Release: v4.35.1

fix copies
Fix FA2 import + deprecation cycle (#27330 )
2025-10-21 17:48:57 +08:00 · 2023-11-14 15:37:41 +01:00 · 2023-11-14 15:37:41 +01:00 · 2023-11-14 15:17:23 +01:00 · 2023-11-14 15:15:50 +01:00 · 2023-11-14 15:15:41 +01:00
586 changed files with 35205 additions and 7709 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -127,6 +127,7 @@ class CircleCIJob:
            },
        ]
        steps.extend([{"run": l} for l in self.install_steps])
+        steps.extend([{"run": 'pip install "fsspec>=2023.5.0,<2023.10.0"'}])
        steps.extend([{"run": "pip install pytest-subtests"}])
        steps.append(
            {
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@ -26,6 +26,8 @@ requirements:
    - protobuf
    - tokenizers >=0.11.1,!=0.11.3,<0.13
    - pyyaml >=5.1
+    - safetensors
+    - fsspec
  run:
    - python
    - numpy >=1.17
@ -40,6 +42,8 @@ requirements:
    - protobuf
    - tokenizers >=0.11.1,!=0.11.3,<0.13
    - pyyaml >=5.1
+    - safetensors
+    - fsspec

 test:
  imports:
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -15,7 +15,7 @@ jobs:
      commit_sha: ${{ github.sha }}
      package: transformers
      notebook_folder: transformers_doc
-      languages: de en es fr it ko pt zh ja te
+      languages: de en es fr hi it ko pt zh ja te
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,4 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: de en es fr it ko pt zh ja te
+      languages: de en es fr hi it ko pt zh ja te
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@ -21,36 +21,8 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
-  check_runner_status:
-    name: Check Runner Status
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout transformers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-
-  check_runners:
-    name: Check Runners
-    needs: check_runner_status
-    strategy:
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
-    container:
-      image: huggingface/transformers-all-latest-torch-nightly-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
  setup:
    name: Setup
-    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@ -276,8 +248,6 @@ jobs:
    runs-on: ubuntu-latest
    if: always()
    needs: [
-      check_runner_status,
-      check_runners,
      setup,
      run_tests_single_gpu,
      run_tests_multi_gpu,
@ -288,8 +258,6 @@ jobs:
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
-          echo "Runner availability: ${{ needs.check_runner_status.result }}"
-          echo "Runner status: ${{ needs.check_runners.result }}"
          echo "Setup status: ${{ needs.setup.result }}"

      - uses: actions/checkout@v3
@ -303,8 +271,6 @@ jobs:
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: Nightly CI
-          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
-          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
--- a/.github/workflows/self-past.yml
+++ b/.github/workflows/self-past.yml
@ -32,36 +32,8 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
-  check_runner_status:
-    name: Check Runner Status
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout transformers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-past-ci-runner-docker,multi-gpu-past-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-
-  check_runners:
-    name: Check Runners
-    needs: check_runner_status
-    strategy:
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, past-ci]
-    container:
-      image: huggingface/transformers-${{ inputs.framework }}-past-${{ inputs.version }}-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
  setup:
    name: Setup
-    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@ -319,8 +291,6 @@ jobs:
    runs-on: ubuntu-latest
    if: always()
    needs: [
-      check_runner_status,
-      check_runners,
      setup,
      run_tests_single_gpu,
      run_tests_multi_gpu,
@ -331,8 +301,6 @@ jobs:
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
-          echo "Runner availability: ${{ needs.check_runner_status.result }}"
-          echo "Runner status: ${{ needs.check_runners.result }}"
          echo "Setup status: ${{ needs.setup.result }}"

      - uses: actions/checkout@v3
@ -351,8 +319,6 @@ jobs:
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
          CI_EVENT: Past CI - ${{ inputs.framework }}-${{ inputs.version }}
-          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
-          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@ -27,36 +27,8 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
-  check_runner_status:
-    name: Check Runner Status
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout transformers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-ci-runner-docker,multi-gpu-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-
-  check_runners:
-    name: Check Runners
-    needs: check_runner_status
-    strategy:
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, push-ci]
-    container:
-      image: huggingface/transformers-all-latest-gpu-push-ci
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
  setup:
    name: Setup
-    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@ -521,8 +493,6 @@ jobs:
    runs-on: ubuntu-latest
    if: always()
    needs: [
-        check_runner_status,
-        check_runners,
        setup,
        run_tests_single_gpu,
        run_tests_multi_gpu,
@ -534,9 +504,7 @@ jobs:
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
-          echo "Runner availability: ${{ needs.check_runner_status.result }}"
          echo "Setup status: ${{ needs.setup.result }}"
-          echo "Runner status: ${{ needs.check_runners.result }}"

      # Necessary to get the correct branch name and commit SHA for `workflow_run` event
      # We also take into account the `push` event (we might want to test some changes in a branch)
@ -589,8 +557,6 @@ jobs:
          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
          CI_SHA: ${{ env.CI_SHA }}
-          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
-          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}

        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -25,36 +25,8 @@ env:
  RUN_PT_TF_CROSS_TESTS: 1

 jobs:
-  check_runner_status:
-    name: Check Runner Status
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout transformers
-        uses: actions/checkout@v3
-        with:
-          fetch-depth: 2
-
-      - name: Check Runner Status
-        run: python utils/check_self_hosted_runner.py --target_runners single-gpu-scheduled-ci-runner-docker,multi-gpu-scheduled-ci-runner-docker --token ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-
-  check_runners:
-    name: Check Runners
-    needs: check_runner_status
-    strategy:
-      matrix:
-        machine_type: [single-gpu, multi-gpu]
-    runs-on: ['${{ matrix.machine_type }}', nvidia-gpu, t4, daily-ci]
-    container:
-      image: huggingface/transformers-all-latest-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
  setup:
    name: Setup
-    needs: check_runners
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
@ -430,8 +402,6 @@ jobs:
    runs-on: ubuntu-latest
    if: always()
    needs: [
-      check_runner_status,
-      check_runners,
      setup,
      run_tests_single_gpu,
      run_tests_multi_gpu,
@ -480,8 +450,6 @@ jobs:
    runs-on: ubuntu-latest
    if: always()
    needs: [
-      check_runner_status,
-      check_runners,
      setup,
      run_tests_single_gpu,
      run_tests_multi_gpu,
@ -496,8 +464,6 @@ jobs:
        shell: bash
        # For the meaning of these environment variables, see the job `Setup`
        run: |
-          echo "Runner availability: ${{ needs.check_runner_status.result }}"
-          echo "Runner status: ${{ needs.check_runners.result }}"
          echo "Setup status: ${{ needs.setup.result }}"

      - uses: actions/checkout@v3
@ -513,8 +479,6 @@ jobs:
          CI_EVENT: scheduled
          CI_SHA: ${{ github.sha }}
          CI_WORKFLOW_REF: ${{ github.workflow_ref }}
-          RUNNER_STATUS: ${{ needs.check_runner_status.result }}
-          RUNNER_ENV_STATUS: ${{ needs.check_runners.result }}
          SETUP_STATUS: ${{ needs.setup.result }}
        # We pass `needs.setup.outputs.matrix` as the argument. A processing in `notification_service.py` to change
        # `models/bert` to `models_bert` is required, as the artifact names use `_` instead of `/`.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -40,8 +40,8 @@ There are several ways you can contribute to 🤗 Transformers:

 If you don't know where to start, there is a special [Good First
 Issue](https://github.com/huggingface/transformers/contribute) listing. It will give you a list of
-open issues that are beginner-friendly and help you start contributing to open-source. Just comment in the issue that you'd like to work
-on it. 
+open issues that are beginner-friendly and help you start contributing to open-source. Just comment on the issue that you'd like to work
+on. 

 For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/transformers/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀

@ -62,7 +62,7 @@ feedback.
 The 🤗 Transformers library is robust and reliable thanks to users who report the problems they encounter.

 Before you report an issue, we would really appreciate it if you could **make sure the bug was not
-already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask on the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.
+already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code. If you're unsure whether the bug is in your code or the library, please ask in the [forum](https://discuss.huggingface.co/) first. This helps us respond quicker to fixing issues related to the library versus general questions.

 Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:

@ -105,7 +105,7 @@ We have added [templates](https://github.com/huggingface/transformers/tree/main/

 New models are constantly released and if you want to implement a new model, please provide the following information

-* A short description of the model and link to the paper.
+* A short description of the model and a link to the paper.
 * Link to the implementation if it is open-sourced.
 * Link to the model weights if they are available.

@ -172,7 +172,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai

   which should be enough for most use cases.

-5. Develop the features on your branch.
+5. Develop the features in your branch.

   As you work on your code, you should make sure the test suite
   passes. Run the tests impacted by your changes like this:
@ -208,7 +208,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
   make quality
   ```

-   Finally, we have a lot of scripts to make sure we didn't forget to update
+   Finally, we have a lot of scripts to make sure we don't forget to update
   some files when adding a new model. You can run these scripts with:

   ```bash
@ -218,7 +218,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
   To learn more about those checks and how to fix any issues with them, check out the
   [Checks on a Pull Request](https://huggingface.co/docs/transformers/pr_checks) guide.

-   If you're modifying documents under `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
+   If you're modifying documents under the `docs/source` directory, make sure the documentation can still be built. This check will also run in the CI when you open a pull request. To run a local check
   make sure you install the documentation builder:
   
   ```bash
@ -234,7 +234,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai
   This will build the documentation in the `~/tmp/test-build` folder where you can inspect the generated
   Markdown files with your favorite editor. You can also preview the docs on GitHub when you open a pull request.

-   Once you're happy with your changes, add changed files with `git add` and
+   Once you're happy with your changes, add the changed files with `git add` and
   record your changes locally with `git commit`:

   ```bash
@ -261,7 +261,7 @@ You'll need **[Python 3.8]((https://github.com/huggingface/transformers/blob/mai

   If you've already opened a pull request, you'll need to force push with the `--force` flag. Otherwise, if the pull request hasn't been opened yet, you can just push your changes normally.

-6. Now you can go to your fork of the repository on GitHub and click on **Pull request** to open a pull request. Make sure you tick off all the boxes in our [checklist](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.
+6. Now you can go to your fork of the repository on GitHub and click on **Pull Request** to open a pull request. Make sure you tick off all the boxes on our [checklist](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md/#pull-request-checklist) below. When you're ready, you can send your changes to the project maintainers for review.

 7. It's ok if maintainers request changes, it happens to our core contributors
   too! So everyone can see the changes in the pull request, work in your local
--- a/README.md
+++ b/README.md
@ -54,6 +54,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_ru.md">Русский</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_pt-br.md">Рortuguês</a> |
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -69,7 +70,7 @@ limitations under the License.

 These models can be applied on:

-* 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, text generation, in over 100 languages.
+* 📝 Text, for tasks like text classification, information extraction, question answering, summarization, translation, and text generation, in over 100 languages.
 * 🖼️ Images, for tasks like image classification, object detection, and segmentation.
 * 🗣️ Audio, for tasks like speech recognition and audio classification.

@ -147,7 +148,7 @@ To immediately use a model on a given input (text, image, audio, ...), we provid
 [{'label': 'POSITIVE', 'score': 0.9996980428695679}]
 ```

-The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here the answer is "positive" with a confidence of 99.97%.
+The second line of code downloads and caches the pretrained model used by the pipeline, while the third evaluates it on the given text. Here, the answer is "positive" with a confidence of 99.97%.

 Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in computer vision and speech. For example, we can easily extract detected objects in an image:

@ -181,7 +182,7 @@ Many tasks have a pre-trained `pipeline` ready to go, in NLP but also in compute
  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
 ```

-Here we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:
+Here, we get a list of objects detected in the image, with a box surrounding the object and a confidence score. Here is the original image on the left, with the predictions displayed on the right:

 <h3 align="center">
    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
@ -212,7 +213,7 @@ And here is the equivalent code for TensorFlow:
 >>> outputs = model(**inputs)
 ```

-The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.
+The tokenizer is responsible for all the preprocessing the pretrained model expects and can be called directly on a single string (as in the above examples) or a list. It will output a dictionary that you can use in downstream code or simply directly pass to your model using the ** argument unpacking operator.

 The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use as usual. [This tutorial](https://huggingface.co/docs/transformers/training) explains how to integrate such a model into a classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune on a new dataset.

@ -232,7 +233,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 1. Choose the right framework for every part of a model's lifetime:
    - Train state-of-the-art models in 3 lines of code.
    - Move a single model between TF2.0/PyTorch/JAX frameworks at will.
-    - Seamlessly pick the right framework for training, evaluation and production.
+    - Seamlessly pick the right framework for training, evaluation, and production.

 1. Easily customize a model or an example to your needs:
    - We provide examples for each architecture to reproduce the results published by its original authors.
@ -243,19 +244,19 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta

 - This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving into additional abstractions/files.
 - The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library (possibly, [Accelerate](https://huggingface.co/docs/accelerate)).
- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.
+- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/main/examples) are just that: examples. It is expected that they won't work out-of-the-box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.

 ## Installation

 ### With pip

-This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ and TensorFlow 2.6+.
+This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.10+, and TensorFlow 2.6+.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

 First, create a virtual environment with the version of Python you're going to use and activate it.

-Then, you will need to install at least one of Flax, PyTorch or TensorFlow.
+Then, you will need to install at least one of Flax, PyTorch, or TensorFlow.
 Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or [Flax](https://github.com/google/flax#quick-install) and [Jax](https://github.com/google/jax#installation) installation pages regarding the specific installation command for your platform.

 When one of those backends has been installed, 🤗 Transformers can be installed using pip as follows:
@ -282,7 +283,7 @@ Follow the installation pages of Flax, PyTorch or TensorFlow to see how to insta

 ## Model architectures

-**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
+**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co/models), where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).

 Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)

@ -294,11 +295,11 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
-1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
@ -385,6 +386,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
@ -432,10 +434,10 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
@ -459,6 +461,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@ -497,7 +500,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
 1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
-1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) rreleased with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
@ -519,7 +522,7 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
-1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedback before starting your PR.

 To check if each model has an implementation in Flax, PyTorch or TensorFlow, or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/docs/transformers/index#supported-frameworks).

--- a/README_es.md
+++ b/README_es.md
@ -46,7 +46,8 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
        <b>Español</b> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
+        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a> |
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -360,6 +361,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
@ -407,10 +409,10 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
@ -434,6 +436,7 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
@ -528,4 +531,4 @@ Ahora nosotros tenemos un [papel](https://www.aclweb.org/anthology/2020.emnlp-de
    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
    pages = "38--45"
 }
-```
+```
--- a/README_hd.md
+++ b/README_hd.md
@ -72,6 +72,7 @@ checkpoint: जाँच बिंदु
        <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
        <b>हिन्दी</b> |
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -334,6 +335,7 @@ conda install -c huggingface transformers
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce से) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. द्वाराअनुसंधान पत्र [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) के साथ जारी किया गया
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (माइक्रोसॉफ्ट रिसर्च एशिया से) साथ देने वाला पेपर [लेआउटएलएमवी3: यूनिफाइड टेक्स्ट और इमेज मास्किंग के साथ दस्तावेज़ एआई के लिए पूर्व-प्रशिक्षण](https://arxiv.org/abs/2204.08387) युपन हुआंग, टेंगचाओ लव, लेई कुई, युटोंग लू, फुरु वेई द्वारा पोस्ट किया गया।
@ -381,10 +383,10 @@ conda install -c huggingface transformers
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI से) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. द्वाराअनुसंधान पत्र [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) के साथ जारी किया गया
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में कागज [Nyströmformer: A Nyström- आधारित एल्गोरिथम आत्म-ध्यान का अनुमान लगाने के लिए ](https://arxiv.org/abs/2102.03902) युनयांग ज़िओंग, झानपेंग ज़ेंग, रुद्रसिस चक्रवर्ती, मिंगक्सिंग टैन, ग्लेन फंग, यिन ली, विकास सिंह द्वारा पोस्ट किया गया।
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs से) पेपर [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) जितेश जैन, जिआचेन ली, मांगटिक चिउ, अली हसनी, निकिता ओरलोव, हम्फ्री शि के द्वारा जारी किया गया है।
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI से) साथ में कागज [विज़न ट्रांसफॉर्मर्स के साथ सिंपल ओपन-वोकैबुलरी ऑब्जेक्ट डिटेक्शन](https:/ /arxiv.org/abs/2205.06230) मैथियास मिंडरर, एलेक्सी ग्रिट्सेंको, ऑस्टिन स्टोन, मैक्सिम न्यूमैन, डिर्क वीसेनबोर्न, एलेक्सी डोसोवित्स्की, अरविंद महेंद्रन, अनुराग अर्नब, मुस्तफा देहघानी, ज़ुओरन शेन, जिओ वांग, ज़ियाओहुआ झाई, थॉमस किफ़, और नील हॉल्सबी द्वारा पोस्ट किया गया।
-1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI से) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. द्वाराअनुसंधान पत्र [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) के साथ जारी किया गया
+1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI से) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. द्वाराअनुसंधान पत्र [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) के साथ जारी किया गया
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google की ओर से) साथ में दिया गया पेपर [लंबे इनपुट सारांश के लिए ट्रांसफ़ॉर्मरों को बेहतर तरीके से एक्सटेंड करना](https://arxiv .org/abs/2208.04347) जेसन फांग, याओ झाओ, पीटर जे लियू द्वारा।
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (दीपमाइंड से) साथ में पेपर [पर्सीवर आईओ: संरचित इनपुट और आउटपुट के लिए एक सामान्य वास्तुकला] (https://arxiv.org/abs/2107.14795) एंड्रयू जेगल, सेबेस्टियन बोरग्यूड, जीन-बैप्टिस्ट अलायराक, कार्ल डोर्श, कैटलिन इओनेस्कु, डेविड द्वारा डिंग, स्कंद कोप्पुला, डैनियल ज़ोरान, एंड्रयू ब्रॉक, इवान शेलहैमर, ओलिवियर हेनाफ, मैथ्यू एम। बोट्विनिक, एंड्रयू ज़िसरमैन, ओरिओल विनियल्स, जोआओ कैरेरा द्वारा पोस्ट किया गया।
@ -408,6 +410,7 @@ conda install -c huggingface transformers
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (झुईई टेक्नोलॉजी से), साथ में पेपर [रोफॉर्मर: रोटरी पोजिशन एंबेडिंग के साथ एन्हांस्ड ट्रांसफॉर्मर] (https://arxiv.org/pdf/2104.09864v1.pdf) जियानलिन सु और यू लू और शेंगफेंग पैन और बो वेन और युनफेंग लियू द्वारा प्रकाशित।
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng से) Bo Peng. द्वाराअनुसंधान पत्र [this repo](https://github.com/BlinkDL/RWKV-LM) के साथ जारी किया गया
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI से) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. द्वाराअनुसंधान पत्र [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) के साथ जारी किया गया
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP से) साथ देने वाला पेपर [भाषण पहचान के लिए अनसुपरवाइज्ड प्री-ट्रेनिंग में परफॉर्मेंस-एफिशिएंसी ट्रेड-ऑफ्स](https ://arxiv.org/abs/2109.06870) फेलिक्स वू, क्वांगयुन किम, जिंग पैन, क्यू हान, किलियन क्यू. वेनबर्गर, योव आर्टज़ी द्वारा।
--- a/README_ja.md
+++ b/README_ja.md
@ -82,6 +82,7 @@ user: ユーザ
        <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
        <b>日本語</b> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -394,6 +395,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce から) Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi. から公開された研究論文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI から) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever から公開された研究論文: [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf)
+1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia から) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou から公開された研究論文: [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318)
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia から) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou から公開された研究論文: [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740)
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia から) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei から公開された研究論文: [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387)
@ -441,10 +443,10 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI から) Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic. から公開された研究論文 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison から) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh から公開された研究論文: [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902)
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs から) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi から公開された研究論文: [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220)
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI から) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al から公開された研究論文: [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068)
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby から公開された研究論文: [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230)
-1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. から公開された研究論文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
+1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI から) Matthias Minderer, Alexey Gritsenko, Neil Houlsby. から公開された研究論文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google から) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu から公開された研究論文: [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google から) Jason Phang, Yao Zhao, and Peter J. Liu から公開された研究論文: [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347)
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind から) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira から公開された研究論文: [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795)
@ -468,6 +470,7 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI から) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou から公開された研究論文: [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf)
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology から), Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu から公開された研究論文: [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864)
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng から) Bo Peng. から公開された研究論文 [this repo](https://github.com/BlinkDL/RWKV-LM)
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA から) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo から公開された研究論文: [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI から) Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick. から公開された研究論文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP から) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi から公開された研究論文: [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870)
--- a/README_ko.md
+++ b/README_ko.md
@ -47,6 +47,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -309,6 +310,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (Salesforce 에서 제공)은 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.의 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500)논문과 함께 발표했습니다.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (OpenAI 에서) Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever 의 [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) 논문과 함께 발표했습니다.
+1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (Microsoft Research Asia 에서) Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 의 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 논문과 함께 발표했습니다.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (Microsoft Research Asia 에서) Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 의 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 논문과 함께 발표했습니다.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (Microsoft Research Asia 에서) Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 의 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 논문과 함께 발표했습니다.
@ -356,10 +358,10 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (Meta AI 에서 제공)은 Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.의 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418)논문과 함께 발표했습니다.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (the University of Wisconsin - Madison 에서) Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 의 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 논문과 함께 발표했습니다.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (SHI Labs 에서) Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 의 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 논문과 함께 발표했습니다.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (Meta AI 에서) Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 의 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 논문과 함께 발표했습니다.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (Google AI 에서) Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 의 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 논문과 함께 발표했습니다.
-1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (Google AI 에서 제공)은 Matthias Minderer, Alexey Gritsenko, Neil Houlsby.의 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)논문과 함께 발표했습니다.
+1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (Google AI 에서 제공)은 Matthias Minderer, Alexey Gritsenko, Neil Houlsby.의 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683)논문과 함께 발표했습니다.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (Google 에서) Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 의 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 논문과 함께 발표했습니다.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (Google 에서) Jason Phang, Yao Zhao, Peter J. Liu 의 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 논문과 함께 발표했습니다.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (Deepmind 에서) Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 의 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 논문과 함께 발표했습니다.
@ -383,6 +385,7 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (WeChatAI 에서) HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 의 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 논문과 함께 발표했습니다.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (ZhuiyiTechnology 에서) Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 의 a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 논문과 함께 발표했습니다.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (Bo Peng 에서 제공)은 Bo Peng.의 [this repo](https://github.com/BlinkDL/RWKV-LM)논문과 함께 발표했습니다.
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (NVIDIA 에서) Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 의 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 논문과 함께 발표했습니다.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (Meta AI 에서 제공)은 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.의 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf)논문과 함께 발표했습니다.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (ASAPP 에서) Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 의 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 논문과 함께 발표했습니다.
--- a/README_pt-br.md
+++ b/README_pt-br.md
@ -54,6 +54,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_ru.md">Русский</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_pt-br.md">Рortuguês</a> |
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -440,7 +441,7 @@ Número atual de pontos de verificação: ![](https://img.shields.io/endpoint?ur
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
@ -562,14 +563,3 @@ Agora temos um [artigo](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) qu
    pages = "38--45"
 }
 ```
-
-
-
-
-
-
-
-
-
-
-
--- a/README_ru.md
+++ b/README_ru.md
@ -53,6 +53,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a> |
        <b>Русский</b>
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    <p>
 </h4>

@ -428,7 +429,7 @@ conda install -c huggingface transformers
 1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
--- a/README_te.md
+++ b/README_te.md
@ -0,0 +1,557 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<p align="center">
+  <picture>
+    <source media="(prefers-color-scheme: dark)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-dark.svg">
+    <source media="(prefers-color-scheme: light)" srcset="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg">
+    <img alt="Hugging Face Transformers Library" src="https://huggingface.co/datasets/huggingface/documentation-images/raw/main/transformers-logo-light.svg" width="352" height="59" style="max-width: 100%;">
+  </picture>
+  <br/>
+  <br/>
+</p>
+
+
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/main">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/main/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/docs/transformers/index">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/transformers/index.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/main/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/">English</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hans.md">简体中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_zh-hant.md">繁體中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_ko.md">한국어</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_ru.md">Русский</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/README_pt-br.md">Рortuguês</a> |
+        <b>తెలుగు</b> |
+    </p>
+</h4>
+
+<h3 align="center">
+    <p>JAX, PyTorch మరియు TensorFlow కోసం అత్యాధునిక యంత్ర అభ్యాసం</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/course_banner.png"></a>
+</h3>
+
+🤗 ట్రాన్స్‌ఫార్మర్లు టెక్స్ట్, విజన్ మరియు ఆడియో వంటి విభిన్న పద్ధతులపై టాస్క్‌లను నిర్వహించడానికి వేలాది ముందుగా శిక్షణ పొందిన మోడల్‌లను అందిస్తాయి.
+
+ఈ నమూనాలు వర్తించవచ్చు:
+
+* 📝 టెక్స్ట్, 100కి పైగా భాషల్లో టెక్స్ట్ క్లాసిఫికేషన్, ఇన్ఫర్మేషన్ ఎక్స్‌ట్రాక్షన్, ప్రశ్నలకు సమాధానాలు, సారాంశం, అనువాదం, టెక్స్ట్ జనరేషన్ వంటి పనుల కోసం.
+* 🖼️ ఇమేజ్‌లు, ఇమేజ్ వర్గీకరణ, ఆబ్జెక్ట్ డిటెక్షన్ మరియు సెగ్మెంటేషన్ వంటి పనుల కోసం.
+* 🗣️ ఆడియో, స్పీచ్ రికగ్నిషన్ మరియు ఆడియో వర్గీకరణ వంటి పనుల కోసం.
+
+ట్రాన్స్‌ఫార్మర్ మోడల్‌లు టేబుల్ క్వశ్చన్ ఆన్సర్ చేయడం, ఆప్టికల్ క్యారెక్టర్ రికగ్నిషన్, స్కాన్ చేసిన డాక్యుమెంట్‌ల నుండి ఇన్ఫర్మేషన్ ఎక్స్‌ట్రాక్షన్, వీడియో క్లాసిఫికేషన్ మరియు విజువల్ క్వశ్చన్ ఆన్సర్ చేయడం వంటి **అనేక పద్ధతులతో కలిపి** పనులను కూడా చేయగలవు.
+
+🤗 ట్రాన్స్‌ఫార్మర్లు అందించిన టెక్స్ట్‌లో ప్రీట్రైన్డ్ మోడల్‌లను త్వరగా డౌన్‌లోడ్ చేయడానికి మరియు ఉపయోగించడానికి, వాటిని మీ స్వంత డేటాసెట్‌లలో ఫైన్-ట్యూన్ చేయడానికి మరియు వాటిని మా [మోడల్ హబ్](https://huggingface.co/models)లో సంఘంతో భాగస్వామ్యం చేయడానికి API లను అందిస్తుంది. అదే సమయంలో, ఆర్కిటెక్చర్‌ని నిర్వచించే ప్రతి పైథాన్ మాడ్యూల్ పూర్తిగా స్వతంత్రంగా ఉంటుంది మరియు త్వరిత పరిశోధన ప్రయోగాలను ప్రారంభించడానికి సవరించవచ్చు.
+
+🤗 ట్రాన్స్‌ఫార్మర్‌లకు మూడు అత్యంత ప్రజాదరణ పొందిన డీప్ లెర్నింగ్ లైబ్రరీలు ఉన్నాయి — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) మరియు [TensorFlow](https://www.tensorflow.org/) — వాటి మధ్య అతుకులు లేని ఏకీకరణతో. మీ మోడల్‌లను ఒకదానితో మరొకదానితో అనుమితి కోసం లోడ్ చేసే ముందు వాటికి శిక్షణ ఇవ్వడం చాలా సులభం.
+
+## ఆన్‌లైన్ డెమోలు
+
+మీరు [మోడల్ హబ్](https://huggingface.co/models) నుండి మా మోడళ్లలో చాలా వరకు వాటి పేజీలలో నేరుగా పరీక్షించవచ్చు. మేము పబ్లిక్ మరియు ప్రైవేట్ మోడల్‌ల కోసం [ప్రైవేట్ మోడల్ హోస్టింగ్, సంస్కరణ & అనుమితి API](https://huggingface.co/pricing)ని కూడా అందిస్తాము.
+
+ఇక్కడ కొన్ని ఉదాహరణలు ఉన్నాయి:
+
+సహజ భాషా ప్రాసెసింగ్‌లో:
+- [BERT తో మాస్క్‌డ్ వర్డ్ కంప్లీషన్](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Electra తో పేరు ఎంటిటీ గుర్తింపు](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [GPT-2 తో టెక్స్ట్ జనరేషన్](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [RoBERTa తో సహజ భాషా అనుమితి](https://huggingface.co/roberta-large-mnli?text=The+dog+was+Lost.+Nobody+lost+any+animal)
+- [BART తో సారాంశం](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [DistilBERT తో ప్రశ్న సమాధానం](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [T5 తో అనువాదం](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+కంప్యూటర్ దృష్టిలో:
+- [VIT తో చిత్ర వర్గీకరణ](https://huggingface.co/google/vit-base-patch16-224)
+- [DETR తో ఆబ్జెక్ట్ డిటెక్షన్](https://huggingface.co/facebook/detr-resnet-50)
+- [SegFormer తో సెమాంటిక్ సెగ్మెంటేషన్](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
+- [MaskFormer తో పానోప్టిక్ సెగ్మెంటేషన్](https://huggingface.co/facebook/maskformer-swin-small-coco)
+- [DPT తో లోతు అంచనా](https://huggingface.co/docs/transformers/model_doc/dpt)
+- [VideoMAE తో వీడియో వర్గీకరణ](https://huggingface.co/docs/transformers/model_doc/videomae)
+- [OneFormer తో యూనివర్సల్ సెగ్మెంటేషన్](https://huggingface.co/shi-labs/oneformer_ade20k_dinat_large)
+
+ఆడియోలో:
+- [Wav2Vec2 తో ఆటోమేటిక్ స్పీచ్ రికగ్నిషన్](https://huggingface.co/facebook/wav2vec2-base-960h)
+- [Wav2Vec2 తో కీవర్డ్ స్పాటింగ్](https://huggingface.co/superb/wav2vec2-base-superb-ks)
+- [ఆడియో స్పెక్ట్రోగ్రామ్ ట్రాన్స్‌ఫార్మర్‌తో ఆడియో వర్గీకరణ](https://huggingface.co/MIT/ast-finetuned-audioset-10-10-0.4593)
+
+మల్టీమోడల్ టాస్క్‌లలో:
+- [TAPAS తో టేబుల్ ప్రశ్న సమాధానాలు](https://huggingface.co/google/tapas-base-finetuned-wtq)
+- [ViLT తో దృశ్యమాన ప్రశ్నకు సమాధానం](https://huggingface.co/dandelin/vilt-b32-finetuned-vqa)
+- [CLIP తో జీరో-షాట్ ఇమేజ్ వర్గీకరణ](https://huggingface.co/openai/clip-vit-large-patch14)
+- [LayoutLM తో డాక్యుమెంట్ ప్రశ్నకు సమాధానం](https://huggingface.co/impira/layoutlm-document-qa)
+- [X-CLIP తో జీరో-షాట్ వీడియో వర్గీకరణ](https://huggingface.co/docs/transformers/model_doc/xclip)
+
+## ట్రాన్స్‌ఫార్మర్‌లను ఉపయోగించి 100 ప్రాజెక్టులు
+
+ట్రాన్స్‌ఫార్మర్లు ప్రీట్రైన్డ్ మోడల్‌లను ఉపయోగించడానికి టూల్‌కిట్ కంటే ఎక్కువ: ఇది దాని చుట్టూ నిర్మించిన ప్రాజెక్ట్‌ల సంఘం మరియు
+హగ్గింగ్ ఫేస్ హబ్. డెవలపర్‌లు, పరిశోధకులు, విద్యార్థులు, ప్రొఫెసర్‌లు, ఇంజనీర్లు మరియు ఎవరినైనా అనుమతించేలా ట్రాన్స్‌ఫార్మర్‌లను మేము కోరుకుంటున్నాము
+వారి కలల ప్రాజెక్టులను నిర్మించడానికి.
+
+ట్రాన్స్‌ఫార్మర్‌ల 100,000 నక్షత్రాలను జరుపుకోవడానికి, మేము స్పాట్‌లైట్‌ని ఉంచాలని నిర్ణయించుకున్నాము
+సంఘం, మరియు మేము 100 జాబితాలను కలిగి ఉన్న [awesome-transformers](./awesome-transformers.md) పేజీని సృష్టించాము.
+ట్రాన్స్‌ఫార్మర్ల పరిసరాల్లో అద్భుతమైన ప్రాజెక్టులు నిర్మించబడ్డాయి.
+
+జాబితాలో భాగమని మీరు విశ్వసించే ప్రాజెక్ట్‌ను మీరు కలిగి ఉంటే లేదా ఉపయోగిస్తుంటే, దయచేసి దానిని జోడించడానికి PRని తెరవండి!
+
+## మీరు హగ్గింగ్ ఫేస్ టీమ్ నుండి అనుకూల మద్దతు కోసం చూస్తున్నట్లయితే
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## త్వరిత పర్యటన
+
+ఇచ్చిన ఇన్‌పుట్ (టెక్స్ట్, ఇమేజ్, ఆడియో, ...)పై తక్షణమే మోడల్‌ను ఉపయోగించడానికి, మేము `pipeline` API ని అందిస్తాము. పైప్‌లైన్‌లు ఆ మోడల్ శిక్షణ సమయంలో ఉపయోగించిన ప్రీప్రాసెసింగ్‌తో కూడిన ప్రీట్రైన్డ్ మోడల్‌ను సమూహపరుస్తాయి. సానుకూల మరియు ప్రతికూల పాఠాలను వర్గీకరించడానికి పైప్‌లైన్‌ను త్వరగా ఎలా ఉపయోగించాలో ఇక్కడ ఉంది:
+
+```python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for sentiment-analysis
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+రెండవ లైన్ కోడ్ డౌన్‌లోడ్ మరియు పైప్‌లైన్ ఉపయోగించే ప్రీట్రైన్డ్ మోడల్‌ను కాష్ చేస్తుంది, మూడవది ఇచ్చిన టెక్స్ట్‌పై మూల్యాంకనం చేస్తుంది. ఇక్కడ సమాధానం 99.97% విశ్వాసంతో "పాజిటివ్".
+
+చాలా పనులు NLPలో కానీ కంప్యూటర్ విజన్ మరియు స్పీచ్‌లో కూడా ముందుగా శిక్షణ పొందిన `pipeline` సిద్ధంగా ఉన్నాయి. ఉదాహరణకు, మనం చిత్రంలో గుర్తించిన వస్తువులను సులభంగా సంగ్రహించవచ్చు:
+
+``` python
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import pipeline
+
+# Download an image with cute cats
+>>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png"
+>>> image_data = requests.get(url, stream=True).raw
+>>> image = Image.open(image_data)
+
+# Allocate a pipeline for object detection
+>>> object_detector = pipeline('object-detection')
+>>> object_detector(image)
+[{'score': 0.9982201457023621,
+  'label': 'remote',
+  'box': {'xmin': 40, 'ymin': 70, 'xmax': 175, 'ymax': 117}},
+ {'score': 0.9960021376609802,
+  'label': 'remote',
+  'box': {'xmin': 333, 'ymin': 72, 'xmax': 368, 'ymax': 187}},
+ {'score': 0.9954745173454285,
+  'label': 'couch',
+  'box': {'xmin': 0, 'ymin': 1, 'xmax': 639, 'ymax': 473}},
+ {'score': 0.9988006353378296,
+  'label': 'cat',
+  'box': {'xmin': 13, 'ymin': 52, 'xmax': 314, 'ymax': 470}},
+ {'score': 0.9986783862113953,
+  'label': 'cat',
+  'box': {'xmin': 345, 'ymin': 23, 'xmax': 640, 'ymax': 368}}]
+```
+
+ఇక్కడ మనం ఆబ్జెక్ట్ చుట్టూ ఉన్న బాక్స్ మరియు కాన్ఫిడెన్స్ స్కోర్‌తో చిత్రంలో గుర్తించబడిన వస్తువుల జాబితాను పొందుతాము. ఇక్కడ ఎడమవైపున ఉన్న అసలు చిత్రం, కుడివైపున అంచనాలు ప్రదర్శించబడతాయి:
+
+<h3 align="center">
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample.png" width="400"></a>
+    <a><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/coco_sample_post_processed.png" width="400"></a>
+</h3>
+
+మీరు [ఈ ట్యుటోరియల్](https://huggingface.co/docs/transformers/task_summary)లో `pipeline` API ద్వారా సపోర్ట్ చేసే టాస్క్‌ల గురించి మరింత తెలుసుకోవచ్చు.
+
+`pipeline`తో పాటు, మీరు ఇచ్చిన టాస్క్‌లో ఏదైనా ప్రీట్రైన్డ్ మోడల్‌లను డౌన్‌లోడ్ చేయడానికి మరియు ఉపయోగించడానికి, దీనికి మూడు లైన్ల కోడ్ సరిపోతుంది. ఇక్కడ PyTorch వెర్షన్ ఉంది:
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+మరియు TensorFlow కి సమానమైన కోడ్ ఇక్కడ ఉంది:
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+ప్రిట్రైన్డ్ మోడల్ ఆశించే అన్ని ప్రీప్రాసెసింగ్‌లకు టోకెనైజర్ బాధ్యత వహిస్తుంది మరియు నేరుగా ఒకే స్ట్రింగ్ (పై ఉదాహరణలలో వలె) లేదా జాబితాపై కాల్ చేయవచ్చు. ఇది మీరు డౌన్‌స్ట్రీమ్ కోడ్‌లో ఉపయోగించగల నిఘంటువుని అవుట్‌పుట్ చేస్తుంది లేదా ** ఆర్గ్యుమెంట్ అన్‌ప్యాకింగ్ ఆపరేటర్‌ని ఉపయోగించి నేరుగా మీ మోడల్‌కి పంపుతుంది.
+
+మోడల్ కూడా సాధారణ [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) లేదా [TensorFlow `tf.keras.Model`]( https://www.tensorflow.org/api_docs/python/tf/keras/Model) (మీ బ్యాకెండ్‌ని బట్టి) మీరు మామూలుగా ఉపయోగించవచ్చు. [ఈ ట్యుటోరియల్](https://huggingface.co/docs/transformers/training) అటువంటి మోడల్‌ని క్లాసిక్ PyTorch లేదా TensorFlow ట్రైనింగ్ లూప్‌లో ఎలా ఇంటిగ్రేట్ చేయాలో లేదా మా `Trainer` API ని ఎలా ఉపయోగించాలో వివరిస్తుంది కొత్త డేటాసెట్.
+
+## నేను ట్రాన్స్‌ఫార్మర్‌లను ఎందుకు ఉపయోగించాలి?
+
+1. ఉపయోగించడానికి సులభమైన స్టేట్ ఆఫ్ ది ఆర్ట్ మోడల్‌లు:
+    - సహజ భాషా అవగాహన & ఉత్పత్తి, కంప్యూటర్ దృష్టి మరియు ఆడియో పనులపై అధిక పనితీరు.
+    - విద్యావేత్తలు మరియు అభ్యాసకుల ప్రవేశానికి తక్కువ అవరోధం.
+    - తెలుసుకోవడానికి కేవలం మూడు తరగతులతో కొన్ని వినియోగదారు-ముఖ సంగ్రహణలు.
+    - మా అన్ని ప్రీట్రైన్డ్ మోడల్‌లను ఉపయోగించడం కోసం ఏకీకృత API.
+  
+2. తక్కువ గణన ఖర్చులు, చిన్న కార్బన్ పాదముద్ర:
+    - పరిశోధకులు ఎల్లప్పుడూ మళ్లీ శిక్షణ పొందే బదులు శిక్షణ పొందిన నమూనాలను పంచుకోవచ్చు.
+    - అభ్యాసకులు గణన సమయాన్ని మరియు ఉత్పత్తి ఖర్చులను తగ్గించగలరు.
+    - అన్ని పద్ధతుల్లో 60,000 కంటే ఎక్కువ ప్రీట్రైన్డ్ మోడల్‌లతో డజన్ల కొద్దీ ఆర్కిటెక్చర్‌లు.
+  
+3. మోడల్ జీవితకాలంలో ప్రతి భాగానికి సరైన ఫ్రేమ్‌వర్క్‌ను ఎంచుకోండి:
+    - 3 లైన్ల కోడ్‌లో స్టేట్ ఆఫ్ ది ఆర్ట్ మోడల్‌లకు శిక్షణ ఇవ్వండి.
+    - TF2.0/PyTorch/JAX ఫ్రేమ్‌వర్క్‌ల మధ్య ఒకే మోడల్‌ను ఇష్టానుసారంగా తరలించండి.
+    - శిక్షణ, మూల్యాంకనం మరియు ఉత్పత్తి కోసం సరైన ఫ్రేమ్‌వర్క్‌ను సజావుగా ఎంచుకోండి.
+
+4. మీ అవసరాలకు అనుగుణంగా మోడల్ లేదా ఉదాహరణను సులభంగా అనుకూలీకరించండి:
+    - ప్రతి ఆర్కిటెక్చర్ దాని అసలు రచయితలు ప్రచురించిన ఫలితాలను పునరుత్పత్తి చేయడానికి మేము ఉదాహరణలను అందిస్తాము.
+    - మోడల్ ఇంటర్నల్‌లు వీలైనంత స్థిరంగా బహిర్గతమవుతాయి.
+    - శీఘ్ర ప్రయోగాల కోసం లైబ్రరీ నుండి స్వతంత్రంగా మోడల్ ఫైల్‌లను ఉపయోగించవచ్చు.
+
+## నేను ట్రాన్స్‌ఫార్మర్‌లను ఎందుకు ఉపయోగించకూడదు?
+
+- ఈ లైబ్రరీ న్యూరల్ నెట్‌ల కోసం బిల్డింగ్ బ్లాక్‌ల మాడ్యులర్ టూల్‌బాక్స్ కాదు. మోడల్ ఫైల్‌లలోని కోడ్ ఉద్దేశపూర్వకంగా అదనపు సంగ్రహణలతో రీఫ్యాక్టరింగ్ చేయబడదు, తద్వారా పరిశోధకులు అదనపు సంగ్రహణలు/ఫైళ్లలోకి ప్రవేశించకుండా ప్రతి మోడల్‌పై త్వరగా మళ్లించగలరు.
+- శిక్షణ API ఏ మోడల్‌లో పని చేయడానికి ఉద్దేశించబడలేదు కానీ లైబ్రరీ అందించిన మోడల్‌లతో పని చేయడానికి ఆప్టిమైజ్ చేయబడింది. సాధారణ మెషిన్ లెర్నింగ్ లూప్‌ల కోసం, మీరు మరొక లైబ్రరీని ఉపయోగించాలి (బహుశా, [Accelerate](https://huggingface.co/docs/accelerate)).
+- మేము వీలైనన్ని ఎక్కువ వినియోగ సందర్భాలను ప్రదర్శించడానికి ప్రయత్నిస్తున్నప్పుడు, మా [ఉదాహరణల ఫోల్డర్](https://github.com/huggingface/transformers/tree/main/examples)లోని స్క్రిప్ట్‌లు కేవలం: ఉదాహరణలు. మీ నిర్దిష్ట సమస్యపై అవి పని చేయవు మరియు వాటిని మీ అవసరాలకు అనుగుణంగా మార్చుకోవడానికి మీరు కొన్ని కోడ్ లైన్‌లను మార్చవలసి ఉంటుంది.
+
+## సంస్థాపన
+
+### పిప్ తో
+
+ఈ రిపోజిటరీ పైథాన్ 3.8+, ఫ్లాక్స్ 0.4.1+, PyTorch 1.10+ మరియు TensorFlow 2.6+లో పరీక్షించబడింది.
+
+మీరు [వర్చువల్ వాతావరణం](https://docs.python.org/3/library/venv.html)లో 🤗 ట్రాన్స్‌ఫార్మర్‌లను ఇన్‌స్టాల్ చేయాలి. మీకు పైథాన్ వర్చువల్ పరిసరాల గురించి తెలియకుంటే, [యూజర్ గైడ్](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) చూడండి.
+
+ముందుగా, మీరు ఉపయోగించబోతున్న పైథాన్ వెర్షన్‌తో వర్చువల్ వాతావరణాన్ని సృష్టించండి మరియు దానిని సక్రియం చేయండి.
+
+అప్పుడు, మీరు ఫ్లాక్స్, పైటార్చ్ లేదా టెన్సర్‌ఫ్లోలో కనీసం ఒకదానిని ఇన్‌స్టాల్ చేయాలి.
+దయచేసి [TensorFlow ఇన్‌స్టాలేషన్ పేజీ](https://www.tensorflow.org/install/), [PyTorch ఇన్‌స్టాలేషన్ పేజీ](https://pytorch.org/get-started/locally/#start-locally) మరియు/ని చూడండి లేదా మీ ప్లాట్‌ఫారమ్ కోసం నిర్దిష్ట ఇన్‌స్టాలేషన్ కమాండ్‌కు సంబంధించి [Flax](https://github.com/google/flax#quick-install) మరియు [Jax](https://github.com/google/jax#installation) ఇన్‌స్టాలేషన్ పేజీలు .
+
+ఆ బ్యాకెండ్‌లలో ఒకటి ఇన్‌స్టాల్ చేయబడినప్పుడు, 🤗 ట్రాన్స్‌ఫార్మర్‌లను ఈ క్రింది విధంగా పిప్‌ని ఉపయోగించి ఇన్‌స్టాల్ చేయవచ్చు:
+
+```bash
+pip install transformers
+```
+
+మీరు ఉదాహరణలతో ప్లే చేయాలనుకుంటే లేదా కోడ్ యొక్క బ్లీడింగ్ ఎడ్జ్ అవసరం మరియు కొత్త విడుదల కోసం వేచి ఉండలేకపోతే, మీరు తప్పనిసరిగా [మూలం నుండి లైబ్రరీని ఇన్‌స్టాల్ చేయాలి](https://huggingface.co/docs/transformers/installation#installing-from-source).
+
+### కొండా తో
+
+ట్రాన్స్‌ఫార్మర్స్ వెర్షన్ v4.0.0 నుండి, మేము ఇప్పుడు కొండా ఛానెల్‌ని కలిగి ఉన్నాము: `huggingface`.
+
+🤗 కింది విధంగా కొండా ఉపయోగించి ట్రాన్స్‌ఫార్మర్‌లను ఇన్‌స్టాల్ చేయవచ్చు:
+
+```shell script
+conda install -c huggingface transformers
+```
+
+Flax, PyTorch లేదా TensorFlow యొక్క ఇన్‌స్టాలేషన్ పేజీలను కొండాతో ఎలా ఇన్‌స్టాల్ చేయాలో చూడటానికి వాటిని అనుసరించండి.
+
+> **_గమనిక:_** Windowsలో, కాషింగ్ నుండి ప్రయోజనం పొందేందుకు మీరు డెవలపర్ మోడ్‌ని సక్రియం చేయమని ప్రాంప్ట్ చేయబడవచ్చు. ఇది మీకు ఎంపిక కాకపోతే, దయచేసి [ఈ సంచిక](https://github.com/huggingface/huggingface_hub/issues/1062)లో మాకు తెలియజేయండి.
+
+## మోడల్ ఆర్కిటెక్చర్లు
+
+**[అన్ని మోడల్ చెక్‌పాయింట్‌లు](https://huggingface.co/models)** 🤗 అందించిన ట్రాన్స్‌ఫార్మర్లు huggingface.co [model hub](https://huggingface.co/models) నుండి సజావుగా ఏకీకృతం చేయబడ్డాయి [users](https://huggingface.co/users) మరియు [organizations](https://huggingface.co/organizations) ద్వారా నేరుగా అప్‌లోడ్ చేయబడతాయి.
+
+ప్రస్తుత తనిఖీ కేంద్రాల సంఖ్య: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 ట్రాన్స్‌ఫార్మర్లు ప్రస్తుతం కింది ఆర్కిటెక్చర్‌లను అందజేస్తున్నాయి (వాటిలో ప్రతి ఒక్కటి ఉన్నత స్థాయి సారాంశం కోసం [ఇక్కడ](https://huggingface.co/docs/transformers/model_summary) చూడండి):
+
+1. **[ALBERT](https://huggingface.co/docs/transformers/model_doc/albert)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[ALIGN](https://huggingface.co/docs/transformers/model_doc/align)** (from Google Research) released with the paper [Scaling Up Visual and Vision-Language Representation Learning With Noisy Text Supervision](https://arxiv.org/abs/2102.05918) by Chao Jia, Yinfei Yang, Ye Xia, Yi-Ting Chen, Zarana Parekh, Hieu Pham, Quoc V. Le, Yunhsuan Sung, Zhen Li, Tom Duerig.
+1. **[AltCLIP](https://huggingface.co/docs/transformers/model_doc/altclip)** (from BAAI) released with the paper [AltCLIP: Altering the Language Encoder in CLIP for Extended Language Capabilities](https://arxiv.org/abs/2211.06679) by Chen, Zhongzhi and Liu, Guang and Zhang, Bo-Wen and Ye, Fulong and Yang, Qinghong and Wu, Ledell.
+1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
+1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
+1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
+1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/abs/1910.13461) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee, and Kristina Toutanova.
+1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
+1. **[BiT](https://huggingface.co/docs/transformers/model_doc/bit)** (from Google AI) released with the paper [Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370) by Alexander Kolesnikov, Lucas Beyer, Xiaohua Zhai, Joan Puigcerver, Jessica Yung, Sylvain Gelly, Neil Houlsby.
+1. **[Blenderbot](https://huggingface.co/docs/transformers/model_doc/blenderbot)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/docs/transformers/model_doc/blenderbot-small)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BLIP](https://huggingface.co/docs/transformers/model_doc/blip)** (from Salesforce) released with the paper [BLIP: Bootstrapping Language-Image Pre-training for Unified Vision-Language Understanding and Generation](https://arxiv.org/abs/2201.12086) by Junnan Li, Dongxu Li, Caiming Xiong, Steven Hoi.
+1. **[BLIP-2](https://huggingface.co/docs/transformers/model_doc/blip-2)** (from Salesforce) released with the paper [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) by Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi.
+1. **[BLOOM](https://huggingface.co/docs/transformers/model_doc/bloom)** (from BigScience workshop) released by the [BigScience Workshop](https://bigscience.huggingface.co/).
+1. **[BORT](https://huggingface.co/docs/transformers/model_doc/bort)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[BridgeTower](https://huggingface.co/docs/transformers/model_doc/bridgetower)** (from Harbin Institute of Technology/Microsoft Research Asia/Intel Labs) released with the paper [BridgeTower: Building Bridges Between Encoders in Vision-Language Representation Learning](https://arxiv.org/abs/2206.08657) by Xiao Xu, Chenfei Wu, Shachar Rosenman, Vasudev Lal, Wanxiang Che, Nan Duan.
+1. **[BROS](https://huggingface.co/docs/transformers/model_doc/bros)** (from NAVER CLOVA) released with the paper [BROS: A Pre-trained Language Model Focusing on Text and Layout for Better Key Information Extraction from Documents](https://arxiv.org/abs/2108.04539) by Teakgyu Hong, Donghyun Kim, Mingi Ji, Wonseok Hwang, Daehyun Nam, Sungrae Park.
+1. **[ByT5](https://huggingface.co/docs/transformers/model_doc/byt5)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](https://huggingface.co/docs/transformers/model_doc/canine)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[Chinese-CLIP](https://huggingface.co/docs/transformers/model_doc/chinese_clip)** (from OFA-Sys) released with the paper [Chinese CLIP: Contrastive Vision-Language Pretraining in Chinese](https://arxiv.org/abs/2211.01335) by An Yang, Junshu Pan, Junyang Lin, Rui Men, Yichang Zhang, Jingren Zhou, Chang Zhou.
+1. **[CLAP](https://huggingface.co/docs/transformers/model_doc/clap)** (from LAION-AI) released with the paper [Large-scale Contrastive Language-Audio Pretraining with Feature Fusion and Keyword-to-Caption Augmentation](https://arxiv.org/abs/2211.06687) by Yusong Wu, Ke Chen, Tianyu Zhang, Yuchen Hui, Taylor Berg-Kirkpatrick, Shlomo Dubnov.
+1. **[CLIP](https://huggingface.co/docs/transformers/model_doc/clip)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CLIPSeg](https://huggingface.co/docs/transformers/model_doc/clipseg)** (from University of Göttingen) released with the paper [Image Segmentation Using Text and Image Prompts](https://arxiv.org/abs/2112.10003) by Timo Lüddecke and Alexander Ecker.
+1. **[CodeGen](https://huggingface.co/docs/transformers/model_doc/codegen)** (from Salesforce) released with the paper [A Conversational Paradigm for Program Synthesis](https://arxiv.org/abs/2203.13474) by Erik Nijkamp, Bo Pang, Hiroaki Hayashi, Lifu Tu, Huan Wang, Yingbo Zhou, Silvio Savarese, Caiming Xiong.
+1. **[CodeLlama](https://huggingface.co/docs/transformers/model_doc/llama_code)** (from MetaAI) released with the paper [Code Llama: Open Foundation Models for Code](https://ai.meta.com/research/publications/code-llama-open-foundation-models-for-code/) by Baptiste Rozière, Jonas Gehring, Fabian Gloeckle, Sten Sootla, Itai Gat, Xiaoqing Ellen Tan, Yossi Adi, Jingyu Liu, Tal Remez, Jérémy Rapin, Artyom Kozhevnikov, Ivan Evtimov, Joanna Bitton, Manish Bhatt, Cristian Canton Ferrer, Aaron Grattafiori, Wenhan Xiong, Alexandre Défossez, Jade Copet, Faisal Azhar, Hugo Touvron, Louis Martin, Nicolas Usunier, Thomas Scialom, Gabriel Synnaeve.
+1. **[Conditional DETR](https://huggingface.co/docs/transformers/model_doc/conditional_detr)** (from Microsoft Research Asia) released with the paper [Conditional DETR for Fast Training Convergence](https://arxiv.org/abs/2108.06152) by Depu Meng, Xiaokang Chen, Zejia Fan, Gang Zeng, Houqiang Li, Yuhui Yuan, Lei Sun, Jingdong Wang.
+1. **[ConvBERT](https://huggingface.co/docs/transformers/model_doc/convbert)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[ConvNeXT](https://huggingface.co/docs/transformers/model_doc/convnext)** (from Facebook AI) released with the paper [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545) by Zhuang Liu, Hanzi Mao, Chao-Yuan Wu, Christoph Feichtenhofer, Trevor Darrell, Saining Xie.
+1. **[ConvNeXTV2](https://huggingface.co/docs/transformers/model_doc/convnextv2)** (from Facebook AI) released with the paper [ConvNeXt V2: Co-designing and Scaling ConvNets with Masked Autoencoders](https://arxiv.org/abs/2301.00808) by Sanghyun Woo, Shoubhik Debnath, Ronghang Hu, Xinlei Chen, Zhuang Liu, In So Kweon, Saining Xie.
+1. **[CPM](https://huggingface.co/docs/transformers/model_doc/cpm)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CPM-Ant](https://huggingface.co/docs/transformers/model_doc/cpmant)** (from OpenBMB) released by the [OpenBMB](https://www.openbmb.org/).
+1. **[CTRL](https://huggingface.co/docs/transformers/model_doc/ctrl)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[CvT](https://huggingface.co/docs/transformers/model_doc/cvt)** (from Microsoft) released with the paper [CvT: Introducing Convolutions to Vision Transformers](https://arxiv.org/abs/2103.15808) by Haiping Wu, Bin Xiao, Noel Codella, Mengchen Liu, Xiyang Dai, Lu Yuan, Lei Zhang.
+1. **[Data2Vec](https://huggingface.co/docs/transformers/model_doc/data2vec)** (from Facebook) released with the paper [Data2Vec:  A General Framework for Self-supervised Learning in Speech, Vision and Language](https://arxiv.org/abs/2202.03555) by Alexei Baevski, Wei-Ning Hsu, Qiantong Xu, Arun Babu, Jiatao Gu, Michael Auli.
+1. **[DeBERTa](https://huggingface.co/docs/transformers/model_doc/deberta)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/docs/transformers/model_doc/deberta-v2)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[Decision Transformer](https://huggingface.co/docs/transformers/model_doc/decision_transformer)** (from Berkeley/Facebook/Google) released with the paper [Decision Transformer: Reinforcement Learning via Sequence Modeling](https://arxiv.org/abs/2106.01345) by Lili Chen, Kevin Lu, Aravind Rajeswaran, Kimin Lee, Aditya Grover, Michael Laskin, Pieter Abbeel, Aravind Srinivas, Igor Mordatch.
+1. **[Deformable DETR](https://huggingface.co/docs/transformers/model_doc/deformable_detr)** (from SenseTime Research) released with the paper [Deformable DETR: Deformable Transformers for End-to-End Object Detection](https://arxiv.org/abs/2010.04159) by Xizhou Zhu, Weijie Su, Lewei Lu, Bin Li, Xiaogang Wang, Jifeng Dai.
+1. **[DeiT](https://huggingface.co/docs/transformers/model_doc/deit)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DePlot](https://huggingface.co/docs/transformers/model_doc/deplot)** (from Google AI) released with the paper [DePlot: One-shot visual language reasoning by plot-to-table translation](https://arxiv.org/abs/2212.10505) by Fangyu Liu, Julian Martin Eisenschlos, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Wenhu Chen, Nigel Collier, Yasemin Altun.
+1. **[DETA](https://huggingface.co/docs/transformers/model_doc/deta)** (from The University of Texas at Austin) released with the paper [NMS Strikes Back](https://arxiv.org/abs/2212.06137) by Jeffrey Ouyang-Zhang, Jang Hyun Cho, Xingyi Zhou, Philipp Krähenbühl.
+1. **[DETR](https://huggingface.co/docs/transformers/model_doc/detr)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](https://huggingface.co/docs/transformers/model_doc/dialogpt)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DiNAT](https://huggingface.co/docs/transformers/model_doc/dinat)** (from SHI Labs) released with the paper [Dilated Neighborhood Attention Transformer](https://arxiv.org/abs/2209.15001) by Ali Hassani and Humphrey Shi.
+1. **[DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2)** (from Meta AI) released with the paper [DINOv2: Learning Robust Visual Features without Supervision](https://arxiv.org/abs/2304.07193) by Maxime Oquab, Timothée Darcet, Théo Moutakanni, Huy Vo, Marc Szafraniec, Vasil Khalidov, Pierre Fernandez, Daniel Haziza, Francisco Massa, Alaaeldin El-Nouby, Mahmoud Assran, Nicolas Ballas, Wojciech Galuba, Russell Howes, Po-Yao Huang, Shang-Wen Li, Ishan Misra, Michael Rabbat, Vasu Sharma, Gabriel Synnaeve, Hu Xu, Hervé Jegou, Julien Mairal, Patrick Labatut, Armand Joulin, Piotr Bojanowski.
+1. **[DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/main/examples/research_projects/distillation) and a German version of DistilBERT.
+1. **[DiT](https://huggingface.co/docs/transformers/model_doc/dit)** (from Microsoft Research) released with the paper [DiT: Self-supervised Pre-training for Document Image Transformer](https://arxiv.org/abs/2203.02378) by Junlong Li, Yiheng Xu, Tengchao Lv, Lei Cui, Cha Zhang, Furu Wei.
+1. **[Donut](https://huggingface.co/docs/transformers/model_doc/donut)** (from NAVER), released together with the paper [OCR-free Document Understanding Transformer](https://arxiv.org/abs/2111.15664) by Geewook Kim, Teakgyu Hong, Moonbin Yim, Jeongyeon Nam, Jinyoung Park, Jinyeong Yim, Wonseok Hwang, Sangdoo Yun, Dongyoon Han, Seunghyun Park.
+1. **[DPR](https://huggingface.co/docs/transformers/model_doc/dpr)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPT](https://huggingface.co/docs/transformers/master/model_doc/dpt)** (from Intel Labs) released with the paper [Vision Transformers for Dense Prediction](https://arxiv.org/abs/2103.13413) by René Ranftl, Alexey Bochkovskiy, Vladlen Koltun.
+1. **[EfficientFormer](https://huggingface.co/docs/transformers/model_doc/efficientformer)** (from Snap Research) released with the paper [EfficientFormer: Vision Transformers at MobileNetSpeed](https://arxiv.org/abs/2206.01191) by Yanyu Li, Geng Yuan, Yang Wen, Ju Hu, Georgios Evangelidis, Sergey Tulyakov, Yanzhi Wang, Jian Ren.
+1. **[EfficientNet](https://huggingface.co/docs/transformers/model_doc/efficientnet)** (from Google Brain) released with the paper [EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks](https://arxiv.org/abs/1905.11946) by Mingxing Tan, Quoc V. Le.
+1. **[ELECTRA](https://huggingface.co/docs/transformers/model_doc/electra)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EnCodec](https://huggingface.co/docs/transformers/model_doc/encodec)** (from Meta AI) released with the paper [High Fidelity Neural Audio Compression](https://arxiv.org/abs/2210.13438) by Alexandre Défossez, Jade Copet, Gabriel Synnaeve, Yossi Adi.
+1. **[EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder-decoder)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[ERNIE](https://huggingface.co/docs/transformers/model_doc/ernie)** (from Baidu) released with the paper [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/abs/1904.09223) by Yu Sun, Shuohuan Wang, Yukun Li, Shikun Feng, Xuyi Chen, Han Zhang, Xin Tian, Danxiang Zhu, Hao Tian, Hua Wu.
+1. **[ErnieM](https://huggingface.co/docs/transformers/model_doc/ernie_m)** (from Baidu) released with the paper [ERNIE-M: Enhanced Multilingual Representation by Aligning Cross-lingual Semantics with Monolingual Corpora](https://arxiv.org/abs/2012.15674) by Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang.
+1. **[ESM](https://huggingface.co/docs/transformers/model_doc/esm)** (from Meta AI) are transformer protein language models.  **ESM-1b** was released with the paper [Biological structure and function emerge from scaling unsupervised learning to 250 million protein sequences](https://www.pnas.org/content/118/15/e2016239118) by Alexander Rives, Joshua Meier, Tom Sercu, Siddharth Goyal, Zeming Lin, Jason Liu, Demi Guo, Myle Ott, C. Lawrence Zitnick, Jerry Ma, and Rob Fergus. **ESM-1v** was released with the paper [Language models enable zero-shot prediction of the effects of mutations on protein function](https://doi.org/10.1101/2021.07.09.450648) by Joshua Meier, Roshan Rao, Robert Verkuil, Jason Liu, Tom Sercu and Alexander Rives. **ESM-2 and ESMFold** were released with the paper [Language models of protein sequences at the scale of evolution enable accurate structure prediction](https://doi.org/10.1101/2022.07.20.500902) by Zeming Lin, Halil Akin, Roshan Rao, Brian Hie, Zhongkai Zhu, Wenting Lu, Allan dos Santos Costa, Maryam Fazel-Zarandi, Tom Sercu, Sal Candido, Alexander Rives.
+1. **[Falcon](https://huggingface.co/docs/transformers/model_doc/falcon)** (from Technology Innovation Institute) by Almazrouei, Ebtesam and Alobeidli, Hamza and Alshamsi, Abdulaziz and Cappelli, Alessandro and Cojocaru, Ruxandra and Debbah, Merouane and Goffinet, Etienne and Heslow, Daniel and Launay, Julien and Malartic, Quentin and Noune, Badreddine and Pannier, Baptiste and Penedo, Guilherme.
+1. **[FLAN-T5](https://huggingface.co/docs/transformers/model_doc/flan-t5)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-t5-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FLAN-UL2](https://huggingface.co/docs/transformers/model_doc/flan-ul2)** (from Google AI) released in the repository [google-research/t5x](https://github.com/google-research/t5x/blob/main/docs/models.md#flan-ul2-checkpoints) by Hyung Won Chung, Le Hou, Shayne Longpre, Barret Zoph, Yi Tay, William Fedus, Eric Li, Xuezhi Wang, Mostafa Dehghani, Siddhartha Brahma, Albert Webson, Shixiang Shane Gu, Zhuyun Dai, Mirac Suzgun, Xinyun Chen, Aakanksha Chowdhery, Sharan Narang, Gaurav Mishra, Adams Yu, Vincent Zhao, Yanping Huang, Andrew Dai, Hongkun Yu, Slav Petrov, Ed H. Chi, Jeff Dean, Jacob Devlin, Adam Roberts, Denny Zhou, Quoc V. Le, and Jason Wei
+1. **[FlauBERT](https://huggingface.co/docs/transformers/model_doc/flaubert)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FLAVA](https://huggingface.co/docs/transformers/model_doc/flava)** (from Facebook AI) released with the paper [FLAVA: A Foundational Language And Vision Alignment Model](https://arxiv.org/abs/2112.04482) by Amanpreet Singh, Ronghang Hu, Vedanuj Goswami, Guillaume Couairon, Wojciech Galuba, Marcus Rohrbach, and Douwe Kiela.
+1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
+1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
+1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
+1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
+1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](https://huggingface.co/docs/transformers/model_doc/gpt_neo)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT NeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox)** (from EleutherAI) released with the paper [GPT-NeoX-20B: An Open-Source Autoregressive Language Model](https://arxiv.org/abs/2204.06745) by Sid Black, Stella Biderman, Eric Hallahan, Quentin Anthony, Leo Gao, Laurence Golding, Horace He, Connor Leahy, Kyle McDonell, Jason Phang, Michael Pieler, USVSN Sai Prashanth, Shivanshu Purohit, Laria Reynolds, Jonathan Tow, Ben Wang, Samuel Weinbach
+1. **[GPT NeoX Japanese](https://huggingface.co/docs/transformers/model_doc/gpt_neox_japanese)** (from ABEJA) released by Shinya Otani, Takayoshi Makabe, Anuj Arora, and Kyo Hattori.
+1. **[GPT-2](https://huggingface.co/docs/transformers/model_doc/gpt2)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://openai.com/research/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/docs/transformers/model_doc/gptj)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[GPT-Sw3](https://huggingface.co/docs/transformers/model_doc/gpt-sw3)** (from AI-Sweden) released with the paper [Lessons Learned from GPT-SW3: Building the First Large-Scale Generative Language Model for Swedish](http://www.lrec-conf.org/proceedings/lrec2022/pdf/2022.lrec-1.376.pdf) by Ariel Ekgren, Amaru Cuba Gyllensten, Evangelia Gogoulou, Alice Heiman, Severine Verlinden, Joey Öhman, Fredrik Carlsson, Magnus Sahlgren.
+1. **[GPTBigCode](https://huggingface.co/docs/transformers/model_doc/gpt_bigcode)** (from BigCode) released with the paper [SantaCoder: don't reach for the stars!](https://arxiv.org/abs/2301.03988) by Loubna Ben Allal, Raymond Li, Denis Kocetkov, Chenghao Mou, Christopher Akiki, Carlos Munoz Ferrandis, Niklas Muennighoff, Mayank Mishra, Alex Gu, Manan Dey, Logesh Kumar Umapathi, Carolyn Jane Anderson, Yangtian Zi, Joel Lamy Poirier, Hailey Schoelkopf, Sergey Troshin, Dmitry Abulkhanov, Manuel Romero, Michael Lappert, Francesco De Toni, Bernardo García del Río, Qian Liu, Shamik Bose, Urvashi Bhattacharyya, Terry Yue Zhuo, Ian Yu, Paulo Villegas, Marco Zocca, Sourab Mangrulkar, David Lansky, Huu Nguyen, Danish Contractor, Luis Villa, Jia Li, Dzmitry Bahdanau, Yacine Jernite, Sean Hughes, Daniel Fried, Arjun Guha, Harm de Vries, Leandro von Werra.
+1. **[GPTSAN-japanese](https://huggingface.co/docs/transformers/model_doc/gptsan-japanese)** released in the repository [tanreinama/GPTSAN](https://github.com/tanreinama/GPTSAN/blob/main/report/model.md) by Toshiyuki Sakamoto(tanreinama).
+1. **[Graphormer](https://huggingface.co/docs/transformers/model_doc/graphormer)** (from Microsoft) released with the paper [Do Transformers Really Perform Bad for Graph Representation?](https://arxiv.org/abs/2106.05234) by Chengxuan Ying, Tianle Cai, Shengjie Luo, Shuxin Zheng, Guolin Ke, Di He, Yanming Shen, Tie-Yan Liu.
+1. **[GroupViT](https://huggingface.co/docs/transformers/model_doc/groupvit)** (from UCSD, NVIDIA) released with the paper [GroupViT: Semantic Segmentation Emerges from Text Supervision](https://arxiv.org/abs/2202.11094) by Jiarui Xu, Shalini De Mello, Sifei Liu, Wonmin Byeon, Thomas Breuel, Jan Kautz, Xiaolong Wang.
+1. **[HerBERT](https://huggingface.co/docs/transformers/model_doc/herbert)** (from Allegro.pl, AGH University of Science and Technology) released with the paper [KLEJ: Comprehensive Benchmark for Polish Language Understanding](https://www.aclweb.org/anthology/2020.acl-main.111.pdf) by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, Ireneusz Gawlik.
+1. **[Hubert](https://huggingface.co/docs/transformers/model_doc/hubert)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](https://huggingface.co/docs/transformers/model_doc/ibert)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[IDEFICS](https://huggingface.co/docs/transformers/model_doc/idefics)** (from HuggingFace) released with the paper [OBELICS: An Open Web-Scale Filtered Dataset of Interleaved Image-Text Documents](https://huggingface.co/papers/2306.16527) by Hugo Laurençon, Lucile Saulnier, Léo Tronchon, Stas Bekman, Amanpreet Singh, Anton Lozhkov, Thomas Wang, Siddharth Karamcheti, Alexander M. Rush, Douwe Kiela, Matthieu Cord, Victor Sanh.
+1. **[ImageGPT](https://huggingface.co/docs/transformers/model_doc/imagegpt)** (from OpenAI) released with the paper [Generative Pretraining from Pixels](https://openai.com/blog/image-gpt/) by Mark Chen, Alec Radford, Rewon Child, Jeffrey Wu, Heewoo Jun, David Luan, Ilya Sutskever.
+1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
+1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
+1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
+1. **[LayoutXLM](https://huggingface.co/docs/transformers/model_doc/layoutxlm)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](https://huggingface.co/docs/transformers/model_doc/led)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LeViT](https://huggingface.co/docs/transformers/model_doc/levit)** (from Meta AI) released with the paper [LeViT: A Vision Transformer in ConvNet's Clothing for Faster Inference](https://arxiv.org/abs/2104.01136) by Ben Graham, Alaaeldin El-Nouby, Hugo Touvron, Pierre Stock, Armand Joulin, Hervé Jégou, Matthijs Douze.
+1. **[LiLT](https://huggingface.co/docs/transformers/model_doc/lilt)** (from South China University of Technology) released with the paper [LiLT: A Simple yet Effective Language-Independent Layout Transformer for Structured Document Understanding](https://arxiv.org/abs/2202.13669) by Jiapeng Wang, Lianwen Jin, Kai Ding.
+1. **[LLaMA](https://huggingface.co/docs/transformers/model_doc/llama)** (from The FAIR team of Meta AI) released with the paper [LLaMA: Open and Efficient Foundation Language Models](https://arxiv.org/abs/2302.13971) by Hugo Touvron, Thibaut Lavril, Gautier Izacard, Xavier Martinet, Marie-Anne Lachaux, Timothée Lacroix, Baptiste Rozière, Naman Goyal, Eric Hambro, Faisal Azhar, Aurelien Rodriguez, Armand Joulin, Edouard Grave, Guillaume Lample.
+1. **[Llama2](https://huggingface.co/docs/transformers/model_doc/llama2)** (from The FAIR team of Meta AI) released with the paper [Llama2: Open Foundation and Fine-Tuned Chat Models](https://ai.meta.com/research/publications/llama-2-open-foundation-and-fine-tuned-chat-models/) by Hugo Touvron, Louis Martin, Kevin Stone, Peter Albert, Amjad Almahairi, Yasmine Babaei, Nikolay Bashlykov, Soumya Batra, Prajjwal Bhargava, Shruti Bhosale, Dan Bikel, Lukas Blecher, Cristian Canton Ferrer, Moya Chen, Guillem Cucurull, David Esiobu, Jude Fernandes, Jeremy Fu, Wenyin Fu, Brian Fuller, Cynthia Gao, Vedanuj Goswami, Naman Goyal, Anthony Hartshorn, Saghar Hosseini, Rui Hou, Hakan Inan, Marcin Kardas, Viktor Kerkez Madian Khabsa, Isabel Kloumann, Artem Korenev, Punit Singh Koura, Marie-Anne Lachaux, Thibaut Lavril, Jenya Lee, Diana Liskovich, Yinghai Lu, Yuning Mao, Xavier Martinet, Todor Mihaylov, Pushka rMishra, Igor Molybog, Yixin Nie, Andrew Poulton, Jeremy Reizenstein, Rashi Rungta, Kalyan Saladi, Alan Schelten, Ruan Silva, Eric Michael Smith, Ranjan Subramanian, Xiaoqing EllenTan, Binh Tang, Ross Taylor, Adina Williams, Jian Xiang Kuan, Puxin Xu, Zheng Yan, Iliyan Zarov, Yuchen Zhang, Angela Fan, Melanie Kambadur, Sharan Narang, Aurelien Rodriguez, Robert Stojnic, Sergey Edunov, Thomas Scialom.
+1. **[Longformer](https://huggingface.co/docs/transformers/model_doc/longformer)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LongT5](https://huggingface.co/docs/transformers/model_doc/longt5)** (from Google AI) released with the paper [LongT5: Efficient Text-To-Text Transformer for Long Sequences](https://arxiv.org/abs/2112.07916) by Mandy Guo, Joshua Ainslie, David Uthus, Santiago Ontanon, Jianmo Ni, Yun-Hsuan Sung, Yinfei Yang.
+1. **[LUKE](https://huggingface.co/docs/transformers/model_doc/luke)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](https://huggingface.co/docs/transformers/model_doc/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M-CTC-T](https://huggingface.co/docs/transformers/model_doc/mctct)** (from Facebook) released with the paper [Pseudo-Labeling For Massively Multilingual Speech Recognition](https://arxiv.org/abs/2111.00161) by Loren Lugosch, Tatiana Likhomanenko, Gabriel Synnaeve, and Ronan Collobert.
+1. **[M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](https://huggingface.co/docs/transformers/model_doc/marian)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MarkupLM](https://huggingface.co/docs/transformers/model_doc/markuplm)** (from Microsoft Research Asia) released with the paper [MarkupLM: Pre-training of Text and Markup Language for Visually-rich Document Understanding](https://arxiv.org/abs/2110.08518) by Junlong Li, Yiheng Xu, Lei Cui, Furu Wei.
+1. **[Mask2Former](https://huggingface.co/docs/transformers/model_doc/mask2former)** (from FAIR and UIUC) released with the paper [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527) by Bowen Cheng, Ishan Misra, Alexander G. Schwing, Alexander Kirillov, Rohit Girdhar.
+1. **[MaskFormer](https://huggingface.co/docs/transformers/model_doc/maskformer)** (from Meta and UIUC) released with the paper [Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278) by Bowen Cheng, Alexander G. Schwing, Alexander Kirillov.
+1. **[MatCha](https://huggingface.co/docs/transformers/model_doc/matcha)** (from Google AI) released with the paper [MatCha: Enhancing Visual Language Pretraining with Math Reasoning and Chart Derendering](https://arxiv.org/abs/2212.09662) by Fangyu Liu, Francesco Piccinno, Syrine Krichene, Chenxi Pang, Kenton Lee, Mandar Joshi, Yasemin Altun, Nigel Collier, Julian Martin Eisenschlos.
+1. **[mBART](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[mBART-50](https://huggingface.co/docs/transformers/model_doc/mbart)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[MEGA](https://huggingface.co/docs/transformers/model_doc/mega)** (from Meta/USC/CMU/SJTU) released with the paper [Mega: Moving Average Equipped Gated Attention](https://arxiv.org/abs/2209.10655) by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig, Jonathan May, and Luke Zettlemoyer.
+1. **[Megatron-BERT](https://huggingface.co/docs/transformers/model_doc/megatron-bert)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/docs/transformers/model_doc/megatron_gpt2)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MGP-STR](https://huggingface.co/docs/transformers/model_doc/mgp-str)** (from Alibaba Research) released with the paper [Multi-Granularity Prediction for Scene Text Recognition](https://arxiv.org/abs/2209.03592) by Peng Wang, Cheng Da, and Cong Yao.
+1. **[Mistral](https://huggingface.co/docs/transformers/model_doc/mistral)** (from Mistral AI) by The [Mistral AI](https://mistral.ai) team: Albert Jiang, Alexandre Sablayrolles, Arthur Mensch, Chris Bamford, Devendra Singh Chaplot, Diego de las Casas, Florian Bressand, Gianna Lengyel, Guillaume Lample, Lélio Renard Lavaud, Lucile Saulnier, Marie-Anne Lachaux, Pierre Stock, Teven Le Scao, Thibaut Lavril, Thomas Wang, Timothée Lacroix, William El Sayed.
+1. **[mLUKE](https://huggingface.co/docs/transformers/model_doc/mluke)** (from Studio Ousia) released with the paper [mLUKE: The Power of Entity Representations in Multilingual Pretrained Language Models](https://arxiv.org/abs/2110.08151) by Ryokan Ri, Ikuya Yamada, and Yoshimasa Tsuruoka.
+1. **[MMS](https://huggingface.co/docs/transformers/model_doc/mms)** (from Facebook) released with the paper [Scaling Speech Technology to 1,000+ Languages](https://arxiv.org/abs/2305.13516) by Vineel Pratap, Andros Tjandra, Bowen Shi, Paden Tomasello, Arun Babu, Sayani Kundu, Ali Elkahky, Zhaoheng Ni, Apoorv Vyas, Maryam Fazel-Zarandi, Alexei Baevski, Yossi Adi, Xiaohui Zhang, Wei-Ning Hsu, Alexis Conneau, Michael Auli.
+1. **[MobileBERT](https://huggingface.co/docs/transformers/model_doc/mobilebert)** (from CMU/Google Brain) released with the paper [MobileBERT: a Compact Task-Agnostic BERT for Resource-Limited Devices](https://arxiv.org/abs/2004.02984) by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou.
+1. **[MobileNetV1](https://huggingface.co/docs/transformers/model_doc/mobilenet_v1)** (from Google Inc.) released with the paper [MobileNets: Efficient Convolutional Neural Networks for Mobile Vision Applications](https://arxiv.org/abs/1704.04861) by Andrew G. Howard, Menglong Zhu, Bo Chen, Dmitry Kalenichenko, Weijun Wang, Tobias Weyand, Marco Andreetto, Hartwig Adam.
+1. **[MobileNetV2](https://huggingface.co/docs/transformers/model_doc/mobilenet_v2)** (from Google Inc.) released with the paper [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381) by Mark Sandler, Andrew Howard, Menglong Zhu, Andrey Zhmoginov, Liang-Chieh Chen.
+1. **[MobileViT](https://huggingface.co/docs/transformers/model_doc/mobilevit)** (from Apple) released with the paper [MobileViT: Light-weight, General-purpose, and Mobile-friendly Vision Transformer](https://arxiv.org/abs/2110.02178) by Sachin Mehta and Mohammad Rastegari.
+1. **[MobileViTV2](https://huggingface.co/docs/transformers/model_doc/mobilevitv2)** (from Apple) released with the paper [Separable Self-attention for Mobile Vision Transformers](https://arxiv.org/abs/2206.02680) by Sachin Mehta and Mohammad Rastegari.
+1. **[MPNet](https://huggingface.co/docs/transformers/model_doc/mpnet)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MPT](https://huggingface.co/docs/transformers/model_doc/mpt)** (from MosaiML) released with the repository [llm-foundry](https://github.com/mosaicml/llm-foundry/) by the MosaicML NLP Team.
+1. **[MRA](https://huggingface.co/docs/transformers/model_doc/mra)** (from the University of Wisconsin - Madison) released with the paper [Multi Resolution Analysis (MRA) for Approximate Self-Attention](https://arxiv.org/abs/2207.10284) by Zhanpeng Zeng, Sourav Pal, Jeffery Kline, Glenn M Fung, Vikas Singh.
+1. **[MT5](https://huggingface.co/docs/transformers/model_doc/mt5)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[MusicGen](https://huggingface.co/docs/transformers/model_doc/musicgen)** (from Meta) released with the paper [Simple and Controllable Music Generation](https://arxiv.org/abs/2306.05284) by Jade Copet, Felix Kreuk, Itai Gat, Tal Remez, David Kant, Gabriel Synnaeve, Yossi Adi and Alexandre Défossez.
+1. **[MVP](https://huggingface.co/docs/transformers/model_doc/mvp)** (from RUC AI Box) released with the paper [MVP: Multi-task Supervised Pre-training for Natural Language Generation](https://arxiv.org/abs/2206.12131) by Tianyi Tang, Junyi Li, Wayne Xin Zhao and Ji-Rong Wen.
+1. **[NAT](https://huggingface.co/docs/transformers/model_doc/nat)** (from SHI Labs) released with the paper [Neighborhood Attention Transformer](https://arxiv.org/abs/2204.07143) by Ali Hassani, Steven Walton, Jiachen Li, Shen Li, and Humphrey Shi.
+1. **[Nezha](https://huggingface.co/docs/transformers/model_doc/nezha)** (from Huawei Noah’s Ark Lab) released with the paper [NEZHA: Neural Contextualized Representation for Chinese Language Understanding](https://arxiv.org/abs/1909.00204) by Junqiu Wei, Xiaozhe Ren, Xiaoguang Li, Wenyong Huang, Yi Liao, Yasheng Wang, Jiashu Lin, Xin Jiang, Xiao Chen and Qun Liu.
+1. **[NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[NLLB-MOE](https://huggingface.co/docs/transformers/model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
+1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
+1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
+1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
+1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
+1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
+1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, and Peter J. Liu.
+1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
+1. **[Persimmon](https://huggingface.co/docs/transformers/model_doc/persimmon)** (from ADEPT) released in a [blog post](https://www.adept.ai/blog/persimmon-8b) by Erich Elsen, Augustus Odena, Maxwell Nye, Sağnak Taşırlar, Tri Dao, Curtis Hawthorne, Deepak Moparthi, Arushi Somani.
+1. **[PhoBERT](https://huggingface.co/docs/transformers/model_doc/phobert)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[Pix2Struct](https://huggingface.co/docs/transformers/model_doc/pix2struct)** (from Google) released with the paper [Pix2Struct: Screenshot Parsing as Pretraining for Visual Language Understanding](https://arxiv.org/abs/2210.03347) by Kenton Lee, Mandar Joshi, Iulia Turc, Hexiang Hu, Fangyu Liu, Julian Eisenschlos, Urvashi Khandelwal, Peter Shaw, Ming-Wei Chang, Kristina Toutanova.
+1. **[PLBart](https://huggingface.co/docs/transformers/model_doc/plbart)** (from UCLA NLP) released with the paper [Unified Pre-training for Program Understanding and Generation](https://arxiv.org/abs/2103.06333) by Wasi Uddin Ahmad, Saikat Chakraborty, Baishakhi Ray, Kai-Wei Chang.
+1. **[PoolFormer](https://huggingface.co/docs/transformers/model_doc/poolformer)** (from Sea AI Labs) released with the paper [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418) by Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng.
+1. **[Pop2Piano](https://huggingface.co/docs/transformers/model_doc/pop2piano)** released with the paper [Pop2Piano : Pop Audio-based Piano Cover Generation](https://arxiv.org/abs/2211.00895) by Jongho Choi and Kyogu Lee.
+1. **[ProphetNet](https://huggingface.co/docs/transformers/model_doc/prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[PVT](https://huggingface.co/docs/transformers/model_doc/pvt)** (from Nanjing University, The University of Hong Kong etc.) released with the paper [Pyramid Vision Transformer: A Versatile Backbone for Dense Prediction without Convolutions](https://arxiv.org/pdf/2102.12122.pdf) by Wenhai Wang, Enze Xie, Xiang Li, Deng-Ping Fan, Kaitao Song, Ding Liang, Tong Lu, Ping Luo, Ling Shao.
+1. **[QDQBert](https://huggingface.co/docs/transformers/model_doc/qdqbert)** (from NVIDIA) released with the paper [Integer Quantization for Deep Learning Inference: Principles and Empirical Evaluation](https://arxiv.org/abs/2004.09602) by Hao Wu, Patrick Judd, Xiaojie Zhang, Mikhail Isaev and Paulius Micikevicius.
+1. **[RAG](https://huggingface.co/docs/transformers/model_doc/rag)** (from Facebook) released with the paper [Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks](https://arxiv.org/abs/2005.11401) by Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela.
+1. **[REALM](https://huggingface.co/docs/transformers/model_doc/realm.html)** (from Google Research) released with the paper [REALM: Retrieval-Augmented Language Model Pre-Training](https://arxiv.org/abs/2002.08909) by Kelvin Guu, Kenton Lee, Zora Tung, Panupong Pasupat and Ming-Wei Chang.
+1. **[Reformer](https://huggingface.co/docs/transformers/model_doc/reformer)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RegNet](https://huggingface.co/docs/transformers/model_doc/regnet)** (from META Platforms) released with the paper [Designing Network Design Space](https://arxiv.org/abs/2003.13678) by Ilija Radosavovic, Raj Prateek Kosaraju, Ross Girshick, Kaiming He, Piotr Dollár.
+1. **[RemBERT](https://huggingface.co/docs/transformers/model_doc/rembert)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/abs/2010.12821) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[ResNet](https://huggingface.co/docs/transformers/model_doc/resnet)** (from Microsoft Research) released with the paper [Deep Residual Learning for Image Recognition](https://arxiv.org/abs/1512.03385) by Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun.
+1. **[RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta)** (from Facebook), released together with the paper [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoBERTa-PreLayerNorm](https://huggingface.co/docs/transformers/model_doc/roberta-prelayernorm)** (from Facebook) released with the paper [fairseq: A Fast, Extensible Toolkit for Sequence Modeling](https://arxiv.org/abs/1904.01038) by Myle Ott, Sergey Edunov, Alexei Baevski, Angela Fan, Sam Gross, Nathan Ng, David Grangier, Michael Auli.
+1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
+1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/abs/2104.09864) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng), released on [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/main/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
+1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
+1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](https://huggingface.co/docs/transformers/model_doc/sew_d)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechT5](https://huggingface.co/docs/transformers/model_doc/speecht5)** (from Microsoft Research) released with the paper [SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205) by Junyi Ao, Rui Wang, Long Zhou, Chengyi Wang, Shuo Ren, Yu Wu, Shujie Liu, Tom Ko, Qing Li, Yu Zhang, Zhihua Wei, Yao Qian, Jinyu Li, Furu Wei.
+1. **[SpeechToTextTransformer](https://huggingface.co/docs/transformers/model_doc/speech_to_text)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](https://huggingface.co/docs/transformers/model_doc/speech_to_text_2)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/docs/transformers/model_doc/splinter)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBERT](https://huggingface.co/docs/transformers/model_doc/squeezebert)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SwiftFormer](https://huggingface.co/docs/transformers/model_doc/swiftformer)** (from MBZUAI) released with the paper [SwiftFormer: Efficient Additive Attention for Transformer-based Real-time Mobile Vision Applications](https://arxiv.org/abs/2303.15446) by Abdelrahman Shaker, Muhammad Maaz, Hanoona Rasheed, Salman Khan, Ming-Hsuan Yang, Fahad Shahbaz Khan.
+1. **[Swin Transformer](https://huggingface.co/docs/transformers/model_doc/swin)** (from Microsoft) released with the paper [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030) by Ze Liu, Yutong Lin, Yue Cao, Han Hu, Yixuan Wei, Zheng Zhang, Stephen Lin, Baining Guo.
+1. **[Swin Transformer V2](https://huggingface.co/docs/transformers/model_doc/swinv2)** (from Microsoft) released with the paper [Swin Transformer V2: Scaling Up Capacity and Resolution](https://arxiv.org/abs/2111.09883) by Ze Liu, Han Hu, Yutong Lin, Zhuliang Yao, Zhenda Xie, Yixuan Wei, Jia Ning, Yue Cao, Zheng Zhang, Li Dong, Furu Wei, Baining Guo.
+1. **[Swin2SR](https://huggingface.co/docs/transformers/model_doc/swin2sr)** (from University of Würzburg) released with the paper [Swin2SR: SwinV2 Transformer for Compressed Image Super-Resolution and Restoration](https://arxiv.org/abs/2209.11345) by Marcos V. Conde, Ui-Jin Choi, Maxime Burchi, Radu Timofte.
+1. **[SwitchTransformers](https://huggingface.co/docs/transformers/model_doc/switch_transformers)** (from Google) released with the paper [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://arxiv.org/abs/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+1. **[T5](https://huggingface.co/docs/transformers/model_doc/t5)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/docs/transformers/model_doc/t5v1.1)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[Table Transformer](https://huggingface.co/docs/transformers/model_doc/table-transformer)** (from Microsoft Research) released with the paper [PubTables-1M: Towards Comprehensive Table Extraction From Unstructured Documents](https://arxiv.org/abs/2110.00061) by Brandon Smock, Rohith Pesala, Robin Abraham.
+1. **[TAPAS](https://huggingface.co/docs/transformers/model_doc/tapas)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[TAPEX](https://huggingface.co/docs/transformers/model_doc/tapex)** (from Microsoft Research) released with the paper [TAPEX: Table Pre-training via Learning a Neural SQL Executor](https://arxiv.org/abs/2107.07653) by Qian Liu, Bei Chen, Jiaqi Guo, Morteza Ziyadi, Zeqi Lin, Weizhu Chen, Jian-Guang Lou.
+1. **[Time Series Transformer](https://huggingface.co/docs/transformers/model_doc/time_series_transformer)** (from HuggingFace).
+1. **[TimeSformer](https://huggingface.co/docs/transformers/model_doc/timesformer)** (from Facebook) released with the paper [Is Space-Time Attention All You Need for Video Understanding?](https://arxiv.org/abs/2102.05095) by Gedas Bertasius, Heng Wang, Lorenzo Torresani.
+1. **[Trajectory Transformer](https://huggingface.co/docs/transformers/model_doc/trajectory_transformers)** (from the University of California at Berkeley) released with the paper [Offline Reinforcement Learning as One Big Sequence Modeling Problem](https://arxiv.org/abs/2106.02039) by Michael Janner, Qiyang Li, Sergey Levine
+1. **[Transformer-XL](https://huggingface.co/docs/transformers/model_doc/transfo-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](https://huggingface.co/docs/transformers/model_doc/trocr)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[TVLT](https://huggingface.co/docs/transformers/model_doc/tvlt)** (from UNC Chapel Hill) released with the paper [TVLT: Textless Vision-Language Transformer](https://arxiv.org/abs/2209.14156) by Zineng Tang, Jaemin Cho, Yixin Nie, Mohit Bansal.
+1. **[UL2](https://huggingface.co/docs/transformers/model_doc/ul2)** (from Google Research) released with the paper [Unifying Language Learning Paradigms](https://arxiv.org/abs/2205.05131v1) by Yi Tay, Mostafa Dehghani, Vinh Q. Tran, Xavier Garcia, Dara Bahri, Tal Schuster, Huaixiu Steven Zheng, Neil Houlsby, Donald Metzler
+1. **[UMT5](https://huggingface.co/docs/transformers/model_doc/umt5)** (from Google Research) released with the paper [UniMax: Fairer and More Effective Language Sampling for Large-Scale Multilingual Pretraining](https://openreview.net/forum?id=kXwdL1cWOAi) by Hyung Won Chung, Xavier Garcia, Adam Roberts, Yi Tay, Orhan Firat, Sharan Narang, Noah Constant.
+1. **[UniSpeech](https://huggingface.co/docs/transformers/model_doc/unispeech)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](https://huggingface.co/docs/transformers/model_doc/unispeech-sat)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[UPerNet](https://huggingface.co/docs/transformers/model_doc/upernet)** (from Peking University) released with the paper [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/abs/1807.10221) by Tete Xiao, Yingcheng Liu, Bolei Zhou, Yuning Jiang, Jian Sun.
+1. **[VAN](https://huggingface.co/docs/transformers/model_doc/van)** (from Tsinghua University and Nankai University) released with the paper [Visual Attention Network](https://arxiv.org/abs/2202.09741) by Meng-Hao Guo, Cheng-Ze Lu, Zheng-Ning Liu, Ming-Ming Cheng, Shi-Min Hu.
+1. **[VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae)** (from Multimedia Computing Group, Nanjing University) released with the paper [VideoMAE: Masked Autoencoders are Data-Efficient Learners for Self-Supervised Video Pre-Training](https://arxiv.org/abs/2203.12602) by Zhan Tong, Yibing Song, Jue Wang, Limin Wang.
+1. **[ViLT](https://huggingface.co/docs/transformers/model_doc/vilt)** (from NAVER AI Lab/Kakao Enterprise/Kakao Brain) released with the paper [ViLT: Vision-and-Language Transformer Without Convolution or Region Supervision](https://arxiv.org/abs/2102.03334) by Wonjae Kim, Bokyung Son, Ildoo Kim.
+1. **[Vision Transformer (ViT)](https://huggingface.co/docs/transformers/model_doc/vit)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](https://huggingface.co/docs/transformers/model_doc/visual_bert)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[ViT Hybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VitDet](https://huggingface.co/docs/transformers/model_doc/vitdet)** (from Meta AI) released with the paper [Exploring Plain Vision Transformer Backbones for Object Detection](https://arxiv.org/abs/2203.16527) by Yanghao Li, Hanzi Mao, Ross Girshick, Kaiming He.
+1. **[ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae)** (from Meta AI) released with the paper [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377) by Kaiming He, Xinlei Chen, Saining Xie, Yanghao Li, Piotr Dollár, Ross Girshick.
+1. **[ViTMatte](https://huggingface.co/docs/transformers/model_doc/vitmatte)** (from HUST-VL) released with the paper [ViTMatte: Boosting Image Matting with Pretrained Plain Vision Transformers](https://arxiv.org/abs/2305.15272) by Jingfeng Yao, Xinggang Wang, Shusheng Yang, Baoyuan Wang.
+1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (from Meta AI) released with the paper [Masked Siamese Networks for Label-Efficient Learning](https://arxiv.org/abs/2204.07141) by Mahmoud Assran, Mathilde Caron, Ishan Misra, Piotr Bojanowski, Florian Bordes, Pascal Vincent, Armand Joulin, Michael Rabbat, Nicolas Ballas.
+1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (from Kakao Enterprise) released with the paper [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) by Jaehyeon Kim, Jungil Kong, Juhee Son.
+1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
+1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (from Facebook AI) released with the paper [FAIRSEQ S2T: Fast Speech-to-Text Modeling with FAIRSEQ](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Sravya Popuri, Dmytro Okhonko, Juan Pino.
+1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (from Facebook AI) released with the paper [Simple and Effective Zero-shot Cross-lingual Phoneme Recognition](https://arxiv.org/abs/2109.11680) by Qiantong Xu, Alexei Baevski, Michael Auli.
+1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (from Microsoft Research) released with the paper [WavLM: Large-Scale Self-Supervised Pre-Training for Full Stack Speech Processing](https://arxiv.org/abs/2110.13900) by Sanyuan Chen, Chengyi Wang, Zhengyang Chen, Yu Wu, Shujie Liu, Zhuo Chen, Jinyu Li, Naoyuki Kanda, Takuya Yoshioka, Xiong Xiao, Jian Wu, Long Zhou, Shuo Ren, Yanmin Qian, Yao Qian, Jian Wu, Michael Zeng, Furu Wei.
+1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (from OpenAI) released with the paper [Robust Speech Recognition via Large-Scale Weak Supervision](https://cdn.openai.com/papers/whisper.pdf) by Alec Radford, Jong Wook Kim, Tao Xu, Greg Brockman, Christine McLeavey, Ilya Sutskever.
+1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (from Microsoft Research) released with the paper [Expanding Language-Image Pretrained Models for General Video Recognition](https://arxiv.org/abs/2208.02816) by Bolin Ni, Houwen Peng, Minghao Chen, Songyang Zhang, Gaofeng Meng, Jianlong Fu, Shiming Xiang, Haibin Ling.
+1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (from Meta AI) released with the paper [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) by Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe.
+1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
+1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/docs/transformers/model_doc/xlm-prophetnet)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLM-RoBERTa-XL](https://huggingface.co/docs/transformers/model_doc/xlm-roberta-xl)** (from Facebook AI), released together with the paper [Larger-Scale Transformers for Multilingual Masked Language Modeling](https://arxiv.org/abs/2105.00572) by Naman Goyal, Jingfei Du, Myle Ott, Giri Anantharaman, Alexis Conneau.
+1. **[XLM-V](https://huggingface.co/docs/transformers/model_doc/xlm-v)** (from Meta AI) released with the paper [XLM-V: Overcoming the Vocabulary Bottleneck in Multilingual Masked Language Models](https://arxiv.org/abs/2301.10472) by Davis Liang, Hila Gonen, Yuning Mao, Rui Hou, Naman Goyal, Marjan Ghazvininejad, Luke Zettlemoyer, Madian Khabsa.
+1. **[XLNet](https://huggingface.co/docs/transformers/model_doc/xlnet)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLS-R](https://huggingface.co/docs/transformers/model_doc/xls_r)** (from Facebook AI) released with the paper [XLS-R: Self-supervised Cross-lingual Speech Representation Learning at Scale](https://arxiv.org/abs/2111.09296) by Arun Babu, Changhan Wang, Andros Tjandra, Kushal Lakhotia, Qiantong Xu, Naman Goyal, Kritika Singh, Patrick von Platen, Yatharth Saraf, Juan Pino, Alexei Baevski, Alexis Conneau, Michael Auli.
+1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (from Huazhong University of Science & Technology) released with the paper [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://arxiv.org/abs/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
+1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (from the University of Wisconsin - Madison) released with the paper [You Only Sample (Almost) Once: Linear Cost Self-Attention Via Bernoulli Sampling](https://arxiv.org/abs/2111.09714) by Zhanpeng Zeng, Yunyang Xiong, Sathya N. Ravi, Shailesh Acharya, Glenn Fung, Vikas Singh.
+1. కొత్త మోడల్‌ను అందించాలనుకుంటున్నారా? కొత్త మోడల్‌ను జోడించే ప్రక్రియలో మీకు మార్గనిర్దేశం చేసేందుకు మేము **వివరణాత్మక గైడ్ మరియు టెంప్లేట్‌లను** జోడించాము. మీరు వాటిని రిపోజిటరీ యొక్క [`టెంప్లేట్లు`](./టెంప్లేట్లు) ఫోల్డర్‌లో కనుగొనవచ్చు. మీ PRని ప్రారంభించడానికి ముందు [సహకార మార్గదర్శకాలు](./CONTRIBUTING.md)ని తనిఖీ చేసి, నిర్వహణదారులను సంప్రదించండి లేదా అభిప్రాయాన్ని సేకరించడానికి సమస్యను తెరవండి.
+
+ప్రతి మోడల్ ఫ్లాక్స్, పైటార్చ్ లేదా టెన్సర్‌ఫ్లోలో అమలు చేయబడిందా లేదా 🤗 Tokenizers లైబ్రరీ ద్వారా అనుబంధించబడిన టోకెనైజర్‌ని కలిగి ఉందో లేదో తనిఖీ చేయడానికి, [ఈ పట్టిక](https://huggingface.co/docs/transformers/index#supported-frameworks).
+
+ఈ అమలులు అనేక డేటాసెట్‌లలో పరీక్షించబడ్డాయి (ఉదాహరణ స్క్రిప్ట్‌లను చూడండి) మరియు అసలైన అమలుల పనితీరుతో సరిపోలాలి. మీరు [డాక్యుమెంటేషన్](https://github.com/huggingface/transformers/tree/main/examples) యొక్క ఉదాహరణల విభాగంలో పనితీరుపై మరిన్ని వివరాలను కనుగొనవచ్చు.
+
+## ఇంకా నేర్చుకో
+
+| విభాగం | వివరణ |
+|-|-|
+| [డాక్యుమెంటేషన్](https://huggingface.co/docs/transformers/) | పూర్తి API డాక్యుమెంటేషన్ మరియు ట్యుటోరియల్స్ |
+| [టాస్క్ సారాంశం](https://huggingface.co/docs/transformers/task_summary) | 🤗 ట్రాన్స్‌ఫార్మర్‌ల ద్వారా సపోర్ట్ చేయబడిన విధులు |
+| [ప్రీప్రాసెసింగ్ ట్యుటోరియల్](https://huggingface.co/docs/transformers/preprocessing) | మోడల్‌ల కోసం డేటాను సిద్ధం చేయడానికి `Tokenizer` క్లాస్‌ని ఉపయోగించడం |
+| [ట్రైనింగ్ మరియు ఫైన్-ట్యూనింగ్](https://huggingface.co/docs/transformers/training) | PyTorch/TensorFlow ట్రైనింగ్ లూప్ మరియు `Trainer` APIలో 🤗 ట్రాన్స్‌ఫార్మర్లు అందించిన మోడల్‌లను ఉపయోగించడం |
+| [త్వరిత పర్యటన: ఫైన్-ట్యూనింగ్/యూసేజ్ స్క్రిప్ట్‌లు](https://github.com/huggingface/transformers/tree/main/examples) | విస్తృత శ్రేణి టాస్క్‌లపై ఫైన్-ట్యూనింగ్ మోడల్స్ కోసం ఉదాహరణ స్క్రిప్ట్‌లు |
+| [మోడల్ భాగస్వామ్యం మరియు అప్‌లోడ్ చేయడం](https://huggingface.co/docs/transformers/model_sharing) | కమ్యూనిటీతో మీ ఫైన్-ట్యూన్డ్ మోడల్‌లను అప్‌లోడ్ చేయండి మరియు భాగస్వామ్యం చేయండి |
+
+## అనులేఖనం
+
+🤗 ట్రాన్స్‌ఫార్మర్స్ లైబ్రరీ కోసం మీరు ఉదహరించగల [పేపర్](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) ఇప్పుడు మా వద్ద ఉంది:
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@ -72,6 +72,7 @@ checkpoint: 检查点
        <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -333,6 +334,7 @@ conda install -c huggingface transformers
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (来自 Salesforce) 伴随论文 [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) 由 Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi 发布。
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) 由 Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei 发布。
@ -380,10 +382,10 @@ conda install -c huggingface transformers
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (来自 Meta AI) 伴随论文 [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) 由 Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic 发布。
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (来自 the University of Wisconsin - Madison) 伴随论文 [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) 由 Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh 发布。
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (来自 SHI Labs)  伴随论文 [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) 由 Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi 发布。
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 [Open-Llama](https://github.com/s-JoL/Open-Llama) 发布.
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (来自 [s-JoL](https://huggingface.co/s-JoL)) 由 GitHub (现已删除).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (来自 Meta AI) 伴随论文 [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) 由 Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al 发布。
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (来自 Google AI) 伴随论文 [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) 由 Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby 发布。
-1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (来自 Google AI) 伴随论文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) 由 Matthias Minderer, Alexey Gritsenko, Neil Houlsby 发布。
+1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (来自 Google AI) 伴随论文 [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) 由 Matthias Minderer, Alexey Gritsenko, Neil Houlsby 发布。
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (来自 Google) 伴随论文 [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) 由 Jason Phang, Yao Zhao, Peter J. Liu 发布。
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (来自 Deepmind) 伴随论文 [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) 由 Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira 发布。
@ -407,6 +409,7 @@ conda install -c huggingface transformers
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (来自 WeChatAI), 伴随论文 [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) 由 HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou 发布。
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (来自 Bo Peng) 伴随论文 [this repo](https://github.com/BlinkDL/RWKV-LM) 由 Bo Peng 发布。
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (来自 Meta AI) 伴随论文 [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) 由 Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick 发布。
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@ -84,6 +84,7 @@ user: 使用者
        <a href="https://github.com/huggingface/transformers/blob/main/README_es.md">Español</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/README_hd.md">हिन्दी</a>
+        <a href="https://github.com/huggingface/transformers//blob/main/README_te.md">తెలుగు</a> |
    </p>
 </h4>

@ -345,6 +346,7 @@ conda install -c huggingface transformers
 1. **[Informer](https://huggingface.co/docs/transformers/model_doc/informer)** (from Beihang University, UC Berkeley, Rutgers University, SEDD Company) released with the paper [Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting](https://arxiv.org/abs/2012.07436) by Haoyi Zhou, Shanghang Zhang, Jieqi Peng, Shuai Zhang, Jianxin Li, Hui Xiong, and Wancai Zhang.
 1. **[InstructBLIP](https://huggingface.co/docs/transformers/model_doc/instructblip)** (from Salesforce) released with the paper [InstructBLIP: Towards General-purpose Vision-Language Models with Instruction Tuning](https://arxiv.org/abs/2305.06500) by Wenliang Dai, Junnan Li, Dongxu Li, Anthony Meng Huat Tiong, Junqi Zhao, Weisheng Wang, Boyang Li, Pascale Fung, Steven Hoi.
 1. **[Jukebox](https://huggingface.co/docs/transformers/model_doc/jukebox)** (from OpenAI) released with the paper [Jukebox: A Generative Model for Music](https://arxiv.org/pdf/2005.00341.pdf) by Prafulla Dhariwal, Heewoo Jun, Christine Payne, Jong Wook Kim, Alec Radford, Ilya Sutskever.
+1. **[KOSMOS-2](https://huggingface.co/docs/transformers/model_doc/kosmos-2)** (from Microsoft Research Asia) released with the paper [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
 1. **[LayoutLM](https://huggingface.co/docs/transformers/model_doc/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
 1. **[LayoutLMv2](https://huggingface.co/docs/transformers/model_doc/layoutlmv2)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
 1. **[LayoutLMv3](https://huggingface.co/docs/transformers/model_doc/layoutlmv3)** (from Microsoft Research Asia) released with the paper [LayoutLMv3: Pre-training for Document AI with Unified Text and Image Masking](https://arxiv.org/abs/2204.08387) by Yupan Huang, Tengchao Lv, Lei Cui, Yutong Lu, Furu Wei.
@ -392,10 +394,10 @@ conda install -c huggingface transformers
 1. **[Nougat](https://huggingface.co/docs/transformers/model_doc/nougat)** (from Meta AI) released with the paper [Nougat: Neural Optical Understanding for Academic Documents](https://arxiv.org/abs/2308.13418) by Lukas Blecher, Guillem Cucurull, Thomas Scialom, Robert Stojnic.
 1. **[Nyströmformer](https://huggingface.co/docs/transformers/model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](https://huggingface.co/docs/transformers/model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama).
+1. **[OpenLlama](https://huggingface.co/docs/transformers/model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](https://huggingface.co/docs/transformers/master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](https://huggingface.co/docs/transformers/model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
-1. **[OWLv2](https://huggingface.co/docs/transformers/main/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
+1. **[OWLv2](https://huggingface.co/docs/transformers/model_doc/owlv2)** (from Google AI) released with the paper [Scaling Open-Vocabulary Object Detection](https://arxiv.org/abs/2306.09683) by Matthias Minderer, Alexey Gritsenko, Neil Houlsby.
 1. **[Pegasus](https://huggingface.co/docs/transformers/model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[PEGASUS-X](https://huggingface.co/docs/transformers/model_doc/pegasus_x)** (from Google) released with the paper [Investigating Efficiently Extending Transformers for Long Input Summarization](https://arxiv.org/abs/2208.04347) by Jason Phang, Yao Zhao, Peter J. Liu.
 1. **[Perceiver IO](https://huggingface.co/docs/transformers/model_doc/perceiver)** (from Deepmind) released with the paper [Perceiver IO: A General Architecture for Structured Inputs & Outputs](https://arxiv.org/abs/2107.14795) by Andrew Jaegle, Sebastian Borgeaud, Jean-Baptiste Alayrac, Carl Doersch, Catalin Ionescu, David Ding, Skanda Koppula, Daniel Zoran, Andrew Brock, Evan Shelhamer, Olivier Hénaff, Matthew M. Botvinick, Andrew Zisserman, Oriol Vinyals, João Carreira.
@ -419,6 +421,7 @@ conda install -c huggingface transformers
 1. **[RoCBert](https://huggingface.co/docs/transformers/model_doc/roc_bert)** (from WeChatAI) released with the paper [RoCBert: Robust Chinese Bert with Multimodal Contrastive Pretraining](https://aclanthology.org/2022.acl-long.65.pdf) by HuiSu, WeiweiShi, XiaoyuShen, XiaoZhou, TuoJi, JiaruiFang, JieZhou.
 1. **[RoFormer](https://huggingface.co/docs/transformers/model_doc/roformer)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
 1. **[RWKV](https://huggingface.co/docs/transformers/model_doc/rwkv)** (from Bo Peng) released with the paper [this repo](https://github.com/BlinkDL/RWKV-LM) by Bo Peng.
+1. **[SeamlessM4T](https://huggingface.co/docs/transformers/model_doc/seamless_m4t)** (from Meta AI) released with the paper [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team.
 1. **[SegFormer](https://huggingface.co/docs/transformers/model_doc/segformer)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
 1. **[Segment Anything](https://huggingface.co/docs/transformers/model_doc/sam)** (from Meta AI) released with the paper [Segment Anything](https://arxiv.org/pdf/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.
 1. **[SEW](https://huggingface.co/docs/transformers/model_doc/sew)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
--- a/SECURITY.md
+++ b/SECURITY.md
@ -0,0 +1,6 @@
+# Security Policy
+
+## Reporting a Vulnerability
+
+🤗 We have our bug bounty program set up with HackerOne. Please feel free to submit vulnerability reports to our private program at https://hackerone.com/hugging_face.
+Note that you'll need to be invited to our program, so send us a quick email at security@huggingface.co if you've found a vulnerability.
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -55,6 +55,9 @@ RUN python3 -m pip install --no-cache-dir auto-gptq --extra-index-url https://hu
 # Add einops for additional model testing
 RUN python3 -m pip install --no-cache-dir einops

+# Add autoawq for quantization testing
+RUN python3 -m pip install --no-cache-dir autoawq
+
 # For bettertransformer + gptq 
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum

--- a/docs/source/en/_redirects.yml
+++ b/docs/source/en/_redirects.yml
@ -0,0 +1,3 @@
+# Optimizing inference
+
+perf_infer_gpu_many: perf_infer_gpu_one
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -155,13 +155,9 @@
    title: Efficient training techniques
  - sections:
    - local: perf_infer_cpu
-      title: Inference on CPU
+      title: CPU inference
    - local: perf_infer_gpu_one
-      title: Inference on one GPU
-    - local: perf_infer_gpu_many
-      title: Inference on many GPUs
-    - local: perf_infer_special
-      title: Inference on Specialized Hardware
+      title: GPU inference
    title: Optimizing inference
  - local: big_models
    title: Instantiating a big model
@ -614,6 +610,8 @@
        title: MusicGen
      - local: model_doc/pop2piano
        title: Pop2Piano
+      - local: model_doc/seamless_m4t
+        title: Seamless-M4T
      - local: model_doc/sew
        title: SEW
      - local: model_doc/sew-d
@ -681,6 +679,8 @@
        title: IDEFICS
      - local: model_doc/instructblip
        title: InstructBLIP
+      - local: model_doc/kosmos-2
+        title: KOSMOS-2
      - local: model_doc/layoutlm
        title: LayoutLM
      - local: model_doc/layoutlmv2
--- a/docs/source/en/glossary.md
+++ b/docs/source/en/glossary.md
@ -112,6 +112,12 @@ A type of layer in a neural network where the input matrix is multiplied element

 ## D

+### DataParallel (DP)
+
+Parallelism technique for training on multiple GPUs where the same setup is replicated multiple times, with each instance 
+receiving a distinct data slice. The processing is done in parallel and all setups are synchronized at the end of each training step.
+Learn more about how DataParallel works [here](perf_train_gpu_many#dataparallel-vs-distributeddataparallel).
+
 ### decoder input IDs

 This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
@ -340,6 +346,12 @@ A pipeline in 🤗 Transformers is an abstraction referring to a series of steps

 For more details, see [Pipelines for inference](https://huggingface.co/docs/transformers/pipeline_tutorial).

+### PipelineParallel (PP)
+
+Parallelism technique in which the model is split up vertically (layer-level) across multiple GPUs, so that only one or 
+several layers of the model are placed on a single GPU. Each GPU processes in parallel different stages of the pipeline 
+and working on a small chunk of the batch. Learn more about how PipelineParallel works [here](perf_train_gpu_many#from-naive-model-parallelism-to-pipeline-parallelism).
+
 ### pixel values

 A tensor of the numerical representations of an image that is passed to a model. The pixel values have a shape of [`batch_size`, `num_channels`, `height`, `width`], and are generated from an image processor.
@ -410,6 +422,10 @@ An example of a semi-supervised learning approach is "self-training", in which a
 Models that generate a new sequence from an input, like translation models, or summarization models (such as
 [Bart](model_doc/bart) or [T5](model_doc/t5)).

+### Sharded DDP
+
+Another name for the foundational [ZeRO](#zero-redundancy-optimizer--zero-) concept as used by various other implementations of ZeRO.
+
 ### stride

 In [convolution](#convolution) or [pooling](#pooling), the stride refers to the distance the kernel is moved over a matrix. A stride of 1 means the kernel is moved one pixel over at a time, and a stride of 2 means the kernel is moved two pixels over at a time.
@ -420,6 +436,14 @@ A form of model training that directly uses labeled data to correct and instruct

 ## T

+### Tensor Parallelism (TP)
+
+Parallelism technique for training on multiple GPUs in which each tensor is split up into multiple chunks, so instead of 
+having the whole tensor reside on a single GPU, each shard of the tensor resides on its designated GPU. Shards gets 
+processed separately and in parallel on different GPUs and the results are synced at the end of the processing step. 
+This is what is sometimes called horizontal parallelism, as the splitting happens on horizontal level.
+Learn more about Tensor Parallelism [here](perf_train_gpu_many#tensor-parallelism).
+
 ### token

 A part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords) or a
@ -489,3 +513,12 @@ Self-attention based deep learning model architecture.
 ### unsupervised learning

 A form of model training in which data provided to the model is not labeled. Unsupervised learning techniques leverage statistical information of the data distribution to find patterns useful for the task at hand.
+
+## Z
+
+### Zero Redundancy Optimizer (ZeRO)
+
+Parallelism technique which performs sharding of the tensors somewhat similar to [TensorParallel](#tensorparallel--tp-), 
+except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need 
+to be modified. This method also supports various offloading techniques to compensate for limited GPU memory. 
+Learn more about ZeRO [here](perf_train_gpu_many#zero-data-parallelism).
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -97,7 +97,7 @@ Flax), PyTorch, and/or TensorFlow.
 |              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
 |                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
 |                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
-|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ❌         |      ❌      |
+|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ✅         |      ❌      |
 |                           [CPM](model_doc/cpm)                           |       ✅        |         ✅         |      ✅      |
 |                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
 |                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
@ -158,6 +158,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
 |                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
 |                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
+|                      [KOSMOS-2](model_doc/kosmos-2)                      |       ✅        |         ❌         |      ❌      |
 |                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
 |                    [LayoutLMv2](model_doc/layoutlmv2)                    |       ✅        |         ❌         |      ❌      |
 |                    [LayoutLMv3](model_doc/layoutlmv3)                    |       ✅        |         ✅         |      ❌      |
@ -236,6 +237,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
 |                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
 |                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
+|                  [SeamlessM4T](model_doc/seamless_m4t)                   |       ✅        |         ❌         |      ❌      |
 |                     [SegFormer](model_doc/segformer)                     |       ✅        |         ✅         |      ❌      |
 |                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
 |                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/main_classes/output.md
+++ b/docs/source/en/main_classes/output.md
@ -44,6 +44,7 @@ an optional `attentions` attribute. Here we have the `loss` since we passed alon

 When passing `output_hidden_states=True` you may expect the `outputs.hidden_states[-1]` to match `outputs.last_hidden_states` exactly.
 However, this is not always the case. Some models apply normalization or subsequent process to the last hidden state when it's returned.
+
 </Tip>


--- a/docs/source/en/main_classes/pipelines.md
+++ b/docs/source/en/main_classes/pipelines.md
@ -481,6 +481,12 @@ Pipelines available for multimodal tasks include the following.
    - __call__
    - all

+### MaskGenerationPipeline
+
+[[autodoc]] MaskGenerationPipeline
+    - __call__
+    - all
+
 ### VisualQuestionAnsweringPipeline

 [[autodoc]] VisualQuestionAnsweringPipeline
--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -16,6 +16,97 @@ rendered properly in your Markdown viewer.

 # Quantize 🤗 Transformers models

+## AWQ integration
+
+AWQ method has been introduced in the [*AWQ: Activation-aware Weight Quantization for LLM Compression and Acceleration* paper](https://arxiv.org/abs/2306.00978). With AWQ you can run models in 4-bit precision, while preserving its original quality (i.e. no performance degradation) with a superior throughput that other quantization methods presented below - reaching similar throughput as pure `float16` inference.
+
+We now support inference with any AWQ model, meaning anyone can load and use AWQ weights that are pushed on the Hub or saved locally. Note that using AWQ requires to have access to a NVIDIA GPU. CPU inference is not supported yet. 
+
+### Quantizing a model
+
+We advise users to look at different existing tools in the ecosystem to quantize their models with AWQ algorithm, such as:
+
+- [`llm-awq`](https://github.com/mit-han-lab/llm-awq) from MIT Han Lab
+- [`autoawq`](https://github.com/casper-hansen/AutoAWQ) from [`casper-hansen`](https://github.com/casper-hansen)
+- Intel neural compressor from Intel - through [`optimum-intel`](https://huggingface.co/docs/optimum/main/en/intel/optimization_inc)
+
+Many other tools might exist in the ecosystem, please feel free to open a PR to add them to the list.
+Currently the integration with 🤗 Transformers is only available for models that have been quantized using `autoawq` library and `llm-awq`. Most of the models quantized with `auto-awq` can be found under [`TheBloke`](https://huggingface.co/TheBloke) namespace of 🤗 Hub, and to quantize models with `llm-awq` please refer to the [`convert_to_hf.py`](https://github.com/mit-han-lab/llm-awq/blob/main/examples/convert_to_hf.py) script in the examples folder of [`llm-awq`](https://github.com/mit-han-lab/llm-awq/).
+
+### Load a quantized model
+
+You can load a quantized model from the Hub using the `from_pretrained` method. Make sure that the pushed weights are quantized, by checking that the attribute `quantization_config` is present in the model's configuration file (`configuration.json`). You can confirm that the model is quantized in the AWQ format by checking the field `quantization_config.quant_method` which should be set to `"awq"`. Note that loading the model will set other weights in `float16` by default for performance reasons. If you want to change that behavior, you can pass `torch_dtype` argument to `torch.float32` or `torch.bfloat16`. You can find in the sections below some example snippets and notebook.
+
+## Example usage
+
+First, you need to install [`autoawq`](https://github.com/casper-hansen/AutoAWQ) library
+
+```bash
+pip install autoawq
+```
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda:0")
+```
+
+In case you first load your model on CPU, make sure to move it to your GPU device before using 
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "TheBloke/zephyr-7B-alpha-AWQ"
+model = AutoModelForCausalLM.from_pretrained(model_id).to("cuda:0")
+```
+
+### Combining AWQ and Flash Attention
+
+You can combine AWQ quantization with Flash Attention to get a model that is both quantized and faster. Simply load the model using `from_pretrained` and pass `use_flash_attention_2=True` argument.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("TheBloke/zephyr-7B-alpha-AWQ", use_flash_attention_2=True, device_map="cuda:0")
+```
+
+### Benchmarks
+
+We performed some speed, throughput and latency benchmarks using [`optimum-benchmark`](https://github.com/huggingface/optimum-benchmark) library. 
+
+Note at that time of writing this documentation section, the available quantization methods were: `awq`, `gptq` and `bitsandbytes`.
+
+The benchmark was run on a NVIDIA-A100 instance and the model used was [`TheBloke/Mistral-7B-v0.1-AWQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-AWQ) for the AWQ model, [`TheBloke/Mistral-7B-v0.1-GPTQ`](https://huggingface.co/TheBloke/Mistral-7B-v0.1-GPTQ) for the GPTQ model. We also benchmarked it against `bitsandbytes` quantization methods and native `float16` model. Some results are shown below:
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_memory_plot.png">
+</div>
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_memory_plot.png">
+</div>
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/generate_throughput_plot.png">
+</div>
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/quantization/forward_latency_plot.png">
+</div>
+
+You can find the full results together with packages versions in [this link](https://github.com/huggingface/optimum-benchmark/tree/main/examples/running-mistral).
+
+From the results it appears that AWQ quantization method is the fastest quantization method for inference, text generation and among the lowest peak memory for text generation. However, AWQ seems to have the largest forward latency per batch size. 
+
+### Google colab demo
+
+Check out how to use this integration throughout this [Google Colab demo](https://colab.research.google.com/drive/1HzZH89yAXJaZgwJDhQj9LqSBux932BvY)!
+
+### AwqConfig
+
+[[autodoc]] AwqConfig
+
 ## `AutoGPTQ` Integration

 🤗 Transformers has integrated `optimum` API to perform GPTQ quantization on language models. You can load and quantize your model in 8, 4, 3 or even 2 bits without a big drop of performance and faster inference speed! This is supported by most GPU hardwares.
@ -48,6 +139,7 @@ Note that GPTQ integration supports for now only text models and you may encount
 GPTQ is a quantization method that requires weights calibration before using the quantized models. If you want to quantize transformers model from scratch, it might take some time before producing the quantized model (~5 min on a Google colab for `facebook/opt-350m` model). 

 Hence, there are two different scenarios where you want to use GPTQ-quantized models. The first use case would be to load models that has been already quantized by other users that are available on the Hub, the second use case would be to quantize your model from scratch and save it or push it on the Hub so that other users can also use it.
+
 #### GPTQ Configuration

 In order to load and quantize a model, you need to create a [`GPTQConfig`]. You need to pass the number of `bits`, a `dataset` in order to calibrate the quantization and the `tokenizer` of the model in order prepare the dataset.
@ -59,6 +151,7 @@ gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
 ```

 Note that you can pass your own dataset as a list of string. However, it is highly recommended to use the dataset from the GPTQ paper. 
+
 ```python
 dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
 quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
@ -71,14 +164,17 @@ You can quantize a model by using `from_pretrained` and setting the `quantizatio
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config)
+
 ```
 Note that you will need a GPU to quantize a model. We will put the model in the cpu and move the modules back and forth to the gpu in order to quantize them.

 If you want to maximize your gpus usage while using cpu offload, you can set `device_map = "auto"`.
+
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
 ```
+
 Note that disk offload is not supported. Furthermore, if you are out of memory because of the dataset, you may have to pass `max_memory` in `from_pretained`. Checkout this [guide](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map) to learn more about `device_map` and `max_memory`.

 <Tip warning={true}>
@ -95,12 +191,14 @@ tokenizer.push_to_hub("opt-125m-gptq")
 ```

 If you want to save your quantized model on your local machine, you can also do it with `save_pretrained`: 
+
 ```python
 quantized_model.save_pretrained("opt-125m-gptq")
 tokenizer.save_pretrained("opt-125m-gptq")
 ```

-Note that if you have quantized your model with a `device_map`, make sure to move the entire model to one of your gpus or the `cpu` before saving it. 
+Note that if you have quantized your model with a `device_map`, make sure to move the entire model to one of your gpus or the `cpu` before saving it.
+
 ```python
 quantized_model.to("cpu")
 quantized_model.save_pretrained("opt-125m-gptq")
@ -117,6 +215,7 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq")
 ```

 If you want to load a model faster and without allocating more memory than needed, the `device_map` argument also works with quantized model. Make sure that you have `accelerate` library installed.
+
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
@ -124,16 +223,25 @@ model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", de

 ### Exllama kernels for faster inference

-For 4-bit model, you can use the exllama kernels in order to a faster inference speed. It is activated by default. You can change that behavior by passing `disable_exllama` in [`GPTQConfig`]. This will overwrite the quantization config stored in the config. Note that you will only be able to overwrite the attributes related to the kernels. Furthermore, you need to have the entire model on gpus if you want to use exllama kernels. 
+For 4-bit model, you can use the exllama kernels in order to a faster inference speed. It is activated by default. You can change that behavior by passing `use_exllama` in [`GPTQConfig`]. This will overwrite the quantization config stored in the config. Note that you will only be able to overwrite the attributes related to the kernels. Furthermore, you need to have the entire model on gpus if you want to use exllama kernels. Also, you can perform CPU inference using Auto-GPTQ for Auto-GPTQ version > 0.4.2 by passing `device_map` = "cpu". For CPU inference, you have to pass `use_exllama = False` in the `GPTQConfig.`

 ```py
 import torch
-gptq_config = GPTQConfig(bits=4, disable_exllama=False)
+gptq_config = GPTQConfig(bits=4)
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config=gptq_config)
+```
+
+With the release of the exllamav2 kernels, you can get faster inference speed compared to the exllama kernels. You just need to pass `exllama_config={"version": 2}` in [`GPTQConfig`]:
+
+```py
+import torch
+gptq_config = GPTQConfig(bits=4, exllama_config={"version":2})
 model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config = gptq_config)
 ```

 Note that only 4-bit models are supported for now. Furthermore, it is recommended to deactivate the exllama kernels if you are finetuning a quantized model with peft. 

+You can find the benchmark of these kernels [here](https://github.com/huggingface/optimum/tree/main/tests/benchmark#gptq-benchmark)
 #### Fine-tune a quantized model 

 With the official support of adapters in the Hugging Face ecosystem, you can fine-tune models that have been quantized with GPTQ. 
@ -336,6 +444,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer

 model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
 ```
+
 Note that in this case, you don't need to specify the arguments `load_in_8bit=True`, but you need to make sure that `bitsandbytes` and `accelerate` are installed.
 Note also that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.

@ -356,6 +465,7 @@ quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
 ```

 Let's say you want to load `bigscience/bloom-1b7` model, and you have just enough GPU RAM to fit the entire model except the `lm_head`. Therefore write a custom device_map as follows:
+
 ```python
 device_map = {
    "transformer.word_embeddings": 0,
--- a/docs/source/en/main_classes/trainer.md
+++ b/docs/source/en/main_classes/trainer.md
@ -18,6 +18,12 @@ rendered properly in your Markdown viewer.

 The [`Trainer`] class provides an API for feature-complete training in PyTorch for most standard use cases. It's used in most of the [example scripts](https://github.com/huggingface/transformers/tree/main/examples).

+<Tip>
+
+If you're looking to fine-tune a language model like Llama-2 or Mistral on a text dataset using autoregressive techniques, consider using [`trl`](https://github.com/huggingface/trl)'s [`~trl.SFTTrainer`]. The [`~trl.SFTTrainer`] wraps the [`Trainer`] and is specially optimized for this particular task and supports sequence packing, LoRA, quantization, and DeepSpeed for efficient scaling to any model size. On the other hand, the [`Trainer`] is a more versatile option, suitable for a broader spectrum of tasks.
+
+</Tip>
+
 Before instantiating your [`Trainer`], create a [`TrainingArguments`] to access all the points of customization during training.

 The API supports distributed training on multiple GPUs/TPUs, mixed precision through [NVIDIA Apex](https://github.com/NVIDIA/apex) and Native AMP for PyTorch.
@ -204,6 +210,7 @@ python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
 ```

 if you have either [`accelerate`](https://github.com/huggingface/accelerate) or [`deepspeed`](https://github.com/microsoft/DeepSpeed) installed you can also accomplish the same by using one of:
+
 ```bash
 accelerate launch --num_processes 2 trainer-program.py ...
 ```
@ -240,6 +247,7 @@ CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py .
 Here your physical GPUs 0 and 2 are mapped to `cuda:1` and `cuda:0` correspondingly.

 The above examples were all for `DistributedDataParallel` use pattern, but the same method works for [`DataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) as well:
+
 ```bash
 CUDA_VISIBLE_DEVICES=2,0 python trainer-program.py ...
 ```
@ -732,3 +740,27 @@ Sections that were moved:
 | <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
 | <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
 ]
+
+## Boost your fine-tuning performances using NEFTune
+
+
+NEFTune is a technique to boost the performance of chat models and was introduced by the paper “NEFTune: Noisy Embeddings Improve Instruction Finetuning” from Jain et al. it consists of adding noise to the embedding vectors during training. According to the abstract of the paper:
+
+> Standard finetuning of LLaMA-2-7B using Alpaca achieves 29.79% on AlpacaEval, which rises to 64.69% using noisy embeddings. NEFTune also improves over strong baselines on modern instruction datasets. Models trained with Evol-Instruct see a 10% improvement, with ShareGPT an 8% improvement, and with OpenPlatypus an 8% improvement. Even powerful models further refined with RLHF such as LLaMA-2-Chat benefit from additional training with NEFTune.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/neft-screenshot.png">
+</div>
+
+To use it in `Trainer` simply pass `neftune_noise_alpha` when creating your `TrainingArguments` instance. Note that to avoid any surprising behaviour, NEFTune is disabled after training to retrieve back the original behaviour of the embedding layer.
+
+```python
+from transformers import Trainer, TrainingArguments
+
+args = TrainingArguments(..., neftune_noise_alpha=0.1)
+trainer = Trainer(..., args=args)
+
+...
+
+trainer.train()
+```
--- a/docs/source/en/model_doc/convnextv2.md
+++ b/docs/source/en/model_doc/convnextv2.md
@ -58,4 +58,15 @@ If you're interested in submitting a resource to be included here, please feel f
 ## ConvNextV2ForImageClassification

 [[autodoc]] ConvNextV2ForImageClassification
-    - forward
+    - forward
+
+## TFConvNextV2Model
+
+[[autodoc]] TFConvNextV2Model
+    - call
+
+
+## TFConvNextV2ForImageClassification
+
+[[autodoc]] TFConvNextV2ForImageClassification
+    - call
--- a/docs/source/en/model_doc/gpt_bigcode.md
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@ -42,6 +42,45 @@ The main differences compared to GPT2.

 You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)

+## Combining Starcoder and Flash Attention 2
+
+First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+
+To load and run a model using Flash Attention 2, refer to the snippet below:
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("bigcode/gpt_bigcode-santacoder", torch_dtype=torch.float16, use_flash_attention_2=True)
+>>> tokenizer = AutoTokenizer.from_pretrained("bigcode/gpt_bigcode-santacoder")
+
+>>> prompt = "def hello_world():"
+
+>>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+>>> model.to(device)
+
+>>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+>>> tokenizer.batch_decode(generated_ids)[0]
+'def hello_world():\n    print("hello world")\n\nif __name__ == "__main__":\n    print("hello world")\n<|endoftext|>'
+```
+
+### Expected speedups
+
+Below is a expected speedup diagram that compares pure inference time between the native implementation in transformers using `bigcode/starcoder` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/starcoder-speedup.png">
+</div>
+
+
 ## GPTBigCodeConfig

 [[autodoc]] GPTBigCodeConfig
--- a/docs/source/en/model_doc/kosmos-2.md
+++ b/docs/source/en/model_doc/kosmos-2.md
@ -0,0 +1,98 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# KOSMOS-2
+
+## Overview
+
+The KOSMOS-2 model was proposed in [Kosmos-2: Grounding Multimodal Large Language Models to the World](https://arxiv.org/abs/2306.14824) by Zhiliang Peng, Wenhui Wang, Li Dong, Yaru Hao, Shaohan Huang, Shuming Ma, Furu Wei.
+
+KOSMOS-2 is a Transformer-based causal language model and is trained using the next-word prediction task on a web-scale
+dataset of grounded image-text pairs [GRIT](https://huggingface.co/datasets/zzliang/GRIT). The spatial coordinates of
+the bounding boxes in the dataset are converted to a sequence of location tokens, which are appended to their respective
+entity text spans (for example, `a snowman` followed by `<patch_index_0044><patch_index_0863>`). The data format is
+similar to “hyperlinks” that connect the object regions in an image to their text span in the corresponding caption.
+
+The abstract from the paper is the following:
+
+*We introduce Kosmos-2, a Multimodal Large Language Model (MLLM), enabling new capabilities of perceiving object descriptions (e.g., bounding boxes) and grounding text to the visual world. Specifically, we represent refer expressions as links in Markdown, i.e., ``[text span](bounding boxes)'', where object descriptions are sequences of location tokens. Together with multimodal corpora, we construct large-scale data of grounded image-text pairs (called GrIT) to train the model. In addition to the existing capabilities of MLLMs (e.g., perceiving general modalities, following instructions, and performing in-context learning), Kosmos-2 integrates the grounding capability into downstream applications. We evaluate Kosmos-2 on a wide range of tasks, including (i) multimodal grounding, such as referring expression comprehension, and phrase grounding, (ii) multimodal referring, such as referring expression generation, (iii) perception-language tasks, and (iv) language understanding and generation. This work lays out the foundation for the development of Embodiment AI and sheds light on the big convergence of language, multimodal perception, action, and world modeling, which is a key step toward artificial general intelligence. Code and pretrained models are available at https://aka.ms/kosmos-2.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/kosmos_2_overview.jpg"
+alt="drawing" width="600"/>
+
+<small> Overview of tasks that KOSMOS-2 can handle. Taken from the <a href="https://arxiv.org/abs/2306.14824">original paper</a>. </small>
+
+## Example
+
+```python
+>>> from PIL import Image
+>>> import requests
+>>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration
+
+>>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
+>>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")
+
+>>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> prompt = "<grounding> An image of"
+
+>>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+
+>>> generated_ids = model.generate(
+...     pixel_values=inputs["pixel_values"],
+...     input_ids=inputs["input_ids"],
+...     attention_mask=inputs["attention_mask"],
+...     image_embeds=None,
+...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
+...     use_cache=True,
+...     max_new_tokens=64,
+... )
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+>>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
+>>> processed_text
+'<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'
+
+>>> caption, entities = processor.post_process_generation(generated_text)
+>>> caption
+'An image of a snowman warming himself by a fire.'
+
+>>> entities
+[('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
+```
+
+This model was contributed by [Yih-Dar SHIEH](https://huggingface.co/ydshieh). The original code can be found [here](https://github.com/microsoft/unilm/tree/master/kosmos-2).
+
+## Kosmos2Config
+
+[[autodoc]] Kosmos2Config
+
+## Kosmos2ImageProcessor
+
+## Kosmos2Processor
+
+[[autodoc]] Kosmos2Processor
+    - __call__
+
+## Kosmos2Model
+
+[[autodoc]] Kosmos2Model
+    - forward
+
+## Kosmos2ForConditionalGeneration
+
+[[autodoc]] Kosmos2ForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/llama2.md
+++ b/docs/source/en/model_doc/llama2.md
@ -28,12 +28,12 @@ Checkout all Llama2 models [here](https://huggingface.co/models?search=llama2)

 <Tip warning={true}>

-The `Llama2` models were trained using `bfloat16`, but the original inference uses `float16. The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be
+The `Llama2` models were trained using `bfloat16`, but the original inference uses `float16`. The checkpoints uploaded on the Hub use `torch_dtype = 'float16'`, which will be
 used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. 

-The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`) and finally, if there is a `torch_dtype` provided in the config, it will be used. 
+The `dtype` of the online weights is mostly irrelevant unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online), then it will be casted to the default `dtype` of `torch` (becomes `torch.float32`), and finally, if there is a `torch_dtype` provided in the config, it will be used. 

-Training the model in `float16` is not recommended and known to produce `nan`, as such the model should be trained in `bfloat16`.
+Training the model in `float16` is not recommended and is known to produce `nan`; as such, the model should be trained in `bfloat16`.

 </Tip>

--- a/docs/source/en/model_doc/open-llama.md
+++ b/docs/source/en/model_doc/open-llama.md
@ -33,15 +33,13 @@ This model differs from the [OpenLLaMA models](https://huggingface.co/models?sea

 ## Overview

-The Open-Llama model was proposed in [Open-Llama project](https://github.com/s-JoL/Open-Llama) by community developer s-JoL.
+The Open-Llama model was proposed in the open source Open-Llama project by community developer s-JoL.

 The model is mainly based on LLaMA with some modifications, incorporating memory-efficient attention from Xformers, stable embedding from Bloom, and shared input-output embedding from PaLM.
 And the model is pre-trained on both Chinese and English, which gives it better performance on Chinese language tasks.

 This model was contributed by [s-JoL](https://huggingface.co/s-JoL).
-The original code can be found [Open-Llama](https://github.com/s-JoL/Open-Llama).
-Checkpoint and usage can be found at [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
-
+The original code was released on GitHub by [s-JoL](https://github.com/s-JoL), but is now removed.

 ## OpenLlamaConfig

--- a/docs/source/en/model_doc/seamless_m4t.md
+++ b/docs/source/en/model_doc/seamless_m4t.md
@ -0,0 +1,218 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# SeamlessM4T
+
+## Overview
+
+The SeamlessM4T model was proposed in [SeamlessM4T — Massively Multilingual & Multimodal Machine Translation](https://dl.fbaipublicfiles.com/seamless/seamless_m4t_paper.pdf) by the Seamless Communication team from Meta AI.
+
+SeamlessM4T is a collection of models designed to provide high quality translation, allowing people from different linguistic communities to communicate effortlessly through speech and text.
+
+SeamlessM4T enables multiple tasks without relying on separate models:
+
+- Speech-to-speech translation (S2ST)
+- Speech-to-text translation (S2TT)
+- Text-to-speech translation (T2ST)
+- Text-to-text translation (T2TT)
+- Automatic speech recognition (ASR)
+
+[`SeamlessM4TModel`] can perform all the above tasks, but each task also has its own dedicated sub-model.
+
+The abstract from the paper is the following:
+
+*What does it take to create the Babel Fish, a tool that can help individuals translate speech between any two languages? While recent breakthroughs in text-based models have pushed machine translation coverage beyond 200 languages, unified speech-to-speech translation models have yet to achieve similar strides. More specifically, conventional speech-to-speech translation systems rely on cascaded systems that perform translation progressively, putting high-performing unified systems out of reach. To address these gaps, we introduce SeamlessM4T, a single model that supports speech-to-speech translation, speech-to-text translation, text-to-speech translation, text-to-text translation, and automatic speech recognition for up to 100 languages. To build this, we used 1 million hours of open speech audio data to learn self-supervised speech representations with w2v-BERT 2.0. Subsequently, we created a multimodal corpus of automatically aligned speech translations. Filtered and combined with human-labeled and pseudo-labeled data, we developed the first multilingual system capable of translating from and into English for both speech and text. On FLEURS, SeamlessM4T sets a new standard for translations into multiple target languages, achieving an improvement of 20% BLEU over the previous SOTA in direct speech-to-text translation. Compared to strong cascaded models, SeamlessM4T improves the quality of into-English translation by 1.3 BLEU points in speech-to-text and by 2.6 ASR-BLEU points in speech-to-speech. Tested for robustness, our system performs better against background noises and speaker variations in speech-to-text tasks compared to the current SOTA model. Critically, we evaluated SeamlessM4T on gender bias and added toxicity to assess translation safety. Finally, all contributions in this work are open-sourced and accessible at https://github.com/facebookresearch/seamless_communication*
+
+## Usage
+
+First, load the processor and a checkpoint of the model:
+
+```python
+>>> from transformers import AutoProcessor, SeamlessM4TModel
+
+>>> processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")
+>>> model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")
+```
+
+You can seamlessly use this model on text or on audio, to generated either translated text or translated audio.
+
+Here is how to use the processor to process text and audio:
+
+```python
+>>> # let's load an audio sample from an Arabic speech corpus
+>>> from datasets import load_dataset
+>>> dataset = load_dataset("arabic_speech_corpus", split="test", streaming=True)
+>>> audio_sample = next(iter(dataset))["audio"]
+
+>>> # now, process it
+>>> audio_inputs = processor(audios=audio_sample["array"], return_tensors="pt")
+
+>>> # now, process some English test as well
+>>> text_inputs = processor(text = "Hello, my dog is cute", src_lang="eng", return_tensors="pt")
+```
+
+
+### Speech
+
+[`SeamlessM4TModel`] can *seamlessly* generate text or speech with few or no changes. Let's target Russian voice translation:
+
+```python
+>>> audio_array_from_text = model.generate(**text_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()
+>>> audio_array_from_audio = model.generate(**audio_inputs, tgt_lang="rus")[0].cpu().numpy().squeeze()
+```
+
+With basically the same code, I've translated English text and Arabic speech to Russian speech samples.
+
+### Text
+
+Similarly, you can generate translated text from audio files or from text with the same model. You only have to pass `generate_speech=False` to [`SeamlessM4TModel.generate`].
+This time, let's translate to French.
+
+```python 
+>>> # from audio
+>>> output_tokens = model.generate(**audio_inputs, tgt_lang="fra", generate_speech=False)
+>>> translated_text_from_audio = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
+
+>>> # from text
+>>> output_tokens = model.generate(**text_inputs, tgt_lang="fra", generate_speech=False)
+>>> translated_text_from_text = processor.decode(output_tokens[0].tolist()[0], skip_special_tokens=True)
+```
+
+### Tips
+
+
+#### 1. Use dedicated models
+
+[`SeamlessM4TModel`] is transformers top level model to generate speech and text, but you can also use dedicated models that perform the task without additional components, thus reducing the memory footprint.
+For example, you can replace the audio-to-audio generation snippet with the model dedicated to the S2ST task, the rest is exactly the same code: 
+
+```python
+>>> from transformers import SeamlessM4TForSpeechToSpeech
+>>> model = SeamlessM4TForSpeechToSpeech.from_pretrained("facebook/hf-seamless-m4t-medium")
+```
+
+Or you can replace the text-to-text generation snippet with the model dedicated to the T2TT task, you only have to remove `generate_speech=False`.
+
+```python
+>>> from transformers import SeamlessM4TForTextToText
+>>> model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")
+```
+
+Feel free to try out [`SeamlessM4TForSpeechToText`] and [`SeamlessM4TForTextToSpeech`] as well.
+
+#### 2. Change the speaker identity
+
+You have the possibility to change the speaker used for speech synthesis with the `spkr_id` argument. Some `spkr_id` works better than other for some languages!
+
+#### 3. Change the generation strategy
+
+You can use different [generation strategies](./generation_strategies) for speech and text generation, e.g `.generate(input_ids=input_ids, text_num_beams=4, speech_do_sample=True)` which will successively perform beam-search decoding on the text model, and multinomial sampling on the speech model.
+
+#### 4. Generate speech and text at the same time
+
+Use `return_intermediate_token_ids=True` with [`SeamlessM4TModel`] to return both speech and text !
+
+## Model architecture
+
+
+SeamlessM4T features a versatile architecture that smoothly handles the sequential generation of text and speech. This setup comprises two sequence-to-sequence (seq2seq) models. The first model translates the input modality into translated text, while the second model generates speech tokens, known as "unit tokens," from the translated text.
+
+Each modality has its own dedicated encoder with a unique architecture. Additionally, for speech output, a vocoder inspired by the [HiFi-GAN](https://arxiv.org/abs/2010.05646) architecture is placed on top of the second seq2seq model.
+
+Here's how the generation process works:
+
+- Input text or speech is processed through its specific encoder.
+- A decoder creates text tokens in the desired language.
+- If speech generation is required, the second seq2seq model, following a standard encoder-decoder structure, generates unit tokens.
+- These unit tokens are then passed through the final vocoder to produce the actual speech.
+
+
+This model was contributed by [ylacombe](https://huggingface.co/ylacombe). The original code can be found [here](https://github.com/facebookresearch/seamless_communication).
+
+## SeamlessM4TModel
+
+[[autodoc]] SeamlessM4TModel
+    - generate
+
+
+## SeamlessM4TForTextToSpeech
+
+[[autodoc]] SeamlessM4TForTextToSpeech
+    - generate
+
+
+## SeamlessM4TForSpeechToSpeech
+
+[[autodoc]] SeamlessM4TForSpeechToSpeech
+    - generate
+
+
+## SeamlessM4TForTextToText
+
+[[autodoc]] transformers.SeamlessM4TForTextToText
+    - forward
+    - generate
+
+## SeamlessM4TForSpeechToText
+
+[[autodoc]] transformers.SeamlessM4TForSpeechToText
+    - forward
+    - generate
+
+## SeamlessM4TConfig
+
+[[autodoc]] SeamlessM4TConfig
+
+
+## SeamlessM4TTokenizer
+
+[[autodoc]] SeamlessM4TTokenizer
+    - __call__
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+
+## SeamlessM4TTokenizerFast
+
+[[autodoc]] SeamlessM4TTokenizerFast
+    - __call__
+
+## SeamlessM4TFeatureExtractor
+
+[[autodoc]] SeamlessM4TFeatureExtractor
+    - __call__
+
+## SeamlessM4TProcessor
+
+[[autodoc]] SeamlessM4TProcessor
+    - __call__
+
+## SeamlessM4TCodeHifiGan
+
+[[autodoc]] SeamlessM4TCodeHifiGan
+
+
+## SeamlessM4THifiGan
+
+[[autodoc]] SeamlessM4THifiGan
+
+## SeamlessM4TTextToUnitModel
+
+[[autodoc]] SeamlessM4TTextToUnitModel
+
+## SeamlessM4TTextToUnitForConditionalGeneration
+
+[[autodoc]] SeamlessM4TTextToUnitForConditionalGeneration
+
+
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@ -88,6 +88,11 @@ The original code can be found [here](https://github.com/openai/whisper).
    - forward
    - generate

+## WhisperForCausalLM
+
+[[autodoc]] WhisperForCausalLM
+    - forward
+
 ## WhisperForAudioClassification

 [[autodoc]] WhisperForAudioClassification
--- a/docs/source/en/perf_infer_cpu.md
+++ b/docs/source/en/perf_infer_cpu.md
@ -13,46 +13,48 @@ rendered properly in your Markdown viewer.

 -->

-# Efficient Inference on CPU
+# CPU inference

-This guide focuses on inferencing large models efficiently on CPU.
+With some optimizations, it is possible to efficiently run large model inference on a CPU. One of these optimization techniques involves compiling the PyTorch code into an intermediate format for high-performance environments like C++. The other technique fuses multiple operations into one kernel to reduce the overhead of running each operation separately.

-## `BetterTransformer` for faster inference
+You'll learn how to use [BetterTransformer](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) for faster inference, and how to convert your PyTorch code to [TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html). If you're using an Intel CPU, you can also use [graph optimizations](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features.html#graph-optimization) from [Intel Extension for PyTorch](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/index.html) to boost inference speed even more. Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime or OpenVINO (if you're using an Intel CPU).

-We have recently integrated `BetterTransformer` for faster inference on CPU for text, image and audio models. Check the documentation about this integration [here](https://huggingface.co/docs/optimum/bettertransformer/overview) for more details.
+## BetterTransformer

-## PyTorch JIT-mode (TorchScript)
-TorchScript is a way to create serializable and optimizable models from PyTorch code. Any TorchScript program can be saved from a Python process and loaded in a process where there is no Python dependency.
-Comparing to default eager mode, jit mode in PyTorch normally yields better performance for model inference from optimization methodologies like operator fusion.
+BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:

-For a gentle introduction to TorchScript, see the Introduction to [PyTorch TorchScript tutorial](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html#tracing-modules).
+1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
+2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors

-### IPEX Graph Optimization with JIT-mode
-Intel® Extension for PyTorch provides further optimizations in jit mode for Transformers series models. It is highly recommended for users to take advantage of Intel® Extension for PyTorch with jit mode. Some frequently used operator patterns from Transformers models are already supported in Intel® Extension for PyTorch with jit mode fusions. Those fusion patterns like Multi-head-attention fusion, Concat Linear, Linear+Add, Linear+Gelu, Add+LayerNorm fusion and etc. are enabled and perform well. The benefit of the fusion is delivered to users in a transparent fashion. According to the analysis, ~70% of most popular NLP tasks in question-answering, text-classification, and token-classification can get performance benefits with these fusion patterns for both Float32 precision and BFloat16 Mixed precision.
+BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention).

-Check more detailed information for [IPEX Graph Optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html).
+<Tip>

-#### IPEX installation:
-
-IPEX release is following PyTorch, check the approaches for [IPEX installation](https://intel.github.io/intel-extension-for-pytorch/).
-
-### Usage of JIT-mode
-To enable JIT-mode in Trainer for evaluaion or prediction, users should add `jit_mode_eval` in Trainer command arguments.
-
-<Tip warning={true}>
-
-for PyTorch >= 1.14.0. JIT-mode could benefit any models for prediction and evaluaion since dict input is supported in jit.trace
-
-for PyTorch < 1.14.0. JIT-mode could benefit models whose forward parameter order matches the tuple input order in jit.trace, like question-answering model
-In the case where the forward parameter order does not match the tuple input order in jit.trace, like text-classification models, jit.trace will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users.
+BetterTransformer is not supported for all models. Check this [list](https://huggingface.co/docs/optimum/bettertransformer/overview#supported-models) to see if a model supports BetterTransformer.

 </Tip>

-Take an example of the use cases on [Transformers question-answering](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)
+Before you start, make sure you have 🤗 Optimum [installed](https://huggingface.co/docs/optimum/installation).

+Enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:

- Inference using jit mode on CPU:
-<pre>python run_qa.py \
+```py
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained("bigcode/starcoder")
+model.to_bettertransformer()
+```
+
+## TorchScript
+
+TorchScript is an intermediate PyTorch model representation that can be run in production environments where performance is important. You can train a model in PyTorch and then export it to TorchScript to free the model from Python performance constraints. PyTorch [traces](https://pytorch.org/docs/stable/generated/torch.jit.trace.html) a model to return a [`ScriptFunction`] that is optimized with just-in-time compilation (JIT). Compared to the default eager mode, JIT mode in PyTorch typically yields better performance for inference using optimization techniques like operator fusion.
+
+For a gentle introduction to TorchScript, see the [Introduction to PyTorch TorchScript](https://pytorch.org/tutorials/beginner/Intro_to_TorchScript_tutorial.html) tutorial.
+
+With the [`Trainer`] class, you can enable JIT mode for CPU inference by setting the `--jit_mode_eval` flag:
+
+```bash
+python run_qa.py \
 --model_name_or_path csarron/bert-base-uncased-squad-v1 \
 --dataset_name squad \
 --do_eval \
@ -60,10 +62,31 @@ Take an example of the use cases on [Transformers question-answering](https://gi
 --doc_stride 128 \
 --output_dir /tmp/ \
 --no_cuda \
-<b>--jit_mode_eval </b></pre> 
+--jit_mode_eval
+```

- Inference with IPEX using jit mode on CPU:
-<pre>python run_qa.py \
+<Tip warning={true}>
+
+For PyTorch >= 1.14.0, JIT-mode could benefit any model for prediction and evaluaion since the dict input is supported in `jit.trace`.
+
+For PyTorch < 1.14.0, JIT-mode could benefit a model if its forward parameter order matches the tuple input order in `jit.trace`, such as a question-answering model. If the forward parameter order does not match the tuple input order in `jit.trace`, like a text classification model, `jit.trace` will fail and we are capturing this with the exception here to make it fallback. Logging is used to notify users.
+
+</Tip>
+
+## IPEX graph optimization
+
+Intel® Extension for PyTorch (IPEX) provides further optimizations in JIT mode for Intel CPUs, and we recommend combining it with TorchScript for even faster performance. The IPEX [graph optimization](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/features/graph_optimization.html) fuses operations like Multi-head attention, Concat Linear, Linear + Add, Linear + Gelu, Add + LayerNorm, and more.
+
+To take advantage of these graph optimizations, make sure you have IPEX [installed](https://intel.github.io/intel-extension-for-pytorch/cpu/latest/tutorials/installation.html):
+
+```bash
+pip install intel_extension_for_pytorch
+```
+
+Set the `--use_ipex` and `--jit_mode_eval` flags in the [`Trainer`] class to enable JIT mode with the graph optimizations:
+
+```bash
+python run_qa.py \
 --model_name_or_path csarron/bert-base-uncased-squad-v1 \
 --dataset_name squad \
 --do_eval \
@ -71,5 +94,34 @@ Take an example of the use cases on [Transformers question-answering](https://gi
 --doc_stride 128 \
 --output_dir /tmp/ \
 --no_cuda \
-<b>--use_ipex \</b>
-<b>--jit_mode_eval</b></pre> 
+--use_ipex \
+--jit_mode_eval
+```
+
+## 🤗 Optimum
+
+<Tip>
+
+Learn more details about using ORT with 🤗 Optimum in the [Optimum Inference with ONNX Runtime](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/models) guide. This section only provides a brief and simple example.
+
+</Tip>
+
+ONNX Runtime (ORT) is a model accelerator that runs inference on CPUs by default. ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers, without making too many changes to your code. You only need to replace the 🤗 Transformers `AutoClass` with its equivalent [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and load a checkpoint in the ONNX format.
+
+For example, if you're running inference on a question answering task, load the [optimum/roberta-base-squad2](https://huggingface.co/optimum/roberta-base-squad2) checkpoint which contains a `model.onnx` file:
+
+```py
+from transformers import AutoTokenizer, pipeline
+from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+model = ORTModelForQuestionAnswering.from_pretrained("optimum/roberta-base-squad2")
+tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
+
+onnx_qa = pipeline("question-answering", model=model, tokenizer=tokenizer)
+
+question = "What's my name?"
+context = "My name is Philipp and I live in Nuremberg."
+pred = onnx_qa(question, context)
+```
+
+If you have an Intel CPU, take a look at 🤗 [Optimum Intel](https://huggingface.co/docs/optimum/intel/index) which supports a variety of compression techniques (quantization, pruning, knowledge distillation) and tools for converting models to the [OpenVINO](https://huggingface.co/docs/optimum/intel/inference) format for higher performance inference.
--- a/docs/source/en/perf_infer_gpu_many.md
+++ b/docs/source/en/perf_infer_gpu_many.md
@ -1,124 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Efficient Inference on a Multiple GPUs
-
-This document contains information on how to efficiently infer on a multiple GPUs. 
-<Tip>
-
-Note: A multi GPU setup can use the majority of the strategies described in the [single GPU section](./perf_infer_gpu_one). You must be aware of simple techniques, though, that can be used for a better usage.
-
-</Tip>
-
-## Flash Attention 2
-
-Flash Attention 2 integration also works in a multi-GPU setup, check out the appropriate section in the [single GPU section](./perf_infer_gpu_one#Flash-Attention-2)
-
-## BetterTransformer
-
-[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood.  
-
-BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models.
-
-<Tip>
-
-Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer.
-  
-</Tip>
-
-### Decoder models
-
-For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards. 
-
-To convert a model to BetterTransformer:
-
-```python
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
-# convert the model to BetterTransformer
-model.to_bettertransformer()
-
-# Use it for training or inference
-```
-
-SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
-
-
-```diff
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m").to("cuda")
-# convert the model to BetterTransformer
-model.to_bettertransformer()
-
-input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
-    outputs = model.generate(**inputs)
-
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-```
-
-If you see a bug with a traceback saying 
-
-```bash
-RuntimeError: No available kernel.  Aborting execution.
-```
-
-try using the PyTorch nightly version, which may have a broader coverage for Flash Attention:
-
-```bash
-pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
-```
-
-Have a look at this [blog post](https://pytorch.org/blog/out-of-the-box-acceleration/) to learn more about what is possible with the BetterTransformer + SDPA API.
-
-### Encoder models
-
-For encoder models during inference, BetterTransformer dispatches the forward call of encoder layers to an equivalent of [`torch.nn.TransformerEncoderLayer`](https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html) that will execute the fastpath implementation of the encoder layers.
-
-Because `torch.nn.TransformerEncoderLayer` fastpath does not support training, it is dispatched to `torch.nn.functional.scaled_dot_product_attention` instead, which does not leverage nested tensors but can use Flash Attention or Memory-Efficient Attention fused kernels.
-
-More details about BetterTransformer performance can be found in this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2), and you can learn more about BetterTransformer for encoder models in this [blog](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/).
-
-
-## Advanced usage: mixing FP4 (or Int8) and BetterTransformer
-
-You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention:
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16
-)
-
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config)
-
-input_text = "Hello my dog is cute and"
-inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
-    outputs = model.generate(**inputs)
-
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
-```
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -13,39 +13,38 @@ rendered properly in your Markdown viewer.

 -->

-# Efficient Inference on a Single GPU
+# GPU inference

-In addition to this guide, relevant information can be found as well in [the guide for training on a single GPU](perf_train_gpu_one) and [the guide for inference on CPUs](perf_infer_cpu).
-
-## Flash Attention 2
+GPUs are the standard choice of hardware for machine learning, unlike CPUs, because they are optimized for memory bandwidth and parallelism. To keep up with the larger sizes of modern models or to run these large models on existing and older hardware, there are several optimizations you can use to speed up GPU inference. In this guide, you'll learn how to use FlashAttention-2 (a more memory-efficient attention mechanism), BetterTransformer (a PyTorch native fastpath execution), and bitsandbytes to quantize your model to a lower precision. Finally, learn how to use 🤗 Optimum to accelerate inference with ONNX Runtime on Nvidia GPUs.

 <Tip>

-Note that this feature is experimental and might considerably change in future versions. For instance, the Flash Attention 2 API might migrate to `BetterTransformer` API in the near future.
+The majority of the optimizations described here also apply to multi-GPU setups!

 </Tip>

-Flash Attention 2 can considerably speed up transformer-based models' training and inference speed. Flash Attention 2 has been introduced in the [official Flash Attention repository](https://github.com/Dao-AILab/flash-attention) by Tri Dao et al. The scientific paper on Flash Attention can be found [here](https://arxiv.org/abs/2205.14135).
-
-Make sure to follow the installation guide on the repository mentioned above to properly install Flash Attention 2. Once that package is installed, you can benefit from this feature.
-
-We natively support Flash Attention 2 for the following models:
-
- Llama
- Mistral
- Falcon
-
-You can request to add Flash Attention 2 support for more models by opening an issue on GitHub, and even open a Pull Request to integrate the changes. The supported models can be used for inference and training, including training with padding tokens - *which is currently not supported for `BetterTransformer` API below.*
+## FlashAttention-2

 <Tip>

-Flash Attention 2 can only be used when the models' dtype is  `fp16` or `bf16` and runs only on NVIDIA-GPU devices. Make sure to cast your model to the appropriate dtype and load them on a supported device before using that feature.
-  
+FlashAttention-2 is experimental and may change considerably in future versions.
+
 </Tip>

-### Quick usage
+[FlashAttention-2](https://huggingface.co/papers/2205.14135) is a faster and more efficient implementation of the standard attention mechanism that can significantly speedup inference by:

-To enable Flash Attention 2 in your model, add `use_flash_attention_2` in the `from_pretrained` arguments:
+1. additionally parallelizing the attention computation over sequence length
+2. partitioning the work between GPU threads to reduce communication and shared memory reads/writes between them
+
+FlashAttention-2 supports inference with Llama, Mistral, and Falcon models. You can request to add FlashAttention-2 support for another model by opening a GitHub Issue or Pull Request.
+
+Before you begin, make sure you have FlashAttention-2 installed (see the [installation](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features) guide for more details about prerequisites):
+
+```bash
+pip install flash-attn --no-build-isolation
+```
+
+To enable FlashAttention-2, add the `use_flash_attention_2` parameter to [`~AutoModelForCausalLM.from_pretrained`]:

 ```python
 import torch
@ -61,74 +60,29 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 ```

-And use it for generation or fine-tuning.
+<Tip>

-### Expected speedups
+FlashAttention-2 can only be used when the model's dtype is  `fp16` or `bf16`, and it only runs on Nvidia GPUs. Make sure to cast your model to the appropriate dtype and load them on a supported device before using FlashAttention-2.
+  
+</Tip>

-You can benefit from considerable speedups for fine-tuning and inference, especially for long sequences. However, since Flash Attention does not support computing attention scores with padding tokens under the hood, we must manually pad / unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens. 
+FlashAttention-2 can be combined with other optimization techniques like quantization to further speedup inference. For example, you can combine FlashAttention-2 with 8-bit or 4-bit quantization:

-To overcome this, one should use Flash Attention without padding tokens in the sequence for training (e.g., by packing a dataset, i.e., concatenating sequences until reaching the maximum sequence length. An example is provided [here](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516).
-
-Below is the expected speedup you can get for a simple forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes, without padding tokens:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/falcon-7b-inference-large-seqlen.png">
-</div>
-
-Below is the expected speedup you can get for a simple forward pass on [`meta-llama/Llama-7b-hf`](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes, without padding tokens:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-7b-inference-large-seqlen.png">
-</div>
-
-For sequences with padding tokens (training with padding tokens or generating with padding tokens), we need to unpad / pad the input sequences to compute correctly the attention scores. For relatively small sequence length, on pure forward pass, this creates an overhead leading to a small speedup (below 30% of the input has been filled with padding tokens). 
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-small-seqlen-padding.png">
-</div>
-
-But for large sequence length you can benefit from interesting speedup for pure inference (also training)
-
-Note that Flash Attention makes the attention computation more memory efficient, meaning you can train with much larger sequence lengths without facing CUDA OOM issues. It can lead up to memory reduction up to 20 for large sequence length. Check out [the official flash attention repository](https://github.com/Dao-AILab/flash-attention) for more details.
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png">
-</div>
-
-
-### Advanced usage
-
-You can combine this feature with many exisiting feature for model optimization. Check out few examples below:
-
-### Combining Flash Attention 2 and 8-bit models
-
-You can combine this feature together with 8-bit quantization:
-
-```python
+```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM

 model_id = "tiiuae/falcon-7b"
 tokenizer = AutoTokenizer.from_pretrained(model_id)

+# load in 8bit
 model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    load_in_8bit=True,
    use_flash_attention_2=True,
 )
-```
-
-### Combining Flash Attention 2 and 4-bit models
-
-You can combine this feature together with 4-bit quantization:
-
-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
-
-model_id = "tiiuae/falcon-7b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)

+# load in 4bit
 model = AutoModelForCausalLM.from_pretrained(
    model_id, 
    load_in_4bit=True,
@ -136,85 +90,77 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 ```

-### Combining Flash Attention 2 and PEFT
+### Expected speedups

-You can combine this feature together with PEFT for training adapters using Flash Attention 2 under the hood:
+You can benefit from considerable speedups for inference, especially for inputs with long sequences. However, since FlashAttention-2 does not support computing attention scores with padding tokens, you must manually pad/unpad the attention scores for batched inference when the sequence contains padding tokens. This leads to a significant slowdown for batched generations with padding tokens.

-```python
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer, LlamaForCausalLM
-from peft import LoraConfig
+To overcome this, you should use FlashAttention-2 without padding tokens in the sequence during training (by packing a dataset or [concatenating sequences](https://github.com/huggingface/transformers/blob/main/examples/pytorch/language-modeling/run_clm.py#L516) until reaching the maximum sequence length).

-model_id = "tiiuae/falcon-7b"
-tokenizer = AutoTokenizer.from_pretrained(model_id)
+For a single forward pass on [tiiuae/falcon-7b](https://hf.co/tiiuae/falcon-7b) with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is:

-model = AutoModelForCausalLM.from_pretrained(
-    model_id, 
-    load_in_4bit=True,
-    use_flash_attention_2=True,
-)
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/falcon-7b-inference-large-seqlen.png">
+</div>

-lora_config = LoraConfig(
-    r=8,
-    task_type="CAUSAL_LM"
-)
+For a single forward pass on [meta-llama/Llama-7b-hf](https://hf.co/meta-llama/Llama-7b-hf) with a sequence length of 4096 and various batch sizes without padding tokens, the expected speedup is:

-model.add_adapter(lora_config)
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-7b-inference-large-seqlen.png">
+</div>

-... # train your model
-```
+For sequences with padding tokens (generating with padding tokens), you need to unpad/pad the input sequences to correctly compute the attention scores. With a relatively small sequence length, a single forward pass creates overhead leading to a small speedup (in the example below, 30% of the input is filled with padding tokens):

-## BetterTransformer
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-small-seqlen-padding.png">
+</div>

-[BetterTransformer](https://huggingface.co/docs/optimum/bettertransformer/overview) converts 🤗 Transformers models to use the PyTorch-native fastpath execution, which calls optimized kernels like Flash Attention under the hood.  
-
-BetterTransformer is also supported for faster inference on single and multi-GPU for text, image, and audio models.
+But for larger sequence lengths, you can expect even more speedup benefits:

 <Tip>

-Flash Attention can only be used for models using fp16 or bf16 dtype. Make sure to cast your model to the appropriate dtype before using BetterTransformer.
-  
+FlashAttention is more memory efficient, meaning you can train on much larger sequence lengths without running into out-of-memory issues. You can potentially reduce memory usage up to 20x for larger sequence lengths. Take a look at the [flash-attention](https://github.com/Dao-AILab/flash-attention) repository for more details.
+
 </Tip>

-### Encoder models
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/llama-2-large-seqlen-padding.png">
+</div>

-PyTorch-native [`nn.MultiHeadAttention`](https://pytorch.org/blog/a-better-transformer-for-fast-transformer-encoder-inference/) attention fastpath, called BetterTransformer, can be used with Transformers through the integration in the [🤗 Optimum library](https://huggingface.co/docs/optimum/bettertransformer/overview).
+## BetterTransformer

-PyTorch's attention fastpath allows to speed up inference through kernel fusions and the use of [nested tensors](https://pytorch.org/docs/stable/nested.html). Detailed benchmarks can be found in [this blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2).
+<Tip>

-After installing the [`optimum`](https://github.com/huggingface/optimum) package, to use Better Transformer during inference, the relevant internal modules are replaced by calling [`~PreTrainedModel.to_bettertransformer`]:
+Check out our benchmarks with BetterTransformer and scaled dot product attention in the [Out of the box acceleration and memory savings of 🤗 decoder models with PyTorch 2.0](https://pytorch.org/blog/out-of-the-box-acceleration/) and learn more about the fastpath execution in the [BetterTransformer](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) blog post.
+
+</Tip>
+
+BetterTransformer accelerates inference with its fastpath (native PyTorch specialized implementation of Transformer functions) execution. The two optimizations in the fastpath execution are:
+
+1. fusion, which combines multiple sequential operations into a single "kernel" to reduce the number of computation steps
+2. skipping the inherent sparsity of padding tokens to avoid unnecessary computation with nested tensors
+
+BetterTransformer also converts all attention operations to use the more memory-efficient [scaled dot product attention (SDPA)](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention), and it calls optimized kernels like [FlashAttention](https://huggingface.co/papers/2205.14135) under the hood.
+
+Before you start, make sure you have 🤗 Optimum [installed](https://huggingface.co/docs/optimum/installation).
+
+Then you can enable BetterTransformer with the [`PreTrainedModel.to_bettertransformer`] method:

 ```python
 model = model.to_bettertransformer()
 ```

-The method [`~PreTrainedModel.reverse_bettertransformer`] allows to go back to the original modeling, which should be used before saving the model in order to use the canonical transformers modeling:
+You can return the original Transformers model with the [`~PreTrainedModel.reverse_bettertransformer`] method. You should use this before saving your model to use the canonical Transformers modeling:

-```python
+```py
 model = model.reverse_bettertransformer()
 model.save_pretrained("saved_model")
 ```

-Have a look at this [blog post](https://medium.com/pytorch/bettertransformer-out-of-the-box-performance-for-huggingface-transformers-3fbe27d50ab2) to learn more about what is possible to do with `BetterTransformer` API for encoder models.
+### FlashAttention

-### Decoder models
-
-For text models, especially decoder-based models (GPT, T5, Llama, etc.), the BetterTransformer API converts all attention operations to use the [`torch.nn.functional.scaled_dot_product_attention` operator](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) (SDPA) that is only available in PyTorch 2.0 and onwards. 
-
-To convert a model to BetterTransformer:
-
-```python
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
-# convert the model to BetterTransformer
-model.to_bettertransformer()
-
-# Use it for training or inference
-```
-
-SDPA can also call [Flash Attention](https://arxiv.org/abs/2205.14135) kernels under the hood. To enable Flash Attention or to check that it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:
+SDPA can also call FlashAttention kernels under the hood. FlashAttention can only be used for models using the `fp16` or `bf16` dtype, so make sure to cast your model to the appropriate dtype before using it.

+To enable FlashAttention or to check whether it is available in a given setting (hardware, problem size), use [`torch.backends.cuda.sdp_kernel`](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager:

 ```diff
 import torch
@ -234,47 +180,32 @@ inputs = tokenizer(input_text, return_tensors="pt").to("cuda")
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

-If you see a bug with a traceback saying 
+If you see a bug with the traceback below, try using nightly version of PyTorch which may have broader coverage for FlashAttention:

 ```bash
-RuntimeError: No available kernel.  Aborting execution.
-```
+RuntimeError: No available kernel. Aborting execution.

-try using the PyTorch nightly version, which may have a broader coverage for Flash Attention:
-
-```bash
+# install PyTorch nightly
 pip3 install -U --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu118
 ```

-Or make sure your model is correctly casted in float16 or bfloat16
+## bitsandbytes

+bitsandbytes is a quantization library that includes support for 4-bit and 8-bit quantization. Quantization reduces your model size compared to its native full precision version, making it easier to fit large models onto GPUs with limited memory.

-Have a look at [this detailed blogpost](https://pytorch.org/blog/out-of-the-box-acceleration/) to read more about what is possible to do with `BetterTransformer` + SDPA API.
+Make sure you have bitsnbytes and 🤗 Accelerate installed:

-## `bitsandbytes` integration for FP4 mixed-precision inference
+```bash
+# these versions support 8-bit and 4-bit
+pip install bitsandbytes>=0.39.0 accelerate>=0.20.0

-You can install `bitsandbytes` and benefit from easy model compression on GPUs. Using FP4 quantization you can expect to reduce up to 8x the model size compared to its native full precision version. Check out below how to get started.
+# install Transformers
+pip install transformers
+```

-<Tip>
+### 4-bit

-Note that this feature can also be used in a multi GPU setup.
-
-</Tip>
-
-### Requirements [[requirements-for-fp4-mixedprecision-inference]]
-
- Latest `bitsandbytes` library
-`pip install bitsandbytes>=0.39.0`
-
- Install latest `accelerate` from source
-`pip install git+https://github.com/huggingface/accelerate.git`
-
- Install latest `transformers` from source
-`pip install git+https://github.com/huggingface/transformers.git`
-
-### Running FP4 models - single GPU setup - Quickstart
-
-You can quickly run a FP4 model on a single GPU by running the following code:
+To load a model in 4-bit for inference, use the `load_in_4bit` parameter. The `device_map` parameter is optional, but we recommend setting it to `"auto"` to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment.

 ```py
 from transformers import AutoModelForCausalLM
@ -282,16 +213,8 @@ from transformers import AutoModelForCausalLM
 model_name = "bigscience/bloom-2b5"
 model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
 ```
-Note that `device_map` is optional but setting `device_map = 'auto'` is prefered for inference as it will dispatch efficiently the model on the available ressources.

-### Running FP4 models - multi GPU setup
-
-The way to load your mixed 4-bit model in multiple GPUs is as follows (same command as single GPU setup):
-```py
-model_name = "bigscience/bloom-2b5"
-model_4bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_4bit=True)
-```
-But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
+To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 600MB of memory to the first GPU and 1GB of memory to the second GPU:

 ```py
 max_memory_mapping = {0: "600MB", 1: "1GB"}
@ -300,44 +223,16 @@ model_4bit = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", load_in_4bit=True, max_memory=max_memory_mapping
 )
 ```
-In this example, the first GPU will use 600MB of memory and the second 1GB.

-### Advanced usage
-
-For more advanced usage of this method, please have a look at the [quantization](main_classes/quantization) documentation page.
-
-## `bitsandbytes` integration for Int8 mixed-precision matrix decomposition
+### 8-bit

 <Tip>

-Note that this feature can also be used in a multi GPU setup.
+If you're curious and interested in learning more about the concepts underlying 8-bit quantization, read the [Gentle Introduction to 8-bit Matrix Multiplication for transformers at scale using Hugging Face Transformers, Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration) blog post.

 </Tip>

-From the paper [`LLM.int8() : 8-bit Matrix Multiplication for Transformers at Scale`](https://arxiv.org/abs/2208.07339), we support Hugging Face integration for all models in the Hub with a few lines of code.
-The method reduces `nn.Linear` size by 2 for `float16` and `bfloat16` weights and by 4 for `float32` weights, with close to no impact to the quality by operating on the outliers in half-precision.
-
-![HFxbitsandbytes.png](https://cdn-uploads.huggingface.co/production/uploads/1659861207959-62441d1d9fdefb55a0b7d12c.png)
-
-Int8 mixed-precision matrix decomposition works by separating a matrix multiplication into two streams: (1) a systematic feature outlier stream matrix multiplied in fp16 (0.01%), (2) a regular stream of int8 matrix multiplication (99.9%). With this method, int8 inference with no predictive degradation is possible for very large models.
-For more details regarding the method, check out the [paper](https://arxiv.org/abs/2208.07339) or our [blogpost about the integration](https://huggingface.co/blog/hf-bitsandbytes-integration).
-
-![MixedInt8.gif](https://cdn-uploads.huggingface.co/production/uploads/1660567469965-62441d1d9fdefb55a0b7d12c.gif)
-
-Note, that you would require a GPU to run mixed-8bit models as the kernels have been compiled for GPUs only. Make sure that you have enough GPU memory to store the quarter (or half if your model weights are in half precision) of the model before using this feature.
-Below are some notes to help you use this module, or follow the demos on [Google colab](#colab-demos).
-
-### Requirements [[requirements-for-int8-mixedprecision-matrix-decomposition]]
-
- If you have `bitsandbytes<0.37.0`, make sure you run on NVIDIA GPUs that support 8-bit tensor cores (Turing, Ampere or newer architectures - e.g. T4, RTX20s RTX30s, A40-A100). For `bitsandbytes>=0.37.0`, all GPUs should be supported.
- Install the correct version of `bitsandbytes` by running:
-`pip install bitsandbytes>=0.31.5`
- Install `accelerate`
-`pip install accelerate>=0.12.0`
-
-### Running mixed-Int8 models - single GPU setup
-
-After installing the required libraries, the way to load your mixed 8-bit model is as follows:
+To load a model in 8-bit for inference, use the `load_in_8bit` parameter. The `device_map` parameter is optional, but we recommend setting it to `"auto"` to allow 🤗 Accelerate to automatically and efficiently allocate the model given the available resources in the environment:

 ```py
 from transformers import AutoModelForCausalLM
@ -346,12 +241,7 @@ model_name = "bigscience/bloom-2b5"
 model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
 ```

-For text generation, we recommend:
-
-* using the model's `generate()` method instead of the `pipeline()` function. Although inference is possible with the `pipeline()` function, it is not optimized for mixed-8bit models, and will be slower than using the `generate()` method. Moreover, some sampling strategies are like nucleaus sampling are not supported by the `pipeline()` function for mixed-8bit models.
-* placing all inputs on the same device as the model.
-
-Here is a simple example:
+If you're loading a model in 8-bit for text generation, you should use the [`~transformers.GenerationMixin.generate`] method instead of the [`Pipeline`] function which is not optimized for 8-bit models and will be slower. Some sampling strategies, like nucleus sampling, are also not supported by the [`Pipeline`] for 8-bit models. You should also place all inputs on the same device as the model:

 ```py
 from transformers import AutoModelForCausalLM, AutoTokenizer
@ -366,15 +256,7 @@ generated_ids = model.generate(**inputs)
 outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
 ```

-
-### Running mixed-int8 models - multi GPU setup
-
-The way to load your mixed 8-bit model in multiple GPUs is as follows (same command as single GPU setup):
-```py
-model_name = "bigscience/bloom-2b5"
-model_8bit = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", load_in_8bit=True)
-```
-But you can control the GPU RAM you want to allocate on each GPU using `accelerate`. Use the `max_memory` argument as follows:
+To load a model in 4-bit for inference with multiple GPUs, you can control how much GPU RAM you want to allocate to each GPU. For example, to distribute 1GB of memory to the first GPU and 2GB of memory to the second GPU:

 ```py
 max_memory_mapping = {0: "1GB", 1: "2GB"}
@ -383,27 +265,56 @@ model_8bit = AutoModelForCausalLM.from_pretrained(
    model_name, device_map="auto", load_in_8bit=True, max_memory=max_memory_mapping
 )
 ```
-In this example, the first GPU will use 1GB of memory and the second 2GB.

-### Colab demos
+<Tip>

-With this method you can infer on models that were not possible to infer on a Google Colab before.
-Check out the demo for running T5-11b (42GB in fp32)! Using 8-bit quantization on Google Colab:
+Feel free to try running a 11 billion parameter [T5 model](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing) or the 3 billion parameter [BLOOM model](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing) for inference on Google Colab's free tier GPUs!

-[![Open In Colab: T5-11b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1YORPWx4okIHXnjW7MSAidXN29mPVNT7F?usp=sharing)
+</Tip>

-Or this demo for BLOOM-3B:
+## 🤗 Optimum

-[![Open In Colab: BLOOM-3b demo](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1qOjXfQIAULfKvZqwCen8-MoWKGdSatZ4?usp=sharing)
+<Tip>

-## Advanced usage: mixing FP4 (or Int8) and BetterTransformer
+Learn more details about using ORT with 🤗 Optimum in the [Accelerated inference on NVIDIA GPUs](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#accelerated-inference-on-nvidia-gpus) guide. This section only provides a brief and simple example.

-You can combine the different methods described above to get the best performance for your model. For example, you can use BetterTransformer with FP4 mixed-precision inference + flash attention:
+</Tip>
+
+ONNX Runtime (ORT) is a model accelerator that supports accelerated inference on Nvidia GPUs. ORT uses optimization techniques like fusing common operations into a single node and constant folding to reduce the number of computations performed and speedup inference. ORT also places the most computationally intensive operations on the GPU and the rest on the CPU to intelligently distribute the workload between the two devices.
+
+ORT is supported by 🤗 Optimum which can be used in 🤗 Transformers. You'll need to use an [`~optimum.onnxruntime.ORTModel`] for the task you're solving, and specify the `provider` parameter which can be set to either [`CUDAExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#cudaexecutionprovider) or [`TensorrtExecutionProvider`](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/gpu#tensorrtexecutionprovider). If you want to load a model that was not yet exported to ONNX, you can set `export=True` to convert your model on-the-fly to the ONNX format :
+
+```py
+from optimum.onnxruntime import ORTModelForSequenceClassification
+
+ort_model = ORTModelForSequenceClassification.from_pretrained(
+  "distilbert-base-uncased-finetuned-sst-2-english",
+  export=True,
+  provider="CUDAExecutionProvider",
+)
+```
+
+Now you're free to use the model for inference:
+
+```py
+from optimum.pipelines import pipeline
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")
+
+pipeline = pipeline(task="text-classification", model=ort_model, tokenizer=tokenizer, device="cuda:0")
+result = pipeline("Both the music and visual were astounding, not to mention the actors performance.")
+```
+
+## Combine optimizations
+
+It is often possible to combine several of the optimization techniques described above to get the best inference performance possible for your model. For example, you can load a model in 4-bit, and then enable BetterTransformer with FlashAttention:

 ```py
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

+# load model in 4-bit
 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16
@ -412,9 +323,13 @@ quantization_config = BitsAndBytesConfig(
 tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", quantization_config=quantization_config)

+# enable BetterTransformer
+model = model.to_bettertransformer()
+
 input_text = "Hello my dog is cute and"
 inputs = tokenizer(input_text, return_tensors="pt").to("cuda")

+# enable FlashAttention
 with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
    outputs = model.generate(**inputs)

--- a/docs/source/en/perf_train_gpu_many.md
+++ b/docs/source/en/perf_train_gpu_many.md
@ -15,143 +15,154 @@ rendered properly in your Markdown viewer.

 # Efficient Training on Multiple GPUs

-When training on a single GPU is too slow or the model weights don't fit in a single GPUs memory we use a multi-GPU setup. Switching from a single GPU to multiple requires some form of parallelism as the work needs to be distributed. There are several techniques to achieve parallism such as data, tensor, or pipeline parallism. However, there is no one solution to fit them all and which settings works best depends on the hardware you are running on. While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
+If training a model on a single GPU is too slow or if the model's weights do not fit in a single GPU's memory, transitioning 
+to a multi-GPU setup may be a viable option. Prior to making this transition, thoroughly explore all the strategies covered 
+in the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one) as they are universally applicable 
+to model training on any number of GPUs. Once you have employed those strategies and found them insufficient for your 
+case on a single GPU, consider moving to multiple GPUs.
+
+Transitioning from a single GPU to multiple GPUs requires the introduction of some form of parallelism, as the workload 
+must be distributed across the resources. Multiple techniques can be employed to achieve parallelism, such as data 
+parallelism, tensor parallelism, and pipeline parallelism. It's important to note that there isn't a one-size-fits-all 
+solution, and the optimal settings depend on the specific hardware configuration you are using. 
+
+This guide offers an in-depth overview of individual types of parallelism, as well as guidance on ways to combine   
+techniques and choosing an appropriate approach. For step-by-step tutorials on distributed training, please refer to
+the [🤗 Accelerate documentation](https://huggingface.co/docs/accelerate/index). 

 <Tip>

- Note: Most of the strategies introduced in the [single GPU section](perf_train_gpu_one) (such as mixed precision training or gradient accumulation) are generic and apply to training models in general so make sure to have a look at it before diving into the following sections such as multi-GPU or CPU training.
+While the main concepts discussed in this guide are likely applicable across frameworks, here we focus on 
+PyTorch-based implementations.

 </Tip>

-We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
+Before diving deeper into the specifics of each technique, let's go over the rough decision process when training 
+large models on a large infrastructure.

-## Concepts
+## Scalability strategy

-The following is the brief description of the main concepts that will be described later in depth in this document.
+Begin by estimating how much vRAM is required to train your model. For models hosted on the 🤗 Hub, use our 
+[Model Memory Calculator](https://huggingface.co/spaces/hf-accelerate/model-memory-usage), which gives you 
+accurate calculations within a few percent margin.  

-1. **DataParallel (DP)** - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
-2. **TensorParallel (TP)** - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
-3. **PipelineParallel (PP)** - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
-4. **Zero Redundancy Optimizer (ZeRO)** - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
-5. **Sharded DDP** - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
+**Parallelization strategy for a single Node / multi-GPU setup**

-Before diving deeper into the specifics of each concept we first have a look at the rough decision process when training large models on a large infrastructure.
+When training a model on a single node with multiple GPUs, your choice of parallelization strategy can significantly 
+impact performance. Here's a breakdown of your options:

-## Scalability Strategy
+**Case 1: Your model fits onto a single GPU**

-**⇨ Single Node / Multi-GPU**
-* Model fits onto a single GPU:
+If your model can comfortably fit onto a single GPU, you have two primary options:

-    1. DDP - Distributed DP
-    2. ZeRO - may or may not be faster depending on the situation and configuration used
+1. DDP - Distributed DataParallel
+2. ZeRO - depending on the situation and configuration used, this method may or may not be faster, however, it's worth experimenting with it.

-* Model doesn't fit onto a single GPU:
+**Case 2: Your model doesn't fit onto a single GPU:**

-    1. PP
-    2. ZeRO
-    3. TP
+If your model is too large for a single GPU, you have several alternatives to consider:

-    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+1. PipelineParallel (PP)
+2. ZeRO
+3. TensorParallel (TP)

-    TP is almost always used within a single node. That is TP size <= gpus per node.
+With very fast inter-node connectivity (e.g., NVLINK or NVSwitch) all three strategies (PP, ZeRO, TP) should result in 
+similar performance. However, without these, PP will be faster than TP or ZeRO. The degree of TP may also 
+make a difference. It's best to experiment with your specific setup to determine the most suitable strategy.

-* Largest Layer not fitting into a single GPU:
+TP is almost always used within a single node. That is TP size <= GPUs per node.

-    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
-    2. With ZeRO see the same entry for "Single GPU" above
+**Case 3: Largest layer of your model does not fit onto a single GPU**

+1. If you are not using ZeRO, you have to use TensorParallel (TP), because PipelineParallel (PP) alone won't be sufficient to accommodate the large layer.
+2. If you are using ZeRO, additionally adopt techniques from the [Methods and tools for efficient training on a single GPU](perf_train_gpu_one).

-**⇨ Multi-Node / Multi-GPU**
+**Parallelization strategy for a multi-Node / multi-GPU setup**

-* When you have fast inter-node connectivity:
+* When you have fast inter-node connectivity (e.g., NVLINK or NVSwitch) consider using one of these options:

    1. ZeRO - as it requires close to no modifications to the model
-    2. PP+TP+DP - less communications, but requires massive changes to the model
+    2. A combination of PipelineParallel(PP) with TensorParallel(TP) and DataParallel(DP) - this approach will result in fewer communications, but requires significant changes to the model

-* when you have slow inter-node connectivity and still low on GPU memory:
-
-    1. DP+PP+TP+ZeRO-1
+* When you have slow inter-node connectivity and still low on GPU memory:

+    1. Employ a combination of DataParallel(DP) with PipelineParallel(PP), TensorParallel(TP), and ZeRO.

+In the following sections of this guide we dig deeper into how these different parallelism methods work.

 ## Data Parallelism

-Most users with just 2 GPUs already enjoy the increased training speed up thanks to `DataParallel` (DP) and `DistributedDataParallel` (DDP) that are almost trivial to use. This is a built-in feature of Pytorch. Note that in general it is advised to use DDP as it is better maintained and works for all models while DP might fail for some models. [PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) itself recommends the use of DDP.
+Even with only 2 GPUs, you can readily leverage the accelerated training capabilities offered by PyTorch's built-in features, 
+such as `DataParallel` (DP) and `DistributedDataParallel` (DDP). Note that 
+[PyTorch documentation](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html) recommends to prefer 
+`DistributedDataParallel` (DDP) over `DataParallel` (DP) for multi-GPU training as it works for all models.
+Let's take a look at how these two methods work and what makes them different.

-### DP vs DDP
+### DataParallel vs DistributedDataParallel

-`DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case:
-* while DP is python threads-based, DDP is multiprocess-based - and as such it has no python threads limitations, such as GIL
-* on the other hand a slow inter-connectivity between the GPU cards could lead to an actual slower outcome with DDP
-
-Here are the main differences in the inter-GPU communication overhead between the two modes:
+To understand the key differences in inter-GPU communication overhead between the two methods, let's review the processes per batch:

 [DDP](https://pytorch.org/docs/master/notes/ddp.html):

- At the start time the main process replicates the model once from gpu 0 to the rest of gpus
+- At the start time the main process replicates the model once from GPU 0 to the rest of GPUs
 - Then for each batch:
-   1. each gpu consumes each own mini-batch of data directly
-   2. during `backward`, once the local gradients are ready, they are then averaged across all processes
+   1. Each GPU directly consumes its mini-batch of data.
+   2. During `backward`, once the local gradients are ready, they are averaged across all processes.

 [DP](https://pytorch.org/docs/master/generated/torch.nn.DataParallel.html):

 For each batch:
-   1. gpu 0 reads the batch of data and then sends a mini-batch to each gpu
-   2. replicates the up-to-date model from gpu 0 to each gpu
-   3. runs `forward` and sends output from each gpu to gpu 0, computes loss
-   4. scatters loss from gpu 0 to all gpus, runs `backward`
-   5. sends gradients from each gpu to gpu 0 and averages those
+   1. GPU 0 reads the batch of data and then sends a mini-batch to each GPU.
+   2. The up-to-date model is replicated from GPU 0 to each GPU. 
+   3. `forward` is executed, and output from each GPU is sent to GPU 0 to compute the loss.
+   4. The loss is distributed from GPU 0 to all GPUs, and `backward` is run. 
+   5. Gradients from each GPU are sent to GPU 0 and averaged. 

-The only communication DDP performs per batch is sending gradients, whereas DP does 5 different data exchanges per batch.
+Key differences include:
+1. DDP performs only a single communication per batch - sending gradients, while DP performs five different data exchanges per batch.
+DDP copies data using [torch.distributed](https://pytorch.org/docs/master/distributed.html), while DP copies data within 
+the process via Python threads (which introduces limitations associated with GIL). As a result, **`DistributedDataParallel` (DDP) is generally faster than `DataParallel` (DP)** unless you have slow GPU card inter-connectivity.
+2. Under DP, GPU 0 performs significantly more work than other GPUs, resulting in GPU under-utilization. 
+3. DDP supports distributed training across multiple machines, whereas DP does not.

-DP copies data within the process via python threads, whereas DDP copies data via [torch.distributed](https://pytorch.org/docs/master/distributed.html).
+This is not an exhaustive list of differences between DP and DDP, however, other nuances are out of scope of this guide.
+You can get a deeper understanding of these methods by reading this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/).

-Under DP gpu 0 performs a lot more work than the rest of the gpus, thus resulting in under-utilization of gpus.
+Let's illustrate the differences between DP and DDP with an experiment. We'll benchmark the differences between DP and 
+DDP with an added context of NVLink presence:  

-You can use DDP across multiple machines, but this is not the case with DP.
+* Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`).
+* Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`.

-There are other differences between DP and DDP but they aren't relevant to this discussion.
+To disable the NVLink feature on one of the benchmarks, we use `NCCL_P2P_DISABLE=1`. 

-If you want to go really deep into understanding these 2 modes, this [article](https://www.telesens.co/2019/04/04/distributed-data-parallel-training-using-pytorch-on-aws/) is highly recommended, as it has great diagrams, includes multiple benchmarks and profiler outputs on various hardware, explains all the nuances that you may need to know.
+Here is the benchmarking code and outputs:

-Let's look at an actual benchmark:
-
-| Type   | NVlink | Time |
-| :----- | -----  | ---: |
-| 2:DP   | Y      | 110s |
-| 2:DDP  | Y      | 101s |
-| 2:DDP  | N      | 131s |
-
-
-Analysis:
-
-Here DP is ~10% slower than DDP w/ NVlink, but ~15% faster than DDP w/o NVlink
-
-The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, the more a slow link will slow down the total runtime.
-
-Here is the full benchmark code and outputs:
-
-`NCCL_P2P_DISABLE=1` was used to disable the NVLink feature on the corresponding benchmark.
+**DP**

 ```
-
-# DP
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 python examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200

 {'train_runtime': 110.5948, 'train_samples_per_second': 1.808, 'epoch': 0.69}
+```

-# DDP w/ NVlink
+**DDP w/ NVlink**
+
+```
 rm -r /tmp/test-clm; CUDA_VISIBLE_DEVICES=0,1 \
 python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
 --do_train --output_dir /tmp/test-clm --per_device_train_batch_size 4 --max_steps 200

 {'train_runtime': 101.9003, 'train_samples_per_second': 1.963, 'epoch': 0.69}
+```

-# DDP w/o NVlink
+**DDP w/o NVlink**
+
+```
 rm -r /tmp/test-clm; NCCL_P2P_DISABLE=1 CUDA_VISIBLE_DEVICES=0,1 \
 python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-modeling/run_clm.py \
 --model_name_or_path gpt2 --dataset_name wikitext --dataset_config_name wikitext-2-raw-v1 \
@ -160,17 +171,34 @@ python -m torch.distributed.launch --nproc_per_node 2 examples/pytorch/language-
 {'train_runtime': 131.4367, 'train_samples_per_second': 1.522, 'epoch': 0.69}
 ```

-Hardware: 2x TITAN RTX 24GB each + NVlink with 2 NVLinks (`NV2` in `nvidia-smi topo -m`)
-Software: `pytorch-1.8-to-be` + `cuda-11.0` / `transformers==4.3.0.dev0`
+Here are the same benchmarking results gathered in a table for convenience:
+
+| Type   | NVlink | Time |
+| :----- | -----  | ---: |
+| 2:DP   | Y      | 110s |
+| 2:DDP  | Y      | 101s |
+| 2:DDP  | N      | 131s |
+
+As you can see, in this case DP is ~10% slower than DDP with NVlink, but ~15% faster than DDP without NVlink.
+The real difference will depend on how much data each GPU needs to sync with the others - the more there is to sync, 
+the more a slow link will impede the overall runtime.

 ## ZeRO Data Parallelism

-ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
-![DeepSpeed-Image-1](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png)
+ZeRO-powered data parallelism (ZeRO-DP) is illustrated in the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/).

-It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual `DataParallel` (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it.  And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it.
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero.png" alt="DeepSpeed-Image-1"/>
+ </div>
+
+While it may appear complex, it is a very similar concept to `DataParallel` (DP). The difference is that instead of 
+replicating the full model parameters, gradients and optimizer states, each GPU stores only a slice of it. Then, at 
+run-time when the full layer parameters are needed just for the given layer, all GPUs synchronize to give each other 
+parts that they miss.
+
+To illustrate this idea, consider a simple model with 3 layers (La, Lb, and Lc), where each layer has 3 parameters. 
+Layer La, for example, has weights a0, a1 and a2:

-Consider this simple model with 3 layers, where each layer has 3 params:
 ```
 La | Lb | Lc
 ---|----|---
@ -178,9 +206,8 @@ a0 | b0 | c0
 a1 | b1 | c1
 a2 | b2 | c2
 ```
-Layer La has weights a0, a1 and a2.

-If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:
+If we have 3 GPUs, ZeRO-DP splits the model onto 3 GPUs like so:

 ```
 GPU0:
@ -199,165 +226,213 @@ La | Lb | Lc
 a2 | b2 | c2
 ```

-In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point.
+In a way, this is the same horizontal slicing as tensor parallelism, as opposed to Vertical 
+slicing, where one puts whole layer-groups on different GPUs. Now let's see how this works: 
+
+Each of these GPUs will get the usual mini-batch as it works in DP:

-Now each of these GPUs will get the usual mini-batch as it works in DP:
 ```
 x0 => GPU0
 x1 => GPU1
 x2 => GPU2
 ```

-The inputs are unmodified - they think they are going to be processed by the normal model.
+The inputs are passed without modifications as if they would be processed by the original model.

-First, the inputs hit the layer La.
+First, the inputs get to the layer `La`. What happens at this point?

-Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together.
+On GPU0: the x0 mini-batch requires the a0, a1, a2 parameters to do its forward path through the layer, but the GPU0 has only a0. 
+It will get a1 from GPU1 and a2 from GPU2, bringing all the pieces of the model together.

-In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2.
+In parallel, GPU1 gets another mini-batch - x1. GPU1 has the a1 parameter, but needs a0 and a2, so it gets those from GPU0 and GPU2.
+Same happens to GPU2 that gets the mini-batch x2. It gets a0 and a1 from GPU0 and GPU1.

-Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor.
+This way each of the 3 GPUs gets the full tensors reconstructed and makes a forward pass with its own mini-batch.
+As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. 
+The reconstruction is done efficiently via a pre-fetch.

-All 3 GPUs get the full tensors reconstructed and a forward happens.
+Then the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.

-As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch.
+<Tip>

-And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
+This mechanism is similar to an efficient group backpacking strategy: person A carries the tent, person B carries the stove,
+and person C carries the axe. Each night they all share what they have with others and get from others what they don't have, 
+and in the morning they pack up their allocated type of gear and continue on their way. This is what ZeRO DP/Sharded DDP is.
+Compare this strategy to the simple one where each person has to carry their own tent, stove and axe (similar to 
+DataParallel (DP and DDP) in PyTorch), which would be far more inefficient. 

-To me this sounds like an efficient group backpacking weight distribution strategy:
-
-1. person A carries the tent
-2. person B carries the stove
-3. person C carries the axe
-
-Now each night they all share what they have with others and get from others what they don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.
-
-Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch.
+</Tip>

 While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned.
-
-If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next.
+If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism 
+which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism 
+which is discussed next.

 Implementations:

 - [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3
+- [`Accelerate` integration](https://huggingface.co/docs/accelerate/en/usage_guides/deepspeed) 
 - [`transformers` integration](main_classes/trainer#trainer-integrations)

-## Naive Model Parallelism (Vertical) and Pipeline Parallelism
+## From Naive Model Parallelism to Pipeline Parallelism

-Naive Model Parallelism (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified.
+To explain Pipeline parallelism, we'll first look into Naive Model Parallelism (MP), also known as Vertical MP. This approach
+involves distributing groups of model layers across multiple GPUs by assigning specific layers to specific GPUs with `.to()`. 
+As data flows through these layers, it is moved to the same GPU as the layer, while the other layers remain untouched.

-We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model:
+We refer to this Model parallelism as "Vertical" because of how models are typically visualized. For example, the 
+following diagram shows an 8-layer model split vertically into two slices, placing layers 0-3 onto 
+GPU0 and 4-7 to GPU1:

 ```
 ===================  ===================
 |  0 | 1 | 2 | 3  |  |  4 | 5 | 6 | 7  |
 ===================  ===================
-        gpu0                 gpu1
+        GPU0                 GPU1
 ```
-we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1.

-Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger.
+In this example, when data moves from layer 0 to 3, it's no different from regular forward pass. However, passing data 
+from layer 3 to 4 requires moving it from GPU0 to GPU1, introducing a communication overhead. If the participating 
+GPUs are on the same compute node (e.g. same physical machine) this copying is fast, but if the GPUs are distributed 
+across different compute nodes (e.g. multiple machines), the communication overhead could be substantially greater.

-Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be computed and the optimizer can do its work.
+Following that, layers 4 to 7 work as they would in the original model. Upon completion of the 7th layer, there is often 
+a need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be 
+computed and the optimizer can do its work.

-Problems:
- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
- shared embeddings may need to get copied back and forth between GPUs.
+Naive Model Parallelism comes several shortcomings:
+- **All but one GPU are idle at any given moment**: if 4 GPUs are used, it's nearly identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. 
+- **Overhead in data transfer between devices**:  E.g. 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, but a single 24GB card will complete the training faster, because it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
+- **Copying shared embeddings**: Shared embeddings may need to get copied back and forth between GPUs.

-Pipeline Parallelism (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
+Now that you are familiar with how the naive approach to model parallelism works and its shortcomings, let's look at Pipeline Parallelism (PP).
+PP is almost identical to a naive MP, but it solves the GPU idling problem by chunking the incoming batch into micro-batches 
+and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.

-The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom:
+The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) 
+shows the naive MP on the top, and PP on the bottom:

-![mp-pp](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png)
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-gpipe-bubble.png" alt="MP vs PP"/>
+</div>

-It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble".
+At the bottom of the diagram, you can observe that the Pipeline Parallelism (PP) approach minimizes the number of idle 
+GPU zones, referred to as 'bubbles'. Both parts of the diagram show a parallelism level of degree 4, meaning that 4 GPUs 
+are involved in the pipeline. You can see that there's a forward path of 4 pipe stages (F0, F1, F2 and F3) followed by 
+a backward path in reverse order (B3, B2, B1, and B0).

-Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
+PP introduces a new hyperparameter to tune - `chunks`, which determines how many data chunks are sent in a sequence 
+through the same pipe stage. For example, in the bottom diagram you can see `chunks=4`. GPU0 performs the same 
+forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do complete their work. 
+Only when the other GPUs begin to complete their work, GPU0 starts to work again doing the backward path for chunks 
+3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).

-PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottom diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
+Note that this is the same concept as gradient accumulation steps. PyTorch uses `chunks`, while DeepSpeed refers 
+to the same hyperparameter as gradient accumulation steps.

-Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
+Because of the chunks, PP introduces the notion of micro-batches (MBS). DP splits the global data batch size into 
+mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 
+256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each 
+Pipeline stage works with a single micro-batch at a time. To calculate the global batch size of the DP + PP setup, 
+use the formula: `mbs * chunks * dp_degree` (`8 * 32 * 4 = 1024`).
+With `chunks=1` you end up with the naive MP, which is inefficient. With a large `chunks` value you end up with 
+tiny micro-batch sizes which is also inefficient. For this reason, we encourage to experiment with the `chunks` value to 
+find the one that leads to the most efficient GPUs utilization.

-Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time.
+You may notice a bubble of "dead" time on the diagram that can't be parallelized because the last `forward` stage 
+has to wait for `backward` to complete the pipeline. The purpose of finding the best value for `chunks` is to enable a high 
+concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.

-To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`).
-
-Let's go back to the diagram.
-
-With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus.
-
-While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
-
-There are 2 groups of solutions - the traditional Pipeline API and the more modern solutions that make things much easier for the end user.
-
-Traditional Pipeline API solutions:
+Pipeline API solutions have been implemented in:
 - PyTorch
 - DeepSpeed
 - Megatron-LM

-Modern solutions:
+These come with some shortcomings:
+- They have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
+- Currently the Pipeline API is very restricted. If you had a bunch of Python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
+- Conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
+- They have to arrange each layer so that the output of one layer becomes an input to the other layer.
+
+More recent solutions include:
 - Varuna
 - Sagemaker

-Problems with traditional Pipeline API solutions:
- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
- conditional control flow at the level of pipe stages is not possible - e.g., Encoder-Decoder models like T5 require special workarounds to handle a conditional encoder stage.
- have to arrange each layer so that the output of one model becomes an input to the other model.
-
-We are yet to experiment with Varuna and SageMaker but their papers report that they have overcome the list of problems mentioned above and that they require much smaller changes to the user's model.
+We have not experimented with Varuna and SageMaker but their papers report that they have overcome the list of problems 
+mentioned above and that they require smaller changes to the user's model.

 Implementations:
- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
+- [PyTorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
 - [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
 - [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
 - [Varuna](https://github.com/microsoft/varuna)
 - [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
 - [OSLO](https://github.com/tunib-ai/oslo) - this is implemented based on the Hugging Face Transformers.

-🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
+🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive MP support. 
+The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This 
+is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
+
+DeepSpeed and Megatron-LM integrations are available in [🤗 Accelerate](https://huggingface.co/docs/accelerate/main/en/usage_guides/deepspeed)

 Other approaches:

 DeepSpeed, Varuna and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
-![interleaved-pipeline-execution](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png)

-Here the bubble (idle time) is further minimized by prioritizing backward passes.
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-sagemaker-interleaved-pipeline.png" alt="Interleaved pipeline execution"/>
+</div>

-Varuna further tries to improve the schedule by using simulations to discover the most efficient scheduling.
+Here the bubble (idle time) is further minimized by prioritizing backward passes. Varuna further attempts to improve the 
+schedule by using simulations to discover the most efficient scheduling.

-OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` converting.
+OSLO has pipeline parallelism implementation based on the Transformers without `nn.Sequential` conversion.

 ## Tensor Parallelism

-In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing.
-
-In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
+In Tensor Parallelism, each GPU processes a slice of a tensor and only aggregates the full tensor for operations requiring it.
+To describe this method, this section of the guide relies on the concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) 
+paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).

 The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`.
+The dot dot-product part of it, following the Megatron's paper notation, can be written as `Y = GeLU(XA)`, where `X` is 
+an input vector, `Y` is the output vector, and `A` is the weight matrix.

-Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix.
+If we look at the computation in matrix form, you can see how the matrix multiplication can be split between multiple GPUs:

-If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs:
-![Parallel GEMM](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png)
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_gemm.png" alt="Parallel GEMM"/>
+</div>

-If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
-![independent GeLU](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png)
+If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, 
+then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:

-Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that:
-![parallel shard processing](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png)
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-independent-gelu.png" alt="Independent GeLU"/>
+</div>

-Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads!
-![parallel self-attention](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png)
+Using this principle, we can update a multi-layer perceptron of arbitrary depth, without the need for any synchronization 
+between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors 
+provide a helpful illustration for that:

-Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs.
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_shard_processing.png" alt="Parallel shard processing"/>
+</div>
+
+Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having 
+multiple independent heads!
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-tp-parallel_self_attention.png" alt="Parallel self-attention"/>
+</div>
+
+Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. 
+Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use
+nodes that have at least 8 GPUs.

 This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
 by [@anton-l](https://github.com/anton-l).

-SageMaker combines TP with DP for a more efficient processing.
-
 Alternative names:
 - DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)

@ -367,18 +442,27 @@ Implementations:
 - [SageMaker](https://arxiv.org/abs/2111.05972) - this is a proprietary solution that can only be used on AWS.
 - [OSLO](https://github.com/tunib-ai/oslo) has the tensor parallelism implementation based on the Transformers.

+SageMaker combines TP with DP for a more efficient processing.
+
 🤗 Transformers status:
 - core: not yet implemented in the core
 - but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
 - Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)

-## DP+PP
+🤗 Accelerate integrates with [TP from Megatron-LM](https://huggingface.co/docs/accelerate/v0.23.0/en/usage_guides/megatron_lm).

-The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP.
+## Data Parallelism + Pipeline Parallelism

-![dp-pp-2d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png)
+The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates 
+how one can combine DP with PP.

-Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid.
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-zero-dp-pp.png" alt="DP + PP-2d"/>
+</div>
+
+Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 
+and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. 
+And GPU1 does the same by enlisting GPU3 to its aid.

 Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.

@ -391,11 +475,13 @@ Implementations:

 🤗 Transformers status: not yet implemented

-## DP+PP+TP
+## Data Parallelism + Pipeline Parallelism + Tensor Parallelism

 To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram.

-![dp-pp-tp-3d](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png)
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-deepspeed-3d.png" alt="dp-pp-tp-3d"/>
+</div>

 This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well.

@ -410,15 +496,22 @@ Implementations:

 🤗 Transformers status: not yet implemented, since we have no PP and TP.

-## ZeRO DP+PP+TP
+## ZeRO Data Parallelism + Pipeline Parallelism + Tensor Parallelism

-One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP.
+One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been 
+discussed in [ZeRO Data Parallelism](#zero-data-parallelism). Normally it's a standalone feature that doesn't require PP or TP. 
+But it can be combined with PP and TP.

 When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding).

-While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt.
+While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have negative 
+performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate 
+the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, 
+small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with
+minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to impact the performance.

-In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
+In addition, there are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already 
+reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.

 ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required.

@ -455,7 +548,9 @@ Let's take 10 batches of sequence length 512. If we parallelize them by sample d

 * Operator

-If we perform layer normalization, we compute std first and mean second, and then we can normalize data. Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.
+If we perform layer normalization, we compute std first and mean second, and then we can normalize data. 
+Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 
+devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.

 * Attribute

@ -465,66 +560,20 @@ We have 10 batches of 512 length. If we parallelize them by attribute dimension

 It is similar with tensor model parallelism or naive layer-wise model parallelism.

-![flex-flow-soap](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg)
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/parallelism-flexflow.jpeg" alt="flex-flow-soap"/>
+</div>

-The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these  algorithmically deciding which parallelisation to use where.
+The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) 
+fast-intra-connect/slow-inter-connect and it automatically optimizes all these algorithmically deciding which 
+parallelisation to use where.

-One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
+One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and 
+fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.

-So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization.
+So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best 
+strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan 
+for that. And then you can train. A different setup will have its own custom optimization.

-🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
-
-
-## Which Strategy To Use When
-
-Here is a very rough outline at which parallelism strategy to use when. The first on each list is typically faster.
-
-**⇨ Single GPU**
-
-* Model fits onto a single GPU:
-
-    1. Normal use
-
-* Model doesn't fit onto a single GPU:
-
-    1. ZeRO + Offload CPU and optionally NVMe
-    2. as above plus Memory Centric Tiling (see below for details) if the largest layer can't fit into a single GPU
-
-* Largest Layer not fitting into a single GPU:
-
-1. ZeRO - Enable [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT). It allows you to run arbitrarily large layers by automatically splitting them and executing them sequentially. MCT reduces the number of parameters that are live on a GPU, but it does not affect the activation memory. As this need is very rare as of this writing a manual override of `torch.nn.Linear` needs to be done by the user.
-
-**⇨ Single Node / Multi-GPU**
-
-* Model fits onto a single GPU:
-
-    1. DDP - Distributed DP
-    2. ZeRO - may or may not be faster depending on the situation and configuration used
-
-* Model doesn't fit onto a single GPU:
-
-    1. PP
-    2. ZeRO
-    3. TP
-
-    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
-
-    TP is almost always used within a single node. That is TP size <= gpus per node.
-
-* Largest Layer not fitting into a single GPU:
-
-    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
-    2. With ZeRO see the same entry for "Single GPU" above
-
-
-**⇨ Multi-Node / Multi-GPU**
-
-* When you have fast inter-node connectivity:
-
-    1. ZeRO - as it requires close to no modifications to the model
-    2. PP+TP+DP - less communications, but requires massive changes to the model
-
-* when you have slow inter-node connectivity and still low on GPU memory:
-
-    1. DP+PP+TP+ZeRO-1
+🤗 Transformers status: Transformers models are FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), 
+which is a prerequisite for FlexFlow, however, changes are required on the FlexFlow side to make it work with Transformers models.
--- a/docs/source/en/performance.md
+++ b/docs/source/en/performance.md
@ -53,7 +53,7 @@ sections we go through the steps to run inference on CPU and single/multi-GPU se

 * [Inference on a single CPU](perf_infer_cpu)
 * [Inference on a single GPU](perf_infer_gpu_one)
-* [Multi-GPU inference](perf_infer_gpu_many)
+* [Multi-GPU inference](perf_infer_gpu_one)
 * [XLA Integration for TensorFlow Models](tf_xla)


--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [Whisper](../model_doc/whisper), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)



--- a/docs/source/en/tasks/summarization.md
+++ b/docs/source/en/tasks/summarization.md
@ -35,7 +35,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)

 <!--End of the generated tip-->

--- a/docs/source/en/tasks/translation.md
+++ b/docs/source/en/tasks/translation.md
@ -32,7 +32,7 @@ The task illustrated in this tutorial is supported by the following model archit

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->

-[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)
+[BART](../model_doc/bart), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [Encoder decoder](../model_doc/encoder-decoder), [FairSeq Machine-Translation](../model_doc/fsmt), [GPTSAN-japanese](../model_doc/gptsan-japanese), [LED](../model_doc/led), [LongT5](../model_doc/longt5), [M2M100](../model_doc/m2m_100), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MT5](../model_doc/mt5), [MVP](../model_doc/mvp), [NLLB](../model_doc/nllb), [NLLB-MOE](../model_doc/nllb-moe), [Pegasus](../model_doc/pegasus), [PEGASUS-X](../model_doc/pegasus_x), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [SeamlessM4T](../model_doc/seamless_m4t), [SwitchTransformers](../model_doc/switch_transformers), [T5](../model_doc/t5), [UMT5](../model_doc/umt5), [XLM-ProphetNet](../model_doc/xlm-prophetnet)

 <!--End of the generated tip-->

--- a/docs/source/en/testing.md
+++ b/docs/source/en/testing.md
@ -525,6 +525,25 @@ Certain devices will require an additional import after importing `torch` for th
 ```bash
 TRANSFORMERS_TEST_BACKEND="torch_npu" pytest tests/utils/test_logging.py
 ```
+Alternative backends may also require the replacement of device-specific functions. For example `torch.cuda.manual_seed` may need to be replaced with a device-specific seed setter like `torch.npu.manual_seed` to correctly set a random seed on the device. To specify a new backend with backend-specific device functions when running the test suite, create a Python device specification file in the format:
+
+```
+import torch
+import torch_npu
+# !! Further additional imports can be added here !!
+
+# Specify the device name (eg. 'cuda', 'cpu', 'npu')
+DEVICE_NAME = 'npu'
+
+# Specify device-specific backends to dispatch to.
+# If not specified, will fallback to 'default' in 'testing_utils.py`
+MANUAL_SEED_FN = torch.npu.manual_seed
+EMPTY_CACHE_FN = torch.npu.empty_cache
+DEVICE_COUNT_FN = torch.npu.device_count
+```
+This format also allows for specification of any additional imports required. To use this file to replace equivalent methods in the test suite, set the environment variable `TRANSFORMERS_TEST_DEVICE_SPEC` to the path of the spec file.
+
+Currently, only `MANUAL_SEED_FN`, `EMPTY_CACHE_FN` and `DEVICE_COUNT_FN` are supported for device-specific dispatch.


 ### Distributed training
--- a/docs/source/hi/_toctree.yml
+++ b/docs/source/hi/_toctree.yml
@ -0,0 +1,3 @@
+- sections:
+  - local: pipeline_tutorial
+    title: पाइपलाइनों के साथ अनुमान चलाएँ
--- a/docs/source/hi/pipeline_tutorial.md
+++ b/docs/source/hi/pipeline_tutorial.md
@ -0,0 +1,317 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# अनुमान के लिए पाइपलाइन
+
+[`pipeline`] किसी भी भाषा, कंप्यूटर दृष्टि, भाषण और मल्टीमॉडल कार्यों पर अनुमान लगाने के लिए [Hub] (https://huggingface.co/models) से किसी भी मॉडल का उपयोग करना आसान बनाता है। भले ही आपके पास किसी विशिष्ट तौर-तरीके का अनुभव न हो या आप मॉडलों के पीछे अंतर्निहित कोड से परिचित न हों, फिर भी आप [`pipeline`] के अनुमान के लिए उनका उपयोग कर सकते हैं! यह ट्यूटोरियल आपको ये सिखाएगा:
+
+* अनुमान के लिए [`pipeline`] का उपयोग करें।
+* एक विशिष्ट टोकननाइज़र या मॉडल का उपयोग करें।
+* ऑडियो, विज़न और मल्टीमॉडल कार्यों के लिए [`pipeline`] का उपयोग करें।
+
+<Tip>
+
+समर्थित कार्यों और उपलब्ध मापदंडों की पूरी सूची के लिए [`pipeline`] दस्तावेज़ पर एक नज़र डालें।
+
+</Tip>
+
+## पाइपलाइन का उपयोग
+
+जबकि प्रत्येक कार्य में एक संबद्ध [`pipeline`] होता है, सामान्य [`pipeline`] अमूर्त का उपयोग करना आसान होता है जिसमें शामिल होता है
+सभी कार्य-विशिष्ट पाइपलाइनें। [`pipeline`] स्वचालित रूप से एक डिफ़ॉल्ट मॉडल और सक्षम प्रीप्रोसेसिंग क्लास लोड करता है
+आपके कार्य के लिए अनुमान का. आइए स्वचालित वाक् पहचान (एएसआर) के लिए [`pipeline`] का उपयोग करने का उदाहरण लें, या
+वाक्-से-पाठ.
+
+
+1. एक [`pipeline`] बनाकर प्रारंभ करें और अनुमान कार्य निर्दिष्ट करें:
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition")
+```
+
+2. अपना इनपुट [`pipeline`] पर भेजें। वाक् पहचान के मामले में, यह एक ऑडियो इनपुट फ़ाइल है:
+
+```py
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
+```
+
+क्या वह परिणाम नहीं जो आपके मन में था? कुछ [सबसे अधिक डाउनलोड किए गए स्वचालित वाक् पहचान मॉडल](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) देखें
+यह देखने के लिए हब पर जाएं कि क्या आपको बेहतर ट्रांस्क्रिप्शन मिल सकता है।
+
+आइए OpenAI से [व्हिस्पर लार्ज-v2](https://huggingface.co/openai/whisper-large) मॉडल आज़माएं। व्हिस्पर जारी किया गया
+Wav2Vec2 की तुलना में 2 साल बाद, और लगभग 10 गुना अधिक डेटा पर प्रशिक्षित किया गया था। इस प्रकार, यह अधिकांश डाउनस्ट्रीम पर Wav2Vec2 को मात देता है
+बेंचमार्क. इसमें विराम चिह्न और आवरण की भविष्यवाणी करने का अतिरिक्त लाभ भी है, जिनमें से कोई भी संभव नहीं है
+Wav2Vec2.
+
+आइए इसे यहां आज़माकर देखें कि यह कैसा प्रदर्शन करता है:
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+अब यह परिणाम अधिक सटीक दिखता है! Wav2Vec2 बनाम व्हिस्पर पर गहन तुलना के लिए, [ऑडियो ट्रांसफॉर्मर्स कोर्स] (https://huggingface.co/learn/audio-course/chapter5/asr_models) देखें।
+हम वास्तव में आपको विभिन्न भाषाओं में मॉडल, आपके क्षेत्र में विशेषीकृत मॉडल और बहुत कुछ के लिए हब की जांच करने के लिए प्रोत्साहित करते हैं।
+आप हब पर सीधे अपने ब्राउज़र से मॉडल परिणामों की जांच और तुलना कर सकते हैं कि यह फिट बैठता है या नहीं
+अन्य मामलों की तुलना में कोने के मामलों को बेहतर ढंग से संभालता है।
+और यदि आपको अपने उपयोग के मामले के लिए कोई मॉडल नहीं मिलता है, तो आप हमेशा अपना खुद का [प्रशिक्षण](training) शुरू कर सकते हैं!
+
+यदि आपके पास कई इनपुट हैं, तो आप अपने इनपुट को एक सूची के रूप में पास कर सकते हैं:
+
+```py
+transcriber(
+    [
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+    ]
+)
+```
+
+पाइपलाइनें प्रयोग के लिए बहुत अच्छी हैं क्योंकि एक मॉडल से दूसरे मॉडल पर स्विच करना मामूली काम है; हालाँकि, प्रयोग की तुलना में बड़े कार्यभार के लिए उन्हें अनुकूलित करने के कुछ तरीके हैं। संपूर्ण डेटासेट पर पुनरावृत्ति करने या वेबसर्वर में पाइपलाइनों का उपयोग करने के बारे में निम्नलिखित मार्गदर्शिकाएँ देखें:
+दस्तावेज़ों में से:
+* [डेटासेट पर पाइपलाइनों का उपयोग करना](#using-pipelines-on-a-dataset)
+* [वेबसर्वर के लिए पाइपलाइनों का उपयोग करना](./pipeline_webserver)
+
+## प्राचल
+
+[`pipeline`] कई मापदंडों का समर्थन करता है; कुछ कार्य विशिष्ट हैं, और कुछ सभी पाइपलाइनों के लिए सामान्य हैं।
+सामान्य तौर पर, आप अपनी इच्छानुसार कहीं भी पैरामीटर निर्दिष्ट कर सकते हैं:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
+
+out = transcriber(...)  # This will use `my_parameter=1`.
+out = transcriber(..., my_parameter=2)  # This will override and use `my_parameter=2`.
+out = transcriber(...)  # This will go back to using `my_parameter=1`.
+```
+
+आइए 3 महत्वपूर्ण बातों पर गौर करें:
+
+### उपकरण
+
+यदि आप `device=0` का उपयोग करते हैं, तो पाइपलाइन स्वचालित रूप से मॉडल को निर्दिष्ट डिवाइस पर डाल देती है।
+यह इस पर ध्यान दिए बिना काम करेगा कि आप PyTorch या Tensorflow का उपयोग कर रहे हैं या नहीं।
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0)
+```
+
+यदि मॉडल एकल GPU के लिए बहुत बड़ा है और आप PyTorch का उपयोग कर रहे हैं, तो आप `device_map="auto"` को स्वचालित रूप से सेट कर सकते हैं
+निर्धारित करें कि मॉडल वज़न को कैसे लोड और संग्रहीत किया जाए। `device_map` तर्क का उपयोग करने के लिए 🤗 [Accelerate] (https://huggingface.co/docs/accelerate) की आवश्यकता होती है
+पैकेट:
+
+```bash
+pip install --upgrade accelerate
+```
+
+निम्नलिखित कोड स्वचालित रूप से सभी डिवाइसों में मॉडल भार को लोड और संग्रहीत करता है:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
+```
+
+ध्यान दें कि यदि `device_map='auto'` पारित हो गया है, तो अपनी `pipeline` को चालू करते समय `device=device` तर्क जोड़ने की कोई आवश्यकता नहीं है क्योंकि आपको कुछ अप्रत्याशित व्यवहार का सामना करना पड़ सकता है!
+
+### बैच का आकार
+
+डिफ़ॉल्ट रूप से, पाइपलाइनें [यहां] (https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching) विस्तार से बताए गए कारणों के लिए बैच अनुमान नहीं लगाएंगी। इसका कारण यह है कि बैचिंग आवश्यक रूप से तेज़ नहीं है, और वास्तव में कुछ मामलों में काफी धीमी हो सकती है।
+
+लेकिन अगर यह आपके उपयोग के मामले में काम करता है, तो आप इसका उपयोग कर सकते हैं:
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
+audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
+texts = transcriber(audio_filenames)
+```
+
+यह प्रदान की गई 4 ऑडियो फाइलों पर पाइपलाइन चलाता है, लेकिन यह उन्हें 2 के बैच में पास करेगा
+आपसे किसी और कोड की आवश्यकता के बिना मॉडल (जो एक जीपीयू पर है, जहां बैचिंग से मदद मिलने की अधिक संभावना है) पर जाएं।
+आउटपुट हमेशा उसी से मेल खाना चाहिए जो आपको बैचिंग के बिना प्राप्त हुआ होगा। इसका उद्देश्य केवल पाइपलाइन से अधिक गति प्राप्त करने में आपकी सहायता करना है।
+
+पाइपलाइनें बैचिंग की कुछ जटिलताओं को भी कम कर सकती हैं क्योंकि, कुछ पाइपलाइनों के लिए, एक एकल आइटम (जैसे एक लंबी ऑडियो फ़ाइल) को एक मॉडल द्वारा संसाधित करने के लिए कई भागों में विभाजित करने की आवश्यकता होती है। पाइपलाइन आपके लिए यह [*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching) करती है।
+
+### कार्य विशिष्ट प्राचल
+
+सभी कार्य कार्य विशिष्ट प्राचल प्रदान करते हैं जो आपको अपना काम पूरा करने में मदद करने के लिए अतिरिक्त लचीलेपन और विकल्पों की अनुमति देते हैं।
+उदाहरण के लिए, [`transformers.AutomaticSpeechRecognitionPipeline.__call__`] विधि में एक `return_timestamps` प्राचल है जो वीडियो उपशीर्षक के लिए आशाजनक लगता है:
+
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
+```
+
+जैसा कि आप देख सकते हैं, मॉडल ने पाठ का अनुमान लगाया और **when** विभिन्न वाक्यों का उच्चारण किया गया तो आउटपुट भी दिया।
+
+प्रत्येक कार्य के लिए कई प्राचल उपलब्ध हैं, इसलिए यह देखने के लिए कि आप किसके साथ छेड़छाड़ कर सकते हैं, प्रत्येक कार्य का API संदर्भ देखें!
+उदाहरण के लिए, [`~transformers.AutomaticSpeechRecognitionPipeline`] में एक `chunk_length_s` प्राचल है जो सहायक है
+वास्तव में लंबी ऑडियो फ़ाइलों पर काम करने के लिए (उदाहरण के लिए, संपूर्ण फिल्मों या घंटे-लंबे वीडियो को उपशीर्षक देना) जो आमतौर पर एक मॉडल होता है
+अपने आप संभाल नहीं सकता:
+
+```python
+>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav")
+{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came.  I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. It was the day after the evening...
+```
+
+यदि आपको कोई ऐसा पैरामीटर नहीं मिल रहा है जो वास्तव में आपकी मदद करेगा, तो बेझिझक [अनुरोध करें](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)!
+
+
+## डेटासेट पर पाइपलाइनों का उपयोग करना
+
+पाइपलाइन बड़े डेटासेट पर भी अनुमान चला सकती है। ऐसा करने का सबसे आसान तरीका हम एक पुनरावर्तक का उपयोग करने की सलाह देते हैं:
+
+```py
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
+
+
+pipe = pipeline(model="gpt2", device=0)
+generated_characters = 0
+for out in pipe(data()):
+    generated_characters += len(out[0]["generated_text"])
+```
+
+पुनरावर्तक `data()` प्रत्येक परिणाम और पाइपलाइन स्वचालित रूप से उत्पन्न करता है
+पहचानता है कि इनपुट पुनरावर्तनीय है और डेटा प्राप्त करना शुरू कर देगा
+यह इसे GPU पर प्रोसेस करना जारी रखता है (यह हुड के तहत [DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader) का उपयोग करता है)।
+यह महत्वपूर्ण है क्योंकि आपको संपूर्ण डेटासेट के लिए मेमोरी आवंटित करने की आवश्यकता नहीं है
+और आप जितनी जल्दी हो सके GPU को फीड कर सकते हैं।
+
+चूंकि बैचिंग से चीज़ें तेज़ हो सकती हैं, इसलिए यहां `batch_size` प्राचल को ट्यून करने का प्रयास करना उपयोगी हो सकता है।
+
+किसी डेटासेट पर पुनरावृति करने का सबसे सरल तरीका बस एक को 🤗 [Dataset](https://github.com/huggingface/datasets/) से लोड करना है:
+
+```py
+# KeyDataset is a util that will just output the item we're interested in.
+from transformers.pipelines.pt_utils import KeyDataset
+from datasets import load_dataset
+
+pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
+dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
+
+for out in pipe(KeyDataset(dataset, "audio")):
+    print(out)
+```
+
+
+## वेबसर्वर के लिए पाइपलाइनों का उपयोग करना
+
+<Tip>
+एक अनुमान इंजन बनाना एक जटिल विषय है जो अपने आप में उपयुक्त है
+पृष्ठ।
+</Tip>
+
+[Link](./pipeline_webserver)
+
+## विज़न पाइपलाइन
+
+दृष्टि कार्यों के लिए [`pipeline`] का उपयोग करना व्यावहारिक रूप से समान है।
+
+अपना कार्य निर्दिष्ट करें और अपनी छवि क्लासिफायरियर को भेजें। छवि एक लिंक, एक स्थानीय पथ या बेस64-एन्कोडेड छवि हो सकती है। उदाहरण के लिए, बिल्ली की कौन सी प्रजाति नीचे दिखाई गई है?
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+## पाठ पाइपलाइन
+
+NLP कार्यों के लिए [`pipeline`] का उपयोग करना व्यावहारिक रूप से समान है।
+
+```py
+>>> from transformers import pipeline
+
+>>> # This model is a `zero-shot-classification` model.
+>>> # It will classify text, except you are free to choose any label you might imagine
+>>> classifier = pipeline(model="facebook/bart-large-mnli")
+>>> classifier(
+...     "I have a problem with my iphone that needs to be resolved asap!!",
+...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+... )
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+
+## बहुविध पाइपलाइन
+
+[`pipeline`] एक से अधिक तौर-तरीकों का समर्थन करती है। उदाहरण के लिए, एक दृश्य प्रश्न उत्तर (VQA) कार्य पाठ और छवि को जोड़ता है। अपनी पसंद के किसी भी छवि लिंक और छवि के बारे में कोई प्रश्न पूछने के लिए स्वतंत्र महसूस करें। छवि एक URL या छवि का स्थानीय पथ हो सकती है।
+
+उदाहरण के लिए, यदि आप इस [invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png) का उपयोग करते हैं:
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(model="impira/layoutlm-document-qa")
+>>> vqa(
+...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+...     question="What is the invoice number?",
+... )
+[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
+```
+
+<Tip>
+
+ऊपर दिए गए उदाहरण को चलाने के लिए आपको 🤗 ट्रांसफॉर्मर के अलावा [`pytesseract`](https://pypi.org/project/pytesseract/) इंस्टॉल करना होगा:
+
+```bash
+sudo apt install -y tesseract-ocr
+pip install pytesseract
+```
+
+</Tip>
+
+## 🤗 `त्वरण` के साथ बड़े मॉडलों पर `pipeline` का उपयोग करना:
+
+आप 🤗 `accelerate` का उपयोग करके बड़े मॉडलों पर आसानी से `pipeline` चला सकते हैं! पहले सुनिश्चित करें कि आपने `accelerate` को `pip install accelerate` के साथ इंस्टॉल किया है।
+
+सबसे पहले `device_map='auto'` का उपयोग करके अपना मॉडल लोड करें! हम अपने उदाहरण के लिए `facebook/opt-1.3b` का उपयोग करेंगे।
+
+```py
+# pip install accelerate
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+यदि आप `bitsandbytes` इंस्टॉल करते हैं और `load_in_8bit=True` तर्क जोड़ते हैं तो आप 8-बिट लोडेड मॉडल भी पास कर सकते हैं
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+ध्यान दें कि आप चेकपॉइंट को किसी भी हगिंग फेस मॉडल से बदल सकते हैं जो BLOOM जैसे बड़े मॉडल लोडिंग का समर्थन करता है!
--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@ -29,9 +29,77 @@
    title: LLM を使用した生成
  title: Tutorials
 - sections:
-  - local: generation_strategies
-    title: 生成戦略をカスタマイズする
-  title: Generation
+  - isExpanded: false
+  #   sections:
+  #   - local: tasks/sequence_classification
+  #     title: Text classification
+  #   - local: tasks/token_classification
+  #     title: Token classification
+  #   - local: tasks/question_answering
+  #     title: Question answering
+  #   - local: tasks/language_modeling
+  #     title: Causal language modeling
+  #   - local: tasks/masked_language_modeling
+  #     title: Masked language modeling
+  #   - local: tasks/translation
+  #     title: Translation
+  #   - local: tasks/summarization
+  #     title: Summarization
+  #   - local: tasks/multiple_choice
+  #     title: Multiple choice
+  #   title: Natural Language Processing
+  # - isExpanded: false
+  #   sections:
+  #   - local: tasks/audio_classification
+  #     title: Audio classification
+  #   - local: tasks/asr
+  #     title: Automatic speech recognition
+  #   title: Audio
+  # - isExpanded: false
+  #   sections:
+  #   - local: tasks/image_classification
+  #     title: Image classification
+  #   - local: tasks/semantic_segmentation
+  #     title: Semantic segmentation
+  #   - local: tasks/video_classification
+  #     title: Video classification
+  #   - local: tasks/object_detection
+  #     title: Object detection
+  #   - local: tasks/zero_shot_object_detection
+  #     title: Zero-shot object detection
+  #   - local: tasks/zero_shot_image_classification
+  #     title: Zero-shot image classification
+  #   - local: tasks/monocular_depth_estimation
+  #     title: Depth estimation
+  #   - local: tasks/image_to_image
+  #     title: Image-to-Image
+  #   - local: tasks/knowledge_distillation_for_image_classification
+  #     title: Knowledge Distillation for Computer Vision
+  #   title: Computer Vision
+  # - isExpanded: false
+  #   sections:
+  #   - local: tasks/image_captioning
+  #     title: Image captioning
+  #   - local: tasks/document_question_answering
+  #     title: Document Question Answering
+  #   - local: tasks/visual_question_answering
+  #     title: Visual Question Answering
+  #   - local: tasks/text-to-speech
+  #     title: Text to speech
+  #   title: Multimodal
+  - isExpanded: false
+    sections:
+    - local: generation_strategies
+      title: 生成戦略をカスタマイズする
+    title: Generation
+  # - isExpanded: false
+  #   sections:
+  #   - local: tasks/idefics
+  #     title: Image tasks with IDEFICS
+  #   - local: tasks/prompting
+  #     title: LLM prompting guide
+  #   title: Prompting
+    title: Task Guides
 - sections:
  - local: fast_tokenizers
    title: 🤗 トークナイザーの高速トークナイザーを使用する
@ -135,23 +203,66 @@
    title: モデルトレーニングの解剖学
  title: コンセプチュアルガイド
 - sections:
-  - local: internal/modeling_utils
-    title: カスタムレイヤーとユーティリティ
-  - local: internal/pipelines_utils
-    title: パイプライン用のユーティリティ
-  - local: internal/tokenization_utils
-    title: ト=ークナイザー用のユーティリティ
-  - local: internal/trainer_utils
-    title: トレーナー用ユーティリティ
-  - local: internal/generation_utils
-    title: 発電用ユーティリティ
-  - local: internal/image_processing_utils
-    title: 画像プロセッサ用ユーティリティ
-  - local: internal/audio_utils
-    title: オーディオ処理用のユーティリティ
-  - local: internal/file_utils
-    title: 一般公共事業
-  - local: internal/time_series_utils
-    title: 時系列用のユーティリティ
-  title: 内部ヘルパー
+  - sections:
+    - local: main_classes/agent
+      title: エージェントとツール
+    # - local: model_doc/auto
+    #   title: Auto Classes
+    - local: main_classes/callback
+      title: コールバック
+    - local: main_classes/configuration
+      title: 構成
+    - local: main_classes/data_collator
+      title: データ照合者
+    - local: main_classes/keras_callbacks
+      title: Keras コールバック
+    - local: main_classes/logging
+      title: ロギング
+    - local: main_classes/model
+      title: モデル
+    - local: main_classes/text_generation
+      title: テキストの生成
+    - local: main_classes/onnx
+      title: ONNX
+    - local: main_classes/optimizer_schedules
+      title: 最適化
+    - local: main_classes/output
+      title: モデルの出力
+    - local: main_classes/pipelines
+      title: パイプライン
+    - local: main_classes/processors
+      title: プロセッサー
+    - local: main_classes/quantization
+      title: 量子化
+    - local: main_classes/tokenizer
+      title: トークナイザー
+    - local: main_classes/trainer
+      title: トレーナー
+    - local: main_classes/deepspeed
+      title: ディープスピードの統合
+    - local: main_classes/feature_extractor
+      title: 特徴抽出器
+    - local: main_classes/image_processor
+      title: 画像処理プロセッサ
+    title: 主要なクラス
+  - sections:
+    - local: internal/modeling_utils
+      title: カスタムレイヤーとユーティリティ
+    - local: internal/pipelines_utils
+      title: パイプライン用のユーティリティ
+    - local: internal/tokenization_utils
+      title: ト=ークナイザー用のユーティリティ
+    - local: internal/trainer_utils
+      title: トレーナー用ユーティリティ
+    - local: internal/generation_utils
+      title: 発電用ユーティリティ
+    - local: internal/image_processing_utils
+      title: 画像プロセッサ用ユーティリティ
+    - local: internal/audio_utils
+      title: オーディオ処理用のユーティリティ
+    - local: internal/file_utils
+      title: 一般公共事業
+    - local: internal/time_series_utils
+      title: 時系列用のユーティリティ
+    title: 内部ヘルパー
  title: API
--- a/docs/source/ja/main_classes/agent.md
+++ b/docs/source/ja/main_classes/agent.md
@ -0,0 +1,105 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# エージェントとツール
+
+<Tip warning={true}>
+
+Transformers Agents は実験的な API であり、いつでも変更される可能性があります。エージェントから返される結果
+API または基礎となるモデルは変更される傾向があるため、変更される可能性があります。
+
+</Tip>
+
+エージェントとツールの詳細については、[入門ガイド](../transformers_agents) を必ずお読みください。このページ
+基礎となるクラスの API ドキュメントが含まれています。
+
+## エージェント
+
+私たちは 3 種類のエージェントを提供します。[`HfAgent`] はオープンソース モデルの推論エンドポイントを使用し、[`LocalAgent`] は選択したモデルをローカルで使用し、[`OpenAiAgent`] は OpenAI クローズド モデルを使用します。
+
+### HfAgent
+
+[[autodoc]] HfAgent
+
+### LocalAgent
+
+[[autodoc]] LocalAgent
+
+### OpenAiAgent
+
+[[autodoc]] OpenAiAgent
+
+### AzureOpenAiAgent
+
+[[autodoc]] AzureOpenAiAgent
+
+### Agent
+
+[[autodoc]] Agent
+    - chat
+    - run
+    - prepare_for_new_chat
+
+## Tools
+
+### load_tool
+
+[[autodoc]] load_tool
+
+### Tool
+
+[[autodoc]] Tool
+
+### PipelineTool
+
+[[autodoc]] PipelineTool
+
+### RemoteTool
+
+[[autodoc]] RemoteTool
+
+### launch_gradio_demo
+
+[[autodoc]] launch_gradio_demo
+
+## エージェントの種類
+
+エージェントはツール間であらゆる種類のオブジェクトを処理できます。ツールは完全にマルチモーダルであるため、受け取りと返品が可能です
+テキスト、画像、オーディオ、ビデオなどのタイプ。ツール間の互換性を高めるためだけでなく、
+これらの戻り値を ipython (jupyter、colab、ipython ノートブックなど) で正しくレンダリングするには、ラッパー クラスを実装します。
+このタイプの周り。
+
+ラップされたオブジェクトは最初と同じように動作し続けるはずです。テキストオブジェクトは依然として文字列または画像として動作する必要があります
+オブジェクトは依然として `PIL.Image` として動作するはずです。
+
+これらのタイプには、次の 3 つの特定の目的があります。
+
+- 型に対して `to_raw` を呼び出すと、基になるオブジェクトが返されるはずです
+- 型に対して `to_string` を呼び出すと、オブジェクトを文字列として返す必要があります。`AgentText` の場合は文字列になる可能性があります。
+  ただし、他のインスタンスのオブジェクトのシリアル化されたバージョンのパスになります。
+- ipython カーネルで表示すると、オブジェクトが正しく表示されるはずです
+
+### AgentText
+
+[[autodoc]] transformers.tools.agent_types.AgentText
+
+### AgentImage
+
+[[autodoc]] transformers.tools.agent_types.AgentImage
+
+### AgentAudio
+
+[[autodoc]] transformers.tools.agent_types.AgentAudio
--- a/docs/source/ja/main_classes/callback.md
+++ b/docs/source/ja/main_classes/callback.md
@ -0,0 +1,131 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# コールバック数
+
+コールバックは、PyTorch のトレーニング ループの動作をカスタマイズできるオブジェクトです。
+トレーニング ループを検査できる [`Trainer`] (この機能は TensorFlow にはまだ実装されていません)
+状態を確認し (進捗レポート、TensorBoard または他の ML プラットフォームへのログ記録など)、決定を下します (初期段階など)。
+停止中）。
+
+コールバックは、返される [`TrainerControl`] オブジェクトを除けば、「読み取り専用」のコード部分です。
+トレーニング ループ内では何も変更できません。トレーニング ループの変更が必要なカスタマイズの場合は、次のことを行う必要があります。
+[`Trainer`] をサブクラス化し、必要なメソッドをオーバーライドします (例については、[trainer](trainer) を参照してください)。
+
+デフォルトでは、`TrainingArguments.report_to` は `"all"` に設定されているため、[`Trainer`] は次のコールバックを使用します。
+
+- [`DefaultFlowCallback`] は、ログ記録、保存、評価のデフォルトの動作を処理します。
+- [`PrinterCallback`] または [`ProgressCallback`] で進行状況を表示し、
+  ログ (最初のログは、[`TrainingArguments`] を通じて tqdm を非アクティブ化する場合に使用され、そうでない場合に使用されます)
+  2番目です)。
+- [`~integrations.TensorBoardCallback`] (PyTorch >= 1.4 を介して) tensorboard にアクセスできる場合
+  またはテンソルボードX）。
+- [`~integrations.WandbCallback`] [wandb](https://www.wandb.com/) がインストールされている場合。
+- [`~integrations.CometCallback`] [comet_ml](https://www.comet.ml/site/) がインストールされている場合。
+- [mlflow](https://www.mlflow.org/) がインストールされている場合は [`~integrations.MLflowCallback`]。
+- [`~integrations.NeptuneCallback`] [neptune](https://neptune.ai/) がインストールされている場合。
+- [`~integrations.AzureMLCallback`] [azureml-sdk](https://pypi.org/project/azureml-sdk/) の場合
+  インストールされています。
+- [`~integrations.CodeCarbonCallback`] [codecarbon](https://pypi.org/project/codecarbon/) の場合
+  インストールされています。
+- [`~integrations.ClearMLCallback`] [clearml](https://github.com/allegroai/clearml) がインストールされている場合。
+- [`~integrations.DagsHubCallback`] [dagshub](https://dagshub.com/) がインストールされている場合。
+- [`~integrations.FlyteCallback`] [flyte](https://flyte.org/) がインストールされている場合。
+
+パッケージがインストールされているが、付随する統合を使用したくない場合は、`TrainingArguments.report_to` を、使用したい統合のみのリストに変更できます (例: `["azure_ml", "wandb"]`) 。
+
+コールバックを実装するメインクラスは [`TrainerCallback`] です。それは、
+[`TrainingArguments`] は [`Trainer`] をインスタンス化するために使用され、それにアクセスできます。
+[`TrainerState`] を介してトレーナーの内部状態を取得し、トレーニング ループ上でいくつかのアクションを実行できます。
+[`TrainerControl`]。
+
+## 利用可能なコールバック
+
+ライブラリで利用可能な [`TrainerCallback`] のリストは次のとおりです。
+
+[[autodoc]] integrations.CometCallback
+    - setup
+
+[[autodoc]] DefaultFlowCallback
+
+[[autodoc]] PrinterCallback
+
+[[autodoc]] ProgressCallback
+
+[[autodoc]] EarlyStoppingCallback
+
+[[autodoc]] integrations.TensorBoardCallback
+
+[[autodoc]] integrations.WandbCallback
+    - setup
+
+[[autodoc]] integrations.MLflowCallback
+    - setup
+
+[[autodoc]] integrations.AzureMLCallback
+
+[[autodoc]] integrations.CodeCarbonCallback
+
+[[autodoc]] integrations.NeptuneCallback
+
+[[autodoc]] integrations.ClearMLCallback
+
+[[autodoc]] integrations.DagsHubCallback
+
+[[autodoc]] integrations.FlyteCallback
+
+## TrainerCallback
+
+[[autodoc]] TrainerCallback
+
+以下は、カスタム コールバックを PyTorch [`Trainer`] に登録する方法の例です。
+
+```python
+class MyCallback(TrainerCallback):
+    "A callback that prints a message at the beginning of training"
+
+    def on_train_begin(self, args, state, control, **kwargs):
+        print("Starting training")
+
+
+trainer = Trainer(
+    model,
+    args,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    callbacks=[MyCallback],  # We can either pass the callback class this way or an instance of it (MyCallback())
+)
+```
+
+コールバックを登録する別の方法は、次のように `trainer.add_callback()` を呼び出すことです。
+
+```python
+trainer = Trainer(...)
+trainer.add_callback(MyCallback)
+# Alternatively, we can pass an instance of the callback class
+trainer.add_callback(MyCallback())
+```
+
+## TrainerState
+
+[[autodoc]] TrainerState
+
+## TrainerControl
+
+[[autodoc]] TrainerControl
+
+
--- a/docs/source/ja/main_classes/configuration.md
+++ b/docs/source/ja/main_classes/configuration.md
@ -0,0 +1,31 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+＃ 構成
+
+基本クラス [`PretrainedConfig`] は、設定をロード/保存するための一般的なメソッドを実装します。
+ローカル ファイルまたはディレクトリから、またはライブラリ (ダウンロードされた) によって提供される事前トレーニング済みモデル構成から
+HuggingFace の AWS S3 リポジトリから)。
+
+各派生構成クラスはモデル固有の属性を実装します。すべての構成クラスに存在する共通の属性は次のとおりです。
+`hidden_size`、`num_attention_heads`、および `num_hidden_layers`。テキスト モデルはさらに以下を実装します。
+`vocab_size`。
+
+## PretrainedConfig
+
+[[autodoc]] PretrainedConfig
+    - push_to_hub
+    - all
--- a/docs/source/ja/main_classes/data_collator.md
+++ b/docs/source/ja/main_classes/data_collator.md
@ -0,0 +1,67 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# データ照合者
+
+データ照合器は、データセット要素のリストを入力として使用してバッチを形成するオブジェクトです。これらの要素は、
+`train_dataset` または `eval_dataset` の要素と同じ型。
+
+バッチを構築できるようにするために、データ照合者は何らかの処理 (パディングなど) を適用する場合があります。そのうちのいくつかは（
+[`DataCollatorForLanguageModeling`]) ランダムなデータ拡張 (ランダム マスキングなど) も適用します
+形成されたバッチ上で。
+
+使用例は、[サンプル スクリプト](../examples) または [サンプル ノートブック](../notebooks) にあります。
+
+## Default data collator
+
+[[autodoc]] data.data_collator.default_data_collator
+
+## DefaultDataCollator
+
+[[autodoc]] data.data_collator.DefaultDataCollator
+
+## DataCollatorWithPadding
+
+[[autodoc]] data.data_collator.DataCollatorWithPadding
+
+## DataCollatorForTokenClassification
+
+[[autodoc]] data.data_collator.DataCollatorForTokenClassification
+
+## DataCollatorForSeq2Seq
+
+[[autodoc]] data.data_collator.DataCollatorForSeq2Seq
+
+## DataCollatorForLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForWholeWordMask
+
+[[autodoc]] data.data_collator.DataCollatorForWholeWordMask
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
+
+## DataCollatorForPermutationLanguageModeling
+
+[[autodoc]] data.data_collator.DataCollatorForPermutationLanguageModeling
+    - numpy_mask_tokens
+    - tf_mask_tokens
+    - torch_mask_tokens
--- a/docs/source/ja/main_classes/deepspeed.md
+++ b/docs/source/ja/main_classes/deepspeed.md
--- a/docs/source/ja/main_classes/feature_extractor.md
+++ b/docs/source/ja/main_classes/feature_extractor.md
@ -0,0 +1,41 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+
+-->
+
+# Feature Extractor
+
+
+フィーチャーエクストラクタは、オーディオまたはビジョンモデルのための入力フィーチャーの準備を担当しています。これには、シーケンスからのフィーチャー抽出（例：オーディオファイルの前処理からLog-Melスペクトログラムフィーチャーへの変換）、画像からのフィーチャー抽出（例：画像ファイルのクロッピング）、またパディング、正規化、そしてNumpy、PyTorch、TensorFlowテンソルへの変換も含まれます。
+
+
+## FeatureExtractionMixin
+
+[[autodoc]] feature_extraction_utils.FeatureExtractionMixin
+    - from_pretrained
+    - save_pretrained
+
+## SequenceFeatureExtractor
+
+[[autodoc]] SequenceFeatureExtractor
+    - pad
+
+## BatchFeature
+
+[[autodoc]] BatchFeature
+
+## ImageFeatureExtractionMixin
+
+[[autodoc]] image_utils.ImageFeatureExtractionMixin
--- a/docs/source/ja/main_classes/image_processor.md
+++ b/docs/source/ja/main_classes/image_processor.md
@ -0,0 +1,33 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image Processor
+
+画像プロセッサは、ビジョン モデルの入力特徴の準備とその出力の後処理を担当します。これには、サイズ変更、正規化、PyTorch、TensorFlow、Flax、Numpy テンソルへの変換などの変換が含まれます。ロジットをセグメンテーション マスクに変換するなど、モデル固有の後処理も含まれる場合があります。
+
+## ImageProcessingMixin
+
+[[autodoc]] image_processing_utils.ImageProcessingMixin
+    - from_pretrained
+    - save_pretrained
+
+## BatchFeature
+
+[[autodoc]] BatchFeature
+
+## BaseImageProcessor
+
+[[autodoc]] image_processing_utils.BaseImageProcessor
--- a/docs/source/ja/main_classes/keras_callbacks.md
+++ b/docs/source/ja/main_classes/keras_callbacks.md
@ -1,4 +1,4 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -7,12 +7,22 @@ http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.

 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

 -->

-# Inference on Specialized Hardware
+# Keras callbacks

-This document will be completed soon with information on how to infer on specialized hardware. In the meantime you can check out [the guide for inference on CPUs](perf_infer_cpu).
+Keras を使用して Transformers モデルをトレーニングする場合、一般的な処理を自動化するために使用できるライブラリ固有のコールバックがいくつかあります。
+タスク:
+
+## KerasMetricCallback
+
+[[autodoc]] KerasMetricCallback
+
+## PushToHubCallback
+
+[[autodoc]] PushToHubCallback
--- a/docs/source/ja/main_classes/logging.md
+++ b/docs/source/ja/main_classes/logging.md
@ -0,0 +1,121 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Logging
+
+🤗 Transformersには、ライブラリの詳細度を簡単に設定できる中央集中型のロギングシステムがあります。
+
+現在、ライブラリのデフォルトの詳細度は「WARNING」です。
+
+詳細度を変更するには、直接設定メソッドの1つを使用するだけです。例えば、詳細度をINFOレベルに変更する方法は以下の通りです。
+
+
+```python
+import transformers
+
+transformers.logging.set_verbosity_info()
+```
+
+
+環境変数 `TRANSFORMERS_VERBOSITY` を使用して、デフォルトの冗長性をオーバーライドすることもできます。設定できます
+`debug`、`info`、`warning`、`error`、`critical` のいずれかに変更します。例えば：
+
+```bash
+TRANSFORMERS_VERBOSITY=error ./myprogram.py
+```
+
+
+さらに、一部の「警告」は環境変数を設定することで無効にできます。
+`TRANSFORMERS_NO_ADVISORY_WARNINGS` を *1* などの true 値に設定します。これにより、次を使用してログに記録される警告が無効になります。
+[`logger.warning_advice`]。例えば：
+
+```bash
+TRANSFORMERS_NO_ADVISORY_WARNINGS=1 ./myprogram.py
+```
+
+
+以下は、独自のモジュールまたはスクリプトでライブラリと同じロガーを使用する方法の例です。
+
+```python
+from transformers.utils import logging
+
+logging.set_verbosity_info()
+logger = logging.get_logger("transformers")
+logger.info("INFO")
+logger.warning("WARN")
+```
+
+このロギング モジュールのすべてのメソッドは以下に文書化されています。主なメソッドは次のとおりです。
+[`logging.get_verbosity`] ロガーの現在の冗長レベルを取得します。
+[`logging.set_verbosity`] を使用して、冗長性を選択したレベルに設定します。順番に（少ないものから）
+冗長から最も冗長まで)、それらのレベル (括弧内は対応する int 値) は次のとおりです。
+
+- `transformers.logging.CRITICAL` または `transformers.logging.FATAL` (int 値、50): 最も多いもののみをレポートします。
+  重大なエラー。
+- `transformers.logging.ERROR` (int 値、40): エラーのみを報告します。
+- `transformers.logging.WARNING` または `transformers.logging.WARN` (int 値、30): エラーと
+  警告。これはライブラリで使用されるデフォルトのレベルです。
+- `transformers.logging.INFO` (int 値、20): エラー、警告、および基本情報をレポートします。
+- `transformers.logging.DEBUG` (int 値、10): すべての情報をレポートします。
+
+デフォルトでは、モデルのダウンロード中に「tqdm」進行状況バーが表示されます。 [`logging.disable_progress_bar`] および [`logging.enable_progress_bar`] を使用して、この動作を抑制または抑制解除できます。
+
+## `logging` vs `warnings`
+
+Python には、よく組み合わせて使用される 2 つのロギング システムがあります。上で説明した `logging` と `warnings` です。
+これにより、特定のバケット内の警告をさらに分類できます (例: 機能またはパスの`FutureWarning`)
+これはすでに非推奨になっており、`DeprecationWarning`は今後の非推奨を示します。
+
+両方とも`transformers`ライブラリで使用します。 `logging`の`captureWarning`メソッドを活用して適応させて、
+これらの警告メッセージは、上記の冗長設定ツールによって管理されます。
+
+それはライブラリの開発者にとって何を意味しますか?次のヒューリスティックを尊重する必要があります。
+- `warnings`は、ライブラリおよび`transformers`に依存するライブラリの開発者に優先されるべきです。
+- `logging`は、日常のプロジェクトでライブラリを使用するライブラリのエンドユーザーに使用する必要があります。
+
+以下の`captureWarnings`メソッドのリファレンスを参照してください。
+
+[[autodoc]] logging.captureWarnings
+
+## Base setters
+
+[[autodoc]] logging.set_verbosity_error
+
+[[autodoc]] logging.set_verbosity_warning
+
+[[autodoc]] logging.set_verbosity_info
+
+[[autodoc]] logging.set_verbosity_debug
+
+## Other functions
+
+[[autodoc]] logging.get_verbosity
+
+[[autodoc]] logging.set_verbosity
+
+[[autodoc]] logging.get_logger
+
+[[autodoc]] logging.enable_default_handler
+
+[[autodoc]] logging.disable_default_handler
+
+[[autodoc]] logging.enable_explicit_format
+
+[[autodoc]] logging.reset_format
+
+[[autodoc]] logging.enable_progress_bar
+
+[[autodoc]] logging.disable_progress_bar
--- a/docs/source/ja/main_classes/model.md
+++ b/docs/source/ja/main_classes/model.md
@ -0,0 +1,160 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Models
+
+ベースクラスである [`PreTrainedModel`]、[`TFPreTrainedModel`]、[`FlaxPreTrainedModel`] は、モデルの読み込みと保存に関する共通のメソッドを実装しており、これはローカルのファイルやディレクトリから、またはライブラリが提供する事前学習モデル構成（HuggingFaceのAWS S3リポジトリからダウンロード）からモデルを読み込むために使用できます。
+
+[`PreTrainedModel`] と [`TFPreTrainedModel`] は、次の共通のメソッドも実装しています：
+
+- 語彙に新しいトークンが追加された場合に、入力トークン埋め込みのリサイズを行う
+- モデルのアテンションヘッドを刈り込む
+
+各モデルに共通するその他のメソッドは、[`~modeling_utils.ModuleUtilsMixin`]（PyTorchモデル用）および[`~modeling_tf_utils.TFModuleUtilsMixin`]（TensorFlowモデル用）で定義されており、テキスト生成の場合、[`~generation.GenerationMixin`]（PyTorchモデル用）、[`~generation.TFGenerationMixin`]（TensorFlowモデル用）、および[`~generation.FlaxGenerationMixin`]（Flax/JAXモデル用）もあります。
+
+
+## PreTrainedModel
+
+[[autodoc]] PreTrainedModel
+    - push_to_hub
+    - all
+
+<a id='from_pretrained-torch-dtype'></a>
+
+
+### 大規模モデルの読み込み
+
+Transformers 4.20.0では、[`~PreTrainedModel.from_pretrained`] メソッドが再設計され、[Accelerate](https://huggingface.co/docs/accelerate/big_modeling) を使用して大規模モデルを扱うことが可能になりました。これには Accelerate >= 0.9.0 と PyTorch >= 1.9.0 が必要です。以前の方法でフルモデルを作成し、その後事前学習の重みを読み込む代わりに（これにはメモリ内のモデルサイズが2倍必要で、ランダムに初期化されたモデル用と重み用の2つが必要でした）、モデルを空の外殻として作成し、事前学習の重みが読み込まれるときにパラメーターを実体化するオプションが追加されました。
+
+このオプションは `low_cpu_mem_usage=True` で有効にできます。モデルはまず空の重みを持つメタデバイス上に作成され、その後状態辞書が内部に読み込まれます（シャードされたチェックポイントの場合、シャードごとに読み込まれます）。この方法で使用される最大RAMは、モデルの完全なサイズだけです。
+
+
+```py
+from transformers import AutoModelForSeq2SeqLM
+
+t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", low_cpu_mem_usage=True)
+```
+
+さらに、モデルが完全にRAMに収まらない場合（現時点では推論のみ有効）、異なるデバイスにモデルを直接配置できます。`device_map="auto"` を使用すると、Accelerateは各レイヤーをどのデバイスに配置するかを決定し、最速のデバイス（GPU）を最大限に活用し、残りの部分をCPU、あるいはGPU RAMが不足している場合はハードドライブにオフロードします。モデルが複数のデバイスに分割されていても、通常どおり実行されます。
+
+`device_map` を渡す際、`low_cpu_mem_usage` は自動的に `True` に設定されるため、それを指定する必要はありません。
+
+
+```py
+from transformers import AutoModelForSeq2SeqLM
+
+t0pp = AutoModelForSeq2SeqLM.from_pretrained("bigscience/T0pp", device_map="auto")
+```
+
+モデルがデバイス間でどのように分割されたかは、その `hf_device_map` 属性を見ることで確認できます:
+
+```py
+t0pp.hf_device_map
+```
+
+```python out
+{'shared': 0,
+ 'decoder.embed_tokens': 0,
+ 'encoder': 0,
+ 'decoder.block.0': 0,
+ 'decoder.block.1': 1,
+ 'decoder.block.2': 1,
+ 'decoder.block.3': 1,
+ 'decoder.block.4': 1,
+ 'decoder.block.5': 1,
+ 'decoder.block.6': 1,
+ 'decoder.block.7': 1,
+ 'decoder.block.8': 1,
+ 'decoder.block.9': 1,
+ 'decoder.block.10': 1,
+ 'decoder.block.11': 1,
+ 'decoder.block.12': 1,
+ 'decoder.block.13': 1,
+ 'decoder.block.14': 1,
+ 'decoder.block.15': 1,
+ 'decoder.block.16': 1,
+ 'decoder.block.17': 1,
+ 'decoder.block.18': 1,
+ 'decoder.block.19': 1,
+ 'decoder.block.20': 1,
+ 'decoder.block.21': 1,
+ 'decoder.block.22': 'cpu',
+ 'decoder.block.23': 'cpu',
+ 'decoder.final_layer_norm': 'cpu',
+ 'decoder.dropout': 'cpu',
+ 'lm_head': 'cpu'}
+```
+
+同じフォーマットに従って、独自のデバイスマップを作成することもできます（レイヤー名からデバイスへの辞書です）。モデルのすべてのパラメータを指定されたデバイスにマップする必要がありますが、1つのレイヤーが完全に同じデバイスにある場合、そのレイヤーのサブモジュールのすべてがどこに行くかの詳細を示す必要はありません。例えば、次のデバイスマップはT0ppに適しています（GPUメモリがある場合）:
+
+```python
+device_map = {"shared": 0, "encoder": 0, "decoder": 1, "lm_head": 1}
+```
+
+モデルのメモリへの影響を最小限に抑えるもう 1 つの方法は、低精度の dtype (`torch.float16` など) でモデルをインスタンス化するか、以下で説明する直接量子化手法を使用することです。
+
+### Model Instantiation dtype
+
+Pytorch では、モデルは通常 `torch.float32` 形式でインスタンス化されます。これは、しようとすると問題になる可能性があります
+重みが fp16 にあるモデルをロードすると、2 倍のメモリが必要になるためです。この制限を克服するには、次のことができます。
+`torch_dtype` 引数を使用して、目的の `dtype` を明示的に渡します。
+
+```python
+model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16)
+```
+または、モデルを常に最適なメモリ パターンでロードしたい場合は、特別な値 `"auto"` を使用できます。
+そして、`dtype` はモデルの重みから自動的に導出されます。
+
+```python
+model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto")
+```
+
+スクラッチからインスタンス化されたモデルには、どの `dtype` を使用するかを指示することもできます。
+
+```python
+config = T5Config.from_pretrained("t5")
+model = AutoModel.from_config(config)
+```
+
+Pytorch の設計により、この機能は浮動小数点 dtype でのみ使用できます。
+
+## ModuleUtilsMixin
+
+[[autodoc]] modeling_utils.ModuleUtilsMixin
+
+## TFPreTrainedModel
+
+[[autodoc]] TFPreTrainedModel
+    - push_to_hub
+    - all
+
+## TFModelUtilsMixin
+
+[[autodoc]] modeling_tf_utils.TFModelUtilsMixin
+
+## FlaxPreTrainedModel
+
+[[autodoc]] FlaxPreTrainedModel
+    - push_to_hub
+    - all
+
+## Pushing to the Hub
+
+[[autodoc]] utils.PushToHubMixin
+
+## Sharded checkpoints
+
+[[autodoc]] modeling_utils.load_sharded_checkpoint
--- a/docs/source/ja/main_classes/onnx.md
+++ b/docs/source/ja/main_classes/onnx.md
@ -0,0 +1,55 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Exporting 🤗 Transformers models to ONNX
+
+🤗 Transformers は `transformers.onnx` パッケージを提供します。
+設定オブジェクトを利用することで、モデルのチェックポイントをONNXグラフに変換することができます。
+
+詳細は[ガイド](../serialization) を参照してください。
+を参照してください。
+
+## ONNX Configurations
+
+以下の3つの抽象クラスを提供しています。
+エクスポートしたいモデルアーキテクチャのタイプに応じて、継承すべき3つの抽象クラスを提供します：
+
+* エンコーダーベースのモデルは [`~onnx.config.OnnxConfig`] を継承します。
+* デコーダーベースのモデルは [`~onnx.config.OnnxConfigWithPast`] を継承します。
+* エンコーダー・デコーダーモデルは [`~onnx.config.OnnxSeq2SeqConfigWithPast`] を継承しています。
+
+
+### OnnxConfig
+
+[[autodoc]] onnx.config.OnnxConfig
+
+### OnnxConfigWithPast
+
+[[autodoc]] onnx.config.OnnxConfigWithPast
+
+### OnnxSeq2SeqConfigWithPast
+
+[[autodoc]] onnx.config.OnnxSeq2SeqConfigWithPast
+
+## ONNX Features
+
+各 ONNX 構成は、次のことを可能にする一連の _機能_ に関連付けられています。
+さまざまなタイプのトポロジまたはタスクのモデルをエクスポートします。
+
+### FeaturesManager
+
+[[autodoc]] onnx.features.FeaturesManager
+
--- a/docs/source/ja/main_classes/optimizer_schedules.md
+++ b/docs/source/ja/main_classes/optimizer_schedules.md
@ -0,0 +1,77 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Optimization
+
+`.optimization` モジュールは以下を提供します。
+
+- モデルの微調整に使用できる重み減衰が修正されたオプティマイザー、および
+- `_LRSchedule` から継承するスケジュール オブジェクトの形式のいくつかのスケジュール:
+- 複数のバッチの勾配を累積するための勾配累積クラス
+
+## AdamW (PyTorch)
+
+[[autodoc]] AdamW
+
+## AdaFactor (PyTorch)
+
+[[autodoc]] Adafactor
+
+## AdamWeightDecay (TensorFlow)
+
+[[autodoc]] AdamWeightDecay
+
+[[autodoc]] create_optimizer
+
+## Schedules
+
+### Learning Rate Schedules (Pytorch)
+
+[[autodoc]] SchedulerType
+
+[[autodoc]] get_scheduler
+
+[[autodoc]] get_constant_schedule
+
+[[autodoc]] get_constant_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_constant_schedule.png"/>
+
+[[autodoc]] get_cosine_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_schedule.png"/>
+
+[[autodoc]] get_cosine_with_hard_restarts_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_cosine_hard_restarts_schedule.png"/>
+
+[[autodoc]] get_linear_schedule_with_warmup
+
+<img alt="" src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/warmup_linear_schedule.png"/>
+
+[[autodoc]] get_polynomial_decay_schedule_with_warmup
+
+[[autodoc]] get_inverse_sqrt_schedule
+
+### Warmup (TensorFlow)
+
+[[autodoc]] WarmUp
+
+## Gradient Strategies
+
+### GradientAccumulator (TensorFlow)
+
+[[autodoc]] GradientAccumulator
--- a/docs/source/ja/main_classes/output.md
+++ b/docs/source/ja/main_classes/output.md
@ -0,0 +1,321 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Model outputs
+
+すべてのモデルには、[`~utils.ModelOutput`] のサブクラスのインスタンスである出力があります。それらは
+モデルによって返されるすべての情報を含むデータ構造ですが、タプルまたは
+辞書。
+
+これがどのようになるかを例で見てみましょう。
+
+```python
+from transformers import BertTokenizer, BertForSequenceClassification
+import torch
+
+tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+model = BertForSequenceClassification.from_pretrained("bert-base-uncased")
+
+inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+outputs = model(**inputs, labels=labels)
+```
+
+`outputs`オブジェクトは[`~modeling_outputs.SequenceClassifierOutput`]である。
+これは、オプションで `loss`、`logits`、オプションで `hidden_states`、オプションで `attentions` 属性を持つことを意味します。
+オプションの `attentions` 属性を持つことを意味する。ここでは、`labels`を渡したので`loss`があるが、`hidden_states`と`attentions`はない。
+`output_hidden_states=True`や`output_attentions=True`を渡していないので、`hidden_states`と`attentions`はない。
+`output_attentions=True`を渡さなかったからだ。
+
+<Tip>
+
+`output_hidden_states=True`を渡すと、`outputs.hidden_states[-1]`が `outputs.last_hidden_states` と正確に一致することを期待するかもしれない。
+しかし、必ずしもそうなるとは限りません。モデルによっては、最後に隠された状態が返されたときに、正規化やその後の処理を適用するものもあります。
+
+</Tip>
+
+
+通常と同じように各属性にアクセスできます。その属性がモデルから返されなかった場合は、
+は `None`を取得します。ここで、たとえば`outputs.loss`はモデルによって計算された損失であり、`outputs.attentions`は
+`None`。
+
+`outputs`オブジェクトをタプルとして考える場合、`None`値を持たない属性のみが考慮されます。
+たとえば、ここには 2 つの要素、`loss`、次に`logits`があります。
+
+```python
+outputs[:2]
+```
+
+たとえば、タプル `(outputs.loss, Outputs.logits)` を返します。
+
+`outputs`オブジェクトを辞書として考慮する場合、「None」を持たない属性のみが考慮されます。
+価値観。たとえば、ここには`loss` と `logits`という 2 つのキーがあります。
+
+ここでは、複数のモデル タイプで使用される汎用モデルの出力を文書化します。具体的な出力タイプは次のとおりです。
+対応するモデルのページに記載されています。
+
+## ModelOutput
+
+[[autodoc]] utils.ModelOutput
+    - to_tuple
+
+## BaseModelOutput
+
+[[autodoc]] modeling_outputs.BaseModelOutput
+
+## BaseModelOutputWithPooling
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPooling
+
+## BaseModelOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithCrossAttentions
+
+## BaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+
+## BaseModelOutputWithPast
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPast
+
+## BaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+
+## Seq2SeqModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqModelOutput
+
+## CausalLMOutput
+
+[[autodoc]] modeling_outputs.CausalLMOutput
+
+## CausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithCrossAttentions
+
+## CausalLMOutputWithPast
+
+[[autodoc]] modeling_outputs.CausalLMOutputWithPast
+
+## MaskedLMOutput
+
+[[autodoc]] modeling_outputs.MaskedLMOutput
+
+## Seq2SeqLMOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqLMOutput
+
+## NextSentencePredictorOutput
+
+[[autodoc]] modeling_outputs.NextSentencePredictorOutput
+
+## SequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.SequenceClassifierOutput
+
+## Seq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqSequenceClassifierOutput
+
+## MultipleChoiceModelOutput
+
+[[autodoc]] modeling_outputs.MultipleChoiceModelOutput
+
+## TokenClassifierOutput
+
+[[autodoc]] modeling_outputs.TokenClassifierOutput
+
+## QuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.QuestionAnsweringModelOutput
+
+## Seq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+
+## Seq2SeqSpectrogramOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqSpectrogramOutput
+
+## SemanticSegmenterOutput
+
+[[autodoc]] modeling_outputs.SemanticSegmenterOutput
+
+## ImageClassifierOutput
+
+[[autodoc]] modeling_outputs.ImageClassifierOutput
+
+## ImageClassifierOutputWithNoAttention
+
+[[autodoc]] modeling_outputs.ImageClassifierOutputWithNoAttention
+
+## DepthEstimatorOutput
+
+[[autodoc]] modeling_outputs.DepthEstimatorOutput
+
+## Wav2Vec2BaseModelOutput
+
+[[autodoc]] modeling_outputs.Wav2Vec2BaseModelOutput
+
+## XVectorOutput
+
+[[autodoc]] modeling_outputs.XVectorOutput
+
+## Seq2SeqTSModelOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTSModelOutput
+
+## Seq2SeqTSPredictionOutput
+
+[[autodoc]] modeling_outputs.Seq2SeqTSPredictionOutput
+
+## SampleTSPredictionOutput
+
+[[autodoc]] modeling_outputs.SampleTSPredictionOutput
+
+## TFBaseModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutput
+
+## TFBaseModelOutputWithPooling
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPooling
+
+## TFBaseModelOutputWithPoolingAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
+
+## TFBaseModelOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPast
+
+## TFBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
+
+## TFSeq2SeqModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqModelOutput
+
+## TFCausalLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutput
+
+## TFCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
+
+## TFCausalLMOutputWithPast
+
+[[autodoc]] modeling_tf_outputs.TFCausalLMOutputWithPast
+
+## TFMaskedLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFMaskedLMOutput
+
+## TFSeq2SeqLMOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqLMOutput
+
+## TFNextSentencePredictorOutput
+
+[[autodoc]] modeling_tf_outputs.TFNextSentencePredictorOutput
+
+## TFSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSequenceClassifierOutput
+
+## TFSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqSequenceClassifierOutput
+
+## TFMultipleChoiceModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFMultipleChoiceModelOutput
+
+## TFTokenClassifierOutput
+
+[[autodoc]] modeling_tf_outputs.TFTokenClassifierOutput
+
+## TFQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFQuestionAnsweringModelOutput
+
+## TFSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
+
+## FlaxBaseModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutput
+
+## FlaxBaseModelOutputWithPast
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+## FlaxBaseModelOutputWithPooling
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+## FlaxBaseModelOutputWithPastAndCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+## FlaxSeq2SeqModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+## FlaxCausalLMOutputWithCrossAttentions
+
+[[autodoc]] modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+## FlaxMaskedLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMaskedLMOutput
+
+## FlaxSeq2SeqLMOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+## FlaxNextSentencePredictorOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+## FlaxSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+## FlaxSeq2SeqSequenceClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+## FlaxMultipleChoiceModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+## FlaxTokenClassifierOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxTokenClassifierOutput
+
+## FlaxQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+## FlaxSeq2SeqQuestionAnsweringModelOutput
+
+[[autodoc]] modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
--- a/docs/source/ja/main_classes/pipelines.md
+++ b/docs/source/ja/main_classes/pipelines.md
@ -0,0 +1,494 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Pipelines
+
+パイプラインは、推論にモデルを使うための簡単で優れた方法である。パイプラインは、複雑なコードのほとんどを抽象化したオブジェクトです。
+パイプラインは、ライブラリから複雑なコードのほとんどを抽象化したオブジェクトで、名前付き固有表現認識、マスク言語モデリング、感情分析、特徴抽出、質問応答などのタスクに特化したシンプルなAPIを提供します。
+Recognition、Masked Language Modeling、Sentiment Analysis、Feature Extraction、Question Answeringなどのタスクに特化したシンプルなAPIを提供します。以下を参照のこと。
+[タスク概要](../task_summary)を参照してください。
+
+
+パイプラインの抽象化には2つのカテゴリーがある：
+
+- [`pipeline`] は、他のすべてのパイプラインをカプセル化する最も強力なオブジェクトです。
+- タスク固有のパイプラインは、[オーディオ](#audio)、[コンピューター ビジョン](#computer-vision)、[自然言語処理](#natural-language-processing)、および [マルチモーダル](#multimodal) タスクで使用できます。 
+
+## The pipeline abstraction
+
+*パイプライン* 抽象化は、他のすべての利用可能なパイプラインのラッパーです。他のものと同様にインスタンス化されます
+パイプラインですが、さらなる生活の質を提供できます。
+
+1 つの項目に対する単純な呼び出し:
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe("This restaurant is awesome")
+[{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+```
+
+[ハブ](https://huggingface.co) の特定のモデルを使用したい場合は、モデルがオンになっている場合はタスクを無視できます。
+ハブはすでにそれを定義しています。
+
+```python
+>>> pipe = pipeline(model="roberta-large-mnli")
+>>> pipe("This restaurant is awesome")
+[{'label': 'NEUTRAL', 'score': 0.7313136458396912}]
+```
+
+多くの項目に対してパイプラインを呼び出すには、*list* を使用してパイプラインを呼び出すことができます。
+
+```python
+>>> pipe = pipeline("text-classification")
+>>> pipe(["This restaurant is awesome", "This restaurant is awful"])
+[{'label': 'POSITIVE', 'score': 0.9998743534088135},
+ {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
+```
+
+完全なデータセットを反復するには、`Dataset`を直接使用することをお勧めします。これは、割り当てる必要がないことを意味します
+データセット全体を一度に処理することも、自分でバッチ処理を行う必要もありません。これはカスタムループと同じくらい速く動作するはずです。
+GPU。それが問題でない場合は、ためらわずに問題を作成してください。
+
+```python
+import datasets
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+from tqdm.auto import tqdm
+
+pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
+dataset = datasets.load_dataset("superb", name="asr", split="test")
+
+# KeyDataset (only *pt*) will simply return the item in the dict returned by the dataset item
+# as we're not interested in the *target* part of the dataset. For sentence pair use KeyPairDataset
+for out in tqdm(pipe(KeyDataset(dataset, "file"))):
+    print(out)
+    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+    # {"text": ....}
+    # ....
+```
+
+使いやすくするために、ジェネレーターを使用することもできます。
+
+```python
+from transformers import pipeline
+
+pipe = pipeline("text-classification")
+
+
+def data():
+    while True:
+        # This could come from a dataset, a database, a queue or HTTP request
+        # in a server
+        # Caveat: because this is iterative, you cannot use `num_workers > 1` variable
+        # to use multiple threads to preprocess data. You can still have 1 thread that
+        # does the preprocessing while the main runs the big inference
+        yield "This is a test"
+
+
+for out in pipe(data()):
+    print(out)
+    # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+    # {"text": ....}
+    # ....
+```
+
+[[autodoc]] pipeline
+
+
+## Pipeline batching
+
+
+すべてのパイプラインでバッチ処理を使用できます。これはうまくいきます
+パイプラインがストリーミング機能を使用するときは常に (つまり、リスト、`dataset`、または `generator`を渡すとき)。
+
+```python
+from transformers import pipeline
+from transformers.pipelines.pt_utils import KeyDataset
+import datasets
+
+dataset = datasets.load_dataset("imdb", name="plain_text", split="unsupervised")
+pipe = pipeline("text-classification", device=0)
+for out in pipe(KeyDataset(dataset, "text"), batch_size=8, truncation="only_first"):
+    print(out)
+    # [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+    # Exactly the same output as before, but the content are passed
+    # as batches to the model
+```
+
+<Tip warning={true}>
+
+
+ただし、これによってパフォーマンスが自動的に向上するわけではありません。状況に応じて、10 倍の高速化または 5 倍の低速化のいずれかになります。
+ハードウェア、データ、使用されている実際のモデルについて。
+
+主に高速化である例:
+
+
+</Tip>
+
+```python
+from transformers import pipeline
+from torch.utils.data import Dataset
+from tqdm.auto import tqdm
+
+pipe = pipeline("text-classification", device=0)
+
+
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
+
+    def __getitem__(self, i):
+        return "This is a test"
+
+
+dataset = MyDataset()
+
+for batch_size in [1, 8, 64, 256]:
+    print("-" * 30)
+    print(f"Streaming batch_size={batch_size}")
+    for out in tqdm(pipe(dataset, batch_size=batch_size), total=len(dataset)):
+        pass
+```
+
+```
+# On GTX 970
+------------------------------
+Streaming no batching
+100%|██████████████████████████████████████████████████████████████████████| 5000/5000 [00:26<00:00, 187.52it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:04<00:00, 1205.95it/s]
+------------------------------
+Streaming batch_size=64
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:02<00:00, 2478.24it/s]
+------------------------------
+Streaming batch_size=256
+100%|█████████████████████████████████████████████████████████████████████| 5000/5000 [00:01<00:00, 2554.43it/s]
+(diminishing returns, saturated the GPU)
+```
+
+最も速度が低下する例:
+
+
+```python
+class MyDataset(Dataset):
+    def __len__(self):
+        return 5000
+
+    def __getitem__(self, i):
+        if i % 64 == 0:
+            n = 100
+        else:
+            n = 1
+        return "This is a test" * n
+```
+
+これは、他の文に比べて非常に長い文が時折あります。その場合、**全体**のバッチは 400 である必要があります。
+トークンが長いため、バッチ全体が [64, 4] ではなく [64, 400] になり、速度が大幅に低下します。さらに悪いことに、
+バッチが大きくなると、プログラムは単純にクラッシュします。
+
+```
+------------------------------
+Streaming no batching
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:05<00:00, 183.69it/s]
+------------------------------
+Streaming batch_size=8
+100%|█████████████████████████████████████████████████████████████████████| 1000/1000 [00:03<00:00, 265.74it/s]
+------------------------------
+Streaming batch_size=64
+100%|██████████████████████████████████████████████████████████████████████| 1000/1000 [00:26<00:00, 37.80it/s]
+------------------------------
+Streaming batch_size=256
+  0%|                                                                                 | 0/1000 [00:00<?, ?it/s]
+Traceback (most recent call last):
+  File "/home/nicolas/src/transformers/test.py", line 42, in <module>
+    for out in tqdm(pipe(dataset, batch_size=256), total=len(dataset)):
+....
+    q = q / math.sqrt(dim_per_head)  # (bs, n_heads, q_length, dim_per_head)
+RuntimeError: CUDA out of memory. Tried to allocate 376.00 MiB (GPU 0; 3.95 GiB total capacity; 1.72 GiB already allocated; 354.88 MiB free; 2.46 GiB reserved in total by PyTorch)
+```
+
+この問題に対する適切な (一般的な) 解決策はなく、使用できる距離はユースケースによって異なる場合があります。のルール
+親指：
+
+ユーザーにとっての経験則は次のとおりです。
+
+- **ハードウェアを使用して、負荷に対するパフォーマンスを測定します。測って、測って、測り続ける。実数というのは、
+  進むべき唯一の方法。**
+- レイテンシに制約がある場合 (実際の製品が推論を実行している場合)、バッチ処理を行わないでください。
+- CPU を使用している場合は、バッチ処理を行わないでください。
+- GPU でスループットを使用している場合 (大量の静的データでモデルを実行したい場合)、次のようにします。
+
+  - sequence_length (「自然な」データ) のサイズについてまったくわからない場合は、デフォルトではバッチ処理や測定を行わず、
+    暫定的に追加してみます。失敗した場合に回復するために OOM チェックを追加します (失敗した場合は、ある時点で回復します)。
+    sequence_length を制御します。)
+  - sequence_length が非常に規則的である場合、バッチ処理は非常に興味深いものとなる可能性が高く、測定してプッシュしてください。
+    OOM が発生するまで続けます。
+  - GPU が大きいほど、バッチ処理がより興味深いものになる可能性が高くなります。
+- バッチ処理を有効にしたらすぐに、OOM を適切に処理できることを確認してください。
+
+
+## Pipeline chunk batching
+
+`zero-shot-classification` と `question-answering` は、単一の入力で結果が得られる可能性があるという意味で、少し特殊です。
+モデルの複数の前方パス。通常の状況では、これにより `batch_size` 引数に関する問題が発生します。
+
+この問題を回避するために、これらのパイプラインはどちらも少し特殊になっており、代わりに `ChunkPipeline` になっています。
+通常の `Pipeline`。要するに：
+
+```python
+preprocessed = pipe.preprocess(inputs)
+model_outputs = pipe.forward(preprocessed)
+outputs = pipe.postprocess(model_outputs)
+```
+
+今は次のようになります:
+
+```python
+all_model_outputs = []
+for preprocessed in pipe.preprocess(inputs):
+    model_outputs = pipe.forward(preprocessed)
+    all_model_outputs.append(model_outputs)
+outputs = pipe.postprocess(all_model_outputs)
+```
+
+パイプラインは以下で使用されるため、これはコードに対して非常に透過的である必要があります。
+同じ方法。
+
+パイプラインはバッチを自動的に処理できるため、これは簡略化されたビューです。気にする必要はないという意味です
+入力が実際にトリガーする前方パスの数については、`batch_size` を最適化できます。
+入力とは独立して。前のセクションの注意事項が引き続き適用されます。
+
+## Pipeline custom code
+
+特定のパイプラインをオーバーライドする場合。
+
+目の前のタスクに関する問題を作成することを躊躇しないでください。パイプラインの目標は、使いやすく、ほとんどのユーザーをサポートすることです。
+したがって、`transformers`があなたのユースケースをサポートする可能性があります。
+
+
+単純に試してみたい場合は、次のことができます。
+
+- 選択したパイプラインをサブクラス化します
+
+```python
+class MyPipeline(TextClassificationPipeline):
+    def postprocess():
+        # Your code goes here
+        scores = scores * 100
+        # And here
+
+
+my_pipeline = MyPipeline(model=model, tokenizer=tokenizer, ...)
+# or if you use *pipeline* function, then:
+my_pipeline = pipeline(model="xxxx", pipeline_class=MyPipeline)
+```
+
+これにより、必要なカスタム コードをすべて実行できるようになります。
+
+## Implementing a pipeline
+
+[Implementing a new pipeline](../add_new_pipeline)
+
+## Audio
+
+オーディオ タスクに使用できるパイプラインには次のものがあります。
+
+### AudioClassificationPipeline
+
+[[autodoc]] AudioClassificationPipeline
+    - __call__
+    - all
+
+### AutomaticSpeechRecognitionPipeline
+
+[[autodoc]] AutomaticSpeechRecognitionPipeline
+    - __call__
+    - all
+
+### TextToAudioPipeline
+
+[[autodoc]] TextToAudioPipeline
+    - __call__
+    - all
+
+
+### ZeroShotAudioClassificationPipeline
+
+[[autodoc]] ZeroShotAudioClassificationPipeline
+    - __call__
+    - all
+
+## Computer vision
+
+コンピューター ビジョン タスクに使用できるパイプラインには次のものがあります。
+
+### DepthEstimationPipeline
+[[autodoc]] DepthEstimationPipeline
+    - __call__
+    - all
+
+### ImageClassificationPipeline
+
+[[autodoc]] ImageClassificationPipeline
+    - __call__
+    - all
+
+### ImageSegmentationPipeline
+
+[[autodoc]] ImageSegmentationPipeline
+    - __call__
+    - all
+
+### ImageToImagePipeline
+
+[[autodoc]] ImageToImagePipeline
+    - __call__
+    - all
+
+### ObjectDetectionPipeline
+
+[[autodoc]] ObjectDetectionPipeline
+    - __call__
+    - all
+
+### VideoClassificationPipeline
+
+[[autodoc]] VideoClassificationPipeline
+    - __call__
+    - all
+
+### ZeroShotImageClassificationPipeline
+
+[[autodoc]] ZeroShotImageClassificationPipeline
+    - __call__
+    - all
+
+### ZeroShotObjectDetectionPipeline
+
+[[autodoc]] ZeroShotObjectDetectionPipeline
+    - __call__
+    - all
+
+## Natural Language Processing
+
+自然言語処理タスクに使用できるパイプラインには次のものがあります。
+
+### ConversationalPipeline
+
+[[autodoc]] Conversation
+
+[[autodoc]] ConversationalPipeline
+    - __call__
+    - all
+
+### FillMaskPipeline
+
+[[autodoc]] FillMaskPipeline
+    - __call__
+    - all
+
+### NerPipeline
+
+[[autodoc]] NerPipeline
+
+詳細については、[`TokenClassificationPipeline`] を参照してください。
+
+### QuestionAnsweringPipeline
+
+[[autodoc]] QuestionAnsweringPipeline
+    - __call__
+    - all
+
+### SummarizationPipeline
+
+[[autodoc]] SummarizationPipeline
+    - __call__
+    - all
+
+### TableQuestionAnsweringPipeline
+
+[[autodoc]] TableQuestionAnsweringPipeline
+    - __call__
+
+### TextClassificationPipeline
+
+[[autodoc]] TextClassificationPipeline
+    - __call__
+    - all
+
+### TextGenerationPipeline
+
+[[autodoc]] TextGenerationPipeline
+    - __call__
+    - all
+
+### Text2TextGenerationPipeline
+
+[[autodoc]] Text2TextGenerationPipeline
+    - __call__
+    - all
+
+### TokenClassificationPipeline
+
+[[autodoc]] TokenClassificationPipeline
+    - __call__
+    - all
+
+### TranslationPipeline
+
+[[autodoc]] TranslationPipeline
+    - __call__
+    - all
+
+### ZeroShotClassificationPipeline
+
+[[autodoc]] ZeroShotClassificationPipeline
+    - __call__
+    - all
+
+## Multimodal
+
+マルチモーダル タスクに使用できるパイプラインには次のものがあります。
+
+### DocumentQuestionAnsweringPipeline
+
+[[autodoc]] DocumentQuestionAnsweringPipeline
+    - __call__
+    - all
+
+### FeatureExtractionPipeline
+
+[[autodoc]] FeatureExtractionPipeline
+    - __call__
+    - all
+
+### ImageToTextPipeline
+
+[[autodoc]] ImageToTextPipeline
+    - __call__
+    - all
+
+### VisualQuestionAnsweringPipeline
+
+[[autodoc]] VisualQuestionAnsweringPipeline
+    - __call__
+    - all
+
+## Parent class: `Pipeline`
+
+[[autodoc]] Pipeline
--- a/docs/source/ja/main_classes/processors.md
+++ b/docs/source/ja/main_classes/processors.md
@ -0,0 +1,160 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Processors
+
+Transformers ライブラリでは、プロセッサは 2 つの異なる意味を持ちます。
+- [Wav2Vec2](../model_doc/wav2vec2) などのマルチモーダル モデルの入力を前処理するオブジェクト (音声とテキスト)
+  または [CLIP](../model_doc/clip) (テキストとビジョン)
+- 古いバージョンのライブラリで GLUE または SQUAD のデータを前処理するために使用されていたオブジェクトは非推奨になりました。
+
+## Multi-modal processors
+
+マルチモーダル モデルでは、オブジェクトが複数のモダリティ (テキスト、
+視覚と音声）。これは、2 つ以上の処理オブジェクトをグループ化するプロセッサーと呼ばれるオブジェクトによって処理されます。
+トークナイザー (テキスト モダリティ用)、画像プロセッサー (視覚用)、特徴抽出器 (オーディオ用) など。
+
+これらのプロセッサは、保存およびロード機能を実装する次の基本クラスを継承します。
+
+[[autodoc]] ProcessorMixin
+
+## Deprecated processors
+
+すべてのプロセッサは、同じアーキテクチャに従っています。
+[`~data.processors.utils.DataProcessor`]。プロセッサは次のリストを返します。
+[`~data.processors.utils.InputExample`]。これら
+[`~data.processors.utils.InputExample`] は次のように変換できます。
+[`~data.processors.utils.Input features`] をモデルにフィードします。
+
+[[autodoc]] data.processors.utils.DataProcessor
+
+[[autodoc]] data.processors.utils.InputExample
+
+[[autodoc]] data.processors.utils.InputFeatures
+
+## GLUE
+
+[一般言語理解評価 (GLUE)](https://gluebenchmark.com/) は、
+既存の NLU タスクの多様なセットにわたるモデルのパフォーマンス。紙と同時発売された [GLUE: A
+自然言語理解のためのマルチタスクベンチマークおよび分析プラットフォーム](https://openreview.net/pdf?id=rJ4km2R5t7)
+
+このライブラリは、MRPC、MNLI、MNLI (不一致)、CoLA、SST2、STSB、
+QQP、QNLI、RTE、WNLI。
+
+それらのプロセッサは次のとおりです。
+
+- [`~data.processors.utils.MrpcProcessor`]
+- [`~data.processors.utils.MnliProcessor`]
+- [`~data.processors.utils.MnliMismatchedProcessor`]
+- [`~data.processors.utils.Sst2Processor`]
+- [`~data.processors.utils.StsbProcessor`]
+- [`~data.processors.utils.QqpProcessor`]
+- [`~data.processors.utils.QnliProcessor`]
+- [`~data.processors.utils.RteProcessor`]
+- [`~data.processors.utils.WnliProcessor`]
+
+
+さらに、次のメソッドを使用して、データ ファイルから値をロードし、それらをリストに変換することができます。
+[`~data.processors.utils.InputExample`]。
+
+[[autodoc]] data.processors.glue.glue_convert_examples_to_features
+
+## XNLI
+
+[クロスリンガル NLI コーパス (XNLI)](https://www.nyu.edu/projects/bowman/xnli/) は、
+言語を超えたテキスト表現の品質。 XNLI は、[*MultiNLI*](http://www.nyu.edu/projects/bowman/multinli/) に基づくクラウドソースのデータセットです。テキストのペアには、15 個のテキスト含意アノテーションがラベル付けされています。
+さまざまな言語 (英語などの高リソース言語とスワヒリ語などの低リソース言語の両方を含む)。
+
+論文 [XNLI: Evaluating Cross-lingual Sentence Representations](https://arxiv.org/abs/1809.05053) と同時にリリースされました。
+
+このライブラリは、XNLI データをロードするプロセッサをホストします。
+
+- [`~data.processors.utils.XnliProcessor`]
+
+テストセットにはゴールドラベルが付いているため、評価はテストセットで行われますのでご了承ください。
+
+これらのプロセッサを使用する例は、[run_xnli.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/text-classification/run_xnli.py) スクリプトに示されています。
+
+## SQuAD
+
+[The Stanford Question Answering Dataset (SQuAD)](https://rajpurkar.github.io/SQuAD-explorer//) は、次のベンチマークです。
+質問応答に関するモデルのパフォーマンスを評価します。 v1.1 と v2.0 の 2 つのバージョンが利用可能です。最初のバージョン
+(v1.1) は、論文 [SQuAD: 100,000+ question for Machine Comprehension of Text](https://arxiv.org/abs/1606.05250) とともにリリースされました。 2 番目のバージョン (v2.0) は、論文 [Know What You Don't と同時にリリースされました。
+知っておくべき: SQuAD の答えられない質問](https://arxiv.org/abs/1806.03822)。
+
+このライブラリは、次の 2 つのバージョンのそれぞれのプロセッサをホストします。
+
+### Processors
+
+それらのプロセッサは次のとおりです。
+
+- [`~data.processors.utils.SquadV1Processor`]
+- [`~data.processors.utils.SquadV2Processor`]
+
+どちらも抽象クラス [`~data.processors.utils.SquadProcessor`] を継承しています。
+
+[[autodoc]] data.processors.squad.SquadProcessor
+    - all
+
+さらに、次のメソッドを使用して、SQuAD の例を次の形式に変換できます。
+モデルの入力として使用できる [`~data.processors.utils.SquadFeatures`]。
+
+[[autodoc]] data.processors.squad.squad_convert_examples_to_features
+
+これらのプロセッサと前述の方法は、データを含むファイルだけでなく、
+*tensorflow_datasets* パッケージ。以下に例を示します。
+
+### Example usage
+
+以下にプロセッサを使用した例と、データ ファイルを使用した変換方法を示します。
+
+```python
+# Loading a V2 processor
+processor = SquadV2Processor()
+examples = processor.get_dev_examples(squad_v2_data_dir)
+
+# Loading a V1 processor
+processor = SquadV1Processor()
+examples = processor.get_dev_examples(squad_v1_data_dir)
+
+features = squad_convert_examples_to_features(
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+*tensorflow_datasets* の使用は、データ ファイルを使用するのと同じくらい簡単です。
+
+```python
+# tensorflow_datasets only handle Squad V1.
+tfds_examples = tfds.load("squad")
+examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
+
+features = squad_convert_examples_to_features(
+    examples=examples,
+    tokenizer=tokenizer,
+    max_seq_length=max_seq_length,
+    doc_stride=args.doc_stride,
+    max_query_length=max_query_length,
+    is_training=not evaluate,
+)
+```
+
+これらのプロセッサを使用する別の例は、[run_squad.py](https://github.com/huggingface/transformers/tree/main/examples/legacy/question-answering/run_squad.py) スクリプトに示されています。
--- a/docs/source/ja/main_classes/quantization.md
+++ b/docs/source/ja/main_classes/quantization.md
@ -0,0 +1,447 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quantize 🤗 Transformers models
+
+## `AutoGPTQ` Integration
+
+
+🤗 Transformers には、言語モデルで GPTQ 量子化を実行するための `optimum` API が統合されています。パフォーマンスを大幅に低下させることなく、推論速度を高速化することなく、モデルを 8、4、3、さらには 2 ビットでロードおよび量子化できます。これは、ほとんどの GPU ハードウェアでサポートされています。
+
+量子化モデルの詳細については、以下を確認してください。
+- [GPTQ](https://arxiv.org/pdf/2210.17323.pdf) 論文
+- GPTQ 量子化に関する `optimum` [ガイド](https://huggingface.co/docs/optimum/llm_quantization/usage_guides/quantization)
+- バックエンドとして使用される [`AutoGPTQ`](https://github.com/PanQiWei/AutoGPTQ) ライブラリ
+
+### Requirements
+
+以下のコードを実行するには、以下の要件がインストールされている必要があります： 
+
+- 最新の `AutoGPTQ` ライブラリをインストールする。
+`pip install auto-gptq` をインストールする。
+
+- 最新の `optimum` をソースからインストールする。
+`git+https://github.com/huggingface/optimum.git` をインストールする。
+
+- 最新の `transformers` をソースからインストールする。
+最新の `transformers` をソースからインストールする `pip install git+https://github.com/huggingface/transformers.git`
+
+- 最新の `accelerate` ライブラリをインストールする。
+`pip install --upgrade accelerate` を実行する。
+
+GPTQ統合は今のところテキストモデルのみをサポートしているので、視覚、音声、マルチモーダルモデルでは予期せぬ挙動に遭遇するかもしれないことに注意してください。
+
+### Load and quantize a model
+
+GPTQ は、量子化モデルを使用する前に重みのキャリブレーションを必要とする量子化方法です。トランスフォーマー モデルを最初から量子化する場合は、量子化モデルを作成するまでに時間がかかることがあります (`facebook/opt-350m`モデルの Google colab では約 5 分)。
+
+したがって、GPTQ 量子化モデルを使用するシナリオは 2 つあります。最初の使用例は、ハブで利用可能な他のユーザーによってすでに量子化されたモデルをロードすることです。2 番目の使用例は、モデルを最初から量子化し、保存するかハブにプッシュして、他のユーザーが使用できるようにすることです。それも使ってください。
+
+#### GPTQ Configuration
+
+モデルをロードして量子化するには、[`GPTQConfig`] を作成する必要があります。データセットを準備するには、`bits`の数、量子化を調整するための`dataset`、およびモデルの`Tokenizer`を渡す必要があります。
+
+```python 
+model_id = "facebook/opt-125m"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+gptq_config = GPTQConfig(bits=4, dataset = "c4", tokenizer=tokenizer)
+```
+
+独自のデータセットを文字列のリストとして渡すことができることに注意してください。ただし、GPTQ 論文のデータセットを使用することを強くお勧めします。
+
+```python
+dataset = ["auto-gptq is an easy-to-use model quantization library with user-friendly apis, based on GPTQ algorithm."]
+quantization = GPTQConfig(bits=4, dataset = dataset, tokenizer=tokenizer)
+```
+
+#### Quantization
+
+`from_pretrained` を使用し、`quantization_config` を設定することでモデルを量子化できます。
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=gptq_config)
+```
+
+モデルを量子化するには GPU が必要であることに注意してください。モデルを CPU に配置し、量子化するためにモジュールを GPU に前後に移動させます。
+
+CPU オフロードの使用中に GPU の使用量を最大化したい場合は、`device_map = "auto"` を設定できます。
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", quantization_config=gptq_config)
+```
+
+ディスク オフロードはサポートされていないことに注意してください。さらに、データセットが原因でメモリが不足している場合は、`from_pretained` で `max_memory` を渡す必要がある場合があります。 `device_map`と`max_memory`の詳細については、この [ガイド](https://huggingface.co/docs/accelerate/usage_guides/big_modeling#designing-a-device-map) を参照してください。
+
+<Tip warning={true}>
+GPTQ 量子化は、現時点ではテキスト モデルでのみ機能します。さらに、量子化プロセスはハードウェアによっては長時間かかる場合があります (NVIDIA A100 を使用した場合、175B モデル = 4 gpu 時間)。モデルの GPTQ 量子化バージョンが存在しない場合は、ハブで確認してください。そうでない場合は、github で要求を送信できます。
+</Tip>
+
+### Push quantized model to 🤗 Hub
+
+他の 🤗 モデルと同様に、`push_to_hub` を使用して量子化モデルをハブにプッシュできます。量子化構成は保存され、モデルに沿ってプッシュされます。
+
+```python
+quantized_model.push_to_hub("opt-125m-gptq")
+tokenizer.push_to_hub("opt-125m-gptq")
+```
+
+量子化されたモデルをローカル マシンに保存したい場合は、`save_pretrained` を使用して行うこともできます。
+
+
+```python
+quantized_model.save_pretrained("opt-125m-gptq")
+tokenizer.save_pretrained("opt-125m-gptq")
+```
+
+`device_map` を使用してモデルを量子化した場合は、保存する前にモデル全体を GPU または `cpu` のいずれかに移動してください。
+
+```python
+quantized_model.to("cpu")
+quantized_model.save_pretrained("opt-125m-gptq")
+```
+
+### Load a quantized model from the 🤗 Hub
+
+`from_pretrained`を使用して、量子化されたモデルをハブからロードできます。
+属性 `quantization_config` がモデル設定オブジェクトに存在することを確認して、プッシュされた重みが量子化されていることを確認します。
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq")
+```
+
+必要以上のメモリを割り当てずにモデルをより速くロードしたい場合は、`device_map` 引数は量子化モデルでも機能します。 `accelerate`ライブラリがインストールされていることを確認してください。
+
+```python
+from transformers import AutoModelForCausalLM
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto")
+```
+
+### Exllama kernels for faster inference
+
+4 ビット モデルの場合、推論速度を高めるために exllama カーネルを使用できます。デフォルトで有効になっています。 [`GPTQConfig`] で `disable_exllama` を渡すことで、その動作を変更できます。これにより、設定に保存されている量子化設定が上書きされます。カーネルに関連する属性のみを上書きできることに注意してください。さらに、exllama カーネルを使用したい場合は、モデル全体を GPU 上に置く必要があります。
+
+
+```py
+import torch
+gptq_config = GPTQConfig(bits=4, disable_exllama=False)
+model = AutoModelForCausalLM.from_pretrained("{your_username}/opt-125m-gptq", device_map="auto", quantization_config = gptq_config)
+```
+
+現時点では 4 ビット モデルのみがサポートされていることに注意してください。さらに、peft を使用して量子化モデルを微調整している場合は、exllama カーネルを非アクティブ化することをお勧めします。
+
+#### Fine-tune a quantized model 
+
+Hugging Face エコシステムのアダプターの公式サポートにより、GPTQ で量子化されたモデルを微調整できます。
+詳細については、[`peft`](https://github.com/huggingface/peft) ライブラリをご覧ください。
+
+### Example demo
+
+GPTQ を使用してモデルを量子化する方法と、peft を使用して量子化されたモデルを微調整する方法については、Google Colab [ノートブック](https://colab.research.google.com/drive/1_TIrmuKOFhuRRiTWN94iLKUFu6ZX4ceb?usp=sharing) を参照してください。
+
+### GPTQConfig
+
+[[autodoc]] GPTQConfig
+
+## `bitsandbytes` Integration
+
+🤗 Transformers は、`bitsandbytes` で最もよく使用されるモジュールと緊密に統合されています。数行のコードでモデルを 8 ビット精度でロードできます。
+これは、`bitsandbytes`の `0.37.0`リリース以降、ほとんどの GPU ハードウェアでサポートされています。
+
+量子化方法の詳細については、[LLM.int8()](https://arxiv.org/abs/2208.07339) 論文、または [ブログ投稿](https://huggingface.co/blog/hf-bitsandbytes-) をご覧ください。統合）コラボレーションについて。
+
+`0.39.0`リリース以降、FP4 データ型を活用し、4 ビット量子化を使用して`device_map`をサポートする任意のモデルをロードできます。
+
+独自の pytorch モデルを量子化したい場合は、🤗 Accelerate ライブラリの [ドキュメント](https://huggingface.co/docs/accelerate/main/en/usage_guides/quantization) をチェックしてください。
+
+`bitsandbytes`統合を使用してできることは次のとおりです
+
+### General usage
+
+モデルが 🤗 Accelerate による読み込みをサポートし、`torch.nn.Linear` レイヤーが含まれている限り、 [`~PreTrainedModel.from_pretrained`] メソッドを呼び出すときに `load_in_8bit` または `load_in_4bit` 引数を使用してモデルを量子化できます。これはどのようなモダリティでも同様に機能するはずです。
+
+```python
+from transformers import AutoModelForCausalLM
+
+model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True)
+model_4bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_4bit=True)
+```
+
+デフォルトでは、他のすべてのモジュール (例: `torch.nn.LayerNorm`) は `torch.float16` に変換されますが、その `dtype` を変更したい場合は、`torch_dtype` 引数を上書きできます。
+
+```python
+>>> import torch
+>>> from transformers import AutoModelForCausalLM
+
+>>> model_8bit = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", load_in_8bit=True, torch_dtype=torch.float32)
+>>> model_8bit.model.decoder.layers[-1].final_layer_norm.weight.dtype
+torch.float32
+```
+
+### FP4 quantization 
+
+#### Requirements
+
+以下のコード スニペットを実行する前に、以下の要件がインストールされていることを確認してください。
+
+- 最新の`bitsandbytes`ライブラリ
+`pip install bitsandbytes>=0.39.0`
+
+- 最新の`accelerate`をインストールする
+`pip install --upgrade accelerate`
+
+- 最新の `transformers` をインストールする
+`pip install --upgrade transformers`
+
+#### Tips and best practices
+
+- **高度な使用法:** 可能なすべてのオプションを使用した 4 ビット量子化の高度な使用法については、[この Google Colab ノートブック](https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf) を参照してください。
+
+- **`batch_size=1` による高速推論 :** bitsandbytes の `0.40.0` リリース以降、`batch_size=1` では高速推論の恩恵を受けることができます。 [これらのリリース ノート](https://github.com/TimDettmers/bitsandbytes/releases/tag/0.40.0) を確認し、この機能を活用するには`0.40.0`以降のバージョンを使用していることを確認してください。箱の。
+
+- **トレーニング:** [QLoRA 論文](https://arxiv.org/abs/2305.14314) によると、4 ビット基本モデルをトレーニングする場合 (例: LoRA アダプターを使用)、`bnb_4bit_quant_type='nf4'` を使用する必要があります。 。
+
+- **推論:** 推論の場合、`bnb_4bit_quant_type` はパフォーマンスに大きな影響を与えません。ただし、モデルの重みとの一貫性を保つために、必ず同じ `bnb_4bit_compute_dtype` および `torch_dtype` 引数を使用してください。
+
+
+#### Load a large model in 4bit
+
+`.from_pretrained` メソッドを呼び出すときに `load_in_4bit=True` を使用すると、メモリ使用量を (おおよそ) 4 で割ることができます。
+
+```python
+# pip install transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "bigscience/bloom-1b7"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_4bit=True)
+```
+
+<Tip warning={true}>
+
+モデルが 4 ビットでロードされると、現時点では量子化された重みをハブにプッシュすることはできないことに注意してください。 4 ビットの重みはまだサポートされていないため、トレーニングできないことにも注意してください。ただし、4 ビット モデルを使用して追加のパラメーターをトレーニングすることもできます。これについては次のセクションで説明します。
+
+</Tip>
+
+### Load a large model in 8bit
+
+`.from_pretrained` メソッドを呼び出すときに `load_in_8bit=True` 引数を使用すると、メモリ要件をおよそ半分にしてモデルをロードできます。
+
+```python
+# pip install transformers accelerate bitsandbytes
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "bigscience/bloom-1b7"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(model_id, device_map="auto", load_in_8bit=True)
+```
+
+次に、通常 [`PreTrainedModel`] を使用するのと同じようにモデルを使用します。
+
+`get_memory_footprint` メソッドを使用して、モデルのメモリ フットプリントを確認できます。
+
+```python
+print(model.get_memory_footprint())
+```
+
+この統合により、大きなモデルを小さなデバイスにロードし、問題なく実行できるようになりました。
+
+<Tip warning={true}>
+モデルが 8 ビットでロードされると、最新の `transformers`と`bitsandbytes`を使用する場合を除き、量子化された重みをハブにプッシュすることは現在不可能であることに注意してください。 8 ビットの重みはまだサポートされていないため、トレーニングできないことにも注意してください。ただし、8 ビット モデルを使用して追加のパラメーターをトレーニングすることもできます。これについては次のセクションで説明します。
+また、`device_map` はオプションですが、利用可能なリソース上でモデルを効率的にディスパッチするため、推論には `device_map = 'auto'` を設定することが推奨されます。
+
+</Tip>
+
+#### Advanced use cases
+
+ここでは、FP4 量子化を使用して実行できるいくつかの高度な使用例について説明します。
+
+##### Change the compute dtype
+
+compute dtype は、計算中に使用される dtype を変更するために使用されます。たとえば、隠し状態は`float32`にありますが、高速化のために計算を bf16 に設定できます。デフォルトでは、compute dtype は `float32` に設定されます。
+
+```python
+import torch
+from transformers import BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16)
+```
+
+##### Using NF4 (Normal Float 4) data type 
+
+NF4 データ型を使用することもできます。これは、正規分布を使用して初期化された重みに適合した新しい 4 ビット データ型です。その実行のために:
+
+```python
+from transformers import BitsAndBytesConfig
+
+nf4_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+)
+
+model_nf4 = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=nf4_config)
+```
+
+##### Use nested quantization for more memory efficient inference
+
+また、ネストされた量子化手法を使用することをお勧めします。これにより、パフォーマンスを追加することなく、より多くのメモリが節約されます。経験的な観察から、これにより、NVIDIA-T4 16GB 上でシーケンス長 1024、バッチ サイズ 1、勾配累積ステップ 4 の llama-13b モデルを微調整することが可能になります。
+
+```python
+from transformers import BitsAndBytesConfig
+
+double_quant_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_use_double_quant=True,
+)
+
+model_double_quant = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=double_quant_config)
+```
+
+
+### Push quantized models on the 🤗 Hub
+
+`push_to_hub`メソッドを単純に使用することで、量子化されたモデルをハブにプッシュできます。これにより、最初に量子化構成ファイルがプッシュされ、次に量子化されたモデルの重みがプッシュされます。
+この機能を使用できるようにするには、必ず `bitsandbytes>0.37.2` を使用してください (この記事の執筆時点では、`bitsandbytes==0.38.0.post1` でテストしました)。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m", device_map="auto", load_in_8bit=True)
+tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
+
+model.push_to_hub("bloom-560m-8bit")
+```
+
+<Tip warning={true}>
+
+大規模なモデルでは、ハブ上で 8 ビット モデルをプッシュすることが強く推奨されます。これにより、コミュニティはメモリ フットプリントの削減と、たとえば Google Colab での大規模なモデルの読み込みによる恩恵を受けることができます。
+
+</Tip>
+
+### Load a quantized model from the 🤗 Hub
+
+`from_pretrained`メソッドを使用して、ハブから量子化モデルをロードできます。属性 `quantization_config` がモデル設定オブジェクトに存在することを確認して、プッシュされた重みが量子化されていることを確認します。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("{your_username}/bloom-560m-8bit", device_map="auto")
+```
+
+この場合、引数 `load_in_8bit=True` を指定する必要はありませんが、`bitsandbytes` と `accelerate` がインストールされていることを確認する必要があることに注意してください。
+また、`device_map` はオプションですが、利用可能なリソース上でモデルを効率的にディスパッチするため、推論には `device_map = 'auto'` を設定することが推奨されます。
+
+### Advanced use cases
+
+このセクションは、8 ビット モデルのロードと実行以外に何ができるかを探求したい上級ユーザーを対象としています。
+
+#### Offload between `cpu` and `gpu`
+
+この高度な使用例の 1 つは、モデルをロードし、`CPU`と`GPU`の間で重みをディスパッチできることです。 CPU 上でディスパッチされる重みは **8 ビットに変換されない**ため、`float32`に保持されることに注意してください。この機能は、非常に大規模なモデルを適合させ、そのモデルを GPU と CPU の間でディスパッチしたいユーザーを対象としています。
+
+まず、`transformers` から [`BitsAndBytesConfig`] をロードし、属性 `llm_int8_enable_fp32_cpu_offload` を `True` に設定します。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(llm_int8_enable_fp32_cpu_offload=True)
+```
+
+`bigscience/bloom-1b7`モデルをロードする必要があり、`lm_head`を除くモデル全体に適合するのに十分な GPU RAM があるとします。したがって、次のようにカスタム device_map を作成します。
+
+```python
+device_map = {
+    "transformer.word_embeddings": 0,
+    "transformer.word_embeddings_layernorm": 0,
+    "lm_head": "cpu",
+    "transformer.h": 0,
+    "transformer.ln_f": 0,
+}
+```
+
+そして、次のようにモデルをロードします。
+```python
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-1b7",
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+```
+
+以上です！モデルを楽しんでください！
+
+#### Play with `llm_int8_threshold`
+
+`llm_int8_threshold` 引数を操作して、外れ値のしきい値を変更できます。 外れ値 とは、特定のしきい値より大きい隠れた状態の値です。
+これは、`LLM.int8()`論文で説明されている外れ値検出の外れ値しきい値に対応します。このしきい値を超える隠し状態の値は外れ値とみなされ、それらの値に対する操作は fp16 で実行されます。通常、値は正規分布します。つまり、ほとんどの値は [-3.5, 3.5] の範囲内にありますが、大規模なモデルでは大きく異なる分布を示す例外的な系統的外れ値がいくつかあります。これらの外れ値は、多くの場合 [-60, -6] または [6, 60] の範囲内にあります。 Int8 量子化は、大きさが 5 程度までの値ではうまく機能しますが、それを超えると、パフォーマンスが大幅に低下します。適切なデフォルトのしきい値は 6 ですが、より不安定なモデル (小規模なモデル、微調整) では、より低いしきい値が必要になる場合があります。
+この引数は、モデルの推論速度に影響を与える可能性があります。このパラメータを試してみて、ユースケースに最適なパラメータを見つけることをお勧めします。
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_threshold=10,
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+```
+
+#### Skip the conversion of some modules
+
+一部のモデルには、安定性を確保するために 8 ビットに変換する必要がないモジュールがいくつかあります。たとえば、ジュークボックス モデルには、スキップする必要があるいくつかの `lm_head` モジュールがあります。 `llm_int8_skip_modules` で遊んでみる
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+model_id = "bigscience/bloom-1b7"
+
+quantization_config = BitsAndBytesConfig(
+    llm_int8_skip_modules=["lm_head"],
+)
+
+model_8bit = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map=device_map,
+    quantization_config=quantization_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+```
+
+#### Fine-tune a model that has been loaded in 8-bit
+
+Hugging Face エコシステムのアダプターの公式サポートにより、8 ビットでロードされたモデルを微調整できます。
+これにより、単一の Google Colab で`flan-t5-large`や`facebook/opt-6.7b`などの大規模モデルを微調整することができます。詳細については、[`peft`](https://github.com/huggingface/peft) ライブラリをご覧ください。
+
+トレーニング用のモデルをロードするときに `device_map` を渡す必要がないことに注意してください。モデルが GPU に自動的にロードされます。必要に応じて、デバイス マップを特定のデバイスに設定することもできます (例: `cuda:0`、`0`、`torch.device('cuda:0')`)。 `device_map=auto`は推論のみに使用する必要があることに注意してください。
+
+### BitsAndBytesConfig
+
+[[autodoc]] BitsAndBytesConfig
+
+## Quantization with 🤗 `optimum` 
+
+`optimum`でサポートされている量子化方法の詳細については、[Optimum ドキュメント](https://huggingface.co/docs/optimum/index) を参照し、これらが自分のユースケースに適用できるかどうかを確認してください。
--- a/docs/source/ja/main_classes/text_generation.md
+++ b/docs/source/ja/main_classes/text_generation.md
@ -0,0 +1,63 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Generation
+
+各フレームワークには、それぞれの `GenerationMixin` クラスに実装されたテキスト生成のための Generate メソッドがあります。
+
+- PyTorch [`~generation.GenerationMixin.generate`] は [`~generation.GenerationMixin`] に実装されています。
+- TensorFlow [`~generation.TFGenerationMixin.generate`] は [`~generation.TFGenerationMixin`] に実装されています。
+- Flax/JAX [`~generation.FlaxGenerationMixin.generate`] は [`~generation.FlaxGenerationMixin`] に実装されています。
+
+選択したフレームワークに関係なく、[`~generation.GenerationConfig`] を使用して生成メソッドをパラメータ化できます。
+クラスインスタンス。動作を制御する生成パラメータの完全なリストについては、このクラスを参照してください。
+生成方法のこと。
+
+モデルの生成構成を検査する方法、デフォルトとは何か、パラメーターをアドホックに変更する方法を学習するには、
+カスタマイズされた生成構成を作成して保存する方法については、「
+[テキスト生成戦略ガイド](../generation_strategies)。このガイドでは、関連機能の使用方法についても説明しています。
+トークンストリーミングのような。
+
+## GenerationConfig
+
+[[autodoc]] generation.GenerationConfig
+	- from_pretrained
+	- from_model_config
+	- save_pretrained
+
+## GenerationMixin
+
+[[autodoc]] generation.GenerationMixin
+	- generate
+	- compute_transition_scores
+	- greedy_search
+	- sample
+	- beam_search
+	- beam_sample
+	- contrastive_search
+	- group_beam_search
+	- constrained_beam_search
+
+## TFGenerationMixin
+
+[[autodoc]] generation.TFGenerationMixin
+	- generate
+	- compute_transition_scores
+
+## FlaxGenerationMixin
+
+[[autodoc]] generation.FlaxGenerationMixin
+	- generate
--- a/docs/source/ja/main_classes/tokenizer.md
+++ b/docs/source/ja/main_classes/tokenizer.md
@ -0,0 +1,80 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Tokenizer
+
+トークナイザーは、モデルの入力の準備を担当します。ライブラリには、すべてのモデルのトークナイザーが含まれています。ほとんど
+トークナイザーの一部は、完全な Python 実装と、
+Rust ライブラリ [🤗 Tokenizers](https://github.com/huggingface/tokenizers)。 「高速」実装では次のことが可能になります。
+
+1. 特にバッチトークン化を行う場合の大幅なスピードアップと
+2. 元の文字列 (文字と単語) とトークン空間の間でマッピングする追加のメソッド (例:
+   特定の文字を含むトークンのインデックス、または特定のトークンに対応する文字の範囲）。
+
+基本クラス [`PreTrainedTokenizer`] および [`PreTrainedTokenizerFast`]
+モデル入力の文字列入力をエンコードし (以下を参照)、Python をインスタンス化/保存するための一般的なメソッドを実装します。
+ローカル ファイルまたはディレクトリ、またはライブラリによって提供される事前トレーニング済みトークナイザーからの「高速」トークナイザー
+(HuggingFace の AWS S3 リポジトリからダウンロード)。二人とも頼りにしているのは、
+共通メソッドを含む [`~tokenization_utils_base.PreTrainedTokenizerBase`]
+[`~tokenization_utils_base.SpecialTokensMixin`]。
+
+したがって、[`PreTrainedTokenizer`] と [`PreTrainedTokenizerFast`] はメインを実装します。
+すべてのトークナイザーを使用するためのメソッド:
+
+- トークン化 (文字列をサブワード トークン文字列に分割)、トークン文字列を ID に変換したり、その逆の変換を行ったりします。
+  エンコード/デコード (つまり、トークン化と整数への変換)。
+- 基礎となる構造 (BPE、SentencePiece...) から独立した方法で、語彙に新しいトークンを追加します。
+- 特別なトークン (マスク、文の始まりなど) の管理: トークンの追加、属性への割り当て。
+  トークナイザーにより、簡単にアクセスでき、トークン化中に分割されないようにすることができます。
+
+[`BatchEncoding`] は、
+[`~tokenization_utils_base.PreTrainedTokenizerBase`] のエンコード メソッド (`__call__`、
+`encode_plus` および `batch_encode_plus`) であり、Python 辞書から派生しています。トークナイザーが純粋な Python の場合
+tokenizer の場合、このクラスは標準の Python 辞書と同じように動作し、によって計算されたさまざまなモデル入力を保持します。
+これらのメソッド (`input_ids`、`attention_mask`...)。トークナイザーが「高速」トークナイザーである場合 (つまり、
+HuggingFace [トークナイザー ライブラリ](https://github.com/huggingface/tokenizers))、このクラスはさらに提供します
+元の文字列 (文字と単語) と
+トークンスペース (例: 指定された文字または対応する文字の範囲を構成するトークンのインデックスの取得)
+与えられたトークンに）。
+
+## PreTrainedTokenizer
+
+[[autodoc]] PreTrainedTokenizer
+    - __call__
+    - apply_chat_template
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## PreTrainedTokenizerFast
+
+[`PreTrainedTokenizerFast`] は [tokenizers](https://huggingface.co/docs/tokenizers) ライブラリに依存します。 🤗 トークナイザー ライブラリから取得したトークナイザーは、
+🤗 トランスに非常に簡単にロードされます。これがどのように行われるかを理解するには、[🤗 tokenizers からの tokenizers を使用する](../fast_tokenizers) ページを参照してください。
+
+[[autodoc]] PreTrainedTokenizerFast
+    - __call__
+    - apply_chat_template
+    - batch_decode
+    - decode
+    - encode
+    - push_to_hub
+    - all
+
+## BatchEncoding
+
+[[autodoc]] BatchEncoding
--- a/docs/source/ja/main_classes/trainer.md
+++ b/docs/source/ja/main_classes/trainer.md
@ -0,0 +1,728 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer
+
+[`Trainer`] クラスは、ほとんどの標準的なユースケースに対して、PyTorch で機能を完全にトレーニングするための API を提供します。これは、[サンプル スクリプト](https://github.com/huggingface/transformers/tree/main/examples) のほとんどで使用されています。
+
+[`Trainer`] をインスタンス化する前に、トレーニング中にカスタマイズのすべてのポイントにアクセスするために [`TrainingArguments`] を作成します。
+
+この API は、複数の GPU/TPU での分散トレーニング、[NVIDIA Apex](https://github.com/NVIDIA/apex) および PyTorch のネイティブ AMP による混合精度をサポートします。
+
+[`Trainer`] には、上記の機能をサポートする基本的なトレーニング ループが含まれています。カスタム動作を挿入するには、それらをサブクラス化し、次のメソッドをオーバーライドします。
+
+- **get_train_dataloader** -- トレーニング データローダーを作成します。
+- **get_eval_dataloader** -- 評価用データローダーを作成します。
+- **get_test_dataloader** -- テスト データローダーを作成します。
+- **log** -- トレーニングを監視しているさまざまなオブジェクトに関する情報をログに記録します。
+- **create_optimizer_and_scheduler** -- オプティマイザと学習率スケジューラが渡されなかった場合にセットアップします。
+  初期化。 `create_optimizer`メソッドと`create_scheduler`メソッドをサブクラス化またはオーバーライドすることもできることに注意してください。
+  別々に。
+- **create_optimizer** -- init で渡されなかった場合にオプティマイザーをセットアップします。
+- **create_scheduler** -- init で渡されなかった場合、学習率スケジューラを設定します。
+- **compute_loss** - トレーニング入力のバッチの損失を計算します。
+- **training_step** -- トレーニング ステップを実行します。
+- **prediction_step** -- 評価/テスト ステップを実行します。
+- **evaluate** -- 評価ループを実行し、メトリクスを返します。
+- **predict** -- テスト セットの予測 (ラベルが使用可能な場合はメトリクスも含む) を返します。
+
+<Tip warning={true}>
+
+[`Trainer`] クラスは 🤗 Transformers モデル用に最適化されており、驚くべき動作をする可能性があります
+他の機種で使用する場合。独自のモデルで使用する場合は、次の点を確認してください。
+
+- モデルは常に [`~utils.ModelOutput`] のタプルまたはサブクラスを返します。
+- `labels` 引数が指定され、その損失が最初の値として返される場合、モデルは損失を計算できます。
+  タプルの要素 (モデルがタプルを返す場合)
+- モデルは複数のラベル引数を受け入れることができます ([`TrainingArguments`] で `label_names` を使用して、その名前を [`Trainer`] に示します) が、それらのいずれにも `"label"` という名前を付ける必要はありません。
+
+</Tip>
+
+以下は、加重損失を使用するように [`Trainer`] をカスタマイズする方法の例です (不均衡なトレーニング セットがある場合に役立ちます)。
+
+```python
+from torch import nn
+from transformers import Trainer
+
+
+class CustomTrainer(Trainer):
+    def compute_loss(self, model, inputs, return_outputs=False):
+        labels = inputs.pop("labels")
+        # forward pass
+        outputs = model(**inputs)
+        logits = outputs.get("logits")
+        # compute custom loss (suppose one has 3 labels with different weights)
+        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device))
+        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
+        return (loss, outputs) if return_outputs else loss
+```
+
+PyTorch [`Trainer`] のトレーニング ループの動作をカスタマイズするもう 1 つの方法は、トレーニング ループの状態を検査できる [callbacks](コールバック) を使用することです (進行状況レポート、TensorBoard または他の ML プラットフォームでのログ記録など)。決定（早期停止など）。
+
+## Trainer
+
+[[autodoc]] Trainer
+    - all
+
+## Seq2SeqTrainer
+
+[[autodoc]] Seq2SeqTrainer
+    - evaluate
+    - predict
+
+## TrainingArguments
+
+[[autodoc]] TrainingArguments
+    - all
+
+## Seq2SeqTrainingArguments
+
+[[autodoc]] Seq2SeqTrainingArguments
+    - all
+
+## Checkpoints
+
+デフォルトでは、[`Trainer`] はすべてのチェックポイントを、
+[`TrainingArguments`] を使用しています。これらは、xxx を含む`checkpoint-xxx`という名前のサブフォルダーに保存されます。
+それはトレーニングの段階でした。
+
+チェックポイントからトレーニングを再開するには、次のいずれかを使用して [`Trainer.train`] を呼び出します。
+
+- `resume_from_checkpoint=True` は最新のチェックポイントからトレーニングを再開します
+- `resume_from_checkpoint=checkpoint_dir` ディレクトリ内の特定のチェックポイントからトレーニングを再開します
+  合格した。
+
+さらに、`push_to_hub=True` を使用すると、モデル ハブにチェックポイントを簡単に保存できます。デフォルトでは、すべて
+中間チェックポイントに保存されたモデルは別のコミットに保存されますが、オプティマイザーの状態は保存されません。適応できます
+[`TrainingArguments`] の `hub-strategy` 値を次のいずれかにします。
+
+- `"checkpoint"`: 最新のチェックポイントも last-checkpoint という名前のサブフォルダーにプッシュされます。
+  `trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")` を使用してトレーニングを簡単に再開します。
+- `"all_checkpoints"`: すべてのチェックポイントは、出力フォルダーに表示されるようにプッシュされます (したがって、1 つのチェックポイントが得られます)
+  最終リポジトリ内のフォルダーごとのチェックポイント フォルダー)
+
+## Logging
+
+デフォルトでは、[`Trainer`] はメインプロセスに `logging.INFO` を使用し、レプリカがある場合には `logging.WARNING` を使用します。
+
+これらのデフォルトは、[`TrainingArguments`] の 5 つの `logging` レベルのいずれかを使用するようにオーバーライドできます。
+引数:
+
+- `log_level` - メインプロセス用
+- `log_level_replica` - レプリカ用
+
+さらに、[`TrainingArguments`] の `log_on_each_node` が `False` に設定されている場合、メイン ノードのみが
+メイン プロセスのログ レベル設定を使用すると、他のすべてのノードはレプリカのログ レベル設定を使用します。
+
+[`Trainer`] は、`transformers` のログ レベルをノードごとに個別に設定することに注意してください。
+[`Trainer.__init__`]。したがって、他の機能を利用する場合は、これをより早く設定することをお勧めします (次の例を参照)。
+[`Trainer`] オブジェクトを作成する前の `transformers` 機能。
+
+これをアプリケーションで使用する方法の例を次に示します。
+
+```python
+[...]
+logger = logging.getLogger(__name__)
+
+# Setup logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    handlers=[logging.StreamHandler(sys.stdout)],
+)
+
+# set the main code and the modules it uses to the same log-level according to the node
+log_level = training_args.get_process_log_level()
+logger.setLevel(log_level)
+datasets.utils.logging.set_verbosity(log_level)
+transformers.utils.logging.set_verbosity(log_level)
+
+trainer = Trainer(...)
+```
+
+そして、メイン ノードと他のすべてのノードで重複する可能性が高いものを出力しないように警告するだけを表示したい場合は、
+警告: 次のように実行できます。
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error
+```
+
+マルチノード環境で、各ノードのメインプロセスのログを繰り返したくない場合は、次のようにします。
+上記を次のように変更します。
+
+```bash
+my_app.py ... --log_level warning --log_level_replica error --log_on_each_node 0
+```
+
+その後、最初のノードのメイン プロセスのみが「警告」レベルでログに記録され、メイン ノード上の他のすべてのプロセスはログに記録されます。
+ノードと他のノード上のすべてのプロセスは「エラー」レベルでログに記録されます。
+
+アプリケーションをできるだけ静かにする必要がある場合は、次のようにします。
+
+```bash
+my_app.py ... --log_level error --log_level_replica error --log_on_each_node 0
+```
+
+(マルチノード環境の場合は `--log_on_each_node 0` を追加します)
+
+## Randomness
+
+[`Trainer`] によって生成されたチェックポイントから再開する場合、すべての努力がその状態を復元するために行われます。
+_python_、_numpy_、および _pytorch_ の RNG 状態は、そのチェックポイントを保存した時点と同じ状態になります。
+これにより、「停止して再開」というスタイルのトレーニングが、ノンストップトレーニングに可能な限り近づけられるはずです。
+
+ただし、さまざまなデフォルトの非決定的な pytorch 設定により、これは完全に機能しない可能性があります。フルをご希望の場合は
+決定論については、[ランダム性のソースの制御](https://pytorch.org/docs/stable/notes/randomness) を参照してください。ドキュメントで説明されているように、これらの設定の一部は
+物事を決定論的にするもの (例: `torch.backends.cudnn.deterministic`) は物事を遅くする可能性があるため、これは
+デフォルトでは実行できませんが、必要に応じて自分で有効にすることができます。
+
+## Specific GPUs Selection
+
+どの GPU をどのような順序で使用するかをプログラムに指示する方法について説明します。
+
+[`DistributedDataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.Parallel.DistributedDataParallel.html) を使用して GPU のサブセットのみを使用する場合、使用する GPU の数を指定するだけです。 。たとえば、GPU が 4 つあるが、最初の 2 つを使用したい場合は、次のようにします。
+
+```bash
+python -m torch.distributed.launch --nproc_per_node=2  trainer-program.py ...
+```
+
+[`accelerate`](https://github.com/huggingface/accelerate) または [`deepspeed`](https://github.com/microsoft/DeepSpeed) がインストールされている場合は、次を使用して同じことを達成することもできます。の一つ：
+
+```bash
+accelerate launch --num_processes 2 trainer-program.py ...
+```
+
+```bash
+deepspeed --num_gpus 2 trainer-program.py ...
+```
+
+これらのランチャーを使用するために、Accelerate または [Deepspeed 統合](Deepspeed) 機能を使用する必要はありません。
+
+
+これまでは、プログラムに使用する GPU の数を指示できました。次に、特定の GPU を選択し、その順序を制御する方法について説明します。
+
+次の環境変数は、使用する GPU とその順序を制御するのに役立ちます。
+
+**`CUDA_VISIBLE_DEVICES`**
+
+複数の GPU があり、そのうちの 1 つまたはいくつかの GPU だけを使用したい場合は、環境変数 `CUDA_VISIBLE_DEVICES` を使用する GPU のリストに設定します。
+
+たとえば、4 つの GPU (0、1、2、3) があるとします。物理 GPU 0 と 2 のみで実行するには、次のようにします。
+
+```bash
+CUDA_VISIBLE_DEVICES=0,2 python -m torch.distributed.launch trainer-program.py ...
+```
+
+したがって、pytorch は 2 つの GPU のみを認識し、物理 GPU 0 と 2 はそれぞれ `cuda:0` と `cuda:1` にマッピングされます。
+
+順序を変更することもできます。
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 python -m torch.distributed.launch trainer-program.py ...
+```
+
+ここでは、物理 GPU 0 と 2 がそれぞれ`cuda:1`と`cuda:0`にマッピングされています。
+
+上記の例はすべて `DistributedDataParallel` 使用パターンのものですが、同じ方法が [`DataParallel`](https://pytorch.org/docs/stable/generated/torch.nn.DataParallel.html) でも機能します。
+
+
+```bash
+CUDA_VISIBLE_DEVICES=2,0 python trainer-program.py ...
+```
+
+GPU のない環境をエミュレートするには、次のようにこの環境変数を空の値に設定するだけです。
+
+```bash
+CUDA_VISIBLE_DEVICES= python trainer-program.py ...
+```
+
+他の環境変数と同様に、これらをコマンド ラインに追加する代わりに、次のようにエクスポートすることもできます。
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,2
+python -m torch.distributed.launch trainer-program.py ...
+```
+
+ただし、この方法では、以前に環境変数を設定したことを忘れて、なぜ間違った GPU が使用されているのか理解できない可能性があるため、混乱を招く可能性があります。したがって、このセクションのほとんどの例で示されているように、同じコマンド ラインで特定の実行に対してのみ環境変数を設定するのが一般的です。
+
+**`CUDA_DEVICE_ORDER`**
+
+物理デバイスの順序を制御する追加の環境変数 `CUDA_DEVICE_ORDER` があります。選択肢は次の 2 つです。
+
+1. PCIe バス ID 順 (`nvidia-smi` の順序と一致) - これがデフォルトです。
+
+```bash
+export CUDA_DEVICE_ORDER=PCI_BUS_ID
+```
+
+2. GPU コンピューティング能力順に並べる
+
+```bash
+export CUDA_DEVICE_ORDER=FASTEST_FIRST
+```
+
+ほとんどの場合、この環境変数を気にする必要はありませんが、古い GPU と新しい GPU が物理的に挿入されているため、遅い古いカードが遅くなっているように見えるような偏ったセットアップを行っている場合には、非常に役立ちます。初め。これを解決する 1 つの方法は、カードを交換することです。ただし、カードを交換できない場合 (デバイスの冷却が影響を受けた場合など)、`CUDA_DEVICE_ORDER=FASTEST_FIRST`を設定すると、常に新しい高速カードが最初に配置されます。ただし、`nvidia-smi`は依然として PCIe の順序でレポートするため、多少混乱するでしょう。
+
+順序を入れ替えるもう 1 つの解決策は、以下を使用することです。
+
+```bash
+export CUDA_VISIBLE_DEVICES=1,0
+```
+
+この例では 2 つの GPU だけを使用していますが、もちろん、コンピューターに搭載されている数の GPU にも同じことが当てはまります。
+
+また、この環境変数を設定する場合は、`~/.bashrc` ファイルまたはその他の起動設定ファイルに設定して、忘れるのが最善です。
+
+## Trainer Integrations
+
+[`Trainer`] は、トレーニングを劇的に改善する可能性のあるライブラリをサポートするように拡張されました。
+時間とはるかに大きなモデルに適合します。
+
+現在、サードパーティのソリューション [DeepSpeed](https://github.com/microsoft/DeepSpeed) および [PyTorch FSDP](https://pytorch.org/docs/stable/fsdp.html) をサポートしています。論文 [ZeRO: メモリの最適化]
+兆パラメータ モデルのトレーニングに向けて、Samyam Rajbhandari、Jeff Rasley、Olatunji Ruwase、Yuxiong He 著](https://arxiv.org/abs/1910.02054)。
+
+この提供されるサポートは、この記事の執筆時点では新しくて実験的なものです。 DeepSpeed と PyTorch FSDP のサポートはアクティブであり、それに関する問題は歓迎しますが、FairScale 統合は PyTorch メインに統合されているため、もうサポートしていません ([PyTorch FSDP 統合](#pytorch-fully-sharded-data-parallel))
+
+<a id='zero-install-notes'></a>
+
+### CUDA Extension Installation Notes
+
+この記事の執筆時点では、Deepspeed を使用するには、CUDA C++ コードをコンパイルする必要があります。
+
+すべてのインストールの問題は、[Deepspeed](https://github.com/microsoft/DeepSpeed/issues) の対応する GitHub の問題を通じて対処する必要がありますが、ビルド中に発生する可能性のある一般的な問題がいくつかあります。
+CUDA 拡張機能を構築する必要がある PyTorch 拡張機能。
+
+したがって、次の操作を実行中に CUDA 関連のビルドの問題が発生した場合は、次のとおりです。
+
+```bash
+pip install deepspeed
+```
+
+まず次の注意事項をお読みください。
+
+これらのノートでは、`pytorch` が CUDA `10.2` でビルドされた場合に何をすべきかの例を示します。あなたの状況が次のような場合
+異なる場合は、バージョン番号を目的のバージョンに調整することを忘れないでください。
+
+#### Possible problem #1
+
+Pytorch には独自の CUDA ツールキットが付属していますが、これら 2 つのプロジェクトをビルドするには、同一バージョンの CUDA が必要です。
+システム全体にインストールされます。
+
+たとえば、Python 環境に `cudatoolkit==10.2` を指定して `pytorch` をインストールした場合は、次のものも必要です。
+CUDA `10.2` がシステム全体にインストールされました。
+
+正確な場所はシステムによって異なる場合がありますが、多くのシステムでは`/usr/local/cuda-10.2`が最も一般的な場所です。
+Unix システム。 CUDA が正しく設定され、`PATH`環境変数に追加されると、
+次のようにしてインストール場所を指定します。
+
+
+```bash
+which nvcc
+```
+
+CUDA がシステム全体にインストールされていない場合は、最初にインストールしてください。お気に入りを使用して手順を見つけることができます
+検索エンジン。たとえば、Ubuntu を使用している場合は、[ubuntu cuda 10.2 install](https://www.google.com/search?q=ubuntu+cuda+10.2+install) を検索するとよいでしょう。
+
+#### Possible problem #2
+
+もう 1 つの考えられる一般的な問題は、システム全体に複数の CUDA ツールキットがインストールされている可能性があることです。たとえばあなた
+がある可能性があり：
+
+```bash
+/usr/local/cuda-10.2
+/usr/local/cuda-11.0
+```
+
+この状況では、`PATH` および `LD_LIBRARY_PATH` 環境変数に以下が含まれていることを確認する必要があります。
+目的の CUDA バージョンへの正しいパス。通常、パッケージ インストーラーは、これらに、
+最後のバージョンがインストールされました。適切なパッケージが見つからないためにパッケージのビルドが失敗するという問題が発生した場合は、
+CUDA バージョンがシステム全体にインストールされているにもかかわらず、前述の 2 つを調整する必要があることを意味します
+環境変数。
+
+まず、その内容を見てみましょう。
+
+```bash
+echo $PATH
+echo $LD_LIBRARY_PATH
+```
+
+それで、中に何が入っているかがわかります。
+
+`LD_LIBRARY_PATH` が空である可能性があります。
+
+`PATH` は実行可能ファイルが存在する場所をリストし、`LD_LIBRARY_PATH` は共有ライブラリの場所を示します。
+探すことです。どちらの場合も、前のエントリが後のエントリより優先されます。 `:` は複数を区切るために使用されます
+エントリ。
+
+ここで、ビルド プログラムに特定の CUDA ツールキットの場所を指示するには、最初にリストされる希望のパスを挿入します。
+やっていること：
+
+```bash
+export PATH=/usr/local/cuda-10.2/bin:$PATH
+export LD_LIBRARY_PATH=/usr/local/cuda-10.2/lib64:$LD_LIBRARY_PATH
+```
+
+既存の値を上書きするのではなく、先頭に追加することに注意してください。
+
+もちろん、必要に応じてバージョン番号やフルパスを調整します。割り当てたディレクトリが実際に機能することを確認してください
+存在する。 `lib64` サブディレクトリは、`libcudart.so` などのさまざまな CUDA `.so` オブジェクトが存在する場所です。
+システムでは別の名前が付けられますが、現実を反映するように調整してください。
+
+#### Possible problem #3
+
+一部の古い CUDA バージョンは、新しいコンパイラでのビルドを拒否する場合があります。たとえば、あなたは`gcc-9`を持っていますが、それが必要です
+`gcc-7`。
+
+それにはさまざまな方法があります。
+
+最新の CUDA ツールキットをインストールできる場合は、通常、新しいコンパイラがサポートされているはずです。
+
+あるいは、既に所有しているコンパイラに加えて、下位バージョンのコンパイラをインストールすることもできます。
+すでに存在しますが、デフォルトではないため、ビルドシステムはそれを認識できません。 「gcc-7」がインストールされているが、
+ビルドシステムが見つからないというメッセージを表示する場合は、次の方法で解決できる可能性があります。
+
+```bash
+sudo ln -s /usr/bin/gcc-7  /usr/local/cuda-10.2/bin/gcc
+sudo ln -s /usr/bin/g++-7  /usr/local/cuda-10.2/bin/g++
+```
+
+ここでは、`/usr/local/cuda-10.2/bin/gcc` から `gcc-7` へのシンボリックリンクを作成しています。
+`/usr/local/cuda-10.2/bin/` は `PATH` 環境変数内にある必要があります (前の問題の解決策を参照)。
+`gcc-7` (および `g++7`) が見つかるはずで、ビルドは成功します。
+
+いつものように、状況に合わせて例のパスを編集してください。
+
+### PyTorch Fully Sharded Data parallel
+
+より大きなバッチ サイズで巨大なモデルのトレーニングを高速化するには、完全にシャード化されたデータ並列モデルを使用できます。
+このタイプのデータ並列パラダイムでは、オプティマイザーの状態、勾配、パラメーターをシャーディングすることで、より多くのデータと大規模なモデルをフィッティングできます。
+この機能とその利点の詳細については、[完全シャーディング データ並列ブログ](https://pytorch.org/blog/introducing-pytorch-full-sharded-data-Parallel-api/) をご覧ください。
+最新の PyTorch の Fully Sharded Data Parallel (FSDP) トレーニング機能を統合しました。
+必要なのは、設定を通じて有効にすることだけです。
+
+**FSDP サポートに必要な PyTorch バージョン**: PyTorch Nightly (リリース後にこれを読んだ場合は 1.12.0)
+FSDP を有効にしたモデルの保存は、最近の修正でのみ利用できるためです。
+
+**使用法**：
+
+- 配布されたランチャーが追加されていることを確認してください
+まだ使用していない場合は、`-m torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`を使用します。
+
+- **シャーディング戦略**:
+  - FULL_SHARD : データ並列ワーカー/GPU にわたるシャード オプティマイザーの状態 + 勾配 + モデル パラメーター。
+    このためには、コマンドライン引数に`--fsdp full_shard`を追加します。
+  - SHARD_GRAD_OP : シャード オプティマイザーの状態 + データ並列ワーカー/GPU 全体の勾配。
+    このためには、コマンドライン引数に`--fsdp shard_grad_op`を追加します。
+  - NO_SHARD : シャーディングなし。このためには、コマンドライン引数に`--fsdp no_shard`を追加します。
+- パラメータと勾配を CPU にオフロードするには、
+  コマンドライン引数に`--fsdp "full_shard offload"`または`--fsdp "shard_grad_op offload"`を追加します。
+- `default_auto_wrap_policy` を使用して FSDP でレイヤーを自動的に再帰的にラップするには、
+  コマンドライン引数に`--fsdp "full_shard auto_wrap"`または`--fsdp "shard_grad_op auto_wrap"`を追加します。
+- CPU オフロードと自動ラッピングの両方を有効にするには、
+  コマンドライン引数に`--fsdp "full_shard offload auto_wrap"`または`--fsdp "shard_grad_op offload auto_wrap"`を追加します。
+- 残りの FSDP 構成は、`--fsdp_config <path_to_fsdp_config.json>`を介して渡されます。それは、次のいずれかの場所です。
+  FSDP json 構成ファイル (例: `fsdp_config.json`)、またはすでにロードされている json ファイルを `dict` として使用します。
+  - 自動ラッピングが有効な場合は、トランスベースの自動ラップ ポリシーまたはサイズ ベースの自動ラップ ポリシーを使用できます。
+    - トランスフォーマーベースの自動ラップポリシーの場合、構成ファイルで `fsdp_transformer_layer_cls_to_wrap` を指定することをお勧めします。指定しない場合、使用可能な場合、デフォルト値は `model._no_split_modules` になります。
+      これは、ラップするトランスフォーマー層クラス名のリスト (大文字と小文字を区別) を指定します (例: [`BertLayer`]、[`GPTJBlock`]、[`T5Block`] ...)。
+      重みを共有するサブモジュール (埋め込み層など) が異なる FSDP ラップされたユニットにならないようにする必要があるため、これは重要です。
+      このポリシーを使用すると、マルチヘッド アテンションとそれに続くいくつかの MLP レイヤーを含むブロックごとにラッピングが発生します。
+      共有埋め込みを含む残りの層は、同じ最も外側の FSDP ユニットにラップされるのが便利です。
+      したがって、トランスベースのモデルにはこれを使用してください。
+    - サイズベースの自動ラップポリシーの場合は、設定ファイルに`fsdp_min_num_params`を追加してください。
+      自動ラッピングのための FSDP のパラメータの最小数を指定します。
+  - 設定ファイルで `fsdp_backward_prefetch` を指定できるようになりました。次のパラメータのセットをいつプリフェッチするかを制御します。
+    `backward_pre` と `backward_pos` が利用可能なオプションです。
+    詳細については、`torch.distributed.fsdp.full_sharded_data_Parallel.BackwardPrefetch`を参照してください。
+  - 設定ファイルで `fsdp_forward_prefetch` を指定できるようになりました。次のパラメータのセットをいつプリフェッチするかを制御します。
+    `True`の場合、FSDP はフォワード パスでの実行中に、次に来るオールギャザーを明示的にプリフェッチします。
+  - 設定ファイルで `limit_all_gathers` を指定できるようになりました。
+    `True`の場合、FSDP は CPU スレッドを明示的に同期して、実行中のオールギャザが多すぎるのを防ぎます。
+  - `activation_checkpointing`を設定ファイルで指定できるようになりました。
+    `True`の場合、FSDP アクティベーション チェックポイントは、FSDP のアクティベーションをクリアすることでメモリ使用量を削減する手法です。
+    特定のレイヤーを処理し、バックワード パス中にそれらを再計算します。事実上、これは余分な計算時間を犠牲にします
+    メモリ使用量を削減します。
+
+**注意すべき注意点がいくつかあります**
+- これは `generate` と互換性がないため、 `--predict_with_generate` とも互換性がありません
+  すべての seq2seq/clm スクリプト (翻訳/要約/clm など)。
+  問題 [#21667](https://github.com/huggingface/transformers/issues/21667) を参照してください。
+
+### PyTorch/XLA Fully Sharded Data parallel
+
+TPU ユーザーの皆様に朗報です。 PyTorch/XLA は FSDP をサポートするようになりました。
+最新の Fully Sharded Data Parallel (FSDP) トレーニングがすべてサポートされています。
+詳細については、[FSDP を使用した Cloud TPU での PyTorch モデルのスケーリング](https://pytorch.org/blog/scaling-pytorch-models-on-cloud-tpus-with-fsdp/) および [PyTorch/XLA 実装 を参照してください。 FSDP の](https://github.com/pytorch/xla/tree/master/torch_xla/distributed/fsdp)
+必要なのは、設定を通じて有効にすることだけです。
+
+**FSDP サポートに必要な PyTorch/XLA バージョン**: >=2.0
+
+**使用法**：
+
+`--fsdp "full shard"` を、`--fsdp_config <path_to_fsdp_config.json>` に加えられる次の変更とともに渡します。
+- PyTorch/XLA FSDP を有効にするには、`xla`を`True`に設定する必要があります。
+- `xla_fsdp_settings` 値は、XLA FSDP ラッピング パラメータを格納する辞書です。
+  オプションの完全なリストについては、[こちら](
+  https://github.com/pytorch/xla/blob/master/torch_xla/distributed/fsdp/xla_full_sharded_data_Parallel.py)。
+- `xla_fsdp_grad_ckpt`。 `True`の場合、ネストされた XLA FSDP でラップされた各レイヤー上で勾配チェックポイントを使用します。
+  この設定は、xla フラグが true に設定されており、自動ラッピング ポリシーが指定されている場合にのみ使用できます。
+  `fsdp_min_num_params` または `fsdp_transformer_layer_cls_to_wrap`。
+- トランスフォーマー ベースの自動ラップ ポリシーまたはサイズ ベースの自動ラップ ポリシーのいずれかを使用できます。
+  - トランスフォーマーベースの自動ラップポリシーの場合、構成ファイルで `fsdp_transformer_layer_cls_to_wrap` を指定することをお勧めします。指定しない場合、使用可能な場合、デフォルト値は `model._no_split_modules` になります。
+    これは、ラップするトランスフォーマー層クラス名のリスト (大文字と小文字を区別) を指定します (例: [`BertLayer`]、[`GPTJBlock`]、[`T5Block`] ...)。
+    重みを共有するサブモジュール (埋め込み層など) が異なる FSDP ラップされたユニットにならないようにする必要があるため、これは重要です。
+    このポリシーを使用すると、マルチヘッド アテンションとそれに続くいくつかの MLP レイヤーを含むブロックごとにラッピングが発生します。
+    共有埋め込みを含む残りの層は、同じ最も外側の FSDP ユニットにラップされるのが便利です。
+    したがって、トランスベースのモデルにはこれを使用してください。
+  - サイズベースの自動ラップポリシーの場合は、設定ファイルに`fsdp_min_num_params`を追加してください。
+    自動ラッピングのための FSDP のパラメータの最小数を指定します。
+
+### Using Trainer for accelerated PyTorch Training on Mac 
+
+PyTorch v1.12 リリースにより、開発者と研究者は Apple シリコン GPU を利用してモデル トレーニングを大幅に高速化できます。
+これにより、プロトタイピングや微調整などの機械学習ワークフローを Mac 上でローカルで実行できるようになります。
+PyTorch のバックエンドとしての Apple の Metal Performance Shaders (MPS) はこれを可能にし、新しい `"mps"` デバイス経由で使用できます。
+これにより、計算グラフとプリミティブが MPS Graph フレームワークと MPS によって提供される調整されたカーネルにマッピングされます。
+詳細については、公式ドキュメント [Mac での Accelerated PyTorch Training の紹介](https://pytorch.org/blog/introducing-accelerated-pytorch-training-on-mac/) を参照してください。
+および [MPS バックエンド](https://pytorch.org/docs/stable/notes/mps.html)。
+
+<Tip warning={false}>
+
+MacOS マシンに PyTorch >= 1.13 (執筆時点ではナイトリー バージョン) をインストールすることを強くお勧めします。
+トランスベースのモデルのモデルの正確性とパフォーマンスの向上に関連する主要な修正が行われています。
+詳細については、https://github.com/pytorch/pytorch/issues/82707 を参照してください。
+
+</Tip>
+
+**Apple Silicon チップを使用したトレーニングと推論の利点**
+
+1. ユーザーがローカルで大規模なネットワークやバッチ サイズをトレーニングできるようにします
+2. ユニファイド メモリ アーキテクチャにより、データ取得の遅延が短縮され、GPU がメモリ ストア全体に直接アクセスできるようになります。
+したがって、エンドツーエンドのパフォーマンスが向上します。
+3. クラウドベースの開発に関連するコストや追加のローカル GPU の必要性を削減します。
+
+**前提条件**: mps サポートを備えたトーチをインストールするには、
+この素晴らしいメディア記事 [GPU アクセラレーションが M1 Mac の PyTorch に登場](https://medium.com/towards-data-science/gpu-acceleration-comes-to-pytorch-on-m1-macs-195c399efcc1) に従ってください。 。
+
+**使用法**：
+`mps` デバイスは、`cuda` デバイスが使用される方法と同様に利用可能な場合、デフォルトで使用されます。
+したがって、ユーザーによるアクションは必要ありません。
+たとえば、以下のコマンドを使用して、Apple Silicon GPU を使用して公式の Glue テキスト分類タスクを (ルート フォルダーから) 実行できます。
+
+```bash
+export TASK_NAME=mrpc
+
+python examples/pytorch/text-classification/run_glue.py \
+  --model_name_or_path bert-base-cased \
+  --task_name $TASK_NAME \
+  --do_train \
+  --do_eval \
+  --max_seq_length 128 \
+  --per_device_train_batch_size 32 \
+  --learning_rate 2e-5 \
+  --num_train_epochs 3 \
+  --output_dir /tmp/$TASK_NAME/ \
+  --overwrite_output_dir
+```
+
+**注意すべきいくつかの注意事項**
+
+1. 一部の PyTorch 操作は mps に実装されていないため、エラーがスローされます。
+これを回避する 1 つの方法は、環境変数 `PYTORCH_ENABLE_MPS_FALLBACK=1` を設定することです。
+これらの操作では CPU にフォールバックします。ただし、それでも UserWarning がスローされます。
+2. 分散セットアップ`gloo`および`nccl`は、`mps`デバイスでは動作しません。
+これは、現在「mps」デバイス タイプの単一 GPU のみを使用できることを意味します。
+
+最後に、覚えておいてください。 🤗 `Trainer` は MPS バックエンドのみを統合するため、
+MPS バックエンドの使用に関して問題や質問がある場合は、
+[PyTorch GitHub](https://github.com/pytorch/pytorch/issues) に問題を提出してください。
+
+## Using Accelerate Launcher with Trainer
+
+加速してトレーナーにパワーを与えましょう。ユーザーが期待することに関しては、次のとおりです。
+- トレーナー引数に対して FSDP、DeepSpeed などのトレーナー インテレーションを変更せずに使用し続けることができます。
+- トレーナーで Accelerate Launcher を使用できるようになりました (推奨)。
+
+トレーナーで Accelerate Launcher を使用する手順:
+1. 🤗 Accelerate がインストールされていることを確認してください。Accelerate がないと `Trainer` を使用することはできません。そうでない場合は、`pip install accelerate`してください。 Accelerate のバージョンを更新する必要がある場合もあります: `pip install activate --upgrade`
+2. `accelerate config`を実行し、アンケートに記入します。以下は加速設定の例です。
+  ａ． DDP マルチノード マルチ GPU 構成:
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    distributed_type: MULTI_GPU                                                                                                    
+    downcast_bf16: 'no'
+    gpu_ids: all
+    machine_rank: 0 #change rank as per the node
+    main_process_ip: 192.168.20.1
+    main_process_port: 9898
+    main_training_function: main
+    mixed_precision: fp16
+    num_machines: 2
+    num_processes: 8
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+  b. FSDP config:
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    distributed_type: FSDP
+    downcast_bf16: 'no'
+    fsdp_config:
+      fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+      fsdp_backward_prefetch_policy: BACKWARD_PRE
+      fsdp_forward_prefetch: true
+      fsdp_offload_params: false
+      fsdp_sharding_strategy: 1
+      fsdp_state_dict_type: FULL_STATE_DICT
+      fsdp_sync_module_states: true
+      fsdp_transformer_layer_cls_to_wrap: BertLayer
+      fsdp_use_orig_params: true
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 2
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+  c.ファイルを指す DeepSpeed 構成:
+    ```yaml
+    compute_environment: LOCAL_MACHINE
+    deepspeed_config:
+      deepspeed_config_file: /home/user/configs/ds_zero3_config.json
+      zero3_init_flag: true
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ``` 
+
+  d.加速プラグインを使用した DeepSpeed 構成:
+
+    ```yaml
+    compute_environment: LOCAL_MACHINE                                                                                             
+    deepspeed_config:                                                                                                              
+      gradient_accumulation_steps: 1
+      gradient_clipping: 0.7
+      offload_optimizer_device: cpu
+      offload_param_device: cpu
+      zero3_init_flag: true
+      zero_stage: 2
+    distributed_type: DEEPSPEED
+    downcast_bf16: 'no'
+    machine_rank: 0
+    main_training_function: main
+    mixed_precision: bf16
+    num_machines: 1
+    num_processes: 4
+    rdzv_backend: static
+    same_network: true
+    tpu_env: []
+    tpu_use_cluster: false
+    tpu_use_sudo: false
+    use_cpu: false
+    ```
+
+3. 加速設定またはランチャー引数によって上記で処理された引数以外の引数を使用して、トレーナー スクリプトを実行します。
+以下は、上記の FSDP 構成で`accelerate launcher`を使用して`run_glue.py`を実行する例です。 
+
+```bash
+cd transformers
+
+accelerate launch \
+./examples/pytorch/text-classification/run_glue.py \
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+4. `accelerate launch`するための cmd 引数を直接使用することもできます。上の例は次のようにマッピングされます。
+
+```bash
+cd transformers
+
+accelerate launch --num_processes=2 \
+--use_fsdp \
+--mixed_precision=bf16 \
+--fsdp_auto_wrap_policy=TRANSFORMER_BASED_WRAP  \
+--fsdp_transformer_layer_cls_to_wrap="BertLayer" \
+--fsdp_sharding_strategy=1 \
+--fsdp_state_dict_type=FULL_STATE_DICT \
+./examples/pytorch/text-classification/run_glue.py
+--model_name_or_path bert-base-cased \
+--task_name $TASK_NAME \
+--do_train \
+--do_eval \
+--max_seq_length 128 \
+--per_device_train_batch_size 16 \
+--learning_rate 5e-5 \
+--num_train_epochs 3 \
+--output_dir /tmp/$TASK_NAME/ \
+--overwrite_output_dir
+```
+
+詳細については、🤗 Accelerate CLI ガイドを参照してください: [🤗 Accelerate スクリプトの起動](https://huggingface.co/docs/accelerate/basic_tutorials/launch)。
+
+移動されたセクション:
+
+[ <a href="./deepspeed#deepspeed-trainer-integration">DeepSpeed</a><a id="deepspeed"></a>
+| <a href="./deepspeed#deepspeed-installation">Installation</a><a id="installation"></a>
+| <a href="./deepspeed#deepspeed-multi-gpu">Deployment with multiple GPUs</a><a id="deployment-with-multiple-gpus"></a>
+| <a href="./deepspeed#deepspeed-one-gpu">Deployment with one GPU</a><a id="deployment-with-one-gpu"></a>
+| <a href="./deepspeed#deepspeed-notebook">Deployment in Notebooks</a><a id="deployment-in-notebooks"></a>
+| <a href="./deepspeed#deepspeed-config">Configuration</a><a id="configuration"></a>
+| <a href="./deepspeed#deepspeed-config-passing">Passing Configuration</a><a id="passing-configuration"></a>
+| <a href="./deepspeed#deepspeed-config-shared">Shared Configuration</a><a id="shared-configuration"></a>
+| <a href="./deepspeed#deepspeed-zero">ZeRO</a><a id="zero"></a>
+| <a href="./deepspeed#deepspeed-zero2-config">ZeRO-2 Config</a><a id="zero-2-config"></a>
+| <a href="./deepspeed#deepspeed-zero3-config">ZeRO-3 Config</a><a id="zero-3-config"></a>
+| <a href="./deepspeed#deepspeed-nvme">NVMe Support</a><a id="nvme-support"></a>
+| <a href="./deepspeed#deepspeed-zero2-zero3-performance">ZeRO-2 vs ZeRO-3 Performance</a><a id="zero-2-vs-zero-3-performance"></a>
+| <a href="./deepspeed#deepspeed-zero2-example">ZeRO-2 Example</a><a id="zero-2-example"></a>
+| <a href="./deepspeed#deepspeed-zero3-example">ZeRO-3 Example</a><a id="zero-3-example"></a>
+| <a href="./deepspeed#deepspeed-optimizer">Optimizer</a><a id="optimizer"></a>
+| <a href="./deepspeed#deepspeed-scheduler">Scheduler</a><a id="scheduler"></a>
+| <a href="./deepspeed#deepspeed-fp32">fp32 Precision</a><a id="fp32-precision"></a>
+| <a href="./deepspeed#deepspeed-amp">Automatic Mixed Precision</a><a id="automatic-mixed-precision"></a>
+| <a href="./deepspeed#deepspeed-bs">Batch Size</a><a id="batch-size"></a>
+| <a href="./deepspeed#deepspeed-grad-acc">Gradient Accumulation</a><a id="gradient-accumulation"></a>
+| <a href="./deepspeed#deepspeed-grad-clip">Gradient Clipping</a><a id="gradient-clipping"></a>
+| <a href="./deepspeed#deepspeed-weight-extraction">Getting The Model Weights Out</a><a id="getting-the-model-weights-out"></a>
+]
--- a/docs/source/ms/index.md
+++ b/docs/source/ms/index.md
@ -180,7 +180,7 @@ Dokumentasi disusun kepada lima bahagian:
 1. **[NLLB-MOE](model_doc/nllb-moe)** (from Meta) released with the paper [No Language Left Behind: Scaling Human-Centered Machine Translation](https://arxiv.org/abs/2207.04672) by the NLLB team.
 1. **[Nyströmformer](model_doc/nystromformer)** (from the University of Wisconsin - Madison) released with the paper [Nyströmformer: A Nyström-Based Algorithm for Approximating Self-Attention](https://arxiv.org/abs/2102.03902) by Yunyang Xiong, Zhanpeng Zeng, Rudrasis Chakraborty, Mingxing Tan, Glenn Fung, Yin Li, Vikas Singh.
 1. **[OneFormer](model_doc/oneformer)** (from SHI Labs) released with the paper [OneFormer: One Transformer to Rule Universal Image Segmentation](https://arxiv.org/abs/2211.06220) by Jitesh Jain, Jiachen Li, MangTik Chiu, Ali Hassani, Nikita Orlov, Humphrey Shi.
-1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released in [Open-Llama](https://github.com/s-JoL/Open-Llama). 
+1. **[OpenLlama](model_doc/open-llama)** (from [s-JoL](https://huggingface.co/s-JoL)) released on GitHub (now removed).
 1. **[OPT](master/model_doc/opt)** (from Meta AI) released with the paper [OPT: Open Pre-trained Transformer Language Models](https://arxiv.org/abs/2205.01068) by Susan Zhang, Stephen Roller, Naman Goyal, Mikel Artetxe, Moya Chen, Shuohui Chen et al.
 1. **[OWL-ViT](model_doc/owlvit)** (from Google AI) released with the paper [Simple Open-Vocabulary Object Detection with Vision Transformers](https://arxiv.org/abs/2205.06230) by Matthias Minderer, Alexey Gritsenko, Austin Stone, Maxim Neumann, Dirk Weissenborn, Alexey Dosovitskiy, Aravindh Mahendran, Anurag Arnab, Mostafa Dehghani, Zhuoran Shen, Xiao Wang, Xiaohua Zhai, Thomas Kipf, and Neil Houlsby.
 1. **[Pegasus](model_doc/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -7,10 +7,32 @@
    title: 安装
  title: 开始使用
 - sections:
+  - local: pipeline_tutorial
+    title: 使用pipelines进行推理
+  - local: preprocessing
+    title: 预处理数据
+  - local: training
+    title: 微调预训练模型
  - local: accelerate
-    title: 加速分布式训练
+    title: 使用🤗Accelerate进行分布式训练
+  - local: transformers_agents
+    title: agents教程
  title: 教程
 - sections:
  - local: fast_tokenizers
    title: 使用 🤗 Tokenizers 中的分词器
-  title: 开发者指南
+  - local: multilingual
+    title: 使用多语言模型进行推理
+  - local: create_a_model
+    title: 使用特定于模型的 API
+  - local: custom_models
+    title: 共享自定义模型
+  - local: serialization
+    title: 导出为 ONNX
+  - local: tflite
+    title: 导出为 TFLite
+  title: 开发者指南
+- sections:
+  - local: task_summary
+    title: 🤗Transformers能做什么
+  title: 概念指南
--- a/docs/source/zh/create_a_model.md
+++ b/docs/source/zh/create_a_model.md
@ -0,0 +1,389 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 创建自定义架构
+
+[`AutoClass`](model_doc/auto) 自动推断模型架构并下载预训练的配置和权重。一般来说，我们建议使用 `AutoClass` 生成与检查点（checkpoint）无关的代码。希望对特定模型参数有更多控制的用户，可以仅从几个基类创建自定义的 🤗 Transformers 模型。这对于任何有兴趣学习、训练或试验 🤗 Transformers 模型的人可能特别有用。通过本指南，深入了解如何不通过 `AutoClass` 创建自定义模型。了解如何：
+
+- 加载并自定义模型配置。
+- 创建模型架构。
+- 为文本创建慢速和快速分词器。
+- 为视觉任务创建图像处理器。
+- 为音频任务创建特征提取器。
+- 为多模态任务创建处理器。
+
+## 配置
+
+[配置](main_classes/configuration) 涉及到模型的具体属性。每个模型配置都有不同的属性；例如，所有 NLP 模型都共享 `hidden_size`、`num_attention_heads`、 `num_hidden_layers` 和 `vocab_size` 属性。这些属性用于指定构建模型时的注意力头数量或隐藏层层数。
+
+访问 [`DistilBertConfig`] 以更近一步了解 [DistilBERT](model_doc/distilbert)，检查它的属性：
+
+```py
+>>> from transformers import DistilBertConfig
+
+>>> config = DistilBertConfig()
+>>> print(config)
+DistilBertConfig {
+  "activation": "gelu",
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+[`DistilBertConfig`] 显示了构建基础 [`DistilBertModel`] 所使用的所有默认属性。所有属性都可以进行自定义，为实验创造了空间。例如，您可以将默认模型自定义为：
+
+- 使用 `activation` 参数尝试不同的激活函数。
+- 使用 `attention_dropout` 参数为 attention probabilities 使用更高的 dropout ratio。
+
+```py
+>>> my_config = DistilBertConfig(activation="relu", attention_dropout=0.4)
+>>> print(my_config)
+DistilBertConfig {
+  "activation": "relu",
+  "attention_dropout": 0.4,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "initializer_range": 0.02,
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "transformers_version": "4.16.2",
+  "vocab_size": 30522
+}
+```
+
+预训练模型的属性可以在 [`~PretrainedConfig.from_pretrained`] 函数中进行修改：
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("distilbert-base-uncased", activation="relu", attention_dropout=0.4)
+```
+
+当你对模型配置满意时，可以使用 [`~PretrainedConfig.save_pretrained`] 来保存配置。你的配置文件将以 JSON 文件的形式存储在指定的保存目录中：
+
+```py
+>>> my_config.save_pretrained(save_directory="./your_model_save_path")
+```
+
+要重用配置文件，请使用 [`~PretrainedConfig.from_pretrained`] 进行加载：
+
+```py
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+```
+
+<Tip>
+
+你还可以将配置文件保存为字典，甚至只保存自定义配置属性与默认配置属性之间的差异！有关更多详细信息，请参阅 [配置](main_classes/configuration) 文档。
+
+</Tip>
+
+## 模型
+
+接下来，创建一个[模型](main_classes/models)。模型，也可泛指架构，定义了每一层网络的行为以及进行的操作。配置中的 `num_hidden_layers` 等属性用于定义架构。每个模型都共享基类 [`PreTrainedModel`] 和一些常用方法，例如调整输入嵌入的大小和修剪自注意力头。此外，所有模型都是 [`torch.nn.Module`](https://pytorch.org/docs/stable/generated/torch.nn.Module.html)、[`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) 或 [`flax.linen.Module`](https://flax.readthedocs.io/en/latest/flax.linen.html#module) 的子类。这意味着模型与各自框架的用法兼容。
+
+<frameworkcontent>
+<pt>
+将自定义配置属性加载到模型中：
+
+```py
+>>> from transformers import DistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/config.json")
+>>> model = DistilBertModel(my_config)
+```
+
+这段代码创建了一个具有随机参数而不是预训练权重的模型。在训练该模型之前，您还无法将该模型用于任何用途。训练是一项昂贵且耗时的过程。通常来说，最好使用预训练模型来更快地获得更好的结果，同时仅使用训练所需资源的一小部分。
+
+使用 [`~PreTrainedModel.from_pretrained`] 创建预训练模型：
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+当加载预训练权重时，如果模型是由 🤗 Transformers 提供的，将自动加载默认模型配置。然而，如果你愿意，仍然可以将默认模型配置的某些或者所有属性替换成你自己的配置：
+
+```py
+>>> model = DistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</pt>
+<tf>
+将自定义配置属性加载到模型中：
+
+```py
+>>> from transformers import TFDistilBertModel
+
+>>> my_config = DistilBertConfig.from_pretrained("./your_model_save_path/my_config.json")
+>>> tf_model = TFDistilBertModel(my_config)
+```
+
+这段代码创建了一个具有随机参数而不是预训练权重的模型。在训练该模型之前，您还无法将该模型用于任何用途。训练是一项昂贵且耗时的过程。通常来说，最好使用预训练模型来更快地获得更好的结果，同时仅使用训练所需资源的一小部分。
+
+使用 [`~TFPreTrainedModel.from_pretrained`] 创建预训练模型：
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")
+```
+
+当加载预训练权重时，如果模型是由 🤗 Transformers 提供的，将自动加载默认模型配置。然而，如果你愿意，仍然可以将默认模型配置的某些或者所有属性替换成自己的配置：
+
+```py
+>>> tf_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased", config=my_config)
+```
+</tf>
+</frameworkcontent>
+
+### 模型头（Model heads）
+
+此时，你已经有了一个输出*隐藏状态*的基础 DistilBERT 模型。隐藏状态作为输入传递到模型头以生成最终输出。🤗 Transformers 为每个任务提供不同的模型头，只要模型支持该任务（即，您不能使用 DistilBERT 来执行像翻译这样的序列到序列任务）。
+
+<frameworkcontent>
+<pt>
+例如，[`DistilBertForSequenceClassification`] 是一个带有序列分类头（sequence classification head）的基础 DistilBERT 模型。序列分类头是池化输出之上的线性层。
+
+```py
+>>> from transformers import DistilBertForSequenceClassification
+
+>>> model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+通过切换到不同的模型头，可以轻松地将此检查点重复用于其他任务。对于问答任务，你可以使用 [`DistilBertForQuestionAnswering`] 模型头。问答头（question answering head）与序列分类头类似，不同点在于它是隐藏状态输出之上的线性层。
+
+```py
+>>> from transformers import DistilBertForQuestionAnswering
+
+>>> model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</pt>
+<tf>
+例如，[`TFDistilBertForSequenceClassification`] 是一个带有序列分类头（sequence classification head）的基础 DistilBERT 模型。序列分类头是池化输出之上的线性层。
+
+```py
+>>> from transformers import TFDistilBertForSequenceClassification
+
+>>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+```
+
+通过切换到不同的模型头,可以轻松地将此检查点重复用于其他任务。对于问答任务，你可以使用 [`TFDistilBertForQuestionAnswering`] 模型头。问答头（question answering head）与序列分类头类似，不同点在于它是隐藏状态输出之上的线性层。
+
+```py
+>>> from transformers import TFDistilBertForQuestionAnswering
+
+>>> tf_model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+```
+</tf>
+</frameworkcontent>
+
+## 分词器
+
+在将模型用于文本数据之前，你需要的最后一个基类是 [tokenizer](main_classes/tokenizer)，它用于将原始文本转换为张量。🤗 Transformers 支持两种类型的分词器：
+
+- [`PreTrainedTokenizer`]：分词器的Python实现
+- [`PreTrainedTokenizerFast`]：来自我们基于 Rust 的 [🤗 Tokenizer](https://huggingface.co/docs/tokenizers/python/latest/) 库的分词器。因为其使用了 Rust 实现，这种分词器类型的速度要快得多，尤其是在批量分词（batch tokenization）的时候。快速分词器还提供其他的方法，例如*偏移映射（offset mapping）*，它将标记（token）映射到其原始单词或字符。
+
+这两种分词器都支持常用的方法，如编码和解码、添加新标记以及管理特殊标记。
+
+<Tip warning={true}>
+
+并非每个模型都支持快速分词器。参照这张 [表格](index#supported-frameworks) 查看模型是否支持快速分词器。
+
+</Tip>
+
+如果您训练了自己的分词器，则可以从*词表*文件创建一个分词器：
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> my_tokenizer = DistilBertTokenizer(vocab_file="my_vocab_file.txt", do_lower_case=False, padding_side="left")
+```
+
+请务必记住，自定义分词器生成的词表与预训练模型分词器生成的词表是不同的。如果使用预训练模型，则需要使用预训练模型的词表，否则输入将没有意义。 使用 [`DistilBertTokenizer`] 类创建具有预训练模型词表的分词器：
+
+```py
+>>> from transformers import DistilBertTokenizer
+
+>>> slow_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
+```
+
+使用 [`DistilBertTokenizerFast`] 类创建快速分词器：
+
+```py
+>>> from transformers import DistilBertTokenizerFast
+
+>>> fast_tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
+```
+
+<Tip>
+
+默认情况下，[`AutoTokenizer`] 将尝试加载快速标记生成器。你可以通过在 `from_pretrained` 中设置 `use_fast=False` 以禁用此行为。
+
+</Tip>
+
+## 图像处理器
+
+图像处理器用于处理视觉输入。它继承自 [`~image_processing_utils.ImageProcessingMixin`] 基类。
+
+要使用它，需要创建一个与你使用的模型关联的图像处理器。例如，如果你使用 [ViT](model_doc/vit) 进行图像分类，可以创建一个默认的 [`ViTImageProcessor`]：
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> vit_extractor = ViTImageProcessor()
+>>> print(vit_extractor)
+ViTImageProcessor {
+  "do_normalize": true,
+  "do_resize": true,
+  "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "size": 224
+}
+```
+
+<Tip>
+
+如果您不需要进行任何自定义，只需使用 `from_pretrained` 方法加载模型的默认图像处理器参数。
+
+</Tip>
+
+修改任何 [`ViTImageProcessor`] 参数以创建自定义图像处理器：
+
+```py
+>>> from transformers import ViTImageProcessor
+
+>>> my_vit_extractor = ViTImageProcessor(resample="PIL.Image.BOX", do_normalize=False, image_mean=[0.3, 0.3, 0.3])
+>>> print(my_vit_extractor)
+ViTImageProcessor {
+  "do_normalize": false,
+  "do_resize": true,
+  "image_processor_type": "ViTImageProcessor",
+  "image_mean": [
+    0.3,
+    0.3,
+    0.3
+  ],
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": "PIL.Image.BOX",
+  "size": 224
+}
+```
+
+## 特征提取器
+
+特征提取器用于处理音频输入。它继承自 [`~feature_extraction_utils.FeatureExtractionMixin`] 基类，亦可继承 [`SequenceFeatureExtractor`] 类来处理音频输入。
+
+要使用它，创建一个与你使用的模型关联的特征提取器。例如，如果你使用 [Wav2Vec2](model_doc/wav2vec2) 进行音频分类，可以创建一个默认的 [`Wav2Vec2FeatureExtractor`]：
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor()
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": true,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 16000
+}
+```
+
+<Tip>
+
+如果您不需要进行任何自定义，只需使用 `from_pretrained` 方法加载模型的默认特征提取器参数。
+
+</Tip>
+
+修改任何 [`Wav2Vec2FeatureExtractor`] 参数以创建自定义特征提取器：
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> w2v2_extractor = Wav2Vec2FeatureExtractor(sampling_rate=8000, do_normalize=False)
+>>> print(w2v2_extractor)
+Wav2Vec2FeatureExtractor {
+  "do_normalize": false,
+  "feature_extractor_type": "Wav2Vec2FeatureExtractor",
+  "feature_size": 1,
+  "padding_side": "right",
+  "padding_value": 0.0,
+  "return_attention_mask": false,
+  "sampling_rate": 8000
+}
+```
+
+
+## 处理器
+
+对于支持多模式任务的模型，🤗 Transformers 提供了一个处理器类，可以方便地将特征提取器和分词器等处理类包装到单个对象中。例如，让我们使用 [`Wav2Vec2Processor`] 来执行自动语音识别任务 (ASR)。 ASR 将音频转录为文本，因此您将需要一个特征提取器和一个分词器。
+
+创建一个特征提取器来处理音频输入：
+
+```py
+>>> from transformers import Wav2Vec2FeatureExtractor
+
+>>> feature_extractor = Wav2Vec2FeatureExtractor(padding_value=1.0, do_normalize=True)
+```
+
+创建一个分词器来处理文本输入：
+
+```py
+>>> from transformers import Wav2Vec2CTCTokenizer
+
+>>> tokenizer = Wav2Vec2CTCTokenizer(vocab_file="my_vocab_file.txt")
+```
+
+将特征提取器和分词器合并到 [`Wav2Vec2Processor`] 中：
+
+```py
+>>> from transformers import Wav2Vec2Processor
+
+>>> processor = Wav2Vec2Processor(feature_extractor=feature_extractor, tokenizer=tokenizer)
+```
+
+通过两个基类 - 配置类和模型类 - 以及一个附加的预处理类（分词器、图像处理器、特征提取器或处理器），你可以创建 🤗 Transformers 支持的任何模型。 每个基类都是可配置的，允许你使用所需的特定属性。 你可以轻松设置模型进行训练或修改现有的预训练模型进行微调。
--- a/docs/source/zh/custom_models.md
+++ b/docs/source/zh/custom_models.md
@ -0,0 +1,305 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 共享自定义模型
+
+🤗 Transformers 库设计得易于扩展。每个模型的代码都在仓库给定的子文件夹中，没有进行抽象，因此你可以轻松复制模型代码文件并根据需要进行调整。
+
+如果你要编写全新的模型，从头开始可能更容易。在本教程中，我们将向你展示如何编写自定义模型及其配置，以便可以在 Transformers 中使用它；以及如何与社区共享它（及其依赖的代码），以便任何人都可以使用，即使它不在 🤗 Transformers 库中。
+
+我们将以 ResNet 模型为例，通过将 [timm 库](https://github.com/rwightman/pytorch-image-models) 的 ResNet 类封装到 [`PreTrainedModel`] 中来进行说明。
+
+## 编写自定义配置
+
+在深入研究模型之前，让我们首先编写其配置。模型的配置是一个对象，其中包含构建模型所需的所有信息。我们将在下一节中看到，模型只能接受一个 `config` 来进行初始化，因此我们很需要使该对象尽可能完整。
+
+我们将采用一些我们可能想要调整的 ResNet 类的参数举例。不同的配置将为我们提供不同类型可能的 ResNet 模型。在确认其中一些参数的有效性后，我们只需存储这些参数。
+
+```python
+from transformers import PretrainedConfig
+from typing import List
+
+
+class ResnetConfig(PretrainedConfig):
+    model_type = "resnet"
+
+    def __init__(
+        self,
+        block_type="bottleneck",
+        layers: List[int] = [3, 4, 6, 3],
+        num_classes: int = 1000,
+        input_channels: int = 3,
+        cardinality: int = 1,
+        base_width: int = 64,
+        stem_width: int = 64,
+        stem_type: str = "",
+        avg_down: bool = False,
+        **kwargs,
+    ):
+        if block_type not in ["basic", "bottleneck"]:
+            raise ValueError(f"`block_type` must be 'basic' or bottleneck', got {block_type}.")
+        if stem_type not in ["", "deep", "deep-tiered"]:
+            raise ValueError(f"`stem_type` must be '', 'deep' or 'deep-tiered', got {stem_type}.")
+
+        self.block_type = block_type
+        self.layers = layers
+        self.num_classes = num_classes
+        self.input_channels = input_channels
+        self.cardinality = cardinality
+        self.base_width = base_width
+        self.stem_width = stem_width
+        self.stem_type = stem_type
+        self.avg_down = avg_down
+        super().__init__(**kwargs)
+```
+
+编写自定义配置时需要记住的三个重要事项如下：
+- 必须继承自 `PretrainedConfig`，
+- `PretrainedConfig` 的 `__init__` 方法必须接受任何 kwargs，
+- 这些 `kwargs` 需要传递给超类的 `__init__` 方法。
+
+继承是为了确保你获得来自 🤗 Transformers 库的所有功能，而另外两个约束源于 `PretrainedConfig` 的字段比你设置的字段多。在使用 `from_pretrained` 方法重新加载配置时，这些字段需要被你的配置接受，然后传递给超类。
+
+为你的配置定义 `model_type`（此处为 `model_type="resnet"`）不是必须的，除非你想使用自动类注册你的模型（请参阅最后一节）。
+
+做完这些以后，就可以像使用库里任何其他模型配置一样，轻松地创建和保存配置。以下代码展示了如何创建并保存 resnet50d 配置：
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d_config.save_pretrained("custom-resnet")
+```
+
+这行代码将在 `custom-resnet` 文件夹内保存一个名为 `config.json` 的文件。然后，你可以使用 `from_pretrained` 方法重新加载配置：
+
+```py
+resnet50d_config = ResnetConfig.from_pretrained("custom-resnet")
+```
+
+你还可以使用 [`PretrainedConfig`] 类的任何其他方法，例如 [`~PretrainedConfig.push_to_hub`]，直接将配置上传到 Hub。
+
+## 编写自定义模型
+
+有了 ResNet 配置后，就可以继续编写模型了。实际上，我们将编写两个模型：一个模型用于从一批图像中提取隐藏特征（类似于 [`BertModel`]），另一个模型适用于图像分类（类似于 [`BertForSequenceClassification`]）。
+
+正如之前提到的，我们只会编写一个松散的模型包装，以使示例保持简洁。在编写此类之前，只需要建立起块类型（block types）与实际块类（block classes）之间的映射。然后，通过将所有内容传递给ResNet类，从配置中定义模型：
+
+```py
+from transformers import PreTrainedModel
+from timm.models.resnet import BasicBlock, Bottleneck, ResNet
+from .configuration_resnet import ResnetConfig
+
+
+BLOCK_MAPPING = {"basic": BasicBlock, "bottleneck": Bottleneck}
+
+
+class ResnetModel(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor):
+        return self.model.forward_features(tensor)
+```
+
+对用于进行图像分类的模型，我们只需更改前向方法：
+
+```py
+import torch
+
+
+class ResnetModelForImageClassification(PreTrainedModel):
+    config_class = ResnetConfig
+
+    def __init__(self, config):
+        super().__init__(config)
+        block_layer = BLOCK_MAPPING[config.block_type]
+        self.model = ResNet(
+            block_layer,
+            config.layers,
+            num_classes=config.num_classes,
+            in_chans=config.input_channels,
+            cardinality=config.cardinality,
+            base_width=config.base_width,
+            stem_width=config.stem_width,
+            stem_type=config.stem_type,
+            avg_down=config.avg_down,
+        )
+
+    def forward(self, tensor, labels=None):
+        logits = self.model(tensor)
+        if labels is not None:
+            loss = torch.nn.cross_entropy(logits, labels)
+            return {"loss": loss, "logits": logits}
+        return {"logits": logits}
+```
+
+在这两种情况下，请注意我们如何继承 `PreTrainedModel` 并使用 `config` 调用了超类的初始化（有点像编写常规的torch.nn.Module）。设置 `config_class` 的那行代码不是必须的，除非你想使用自动类注册你的模型（请参阅最后一节）。
+
+<Tip>
+
+如果你的模型与库中的某个模型非常相似，你可以重用与该模型相同的配置。
+
+</Tip>
+
+你可以让模型返回任何你想要的内容，但是像我们为 `ResnetModelForImageClassification` 做的那样返回一个字典，并在传递标签时包含loss，可以使你的模型能够在 [`Trainer`] 类中直接使用。只要你计划使用自己的训练循环或其他库进行训练，也可以使用其他输出格式。
+
+现在我们已经有了模型类，让我们创建一个：
+
+```py
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+```
+
+同样的，你可以使用 [`PreTrainedModel`] 的任何方法，比如 [`~PreTrainedModel.save_pretrained`] 或者 [`~PreTrainedModel.push_to_hub`]。我们将在下一节中使用第二种方法，并了解如何如何使用我们的模型的代码推送模型权重。但首先，让我们在模型内加载一些预训练权重。
+
+在你自己的用例中，你可能会在自己的数据上训练自定义模型。为了快速完成本教程，我们将使用 resnet50d 的预训练版本。由于我们的模型只是它的包装，转移这些权重将会很容易：
+
+```py
+import timm
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+现在让我们看看，如何确保在执行 [`~PreTrainedModel.save_pretrained`] 或 [`~PreTrainedModel.push_to_hub`] 时，模型的代码被保存。
+
+## 将代码发送到 Hub
+
+<Tip warning={true}>
+
+此 API 是实验性的，未来的发布中可能会有一些轻微的不兼容更改。
+
+</Tip>
+
+首先，确保你的模型在一个 `.py` 文件中完全定义。只要所有文件都位于同一目录中，它就可以依赖于某些其他文件的相对导入（目前我们还不为子模块支持此功能）。对于我们的示例，我们将在当前工作目录中名为 `resnet_model` 的文件夹中定义一个 `modeling_resnet.py` 文件和一个 `configuration_resnet.py` 文件。 配置文件包含 `ResnetConfig` 的代码，模型文件包含 `ResnetModel` 和 `ResnetModelForImageClassification` 的代码。
+
+```
+.
+└── resnet_model
+    ├── __init__.py
+    ├── configuration_resnet.py
+    └── modeling_resnet.py
+```
+
+`__init__.py` 可以为空，它的存在只是为了让 Python 检测到 `resnet_model` 可以用作模块。
+
+<Tip warning={true}>
+
+如果从库中复制模型文件，你需要将文件顶部的所有相对导入替换为从 `transformers` 包中的导入。
+
+</Tip>
+
+请注意，你可以重用（或子类化）现有的配置/模型。
+
+要与社区共享您的模型，请参照以下步骤：首先从新创建的文件中导入ResNet模型和配置：
+
+```py
+from resnet_model.configuration_resnet import ResnetConfig
+from resnet_model.modeling_resnet import ResnetModel, ResnetModelForImageClassification
+```
+
+接下来，你需要告诉库，当使用 `save_pretrained` 方法时，你希望复制这些对象的代码文件，并将它们正确注册到给定的 Auto 类（特别是对于模型），只需要运行以下代码：
+
+```py
+ResnetConfig.register_for_auto_class()
+ResnetModel.register_for_auto_class("AutoModel")
+ResnetModelForImageClassification.register_for_auto_class("AutoModelForImageClassification")
+```
+
+请注意，对于配置（只有一个自动类 [`AutoConfig`]），不需要指定自动类，但对于模型来说情况不同。 你的自定义模型可能适用于许多不同的任务，因此你必须指定哪一个自动类适合你的模型。
+
+接下来，让我们像之前一样创建配置和模型：
+
+```py
+resnet50d_config = ResnetConfig(block_type="bottleneck", stem_width=32, stem_type="deep", avg_down=True)
+resnet50d = ResnetModelForImageClassification(resnet50d_config)
+
+pretrained_model = timm.create_model("resnet50d", pretrained=True)
+resnet50d.model.load_state_dict(pretrained_model.state_dict())
+```
+
+现在要将模型推送到集线器，请确保你已登录。你看可以在终端中运行以下命令：
+
+```bash
+huggingface-cli login
+```
+
+或者在笔记本中运行以下代码：
+
+```py
+from huggingface_hub import notebook_login
+
+notebook_login()
+```
+
+然后，可以这样将模型推送到自己的命名空间（或你所属的组织）：
+
+```py
+resnet50d.push_to_hub("custom-resnet50d")
+```
+
+除了模型权重和 JSON 格式的配置外，这行代码也会复制 `custom-resnet50d` 文件夹内的模型以及配置的 `.py` 文件并将结果上传至 Hub。你可以在此[模型仓库](https://huggingface.co/sgugger/custom-resnet50d)中查看结果。
+
+有关推推送至 Hub 方法的更多信息，请参阅[共享教程](model_sharing)。
+
+## 使用带有自定义代码的模型
+
+可以使用自动类（auto-classes）和 `from_pretrained` 方法，使用模型仓库里带有自定义代码的配置、模型或分词器文件。所有上传到 Hub 的文件和代码都会进行恶意软件扫描（有关更多信息，请参阅 [Hub 安全](https://huggingface.co/docs/hub/security#malware-scanning) 文档）, 但你仍应查看模型代码和作者，以避免在你的计算机上执行恶意代码。 设置 `trust_remote_code=True` 以使用带有自定义代码的模型：
+
+```py
+from transformers import AutoModelForImageClassification
+
+model = AutoModelForImageClassification.from_pretrained("sgugger/custom-resnet50d", trust_remote_code=True)
+```
+
+我们强烈建议为 `revision` 参数传递提交哈希（commit hash），以确保模型的作者没有使用一些恶意的代码行更新了代码（除非您完全信任模型的作者）。
+
+```py
+commit_hash = "ed94a7c6247d8aedce4647f00f20de6875b5b292"
+model = AutoModelForImageClassification.from_pretrained(
+    "sgugger/custom-resnet50d", trust_remote_code=True, revision=commit_hash
+)
+```
+
+在 Hub 上浏览模型仓库的提交历史时，有一个按钮可以轻松复制任何提交的提交哈希。
+
+## 将自定义代码的模型注册到自动类
+
+如果你在编写一个扩展 🤗 Transformers 的库，你可能想要扩展自动类以包含您自己的模型。这与将代码推送到 Hub 不同，因为用户需要导入你的库才能获取自定义模型（与从 Hub 自动下载模型代码相反）。
+
+只要你的配置 `model_type` 属性与现有模型类型不同，并且你的模型类有正确的 `config_class` 属性，你可以像这样将它们添加到自动类中：
+
+```py
+from transformers import AutoConfig, AutoModel, AutoModelForImageClassification
+
+AutoConfig.register("resnet", ResnetConfig)
+AutoModel.register(ResnetConfig, ResnetModel)
+AutoModelForImageClassification.register(ResnetConfig, ResnetModelForImageClassification)
+```
+
+请注意，将自定义配置注册到 [`AutoConfig`] 时，使用的第一个参数需要与自定义配置的 `model_type` 匹配；而将自定义模型注册到任何自动模型类时，使用的第一个参数需要与 `config_class` 匹配。
--- a/docs/source/zh/multilingual.md
+++ b/docs/source/zh/multilingual.md
@ -0,0 +1,178 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 用于推理的多语言模型
+
+[[open-in-colab]]
+
+🤗 Transformers 中有多种多语言模型，它们的推理用法与单语言模型不同。但是，并非*所有*的多语言模型用法都不同。一些模型，例如 [bert-base-multilingual-uncased](https://huggingface.co/bert-base-multilingual-uncased) 就可以像单语言模型一样使用。本指南将向您展示如何使用不同用途的多语言模型进行推理。
+
+## XLM
+
+XLM 有十个不同的检查点，其中只有一个是单语言的。剩下的九个检查点可以归为两类：使用语言嵌入的检查点和不使用语言嵌入的检查点。
+
+### 带有语言嵌入的 XLM
+
+以下 XLM 模型使用语言嵌入来指定推理中使用的语言：
+
+- `xlm-mlm-ende-1024` （掩码语言建模，英语-德语）
+- `xlm-mlm-enfr-1024` （掩码语言建模，英语-法语）
+- `xlm-mlm-enro-1024` （掩码语言建模，英语-罗马尼亚语）
+- `xlm-mlm-xnli15-1024` （掩码语言建模，XNLI 数据集语言）
+- `xlm-mlm-tlm-xnli15-1024` （掩码语言建模+翻译，XNLI 数据集语言）
+- `xlm-clm-enfr-1024` （因果语言建模，英语-法语）
+- `xlm-clm-ende-1024` （因果语言建模，英语-德语）
+
+语言嵌入被表示一个张量，其形状与传递给模型的 `input_ids` 相同。这些张量中的值取决于所使用的语言，并由分词器的 `lang2id` 和 `id2lang`  属性识别。
+
+在此示例中，加载 `xlm-clm-enfr-1024` 检查点（因果语言建模，英语-法语）：
+
+```py
+>>> import torch
+>>> from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+>>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+>>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")
+```
+
+分词器的 `lang2id` 属性显示了该模型的语言及其对应的id：
+
+```py
+>>> print(tokenizer.lang2id)
+{'en': 0, 'fr': 1}
+```
+
+接下来，创建一个示例输入：
+
+```py
+>>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")])  # batch size 为 1
+```
+
+将语言 id 设置为 `"en"` 并用其定义语言嵌入。语言嵌入是一个用 `0` 填充的张量，这个张量应该与 `input_ids` 大小相同。
+
+```py
+>>> language_id = tokenizer.lang2id["en"]  # 0
+>>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+>>> # 我们将其 reshape 为 (batch_size, sequence_length) 大小
+>>> langs = langs.view(1, -1)  # 现在的形状是 [1, sequence_length] (我们的 batch size 为 1)
+```
+
+现在，你可以将 `input_ids` 和语言嵌入传递给模型：
+
+```py
+>>> outputs = model(input_ids, langs=langs)
+```
+
+[run_generation.py](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-generation/run_generation.py) 脚本可以使用 `xlm-clm` 检查点生成带有语言嵌入的文本。
+
+### 不带语言嵌入的 XLM
+
+以下 XLM 模型在推理时不需要语言嵌入：
+
+- `xlm-mlm-17-1280` （掩码语言建模，支持 17 种语言）
+- `xlm-mlm-100-1280` （掩码语言建模，支持 100 种语言）
+
+与之前的 XLM 检查点不同，这些模型用于通用句子表示。
+
+## BERT
+
+以下 BERT 模型可用于多语言任务：
+
+- `bert-base-multilingual-uncased` （掩码语言建模 + 下一句预测，支持 102 种语言）
+- `bert-base-multilingual-cased` （掩码语言建模 + 下一句预测，支持 104 种语言）
+
+这些模型在推理时不需要语言嵌入。它们应该能够从上下文中识别语言并进行相应的推理。
+
+## XLM-RoBERTa
+
+以下 XLM-RoBERTa 模型可用于多语言任务：
+
+- `xlm-roberta-base` （掩码语言建模，支持 100 种语言）
+- `xlm-roberta-large` （掩码语言建模，支持 100 种语言）
+
+XLM-RoBERTa 使用 100 种语言的 2.5TB 新创建和清理的 CommonCrawl 数据进行了训练。与之前发布的 mBERT 或 XLM 等多语言模型相比，它在分类、序列标记和问答等下游任务上提供了更强大的优势。
+
+## M2M100
+
+以下 M2M100 模型可用于多语言翻译：
+
+- `facebook/m2m100_418M` （翻译）
+- `facebook/m2m100_1.2B` （翻译）
+
+在此示例中，加载 `facebook/m2m100_418M` 检查点以将中文翻译为英文。你可以在分词器中设置源语言：
+
+```py
+>>> from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> chinese_text = "不要插手巫師的事務, 因為他們是微妙的, 很快就會發怒."
+
+>>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M", src_lang="zh")
+>>> model = M2M100ForConditionalGeneration.from_pretrained("facebook/m2m100_418M")
+```
+
+对文本进行分词：
+
+```py
+>>> encoded_zh = tokenizer(chinese_text, return_tensors="pt")
+```
+
+M2M100 强制将目标语言 id 作为第一个生成的标记，以进行到目标语言的翻译。在 `generate` 方法中将 `forced_bos_token_id` 设置为 `en` 以翻译成英语：
+
+```py
+>>> generated_tokens = model.generate(**encoded_zh, forced_bos_token_id=tokenizer.get_lang_id("en"))
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+'Do not interfere with the matters of the witches, because they are delicate and will soon be angry.'
+```
+
+## MBart
+
+以下 MBart 模型可用于多语言翻译：
+
+- `facebook/mbart-large-50-one-to-many-mmt` （一对多多语言机器翻译，支持 50 种语言）
+- `facebook/mbart-large-50-many-to-many-mmt` （多对多多语言机器翻译，支持 50 种语言）
+- `facebook/mbart-large-50-many-to-one-mmt` （多对一多语言机器翻译，支持 50 种语言）
+- `facebook/mbart-large-50` （多语言翻译，支持 50 种语言）
+- `facebook/mbart-large-cc25`
+
+在此示例中，加载  `facebook/mbart-large-50-many-to-many-mmt` 检查点以将芬兰语翻译为英语。 你可以在分词器中设置源语言：
+
+```py
+>>> from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+
+>>> en_text = "Do not meddle in the affairs of wizards, for they are subtle and quick to anger."
+>>> fi_text = "Älä sekaannu velhojen asioihin, sillä ne ovat hienovaraisia ja nopeasti vihaisia."
+
+>>> tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", src_lang="fi_FI")
+>>> model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+```
+
+对文本进行分词：
+
+```py
+>>> encoded_en = tokenizer(en_text, return_tensors="pt")
+```
+
+MBart 强制将目标语言 id 作为第一个生成的标记，以进行到目标语言的翻译。在 `generate` 方法中将 `forced_bos_token_id` 设置为 `en` 以翻译成英语：
+
+```py
+>>> generated_tokens = model.generate(**encoded_en, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
+>>> tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+"Don't interfere with the wizard's affairs, because they are subtle, will soon get angry."
+```
+
+如果你使用的是 `facebook/mbart-large-50-many-to-one-mmt` 检查点，则无需强制目标语言 id 作为第一个生成的令牌，否则用法是相同的。
--- a/docs/source/zh/pipeline_tutorial.md
+++ b/docs/source/zh/pipeline_tutorial.md
@ -0,0 +1,308 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 推理pipeline
+
+[`pipeline`] 让使用[Hub](https://huggingface.co/models)上的任何模型进行任何语言、计算机视觉、语音以及多模态任务的推理变得非常简单。即使您对特定的模态没有经验，或者不熟悉模型的源码，您仍然可以使用[`pipeline`]进行推理！本教程将教您：
+
+- 如何使用[`pipeline`] 进行推理。
+- 如何使用特定的`tokenizer`(分词器)或模型。
+- 如何使用[`pipeline`] 进行音频、视觉和多模态任务的推理。
+
+<Tip>
+
+请查看[`pipeline`]文档以获取已支持的任务和可用参数的完整列表。
+
+</Tip>
+
+## Pipeline使用
+
+虽然每个任务都有一个关联的[`pipeline`]，但使用通用的抽象的[`pipeline`]更加简单，其中包含所有特定任务的`pipelines`。[`pipeline`]会自动加载一个默认模型和一个能够进行任务推理的预处理类。让我们以使用[`pipeline`]进行自动语音识别（ASR）或语音转文本为例。
+
+1. 首先，创建一个[`pipeline`]并指定推理任务：
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition")
+```
+
+2. 将您的输入传递给[`pipeline`]。对于语音识别，这通常是一个音频输入文件：
+
+
+```py
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': 'I HAVE A DREAM BUT ONE DAY THIS NATION WILL RISE UP LIVE UP THE TRUE MEANING OF ITS TREES'}
+```
+
+您没有得到您期望的结果？可以在Hub上查看一些[最受欢迎的自动语音识别模型](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending) 
+，看看是否可以获得更好的转录。
+
+让我们尝试来自 OpenAI 的[Whisper large-v2](https://huggingface.co/openai/whisper-large) 模型。Whisperb比Wav2Vec2晚2年发布，使用接近10倍的数据进行了训练。因此，它在大多数下游基准测试上击败了Wav2Vec2。
+它还具有预测标点和大小写的附加优势，而Wav2Vec2则无法实现这些功能。
+
+让我们在这里尝试一下，看看它的表现如何：
+
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+现在这个结果看起来更准确了！要进行深入的Wav2Vec2与Whisper比较，请参阅[音频变换器课程](https://huggingface.co/learn/audio-course/chapter5/asr_models)。
+我们鼓励您在 Hub 上查看不同语言的模型，以及专业领域的模型等。您可以在Hub上直接查看并比较模型的结果，以确定是否适合或处理边缘情况是否比其他模型更好。如果您没有找到适用于您的用例的模型，您始终可以[训练](training)自己的模型！
+
+如果您有多个输入，您可以将输入作为列表传递：
+
+
+```py
+transcriber(
+    [
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac",
+        "https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/1.flac",
+    ]
+)
+```
+
+`Pipelines`非常适合用于测试，因为从一个模型切换到另一个模型非常琐碎；但是，还有一些方法可以将它们优化后用于大型工作负载而不仅仅是测试。请查看以下指南，深入探讨如何迭代整个数据集或在Web服务器中使用`Pipelines`：
+* [在数据集上使用流水线](#using-pipelines-on-a-dataset)
+* [在Web服务器中使用流水线](./pipeline_webserver)
+
+
+## 参数
+
+[`pipeline`] 支持许多参数；有些是适用于特定任务的，而有些适用于所有`pipeline`。通常情况下，您可以在任何地方指定对应参数：
+
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", my_parameter=1)
+
+out = transcriber(...)  # This will use `my_parameter=1`.
+out = transcriber(..., my_parameter=2)  # This will override and use `my_parameter=2`.
+out = transcriber(...)  # This will go back to using `my_parameter=1`.
+```
+
+让我们查看其中的三个重要参数：
+
+
+### 设备
+
+如果您使用 `device=n`，`pipeline`会自动将模型放在指定的设备上。无论您使用PyTorch还是Tensorflow，这都可以工作。
+
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0)
+```
+
+如果模型对于单个GPU来说过于庞大，并且您正在使用PyTorch，您可以设置 `device_map="auto"` 以自动确定如何加载和存储模型权重。使用 `device_map` 参数需要安装🤗 [Accelerate](https://huggingface.co/docs/accelerate) 软件包：
+
+
+```bash
+pip install --upgrade accelerate
+```
+
+以下代码会自动在各个设备上加载和存储模型权重：
+
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device_map="auto")
+```
+
+请注意，如果传递了 `device_map="auto"`，在实例化您的 `pipeline` 时不需要添加 `device=device` 参数，否则可能会遇到一些意外的状况！
+
+### 批量大小
+
+默认情况下，`pipelines`不会进行批量推理，原因在[这里](https://huggingface.co/docs/transformers/main_classes/pipelines#pipeline-batching)详细解释。因为批处理不一定更快，实际上在某些情况下可能会更慢。
+
+但如果在您的用例中起作用，您可以使用：
+
+
+```py
+transcriber = pipeline(model="openai/whisper-large-v2", device=0, batch_size=2)
+audio_filenames = [f"https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/{i}.flac" for i in range(1, 5)]
+texts = transcriber(audio_filenames)
+```
+
+以上代码会在提供的4个音频文件上运行`pipeline`，它会将它们以2个一组的批次传递给模型（模型在GPU上，此时批处理更有可能有所帮助），而您无需编写额外的代码。输出应始终与没有批处理时收到的结果相一致。它只是一种帮助您更快地使用`pipeline`的方式。
+
+`pipeline`也可以减轻一些批处理的复杂性，因为对于某些`pipeline`，需要将单个项目（如长音频文件）分成多个部分以供模型处理。`pipeline`为您执行这种[*chunk batching*](./main_classes/pipelines#pipeline-chunk-batching)。
+
+### 任务特定参数
+
+所有任务都提供了特定于任务的参数，这些参数提供额外的灵活性和选择，以帮助您完成工作。
+例如，[`transformers.AutomaticSpeechRecognitionPipeline.__call__`] 方法具有一个 `return_timestamps` 参数，对于字幕视频似乎很有帮助：
+
+```py
+>>> transcriber = pipeline(model="openai/whisper-large-v2", return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.', 'chunks': [{'timestamp': (0.0, 11.88), 'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its'}, {'timestamp': (11.88, 12.38), 'text': ' creed.'}]}
+```
+
+正如您所看到的，模型推断出了文本，还输出了各个句子发音的**时间**。
+
+每个任务都有许多可用的参数，因此请查看每个任务的API参考，以了解您可以进行哪些调整！例如，[`~transformers.AutomaticSpeechRecognitionPipeline`] 具有 `chunk_length_s` 参数，对于处理非常长的音频文件（例如，为整部电影或长达一小时的视频配字幕）非常有帮助，这通常是模型无法单独处理的：
+
+```python
+>>> transcriber = pipeline(model="openai/whisper-large-v2", chunk_length_s=30, return_timestamps=True)
+>>> transcriber("https://huggingface.co/datasets/sanchit-gandhi/librispeech_long/resolve/main/audio.wav")
+{'text': " Chapter 16. I might have told you of the beginning of this liaison in a few lines, but I wanted you to see every step by which we came.  I, too, agree to whatever Marguerite wished, Marguerite to be unable to live apart from me. It was the day after the evening...
+```
+
+如果您找不到一个真正有帮助的参数，欢迎[提出请求](https://github.com/huggingface/transformers/issues/new?assignees=&labels=feature&template=feature-request.yml)！
+
+## 在数据集上使用pipelines
+
+`pipelines`也可以对大型数据集进行推理。我们建议使用迭代器来完成这一任务，这是最简单的方法：
+
+
+```py
+def data():
+    for i in range(1000):
+        yield f"My example {i}"
+
+
+pipe = pipeline(model="gpt2", device=0)
+generated_characters = 0
+for out in pipe(data()):
+    generated_characters += len(out[0]["generated_text"])
+```
+
+迭代器 `data()` 会产生每个结果，`pipelines`会自动识别输入为可迭代对象，并在GPU上处理数据的同时开始获取数据（在底层使用[DataLoader](https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)）。这一点非常重要，因为您不必为整个数据集分配内存，可以尽可能快地将数据传送到GPU。
+
+由于批处理可以加速处理，因此在这里尝试调整 `batch_size` 参数可能会很有用。
+
+迭代数据集的最简单方法就是从🤗 [Datasets](https://github.com/huggingface/datasets/) 中加载数据集：
+
+
+```py
+# KeyDataset is a util that will just output the item we're interested in.
+from transformers.pipelines.pt_utils import KeyDataset
+from datasets import load_dataset
+
+pipe = pipeline(model="hf-internal-testing/tiny-random-wav2vec2", device=0)
+dataset = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation[:10]")
+
+for out in pipe(KeyDataset(dataset, "audio")):
+    print(out)
+```
+
+
+## 在Web服务器上使用pipelines
+
+<Tip>
+创建推理引擎是一个复杂的主题，值得有自己的页面。
+</Tip>
+
+[链接](./pipeline_webserver)
+
+## 视觉流水线
+
+对于视觉任务，使用[`pipeline`] 几乎是相同的。
+
+指定您的任务并将图像传递给分类器。图像可以是链接、本地路径或base64编码的图像。例如，下面显示的是哪种品种的猫？
+
+![pipeline-cat-chonk](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg)
+```py
+>>> from transformers import pipeline
+
+>>> vision_classifier = pipeline(model="google/vit-base-patch16-224")
+>>> preds = vision_classifier(
+...     images="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4335, 'label': 'lynx, catamount'}, {'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}, {'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}, {'score': 0.0239, 'label': 'Egyptian cat'}, {'score': 0.0229, 'label': 'tiger cat'}]
+```
+
+## 文本流水线
+
+对于NLP任务，使用[`pipeline`] 几乎是相同的。
+
+
+```py
+>>> from transformers import pipeline
+
+>>> # This model is a `zero-shot-classification` model.
+>>> # It will classify text, except you are free to choose any label you might imagine
+>>> classifier = pipeline(model="facebook/bart-large-mnli")
+>>> classifier(
+...     "I have a problem with my iphone that needs to be resolved asap!!",
+...     candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
+... )
+{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!', 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'], 'scores': [0.504, 0.479, 0.013, 0.003, 0.002]}
+```
+
+## 多模态流水线
+
+[`pipeline`] 支持多个模态。例如，视觉问题回答（VQA）任务结合了文本和图像。请随意使用您喜欢的任何图像链接和您想要问关于该图像的问题。图像可以是URL或图像的本地路径。
+
+例如，如果您使用这个[invoice image](https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png)：
+
+
+```py
+>>> from transformers import pipeline
+
+>>> vqa = pipeline(model="impira/layoutlm-document-qa")
+>>> vqa(
+...     image="https://huggingface.co/spaces/impira/docquery/resolve/2359223c1837a7587402bda0f2643382a6eefeab/invoice.png",
+...     question="What is the invoice number?",
+... )
+[{'score': 0.42515, 'answer': 'us-001', 'start': 16, 'end': 16}]
+```
+
+<Tip>
+
+要运行上面的示例，除了🤗 Transformers之外，您需要安装[`pytesseract`](https://pypi.org/project/pytesseract/)。
+
+
+```bash
+sudo apt install -y tesseract-ocr
+pip install pytesseract
+```
+
+</Tip>
+
+## 在大模型上使用🤗 `accelerate`和`pipeline`：
+
+您可以轻松地使用🤗 `accelerate`在大模型上运行 `pipeline`！首先确保您已经使用 `pip install accelerate` 安装了 `accelerate`。
+
+首先使用 `device_map="auto"` 加载您的模型！我们将在示例中使用 `facebook/opt-1.3b`。
+
+
+```py
+# pip install accelerate
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", torch_dtype=torch.bfloat16, device_map="auto")
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+如果安装 `bitsandbytes` 并添加参数 `load_in_8bit=True`，您还可以传递8位加载的模型。
+
+
+```py
+# pip install accelerate bitsandbytes
+import torch
+from transformers import pipeline
+
+pipe = pipeline(model="facebook/opt-1.3b", device_map="auto", model_kwargs={"load_in_8bit": True})
+output = pipe("This is a cool example!", do_sample=True, top_p=0.95)
+```
+
+请注意，您可以将`checkpoint `替换为任何支持大模型加载的Hugging Face模型，比如BLOOM！
+
--- a/docs/source/zh/preprocessing.md
+++ b/docs/source/zh/preprocessing.md
@ -0,0 +1,541 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 预处理
+
+[[open-in-colab]]
+
+在您可以在数据集上训练模型之前，数据需要被预处理为期望的模型输入格式。无论您的数据是文本、图像还是音频，它们都需要被转换并组合成批量的张量。🤗 Transformers 提供了一组预处理类来帮助准备数据以供模型使用。在本教程中，您将了解以下内容：
+
+* 对于文本，使用[分词器](./main_classes/tokenizer)(`Tokenizer`)将文本转换为一系列标记(`tokens`)，并创建`tokens`的数字表示，将它们组合成张量。
+* 对于语音和音频，使用[特征提取器](./main_classes/feature_extractor)(`Feature extractor`)从音频波形中提取顺序特征并将其转换为张量。
+* 图像输入使用[图像处理器](./main_classes/image)(`ImageProcessor`)将图像转换为张量。
+* 多模态输入，使用[处理器](./main_classes/processors)(`Processor`)结合了`Tokenizer`和`ImageProcessor`或`Processor`。
+
+<Tip>
+
+`AutoProcessor` **始终**有效的自动选择适用于您使用的模型的正确`class`，无论您使用的是`Tokenizer`、`ImageProcessor`、`Feature extractor`还是`Processor`。
+
+</Tip>
+
+在开始之前，请安装🤗 Datasets，以便您可以加载一些数据集来进行实验：
+
+
+```bash
+pip install datasets
+```
+
+## 自然语言处理
+
+<Youtube id="Yffk5aydLzg"/>
+
+处理文本数据的主要工具是[Tokenizer](main_classes/tokenizer)。`Tokenizer`根据一组规则将文本拆分为`tokens`。然后将这些`tokens`转换为数字，然后转换为张量，成为模型的输入。模型所需的任何附加输入都由`Tokenizer`添加。
+
+<Tip>
+
+如果您计划使用预训练模型，重要的是使用与之关联的预训练`Tokenizer`。这确保文本的拆分方式与预训练语料库相同，并在预训练期间使用相同的标记-索引的对应关系（通常称为*词汇表*-`vocab`）。
+
+</Tip>
+
+开始使用[`AutoTokenizer.from_pretrained`]方法加载一个预训练`tokenizer`。这将下载模型预训练的`vocab`：
+
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+
+然后将您的文本传递给`tokenizer`：
+
+
+```py
+>>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+>>> print(encoded_input)
+{'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
+ 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+ 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+```
+
+`tokenizer`返回一个包含三个重要对象的字典：
+
+* [input_ids](glossary#input-ids) 是与句子中每个`token`对应的索引。
+* [attention_mask](glossary#attention-mask) 指示是否应该关注一个`toekn`。
+* [token_type_ids](glossary#token-type-ids) 在存在多个序列时标识一个`token`属于哪个序列。
+
+通过解码 `input_ids` 来返回您的输入：
+
+
+```py
+>>> tokenizer.decode(encoded_input["input_ids"])
+'[CLS] Do not meddle in the affairs of wizards, for they are subtle and quick to anger. [SEP]'
+```
+
+如您所见，`tokenizer`向句子中添加了两个特殊`token` - `CLS` 和 `SEP`（分类器和分隔符）。并非所有模型都需要特殊`token`，但如果需要，`tokenizer`会自动为您添加。
+
+如果有多个句子需要预处理，将它们作为列表传递给`tokenizer`：
+
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_inputs = tokenizer(batch_sentences)
+>>> print(encoded_inputs)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1]]}
+```
+
+### 填充
+
+句子的长度并不总是相同，这可能会成为一个问题，因为模型输入的张量需要具有统一的形状。填充是一种策略，通过在较短的句子中添加一个特殊的`padding token`，以确保张量是矩形的。
+
+将 `padding` 参数设置为 `True`，以使批次中较短的序列填充到与最长序列相匹配的长度：
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+第一句和第三句因为较短，通过`0`进行填充，。
+
+### 截断
+
+另一方面，有时候一个序列可能对模型来说太长了。在这种情况下，您需要将序列截断为更短的长度。
+
+将 `truncation` 参数设置为 `True`，以将序列截断为模型接受的最大长度：
+
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
+>>> print(encoded_input)
+{'input_ids': [[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+               [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+               [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]],
+ 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                    [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]]}
+```
+
+<Tip>
+
+查看[填充和截断](./pad_truncation)概念指南，了解更多有关填充和截断参数的信息。
+
+</Tip>
+
+### 构建张量
+
+最后，`tokenizer`可以返回实际输入到模型的张量。
+
+将 `return_tensors` 参数设置为 `pt`（对于PyTorch）或 `tf`（对于TensorFlow）：
+
+<frameworkcontent>
+<pt>
+
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+>>> print(encoded_input)
+{'input_ids': tensor([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+                      [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+                      [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+                           [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]),
+ 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+                           [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+                           [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]])}
+```
+</pt>
+<tf>
+
+```py
+>>> batch_sentences = [
+...     "But what about second breakfast?",
+...     "Don't think he knows about second breakfast, Pip.",
+...     "What about elevensies?",
+... ]
+>>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+>>> print(encoded_input)
+{'input_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[101, 1252, 1184, 1164, 1248, 6462, 136, 102, 0, 0, 0, 0, 0, 0, 0],
+       [101, 1790, 112, 189, 1341, 1119, 3520, 1164, 1248, 6462, 117, 21902, 1643, 119, 102],
+       [101, 1327, 1164, 5450, 23434, 136, 102, 0, 0, 0, 0, 0, 0, 0, 0]],
+      dtype=int32)>,
+ 'token_type_ids': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>,
+ 'attention_mask': <tf.Tensor: shape=(2, 9), dtype=int32, numpy=
+array([[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
+       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+       [1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0]], dtype=int32)>}
+```
+</tf>
+</frameworkcontent>
+
+## 音频
+
+对于音频任务，您需要[feature extractor](main_classes/feature_extractor)来准备您的数据集以供模型使用。`feature extractor`旨在从原始音频数据中提取特征，并将它们转换为张量。
+
+加载[MInDS-14](https://huggingface.co/datasets/PolyAI/minds14)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets教程](https://huggingface.co/docs/datasets/load_hub.html)）以了解如何在音频数据集中使用`feature extractor`：
+
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+访问 `audio` 列的第一个元素以查看输入。调用 `audio` 列会自动加载和重新采样音频文件：
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 0.        ,  0.00024414, -0.00024414, ..., -0.00024414,
+         0.        ,  0.        ], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 8000}
+```
+
+这会返回三个对象：
+
+* `array` 是加载的语音信号 - 并在必要时重新采为`1D array`。
+* `path` 指向音频文件的位置。
+* `sampling_rate` 是每秒测量的语音信号数据点数量。
+
+对于本教程，您将使用[Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base)模型。查看模型卡片，您将了解到Wav2Vec2是在16kHz采样的语音音频数据上预训练的。重要的是，您的音频数据的采样率要与用于预训练模型的数据集的采样率匹配。如果您的数据的采样率不同，那么您需要对数据进行重新采样。
+
+1. 使用🤗 Datasets的[`~datasets.Dataset.cast_column`]方法将采样率提升到16kHz：
+
+```py
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+2. 再次调用 `audio` 列以重新采样音频文件：
+
+
+```py
+>>> dataset[0]["audio"]
+{'array': array([ 2.3443763e-05,  2.1729663e-04,  2.2145823e-04, ...,
+         3.8356509e-05, -7.3497440e-06, -2.1754686e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~JOINT_ACCOUNT/602ba55abb1e6d0fbce92065.wav',
+ 'sampling_rate': 16000}
+```
+
+接下来，加载一个`feature extractor`以对输入进行标准化和填充。当填充文本数据时，会为较短的序列添加 `0`。相同的理念适用于音频数据。`feature extractor`添加 `0` - 被解释为静音 - 到`array` 。
+
+使用 [`AutoFeatureExtractor.from_pretrained`] 加载`feature extractor`：
+
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+将音频 `array` 传递给`feature extractor`。我们还建议在`feature extractor`中添加 `sampling_rate` 参数，以更好地调试可能发生的静音错误：
+
+
+```py
+>>> audio_input = [dataset[0]["audio"]["array"]]
+>>> feature_extractor(audio_input, sampling_rate=16000)
+{'input_values': [array([ 3.8106556e-04,  2.7506407e-03,  2.8015103e-03, ...,
+        5.6335266e-04,  4.6588284e-06, -1.7142107e-04], dtype=float32)]}
+```
+
+就像`tokenizer`一样，您可以应用填充或截断来处理批次中的可变序列。请查看这两个音频样本的序列长度：
+
+
+```py
+>>> dataset[0]["audio"]["array"].shape
+(173398,)
+
+>>> dataset[1]["audio"]["array"].shape
+(106496,)
+```
+
+创建一个函数来预处理数据集，以使音频样本具有相同的长度。通过指定最大样本长度，`feature extractor`将填充或截断序列以使其匹配：
+
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays,
+...         sampling_rate=16000,
+...         padding=True,
+...         max_length=100000,
+...         truncation=True,
+...     )
+...     return inputs
+```
+
+将`preprocess_function`应用于数据集中的前几个示例：
+
+
+```py
+>>> processed_dataset = preprocess_function(dataset[:5])
+```
+
+现在样本长度是相同的，并且与指定的最大长度匹配。您现在可以将经过处理的数据集传递给模型了！
+
+
+```py
+>>> processed_dataset["input_values"][0].shape
+(100000,)
+
+>>> processed_dataset["input_values"][1].shape
+(100000,)
+```
+
+## 计算机视觉
+
+对于计算机视觉任务，您需要一个[ image processor](main_classes/image_processor)来准备数据集以供模型使用。图像预处理包括多个步骤将图像转换为模型期望输入的格式。这些步骤包括但不限于调整大小、标准化、颜色通道校正以及将图像转换为张量。
+
+<Tip>
+
+图像预处理通常遵循某种形式的图像增强。图像预处理和图像增强都会改变图像数据，但它们有不同的目的：
+
+* 图像增强可以帮助防止过拟合并增加模型的鲁棒性。您可以在数据增强方面充分发挥创造性 - 调整亮度和颜色、裁剪、旋转、调整大小、缩放等。但要注意不要改变图像的含义。
+* 图像预处理确保图像与模型预期的输入格式匹配。在微调计算机视觉模型时，必须对图像进行与模型训练时相同的预处理。
+
+您可以使用任何您喜欢的图像增强库。对于图像预处理，请使用与模型相关联的`ImageProcessor`。
+
+</Tip>
+
+加载[food101](https://huggingface.co/datasets/food101)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets教程](https://huggingface.co/docs/datasets/load_hub.html)）以了解如何在计算机视觉数据集中使用图像处理器：
+
+<Tip>
+
+因为数据集相当大，请使用🤗 Datasets的`split`参数加载训练集中的少量样本！
+
+</Tip>
+
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("food101", split="train[:100]")
+```
+
+接下来，使用🤗 Datasets的[`Image`](https://huggingface.co/docs/datasets/package_reference/main_classes.html?highlight=image#datasets.Image)功能查看图像：
+
+
+```py
+>>> dataset[0]["image"]
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/vision-preprocess-tutorial.png"/>
+</div>
+
+使用 [`AutoImageProcessor.from_pretrained`] 加载`image processor`：
+
+```py
+>>> from transformers import AutoImageProcessor
+
+>>> image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
+```
+
+首先，让我们进行图像增强。您可以使用任何您喜欢的库，但在本教程中，我们将使用torchvision的[`transforms`](https://pytorch.org/vision/stable/transforms.html)模块。如果您有兴趣使用其他数据增强库，请参阅[Albumentations](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_albumentations.ipynb)或[Kornia notebooks](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification_kornia.ipynb)中的示例。
+
+1. 在这里，我们使用[`Compose`](https://pytorch.org/vision/master/generated/torchvision.transforms.Compose.html)将[`RandomResizedCrop`](https://pytorch.org/vision/main/generated/torchvision.transforms.RandomResizedCrop.html)和 [`ColorJitter`](https://pytorch.org/vision/main/generated/torchvision.transforms.ColorJitter.html)变换连接在一起。请注意，对于调整大小，我们可以从`image_processor`中获取图像尺寸要求。对于一些模型，精确的高度和宽度需要被定义，对于其他模型只需定义`shortest_edge`。
+
+
+```py
+>>> from torchvision.transforms import RandomResizedCrop, ColorJitter, Compose
+
+>>> size = (
+...     image_processor.size["shortest_edge"]
+...     if "shortest_edge" in image_processor.size
+...     else (image_processor.size["height"], image_processor.size["width"])
+... )
+
+>>> _transforms = Compose([RandomResizedCrop(size), ColorJitter(brightness=0.5, hue=0.5)])
+```
+
+2. 模型接受 [`pixel_values`](model_doc/visionencoderdecoder#transformers.VisionEncoderDecoderModel.forward.pixel_values) 作为输入。`ImageProcessor` 可以进行图像的标准化，并生成适当的张量。创建一个函数，将图像增强和图像预处理步骤组合起来处理批量图像，并生成 `pixel_values`：
+
+
+```py
+>>> def transforms(examples):
+...     images = [_transforms(img.convert("RGB")) for img in examples["image"]]
+...     examples["pixel_values"] = image_processor(images, do_resize=False, return_tensors="pt")["pixel_values"]
+...     return examples
+```
+
+<Tip>
+
+在上面的示例中，我们设置`do_resize=False`，因为我们已经在图像增强转换中调整了图像的大小，并利用了适当的`image_processor`的`size`属性。如果您在图像增强期间不调整图像的大小，请将此参数排除在外。默认情况下`ImageProcessor`将处理调整大小。
+
+如果希望将图像标准化步骤为图像增强的一部分，请使用`image_processor.image_mean`和`image_processor.image_std`。
+
+</Tip>
+
+3. 然后使用🤗 Datasets的[`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform)在运行时应用这些变换：
+
+
+```py
+>>> dataset.set_transform(transforms)
+```
+
+4. 现在，当您访问图像时，您将注意到`image processor`已添加了 `pixel_values`。您现在可以将经过处理的数据集传递给模型了！
+
+
+```py
+>>> dataset[0].keys()
+```
+
+这是在应用变换后的图像样子。图像已被随机裁剪，并其颜色属性发生了变化。
+
+
+```py
+>>> import numpy as np
+>>> import matplotlib.pyplot as plt
+
+>>> img = dataset[0]["pixel_values"]
+>>> plt.imshow(img.permute(1, 2, 0))
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/preprocessed_image.png"/>
+</div>
+
+<Tip>
+
+对于诸如目标检测、语义分割、实例分割和全景分割等任务，`ImageProcessor`提供了训练后处理方法。这些方法将模型的原始输出转换为有意义的预测，如边界框或分割地图。
+
+</Tip>
+
+### 填充
+
+在某些情况下，例如，在微调[DETR](./model_doc/detr)时，模型在训练时应用了尺度增强。这可能导致批处理中的图像大小不同。您可以使用[`DetrImageProcessor.pad`]来指定自定义的`collate_fn`将图像批处理在一起。
+
+```py
+>>> def collate_fn(batch):
+...     pixel_values = [item["pixel_values"] for item in batch]
+...     encoding = image_processor.pad(pixel_values, return_tensors="pt")
+...     labels = [item["labels"] for item in batch]
+...     batch = {}
+...     batch["pixel_values"] = encoding["pixel_values"]
+...     batch["pixel_mask"] = encoding["pixel_mask"]
+...     batch["labels"] = labels
+...     return batch
+```
+
+## 多模态
+
+对于涉及多模态输入的任务，您需要[processor](main_classes/processors)来为模型准备数据集。`processor`将两个处理对象-例如`tokenizer`和`feature extractor`-组合在一起。
+
+加载[LJ Speech](https://huggingface.co/datasets/lj_speech)数据集（有关如何加载数据集的更多详细信息，请参阅🤗 [Datasets 教程](https://huggingface.co/docs/datasets/load_hub.html)）以了解如何使用`processor`进行自动语音识别（ASR）：
+
+
+```py
+>>> from datasets import load_dataset
+
+>>> lj_speech = load_dataset("lj_speech", split="train")
+```
+
+对于ASR（自动语音识别），主要关注`audio`和`text`，因此可以删除其他列：
+
+
+```py
+>>> lj_speech = lj_speech.map(remove_columns=["file", "id", "normalized_text"])
+```
+
+现在查看`audio`和`text`列：
+
+```py
+>>> lj_speech[0]["audio"]
+{'array': array([-7.3242188e-04, -7.6293945e-04, -6.4086914e-04, ...,
+         7.3242188e-04,  2.1362305e-04,  6.1035156e-05], dtype=float32),
+ 'path': '/root/.cache/huggingface/datasets/downloads/extracted/917ece08c95cf0c4115e45294e3cd0dee724a1165b7fc11798369308a465bd26/LJSpeech-1.1/wavs/LJ001-0001.wav',
+ 'sampling_rate': 22050}
+
+>>> lj_speech[0]["text"]
+'Printing, in the only sense with which we are at present concerned, differs from most if not from all the arts and crafts represented in the Exhibition'
+```
+
+请记住，您应始终[重新采样](preprocessing#audio)音频数据集的采样率，以匹配用于预训练模型数据集的采样率！
+
+
+```py
+>>> lj_speech = lj_speech.cast_column("audio", Audio(sampling_rate=16_000))
+```
+
+使用[`AutoProcessor.from_pretrained`]加载一个`processor`：
+
+
+```py
+>>> from transformers import AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained("facebook/wav2vec2-base-960h")
+```
+
+1. 创建一个函数，用于将包含在 `array` 中的音频数据处理为 `input_values`，并将 `text` 标记为 `labels`。这些将是输入模型的数据：
+
+```py
+>>> def prepare_dataset(example):
+...     audio = example["audio"]
+
+...     example.update(processor(audio=audio["array"], text=example["text"], sampling_rate=16000))
+
+...     return example
+```
+
+2. 将 `prepare_dataset` 函数应用于一个示例：
+
+```py
+>>> prepare_dataset(lj_speech[0])
+```
+
+`processor`现在已经添加了 `input_values` 和 `labels`，并且采样率也正确降低为为16kHz。现在可以将处理后的数据集传递给模型！
--- a/docs/source/zh/serialization.md
+++ b/docs/source/zh/serialization.md
@ -0,0 +1,181 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 导出为 ONNX
+
+在生产环境中部署 🤗 Transformers 模型通常需要或者能够受益于，将模型导出为可在专门的运行时和硬件上加载和执行的序列化格式。
+
+🤗 Optimum 是 Transformers 的扩展，可以通过其 `exporters` 模块将模型从 PyTorch 或 TensorFlow 导出为 ONNX 及 TFLite 等序列化格式。🤗 Optimum 还提供了一套性能优化工具，可以在目标硬件上以最高效率训练和运行模型。
+
+本指南演示了如何使用 🤗 Optimum 将 🤗 Transformers 模型导出为 ONNX。有关将模型导出为 TFLite 的指南，请参考 [导出为 TFLite 页面](tflite)。
+
+## 导出为 ONNX
+
+[ONNX (Open Neural Network eXchange 开放神经网络交换)](http://onnx.ai) 是一个开放的标准，它定义了一组通用的运算符和一种通用的文件格式，用于表示包括 PyTorch 和 TensorFlow 在内的各种框架中的深度学习模型。当一个模型被导出为 ONNX时，这些运算符被用于构建计算图（通常被称为*中间表示*），该图表示数据在神经网络中的流动。
+
+通过公开具有标准化运算符和数据类型的图，ONNX使得模型能够轻松在不同深度学习框架间切换。例如，在 PyTorch 中训练的模型可以被导出为 ONNX，然后再导入到 TensorFlow（反之亦然）。
+
+导出为 ONNX 后，模型可以：
+- 通过 [图优化（graph optimization）](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/optimization) 和 [量化（quantization）](https://huggingface.co/docs/optimum/onnxruntime/usage_guides/quantization) 等技术进行推理优化。 
+- 通过 [`ORTModelForXXX` 类](https://huggingface.co/docs/optimum/onnxruntime/package_reference/modeling_ort) 使用 ONNX Runtime 运行，它同样遵循你熟悉的 Transformers 中的 `AutoModel` API。
+- 使用 [优化推理流水线（pipeline）](https://huggingface.co/docs/optimum/main/en/onnxruntime/usage_guides/pipelines) 运行，其 API 与 🤗 Transformers 中的 [`pipeline`] 函数相同。
+
+🤗 Optimum 通过利用配置对象提供对 ONNX 导出的支持。多种模型架构已经有现成的配置对象，并且配置对象也被设计得易于扩展以适用于其他架构。
+
+现有的配置列表请参考 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/onnx/overview)。
+
+有两种方式可以将 🤗 Transformers 模型导出为 ONNX，这里我们展示这两种方法：
+
+- 使用 🤗 Optimum 的 CLI（命令行）导出。
+- 使用 🤗 Optimum 的 `optimum.onnxruntime` 模块导出。
+
+### 使用 CLI 将 🤗 Transformers 模型导出为 ONNX
+
+要将 🤗 Transformers 模型导出为 ONNX，首先需要安装额外的依赖项：
+
+```bash
+pip install optimum[exporters]
+```
+
+请参阅 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/export_a_model#exporting-a-model-to-onnx-using-the-cli) 以查看所有可用参数，或者在命令行中查看帮助：
+
+```bash
+optimum-cli export onnx --help
+```
+
+运行以下命令，以从 🤗 Hub 导出模型的检查点（checkpoint），以 `distilbert-base-uncased-distilled-squad` 为例：
+
+```bash
+optimum-cli export onnx --model distilbert-base-uncased-distilled-squad distilbert_base_uncased_squad_onnx/
+```
+
+你应该能在日志中看到导出进度以及生成的 `model.onnx` 文件的保存位置，如下所示：
+
+```bash
+Validating ONNX model distilbert_base_uncased_squad_onnx/model.onnx...
+	-[✓] ONNX model output names match reference model (start_logits, end_logits)
+	- Validating ONNX Model output "start_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+	- Validating ONNX Model output "end_logits":
+		-[✓] (2, 16) matches (2, 16)
+		-[✓] all values close (atol: 0.0001)
+The ONNX export succeeded and the exported model was saved at: distilbert_base_uncased_squad_onnx
+```
+
+上面的示例说明了从 🤗 Hub 导出检查点的过程。导出本地模型时，首先需要确保将模型的权重和分词器文件保存在同一目录（`local_path`）中。在使用 CLI 时，将 `local_path` 传递给 `model` 参数，而不是 🤗 Hub 上的检查点名称，并提供 `--task` 参数。你可以在 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/task_manager)中查看支持的任务列表。如果未提供 `task` 参数，将默认导出不带特定任务头的模型架构。
+
+```bash
+optimum-cli export onnx --model local_path --task question-answering distilbert_base_uncased_squad_onnx/
+```
+
+生成的 `model.onnx` 文件可以在支持 ONNX 标准的 [许多加速引擎（accelerators）](https://onnx.ai/supported-tools.html#deployModel) 之一上运行。例如，我们可以使用 [ONNX Runtime](https://onnxruntime.ai/) 加载和运行模型，如下所示：
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from optimum.onnxruntime import ORTModelForQuestionAnswering
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> model = ORTModelForQuestionAnswering.from_pretrained("distilbert_base_uncased_squad_onnx")
+>>> inputs = tokenizer("What am I using?", "Using DistilBERT with ONNX Runtime!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+
+从 Hub 导出 TensorFlow 检查点的过程也一样。例如，以下是从 [Keras 组织](https://huggingface.co/keras-io) 导出纯 TensorFlow 检查点的命令：
+
+```bash
+optimum-cli export onnx --model keras-io/transformers-qa distilbert_base_cased_squad_onnx/
+```
+
+### 使用 `optimum.onnxruntime` 将 🤗 Transformers 模型导出为 ONNX
+
+除了 CLI 之外，你还可以使用代码将 🤗 Transformers 模型导出为 ONNX，如下所示：
+
+```python
+>>> from optimum.onnxruntime import ORTModelForSequenceClassification
+>>> from transformers import AutoTokenizer
+
+>>> model_checkpoint = "distilbert_base_uncased_squad"
+>>> save_directory = "onnx/"
+
+>>> # 从 transformers 加载模型并将其导出为 ONNX
+>>> ort_model = ORTModelForSequenceClassification.from_pretrained(model_checkpoint, export=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
+
+>>> # 保存 onnx 模型以及分词器
+>>> ort_model.save_pretrained(save_directory)
+>>> tokenizer.save_pretrained(save_directory)
+```
+
+### 导出尚未支持的架构的模型
+
+如果你想要为当前无法导出的模型添加支持，请先检查 [`optimum.exporters.onnx`](https://huggingface.co/docs/optimum/exporters/onnx/overview) 是否支持该模型，如果不支持，你可以 [直接为 🤗 Optimum 贡献代码](https://huggingface.co/docs/optimum/exporters/onnx/usage_guides/contribute)。
+
+### 使用 `transformers.onnx` 导出模型
+
+<Tip warning={true}>
+
+`tranformers.onnx` 不再进行维护，请如上所述，使用 🤗 Optimum 导出模型。这部分内容将在未来版本中删除。
+
+</Tip>
+
+要使用 `tranformers.onnx` 将 🤗 Transformers 模型导出为 ONNX，请安装额外的依赖项：
+
+```bash
+pip install transformers[onnx]
+```
+
+将 `transformers.onnx` 包作为 Python 模块使用，以使用现成的配置导出检查点：
+
+```bash
+python -m transformers.onnx --model=distilbert-base-uncased onnx/
+```
+
+以上代码将导出由 `--model` 参数定义的检查点的 ONNX 图。传入任何 🤗 Hub 上或者存储与本地的检查点。生成的 `model.onnx` 文件可以在支持 ONNX 标准的众多加速引擎上运行。例如，使用 ONNX Runtime 加载并运行模型，如下所示：
+
+```python
+>>> from transformers import AutoTokenizer
+>>> from onnxruntime import InferenceSession
+
+>>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
+>>> session = InferenceSession("onnx/model.onnx")
+>>> # ONNX Runtime expects NumPy arrays as input
+>>> inputs = tokenizer("Using DistilBERT with ONNX Runtime!", return_tensors="np")
+>>> outputs = session.run(output_names=["last_hidden_state"], input_feed=dict(inputs))
+```
+
+可以通过查看每个模型的 ONNX 配置来获取所需的输出名（例如 `["last_hidden_state"]`）。例如，对于 DistilBERT，可以用以下代码获取输出名称：
+
+```python
+>>> from transformers.models.distilbert import DistilBertConfig, DistilBertOnnxConfig
+
+>>> config = DistilBertConfig()
+>>> onnx_config = DistilBertOnnxConfig(config)
+>>> print(list(onnx_config.outputs.keys()))
+["last_hidden_state"]
+```
+
+从 Hub 导出 TensorFlow 检查点的过程也一样。导出纯 TensorFlow 检查点的示例代码如下：
+
+```bash
+python -m transformers.onnx --model=keras-io/transformers-qa onnx/
+```
+
+要导出本地存储的模型，请将模型的权重和分词器文件保存在同一目录中（例如 `local-pt-checkpoint`），然后通过将 `transformers.onnx` 包的 `--model` 参数指向该目录，将其导出为 ONNX：
+
+```bash
+python -m transformers.onnx --model=local-pt-checkpoint onnx/
+```
--- a/docs/source/zh/task_summary.md
+++ b/docs/source/zh/task_summary.md
@ -0,0 +1,347 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 🤗 Transformers 能做什么
+
+🤗 Transformers是一个用于自然语言处理（NLP）、计算机视觉和音频和语音处理任务的预训练模型库。该库不仅包含Transformer模型，还包括用于计算机视觉任务的现代卷积网络等非Transformer模型。如果您看看今天最受欢迎的一些消费产品，比如智能手机、应用程序和电视，很可能背后都有某种深度学习技术的支持。想要从您智能手机拍摄的照片中删除背景对象吗？这里是一个全景分割任务的例子（如果您还不了解这是什么意思，我们将在以下部分进行描述！）。
+
+本页面提供了使用🤗 Transformers库仅用三行代码解决不同的语音和音频、计算机视觉和NLP任务的概述！
+
+
+## 音频
+音频和语音处理任务与其他模态略有不同，主要是因为音频作为输入是一个连续的信号。与文本不同，原始音频波形不能像句子可以被划分为单词那样被整齐地分割成离散的块。为了解决这个问题，通常在固定的时间间隔内对原始音频信号进行采样。如果在每个时间间隔内采样更多样本，采样率就会更高，音频更接近原始音频源。
+
+以前的方法是预处理音频以从中提取有用的特征。现在更常见的做法是直接将原始音频波形输入到特征编码器中，以提取音频表示。这样可以简化预处理步骤，并允许模型学习最重要的特征。
+
+### 音频分类
+
+音频分类是一项将音频数据从预定义的类别集合中进行标记的任务。这是一个广泛的类别，具有许多具体的应用，其中一些包括：
+
+* 声学场景分类：使用场景标签（"办公室"、"海滩"、"体育场"）对音频进行标记。
+* 声学事件检测：使用声音事件标签（"汽车喇叭声"、"鲸鱼叫声"、"玻璃破碎声"）对音频进行标记。
+* 标记：对包含多种声音的音频进行标记（鸟鸣、会议中的说话人识别）。
+* 音乐分类：使用流派标签（"金属"、"嘻哈"、"乡村"）对音乐进行标记。
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="audio-classification", model="superb/hubert-base-superb-er")
+>>> preds = classifier("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.4532, 'label': 'hap'},
+ {'score': 0.3622, 'label': 'sad'},
+ {'score': 0.0943, 'label': 'neu'},
+ {'score': 0.0903, 'label': 'ang'}]
+```
+
+### 自动语音识别
+
+自动语音识别（ASR）将语音转录为文本。这是最常见的音频任务之一，部分原因是因为语音是人类交流的自然形式。如今，ASR系统嵌入在智能技术产品中，如扬声器、电话和汽车。我们可以要求虚拟助手播放音乐、设置提醒和告诉我们天气。
+
+但是，Transformer架构帮助解决的一个关键挑战是低资源语言。通过在大量语音数据上进行预训练，仅在一个低资源语言的一小时标记语音数据上进行微调，仍然可以产生与以前在100倍更多标记数据上训练的ASR系统相比高质量的结果。
+
+```py
+>>> from transformers import pipeline
+
+>>> transcriber = pipeline(task="automatic-speech-recognition", model="openai/whisper-small")
+>>> transcriber("https://huggingface.co/datasets/Narsil/asr_dummy/resolve/main/mlk.flac")
+{'text': ' I have a dream that one day this nation will rise up and live out the true meaning of its creed.'}
+```
+
+## 计算机视觉
+
+计算机视觉任务中最早成功之一是使用卷积神经网络（[CNN](glossary#convolution)）识别邮政编码数字图像。图像由像素组成，每个像素都有一个数值。这使得将图像表示为像素值矩阵变得容易。每个像素值组合描述了图像的颜色。
+
+计算机视觉任务可以通过以下两种通用方式解决：
+
+1. 使用卷积来学习图像的层次特征，从低级特征到高级抽象特征。
+2. 将图像分成块，并使用Transformer逐步学习每个图像块如何相互关联以形成图像。与CNN偏好的自底向上方法不同，这种方法有点像从一个模糊的图像开始，然后逐渐将其聚焦清晰。
+
+### 图像分类
+
+图像分类将整个图像从预定义的类别集合中进行标记。像大多数分类任务一样，图像分类有许多实际用例，其中一些包括：
+
+* 医疗保健：标记医学图像以检测疾病或监测患者健康状况
+* 环境：标记卫星图像以监测森林砍伐、提供野外管理信息或检测野火
+* 农业：标记农作物图像以监测植物健康或用于土地使用监测的卫星图像
+* 生态学：标记动物或植物物种的图像以监测野生动物种群或跟踪濒危物种
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="image-classification")
+>>> preds = classifier(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.4335, 'label': 'lynx, catamount'}
+{'score': 0.0348, 'label': 'cougar, puma, catamount, mountain lion, painter, panther, Felis concolor'}
+{'score': 0.0324, 'label': 'snow leopard, ounce, Panthera uncia'}
+{'score': 0.0239, 'label': 'Egyptian cat'}
+{'score': 0.0229, 'label': 'tiger cat'}
+```
+
+### 目标检测
+
+与图像分类不同，目标检测在图像中识别多个对象以及这些对象在图像中的位置（由边界框定义）。目标检测的一些示例应用包括：
+
+* 自动驾驶车辆：检测日常交通对象，如其他车辆、行人和红绿灯
+* 遥感：灾害监测、城市规划和天气预报
+* 缺陷检测：检测建筑物中的裂缝或结构损坏，以及制造业产品缺陷
+
+
+```py
+>>> from transformers import pipeline
+
+>>> detector = pipeline(task="object-detection")
+>>> preds = detector(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"], "box": pred["box"]} for pred in preds]
+>>> preds
+[{'score': 0.9865,
+  'label': 'cat',
+  'box': {'xmin': 178, 'ymin': 154, 'xmax': 882, 'ymax': 598}}]
+```
+
+### 图像分割
+
+图像分割是一项像素级任务，将图像中的每个像素分配给一个类别。它与使用边界框标记和预测图像中的对象的目标检测不同，因为分割更加精细。分割可以在像素级别检测对象。有几种类型的图像分割：
+
+* 实例分割：除了标记对象的类别外，还标记每个对象的不同实例（“dog-1”，“dog-2”）
+* 全景分割：语义分割和实例分割的组合； 它使用语义类为每个像素标记并标记每个对象的不同实例
+
+分割任务对于自动驾驶车辆很有帮助，可以创建周围世界的像素级地图，以便它们可以在行人和其他车辆周围安全导航。它还适用于医学成像，其中任务的更精细粒度可以帮助识别异常细胞或器官特征。图像分割也可以用于电子商务，通过您的相机在现实世界中覆盖物体来虚拟试穿衣服或创建增强现实体验。
+
+```py
+>>> from transformers import pipeline
+
+>>> segmenter = pipeline(task="image-segmentation")
+>>> preds = segmenter(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> print(*preds, sep="\n")
+{'score': 0.9879, 'label': 'LABEL_184'}
+{'score': 0.9973, 'label': 'snow'}
+{'score': 0.9972, 'label': 'cat'}
+```
+
+### 深度估计
+
+深度估计预测图像中每个像素到相机的距离。这个计算机视觉任务对于场景理解和重建尤为重要。例如，在自动驾驶汽车中，车辆需要了解行人、交通标志和其他车辆等物体的距离，以避免障碍物和碰撞。深度信息还有助于从2D图像构建3D表示，并可用于创建生物结构或建筑物的高质量3D表示。
+
+有两种方法可以进行深度估计：
+
+* stereo（立体）：通过比较同一图像的两个略微不同角度的图像来估计深度
+* monocular（单目）：从单个图像中估计深度
+
+
+```py
+>>> from transformers import pipeline
+
+>>> depth_estimator = pipeline(task="depth-estimation")
+>>> preds = depth_estimator(
+...     "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+... )
+```
+
+## 自然语言处理
+
+NLP任务是最常见的类型之一，因为文本是我们进行交流的自然方式。为了让文本变成模型识别的格式，需要对其进行分词。这意味着将一段文本分成单独的单词或子词（`tokens`），然后将这些`tokens`转换为数字。因此，可以将一段文本表示为一系列数字，一旦有了一系列的数字，就可以将其输入到模型中以解决各种NLP任务！
+
+### 文本分类
+
+像任何模态的分类任务一样，文本分类将一段文本（可以是句子级别、段落或文档）从预定义的类别集合中进行标记。文本分类有许多实际应用，其中一些包括：
+
+* 情感分析：根据某些极性（如`积极`或`消极`）对文本进行标记，可以支持政治、金融和营销等领域的决策制定
+* 内容分类：根据某些主题对文本进行标记，有助于组织和过滤新闻和社交媒体提要中的信息（`天气`、`体育`、`金融`等）
+
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="sentiment-analysis")
+>>> preds = classifier("Hugging Face is the best thing since sliced bread!")
+>>> preds = [{"score": round(pred["score"], 4), "label": pred["label"]} for pred in preds]
+>>> preds
+[{'score': 0.9991, 'label': 'POSITIVE'}]
+```
+
+### Token分类
+
+在任何NLP任务中，文本都经过预处理，将文本序列分成单个单词或子词。这些被称为[tokens](/glossary#token)。Token分类将每个`token`分配一个来自预定义类别集的标签。
+
+两种常见的Token分类是：
+
+* 命名实体识别（NER）：根据实体类别（如组织、人员、位置或日期）对`token`进行标记。NER在生物医学设置中特别受欢迎，可以标记基因、蛋白质和药物名称。
+* 词性标注（POS）：根据其词性（如名词、动词或形容词）对标记进行标记。POS对于帮助翻译系统了解两个相同的单词如何在语法上不同很有用（作为名词的银行与作为动词的银行）。
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline(task="ner")
+>>> preds = classifier("Hugging Face is a French company based in New York City.")
+>>> preds = [
+...     {
+...         "entity": pred["entity"],
+...         "score": round(pred["score"], 4),
+...         "index": pred["index"],
+...         "word": pred["word"],
+...         "start": pred["start"],
+...         "end": pred["end"],
+...     }
+...     for pred in preds
+... ]
+>>> print(*preds, sep="\n")
+{'entity': 'I-ORG', 'score': 0.9968, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+{'entity': 'I-ORG', 'score': 0.9293, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+{'entity': 'I-ORG', 'score': 0.9763, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+{'entity': 'I-MISC', 'score': 0.9983, 'index': 6, 'word': 'French', 'start': 18, 'end': 24}
+{'entity': 'I-LOC', 'score': 0.999, 'index': 10, 'word': 'New', 'start': 42, 'end': 45}
+{'entity': 'I-LOC', 'score': 0.9987, 'index': 11, 'word': 'York', 'start': 46, 'end': 50}
+{'entity': 'I-LOC', 'score': 0.9992, 'index': 12, 'word': 'City', 'start': 51, 'end': 55}
+```
+
+### 问答
+
+问答是另一个`token-level`的任务，返回一个问题的答案，有时带有上下文（开放领域），有时不带上下文（封闭领域）。每当我们向虚拟助手提出问题时，例如询问一家餐厅是否营业，就会发生这种情况。它还可以提供客户或技术支持，并帮助搜索引擎检索您要求的相关信息。
+
+有两种常见的问答类型：
+
+* 提取式：给定一个问题和一些上下文，答案是从模型必须提取的上下文中的一段文本跨度。
+* 抽象式：给定一个问题和一些上下文，答案从上下文中生成；这种方法由[`Text2TextGenerationPipeline`]处理，而不是下面显示的[`QuestionAnsweringPipeline`]。
+
+
+```py
+>>> from transformers import pipeline
+
+>>> question_answerer = pipeline(task="question-answering")
+>>> preds = question_answerer(
+...     question="What is the name of the repository?",
+...     context="The name of the repository is huggingface/transformers",
+... )
+>>> print(
+...     f"score: {round(preds['score'], 4)}, start: {preds['start']}, end: {preds['end']}, answer: {preds['answer']}"
+... )
+score: 0.9327, start: 30, end: 54, answer: huggingface/transformers
+```
+
+### 摘要
+
+摘要从较长的文本中创建一个较短的版本，同时尽可能保留原始文档的大部分含义。摘要是一个序列到序列的任务；它输出比输入更短的文本序列。有许多长篇文档可以进行摘要，以帮助读者快速了解主要要点。法案、法律和财务文件、专利和科学论文等文档可以摘要，以节省读者的时间并作为阅读辅助工具。
+
+像问答一样，摘要有两种类型：
+
+* 提取式：从原始文本中识别和提取最重要的句子
+* 抽象式：从原始文本生成目标摘要（可能包括不在输入文档中的新单词）；[`SummarizationPipeline`]使用抽象方法。
+
+
+```py
+>>> from transformers import pipeline
+
+>>> summarizer = pipeline(task="summarization")
+>>> summarizer(
+...     "In this work, we presented the Transformer, the first sequence transduction model based entirely on attention, replacing the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention. For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers. On both WMT 2014 English-to-German and WMT 2014 English-to-French translation tasks, we achieve a new state of the art. In the former task our best model outperforms even all previously reported ensembles."
+... )
+[{'summary_text': ' The Transformer is the first sequence transduction model based entirely on attention . It replaces the recurrent layers most commonly used in encoder-decoder architectures with multi-headed self-attention . For translation tasks, the Transformer can be trained significantly faster than architectures based on recurrent or convolutional layers .'}]
+```
+
+### 翻译
+
+翻译将一种语言的文本序列转换为另一种语言。它对于帮助来自不同背景的人们相互交流、帮助翻译内容以吸引更广泛的受众，甚至成为学习工具以帮助人们学习一门新语言都非常重要。除了摘要之外，翻译也是一个序列到序列的任务，意味着模型接收输入序列并返回目标输出序列。
+
+在早期，翻译模型大多是单语的，但最近，越来越多的人对可以在多种语言之间进行翻译的多语言模型感兴趣。
+
+```py
+>>> from transformers import pipeline
+
+>>> text = "translate English to French: Hugging Face is a community-based open-source platform for machine learning."
+>>> translator = pipeline(task="translation", model="t5-small")
+>>> translator(text)
+[{'translation_text': "Hugging Face est une tribune communautaire de l'apprentissage des machines."}]
+```
+
+### 语言模型
+
+语言模型是一种预测文本序列中单词的任务。它已成为一种非常流行的NLP任务，因为预训练的语言模型可以微调用于许多其他下游任务。最近，人们对大型语言模型（LLMs）表现出了极大的兴趣，这些模型展示了`zero learning`或`few-shot learning`的能力。这意味着模型可以解决它未被明确训练过的任务！语言模型可用于生成流畅和令人信服的文本，但需要小心，因为文本可能并不总是准确的。
+
+有两种类型的话语模型：
+
+* causal：模型的目标是预测序列中的下一个`token`，而未来的`tokens`被遮盖。
+  
+
+    ```py
+    >>> from transformers import pipeline
+
+    >>> prompt = "Hugging Face is a community-based open-source platform for machine learning."
+    >>> generator = pipeline(task="text-generation")
+    >>> generator(prompt)  # doctest: +SKIP
+    ```
+
+*  masked：模型的目标是预测序列中被遮蔽的`token`，同时具有对序列中所有`tokens`的完全访问权限。
+
+    
+    ```py
+    >>> text = "Hugging Face is a community-based open-source <mask> for machine learning."
+    >>> fill_mask = pipeline(task="fill-mask")
+    >>> preds = fill_mask(text, top_k=1)
+    >>> preds = [
+    ...     {
+    ...         "score": round(pred["score"], 4),
+    ...         "token": pred["token"],
+    ...         "token_str": pred["token_str"],
+    ...         "sequence": pred["sequence"],
+    ...     }
+    ...     for pred in preds
+    ... ]
+    >>> preds
+    [{'score': 0.2236,
+      'token': 1761,
+      'token_str': ' platform',
+      'sequence': 'Hugging Face is a community-based open-source platform for machine learning.'}]
+    ```
+
+## 多模态
+
+多模态任务要求模型处理多种数据模态（文本、图像、音频、视频）以解决特定问题。图像描述是一个多模态任务的例子，其中模型将图像作为输入并输出描述图像或图像某些属性的文本序列。
+
+虽然多模态模型处理不同的数据类型或模态，但内部预处理步骤帮助模型将所有数据类型转换为`embeddings`（向量或数字列表，包含有关数据的有意义信息）。对于像图像描述这样的任务，模型学习图像嵌入和文本嵌入之间的关系。
+
+### 文档问答
+
+文档问答是从文档中回答自然语言问题的任务。与`token-level`问答任务不同，文档问答将包含问题的文档的图像作为输入，并返回答案。文档问答可用于解析结构化文档并从中提取关键信息。在下面的例子中，可以从收据中提取总金额和找零金额。
+
+```py
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> url = "https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/2/image/image.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> doc_question_answerer = pipeline("document-question-answering", model="magorshunov/layoutlm-invoices")
+>>> preds = doc_question_answerer(
+...     question="What is the total amount?",
+...     image=image,
+... )
+>>> preds
+[{'score': 0.8531, 'answer': '17,000', 'start': 4, 'end': 4}]
+```
+
+希望这个页面为您提供了一些有关每种模态中所有类型任务的背景信息以及每个任务的实际重要性。在[下一节](tasks_explained)中，您将了解Transformers如何解决这些任务。
--- a/docs/source/zh/tflite.md
+++ b/docs/source/zh/tflite.md
@ -0,0 +1,54 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 导出为 TFLite
+
+[TensorFlow Lite](https://www.tensorflow.org/lite/guide) 是一个轻量级框架，用于资源受限的设备上，如手机、嵌入式系统和物联网（IoT）设备，部署机器学习模型。TFLite 旨在在计算能力、内存和功耗有限的设备上优化和高效运行模型。模型以一种特殊的高效可移植格式表示，其文件扩展名为 `.tflite`。
+
+🤗 Optimum 通过 `exporters.tflite` 模块提供将 🤗 Transformers 模型导出至 TFLite 格式的功能。请参考 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/exporters/tflite/overview) 以获取支持的模型架构列表。
+
+要将模型导出为 TFLite 格式，请安装所需的依赖项：
+
+```bash
+pip install optimum[exporters-tf]
+```
+
+请参阅 [🤗 Optimum 文档](https://huggingface.co/docs/optimum/main/en/exporters/tflite/usage_guides/export_a_model) 以查看所有可用参数，或者在命令行中查看帮助：
+
+```bash
+optimum-cli export tflite --help
+```
+
+运行以下命令，以从 🤗 Hub 导出模型的检查点（checkpoint），以 `bert-base-uncased` 为例：
+
+```bash
+optimum-cli export tflite --model bert-base-uncased --sequence_length 128 bert_tflite/
+```
+
+你应该能在日志中看到导出进度以及生成的 `model.tflite` 文件的保存位置，如下所示：
+
+```bash
+Validating TFLite model...
+	-[✓] TFLite model output names match reference model (logits)
+	- Validating TFLite Model output "logits":
+		-[✓] (1, 128, 30522) matches (1, 128, 30522)
+		-[x] values not close enough, max diff: 5.817413330078125e-05 (atol: 1e-05)
+The TensorFlow Lite export succeeded with the warning: The maximum absolute difference between the output of the reference model and the TFLite exported model is not within the set tolerance 1e-05:
+- logits: max diff = 5.817413330078125e-05.
+ The exported model was saved at: bert_tflite
+```
+
+上面的示例说明了从 🤗 Hub 导出检查点的过程。导出本地模型时，首先需要确保将模型的权重和分词器文件保存在同一目录（`local_path`）中。在使用 CLI（命令行）时，将 `local_path` 传递给 `model` 参数，而不是 🤗 Hub 上的检查点名称。
--- a/docs/source/zh/training.md
+++ b/docs/source/zh/training.md
@ -0,0 +1,407 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 微调预训练模型
+
+[[open-in-colab]]
+
+使用预训练模型有许多显著的好处。它降低了计算成本，减少了碳排放，同时允许您使用最先进的模型，而无需从头开始训练一个。🤗 Transformers 提供了涉及各种任务的成千上万的预训练模型。当您使用预训练模型时，您需要在与任务相关的数据集上训练该模型。这种操作被称为微调，是一种非常强大的训练技术。在本教程中，您将使用您选择的深度学习框架来微调一个预训练模型：
+
+* 使用 🤗 Transformers 的 [`Trainer`] 来微调预训练模型。
+* 在 TensorFlow 中使用 Keras 来微调预训练模型。
+* 在原生 PyTorch 中微调预训练模型。
+
+<a id='data-processing'></a>
+
+## 准备数据集
+
+<Youtube id="_BZearw7f0w"/>
+
+在您进行预训练模型微调之前，需要下载一个数据集并为训练做好准备。之前的教程向您展示了如何处理训练数据，现在您有机会将这些技能付诸实践！
+
+首先，加载[Yelp评论](https://huggingface.co/datasets/yelp_review_full)数据集：
+
+```py
+>>> from datasets import load_dataset
+
+>>> dataset = load_dataset("yelp_review_full")
+>>> dataset["train"][100]
+{'label': 0,
+ 'text': 'My expectations for McDonalds are t rarely high. But for one to still fail so spectacularly...that takes something special!\\nThe cashier took my friends\'s order, then promptly ignored me. I had to force myself in front of a cashier who opened his register to wait on the person BEHIND me. I waited over five minutes for a gigantic order that included precisely one kid\'s meal. After watching two people who ordered after me be handed their food, I asked where mine was. The manager started yelling at the cashiers for \\"serving off their orders\\" when they didn\'t have their food. But neither cashier was anywhere near those controls, and the manager was the one serving food to customers and clearing the boards.\\nThe manager was rude when giving me my order. She didn\'t make sure that I had everything ON MY RECEIPT, and never even had the decency to apologize that I felt I was getting poor service.\\nI\'ve eaten at various McDonalds restaurants for over 30 years. I\'ve worked at more than one location. I expect bad days, bad moods, and the occasional mistake. But I have yet to have a decent experience at this store. It will remain a place I avoid unless someone in my party needs to avoid illness from low blood sugar. Perhaps I should go back to the racially biased service of Steak n Shake instead!'}
+```
+
+正如您现在所知，您需要一个`tokenizer`来处理文本，包括填充和截断操作以处理可变的序列长度。如果要一次性处理您的数据集，可以使用 🤗 Datasets 的 [`map`](https://huggingface.co/docs/datasets/process.html#map) 方法，将预处理函数应用于整个数据集：
+
+```py
+>>> from transformers import AutoTokenizer
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+
+>>> def tokenize_function(examples):
+...     return tokenizer(examples["text"], padding="max_length", truncation=True)
+
+
+>>> tokenized_datasets = dataset.map(tokenize_function, batched=True)
+```
+如果愿意的话，您可以从完整数据集提取一个较小子集来进行微调，以减少训练所需的时间：
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+<a id='trainer'></a>
+
+## 训练
+
+此时，您应该根据您训练所用的框架来选择对应的教程章节。您可以使用右侧的链接跳转到您想要的章节 - 如果您想隐藏某个框架对应的所有教程内容，只需使用右上角的按钮！
+
+
+<frameworkcontent>
+<pt>
+<Youtube id="nvBXf7s7vTI"/>
+
+## 使用 PyTorch Trainer 进行训练
+
+🤗 Transformers 提供了一个专为训练 🤗 Transformers 模型而优化的 [`Trainer`] 类，使您无需手动编写自己的训练循环步骤而更轻松地开始训练模型。[`Trainer`] API 支持各种训练选项和功能，如日志记录、梯度累积和混合精度。
+
+首先加载您的模型并指定期望的标签数量。根据 Yelp Review [数据集卡片](https://huggingface.co/datasets/yelp_review_full#data-fields)，您知道有五个标签：
+
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+<Tip>
+
+您将会看到一个警告，提到一些预训练权重未被使用，以及一些权重被随机初始化。不用担心，这是完全正常的！BERT 模型的预训练`head`被丢弃，并替换为一个随机初始化的分类`head`。您将在您的序列分类任务上微调这个新模型`head`，将预训练模型的知识转移给它。
+
+</Tip>
+
+### 训练超参数
+
+接下来，创建一个 [`TrainingArguments`] 类，其中包含您可以调整的所有超参数以及用于激活不同训练选项的标志。对于本教程，您可以从默认的训练[超参数](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments)开始，但随时可以尝试不同的设置以找到最佳设置。
+
+指定保存训练检查点的位置：
+
+```py
+>>> from transformers import TrainingArguments
+
+>>> training_args = TrainingArguments(output_dir="test_trainer")
+```
+
+### 评估
+
+[`Trainer`] 在训练过程中不会自动评估模型性能。您需要向 [`Trainer`] 传递一个函数来计算和展示指标。[🤗 Evaluate](https://huggingface.co/docs/evaluate/index) 库提供了一个简单的 [`accuracy`](https://huggingface.co/spaces/evaluate-metric/accuracy) 函数，您可以使用 [`evaluate.load`] 函数加载它（有关更多信息，请参阅此[快速入门](https://huggingface.co/docs/evaluate/a_quick_tour)）：
+
+```py
+>>> import numpy as np
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+```
+在 `metric` 上调用 [`~evaluate.compute`] 来计算您的预测的准确性。在将预测传递给 `compute` 之前，您需要将预测转换为`logits`（请记住，所有 🤗 Transformers 模型都返回对`logits`）：
+
+```py
+>>> def compute_metrics(eval_pred):
+...     logits, labels = eval_pred
+...     predictions = np.argmax(logits, axis=-1)
+...     return metric.compute(predictions=predictions, references=labels)
+```
+
+如果您希望在微调过程中监视评估指标，请在您的训练参数中指定 `evaluation_strategy` 参数，以在每个`epoch`结束时展示评估指标：
+
+```py
+>>> from transformers import TrainingArguments, Trainer
+
+>>> training_args = TrainingArguments(output_dir="test_trainer", evaluation_strategy="epoch")
+```
+
+### 训练器
+
+创建一个包含您的模型、训练参数、训练和测试数据集以及评估函数的 [`Trainer`] 对象：
+
+
+```py
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=small_train_dataset,
+...     eval_dataset=small_eval_dataset,
+...     compute_metrics=compute_metrics,
+... )
+```
+然后调用[`~transformers.Trainer.train`]以微调模型：
+
+```py
+>>> trainer.train()
+```
+</pt>
+<tf>
+<a id='keras'></a>
+
+<Youtube id="rnTGBy2ax1c"/>
+
+## 使用keras训练TensorFlow模型
+
+您也可以使用 Keras API 在 TensorFlow 中训练 🤗 Transformers 模型！
+
+### 加载用于 Keras 的数据
+
+当您希望使用 Keras API 训练 🤗 Transformers 模型时，您需要将您的数据集转换为 Keras 可理解的格式。如果您的数据集很小，您可以将整个数据集转换为NumPy数组并传递给 Keras。在进行更复杂的操作之前，让我们先尝试这种方法。
+
+首先，加载一个数据集。我们将使用 [GLUE benchmark](https://huggingface.co/datasets/glue) 中的 CoLA 数据集，因为它是一个简单的二元文本分类任务。现在只使用训练数据集。
+
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("glue", "cola")
+dataset = dataset["train"]  # Just take the training split for now
+```
+接下来，加载一个`tokenizer`并将数据标记为 NumPy 数组。请注意，标签已经是由 0 和 1 组成的`list`，因此我们可以直接将其转换为 NumPy 数组而无需进行分词处理！
+
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+tokenized_data = tokenizer(dataset["sentence"], return_tensors="np", padding=True)
+# Tokenizer returns a BatchEncoding, but we convert that to a dict for Keras
+tokenized_data = dict(tokenized_data)
+
+labels = np.array(dataset["label"])  # Label is already an array of 0 and 1
+```
+最后，加载、[`compile`](https://keras.io/api/models/model_training_apis/#compile-method) 和 [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) 模型。请注意，Transformers 模型都有一个默认的与任务相关的损失函数，因此除非您希望自定义，否则无需指定一个损失函数：
+
+```py
+from transformers import TFAutoModelForSequenceClassification
+from tensorflow.keras.optimizers import Adam
+
+# Load and compile our model
+model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")
+# Lower learning rates are often better for fine-tuning transformers
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
+
+model.fit(tokenized_data, labels)
+```
+
+<Tip>
+
+当您使用 `compile()` 编译模型时，无需传递损失参数！如果不指定损失参数，Hugging Face 模型会自动选择适合其任务和模型架构的损失函数。如果需要，您始终可以自己指定损失函数以覆盖默认配置。
+
+</Tip>
+
+这种方法对于较小的数据集效果很好，但对于较大的数据集，您可能会发现它开始变得有问题。为什么呢？因为分词后的数组和标签必须完全加载到内存中，而且由于 NumPy 无法处理“不规则”数组，因此每个分词后的样本长度都必须被填充到数据集中最长样本的长度。这将使您的数组变得更大，而所有这些`padding tokens`也会减慢训练速度！
+
+
+### 将数据加载为 tf.data.Dataset
+
+如果您想避免训练速度减慢，可以将数据加载为 `tf.data.Dataset`。虽然您可以自己编写自己的 `tf.data` 流水线，但我们有两种方便的方法来实现这一点：
+
+- [`~TFPreTrainedModel.prepare_tf_dataset`]：这是我们在大多数情况下推荐的方法。因为它是模型上的一个方法，它可以检查模型以自动确定哪些列可用作模型输入，并丢弃其他列以创建一个更简单、性能更好的数据集。
+- [`~datasets.Dataset.to_tf_dataset`]：这个方法更低级，但当您希望完全控制数据集的创建方式时非常有用，可以通过指定要包括的确切 `columns` 和 `label_cols` 来实现。
+
+在使用 [`~TFPreTrainedModel.prepare_tf_dataset`] 之前，您需要将`tokenizer`的输出添加到数据集作为列，如下面的代码示例所示：
+
+```py
+def tokenize_dataset(data):
+    # Keys of the returned dictionary will be added to the dataset as columns
+    return tokenizer(data["text"])
+
+
+dataset = dataset.map(tokenize_dataset)
+```
+请记住，默认情况下，Hugging Face 数据集存储在硬盘上，因此这不会增加您的内存使用！一旦列已经添加，您可以从数据集中流式的传输批次数据，并为每个批次添加`padding tokens`，这与为整个数据集添加`padding tokens`相比，大大减少了`padding tokens`的数量。
+
+```py
+>>> tf_dataset = model.prepare_tf_dataset(dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer)
+```
+请注意，在上面的代码示例中，您需要将`tokenizer`传递给`prepare_tf_dataset`，以便它可以在加载批次时正确填充它们。如果数据集中的所有样本都具有相同的长度而且不需要填充，您可以跳过此参数。如果需要执行比填充样本更复杂的操作（例如，用于掩码语言模型的`tokens` 替换），则可以使用 `collate_fn` 参数，而不是传递一个函数来将样本列表转换为批次并应用任何所需的预处理。请查看我们的[示例](https://github.com/huggingface/transformers/tree/main/examples)或[笔记](https://huggingface.co/docs/transformers/notebooks)以了解此方法的实际操作。
+
+一旦创建了 `tf.data.Dataset`，您可以像以前一样编译和训练模型：
+
+```py
+model.compile(optimizer=Adam(3e-5))  # No loss argument!
+
+model.fit(tf_dataset)
+```
+
+</tf>
+</frameworkcontent>
+
+<a id='pytorch_native'></a>
+
+## 在原生 PyTorch 中训练
+
+<frameworkcontent>
+<pt>
+<Youtube id="Dh9CL8fyG80"/>
+
+[`Trainer`] 负责训练循环，允许您在一行代码中微调模型。对于喜欢编写自己训练循环的用户，您也可以在原生 PyTorch 中微调 🤗 Transformers 模型。
+
+现在，您可能需要重新启动您的`notebook`，或执行以下代码以释放一些内存：
+
+```py
+del model
+del trainer
+torch.cuda.empty_cache()
+```
+
+接下来，手动处理 `tokenized_dataset` 以准备进行训练。
+
+1. 移除 text 列，因为模型不接受原始文本作为输入：
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.remove_columns(["text"])
+    ```
+
+2. 将 label 列重命名为 labels，因为模型期望参数的名称为 labels：
+
+    ```py
+    >>> tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
+    ```
+
+3. 设置数据集的格式以返回 PyTorch 张量而不是`lists`：
+
+    ```py
+    >>> tokenized_datasets.set_format("torch")
+    ```
+
+接着，创建一个先前展示的数据集的较小子集，以加速微调过程
+
+```py
+>>> small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
+>>> small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))
+```
+
+### DataLoader
+
+您的训练和测试数据集创建一个`DataLoader`类，以便可以迭代处理数据批次
+
+```py
+>>> from torch.utils.data import DataLoader
+
+>>> train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8)
+>>> eval_dataloader = DataLoader(small_eval_dataset, batch_size=8)
+```
+
+加载您的模型，并指定期望的标签数量：
+
+```py
+>>> from transformers import AutoModelForSequenceClassification
+
+>>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)
+```
+
+### Optimizer and learning rate scheduler
+
+创建一个`optimizer`和`learning rate scheduler`以进行模型微调。让我们使用 PyTorch 中的 [AdamW](https://pytorch.org/docs/stable/generated/torch.optim.AdamW.html) 优化器：
+
+```py
+>>> from torch.optim import AdamW
+
+>>> optimizer = AdamW(model.parameters(), lr=5e-5)
+```
+
+创建来自 [`Trainer`] 的默认`learning rate scheduler`：
+
+
+```py
+>>> from transformers import get_scheduler
+
+>>> num_epochs = 3
+>>> num_training_steps = num_epochs * len(train_dataloader)
+>>> lr_scheduler = get_scheduler(
+...     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
+... )
+```
+
+最后，指定 `device` 以使用 GPU（如果有的话）。否则，使用 CPU 进行训练可能需要几个小时，而不是几分钟。
+
+
+```py
+>>> import torch
+
+>>> device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+>>> model.to(device)
+```
+
+<Tip>
+
+如果没有 GPU，可以通过notebook平台如 [Colaboratory](https://colab.research.google.com/) 或 [SageMaker StudioLab](https://studiolab.sagemaker.aws/) 来免费获得云端GPU使用。
+
+</Tip>
+
+现在您已经准备好训练了！🥳
+
+### 训练循环
+
+为了跟踪训练进度，使用 [tqdm](https://tqdm.github.io/) 库来添加一个进度条，显示训练步数的进展：
+
+```py
+>>> from tqdm.auto import tqdm
+
+>>> progress_bar = tqdm(range(num_training_steps))
+
+>>> model.train()
+>>> for epoch in range(num_epochs):
+...     for batch in train_dataloader:
+...         batch = {k: v.to(device) for k, v in batch.items()}
+...         outputs = model(**batch)
+...         loss = outputs.loss
+...         loss.backward()
+
+...         optimizer.step()
+...         lr_scheduler.step()
+...         optimizer.zero_grad()
+...         progress_bar.update(1)
+```
+
+### 评估
+
+就像您在 [`Trainer`] 中添加了一个评估函数一样，当您编写自己的训练循环时，您需要做同样的事情。但与在每个`epoch`结束时计算和展示指标不同，这一次您将使用 [`~evaluate.add_batch`] 累积所有批次，并在最后计算指标。
+
+```py
+>>> import evaluate
+
+>>> metric = evaluate.load("accuracy")
+>>> model.eval()
+>>> for batch in eval_dataloader:
+...     batch = {k: v.to(device) for k, v in batch.items()}
+...     with torch.no_grad():
+...         outputs = model(**batch)
+
+...     logits = outputs.logits
+...     predictions = torch.argmax(logits, dim=-1)
+...     metric.add_batch(predictions=predictions, references=batch["labels"])
+
+>>> metric.compute()
+```
+</pt>
+</frameworkcontent>
+
+<a id='additional-resources'></a>
+
+## 附加资源
+
+更多微调例子可参考如下链接：
+
+- [🤗 Transformers 示例](https://github.com/huggingface/transformers/tree/main/examples) 包含用于在 PyTorch 和 TensorFlow 中训练常见自然语言处理任务的脚本。
+
+- [🤗 Transformers 笔记](notebooks) 包含针对特定任务在 PyTorch 和 TensorFlow 中微调模型的各种`notebook`。
--- a/docs/source/zh/transformers_agents.md
+++ b/docs/source/zh/transformers_agents.md
@ -0,0 +1,285 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Transformers Agents
+
+<Tip warning={true}>
+
+`Transformers Agents`是一个实验性的随时可能发生变化的API。由于API或底层模型可能发生变化，`agents`返回的结果也会有所不同。
+
+</Tip>
+
+Transformers版本`v4.29.0`基于`tools`和`agents`概念构建。您可以在[此Colab链接](https://colab.research.google.com/drive/1c7MHD-T1forUPGcC_jlwsIptOzpG3hSj)中进行测试。
+
+简而言之，它在`Transformers`之上提供了一个自然语言API：我们定义了一组经过筛选的`tools`，并设计了一个`agents`来解读自然语言并使用这些工具。它具有很强的可扩展性；我们筛选了一些相关的`tools`，但我们将向您展示如何通过社区开发的`tool`轻松地扩展系统。
+
+让我们从一些可以通过这个新API实现的示例开始。在处理多模态任务时它尤其强大，因此让我们快速试着生成图像并大声朗读文本。
+
+
+```py
+agent.run("Caption the following image", image=image)
+```
+
+| **输入**                                                                                                                      | **输出**                            |
+|-----------------------------------------------------------------------------------------------------------------------------|-----------------------------------|
+| <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/beaver.png" width=200> | A beaver is swimming in the water |
+
+---
+
+```py
+agent.run("Read the following text out loud", text=text)
+```
+| **输入**                            | **输出**                                                                                                                                                                                                               |
+|-----------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| A beaver is swimming in the water | <audio controls><source src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tts_example.wav" type="audio/wav"> your browser does not support the audio element. </audio> 
+
+---
+
+```py
+agent.run(
+    "In the following `document`, where will the TRRF Scientific Advisory Council Meeting take place?",
+    document=document,
+)
+```
+| **输入**                                                                                                                   | **输出**     |
+|-----------------------------------------------------------------------------------------------------------------------------|----------------|
+| <img src="https://datasets-server.huggingface.co/assets/hf-internal-testing/example-documents/--/hf-internal-testing--example-documents/test/0/image/image.jpg" width=200> | ballroom foyer |
+
+## 快速入门
+
+要使用 `agent.run`，您需要实例化一个`agent`，它是一个大型语言模型（LLM）。我们支持OpenAI模型以及来自BigCode和OpenAssistant的开源替代方案。OpenAI模型性能更好（但需要您拥有OpenAI API密钥，因此无法免费使用），Hugging Face为BigCode和OpenAssistant模型提供了免费访问端点。
+
+一开始请安装`agents`附加模块，以安装所有默认依赖项。
+
+```bash
+pip install transformers[agents]
+```
+
+要使用OpenAI模型，您可以在安装`openai`依赖项后实例化一个`OpenAiAgent`：
+
+```bash
+pip install openai
+```
+
+
+```py
+from transformers import OpenAiAgent
+
+agent = OpenAiAgent(model="text-davinci-003", api_key="<your_api_key>")
+```
+
+要使用BigCode或OpenAssistant，请首先登录以访问Inference API：
+
+```py
+from huggingface_hub import login
+
+login("<YOUR_TOKEN>")
+```
+
+然后，实例化`agent`：
+
+```py
+from transformers import HfAgent
+
+# Starcoder
+agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoder")
+# StarcoderBase
+# agent = HfAgent("https://api-inference.huggingface.co/models/bigcode/starcoderbase")
+# OpenAssistant
+# agent = HfAgent(url_endpoint="https://api-inference.huggingface.co/models/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5")
+```
+
+此示例使用了目前Hugging Face免费提供的推理API。如果你有自己的推理端点用于此模型（或其他模型），你可以用你的URL替换上面的URL。
+
+<Tip>
+
+StarCoder和OpenAssistant可以免费使用，并且在简单任务上表现出色。然而，当处理更复杂的提示时就不再有效。如果你遇到这样的问题，我们建议尝试使用OpenAI模型，尽管遗憾的是它不是开源的，但它在目前情况下表现更好。
+
+</Tip>
+
+现在，您已经可以开始使用了！让我们深入了解您现在可以使用的两个API。
+
+### 单次执行(run)
+
+单次执行方法是使用`agent`的 `~Agent.run`：
+
+```py
+agent.run("Draw me a picture of rivers and lakes.")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200>
+
+它会自动选择适合您要执行的任务的`tool`（或`tools`），并以适当的方式运行它们。它可以在同一指令中执行一个或多个任务（尽管您的指令越复杂，`agent`失败的可能性就越大）。
+
+
+```py
+agent.run("Draw me a picture of the sea then transform the picture to add an island")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/sea_and_island.png" width=200>
+
+<br/>
+
+每个 [`~Agent.run`] 操作都是独立的，因此您可以多次连续运行 [`~Agent.run`]并执行不同的任务。
+
+请注意，您的 `agent` 只是一个大型语言模型，因此您略有变化的提示可能会产生完全不同的结果。重要的是尽可能清晰地解释您要执行的任务。我们在[这里](../en/custom_tools#writing-good-user-inputs)更深入地讨论了如何编写良好的提示。
+
+如果您想在多次执行之间保持同一状态或向`agent`传递非文本对象，可以通过指定`agent`要使用的变量来实现。例如，您可以生成有关河流和湖泊的第一幅图像，并要求模型通过执行以下操作向该图片添加一个岛屿：
+
+```python
+picture = agent.run("Generate a picture of rivers and lakes.")
+updated_picture = agent.run("Transform the image in `picture` to add an island to it.", picture=picture)
+```
+
+<Tip>
+
+当模型无法理解您的请求和库中的工具时，这可能会有所帮助。例如：
+
+```py
+agent.run("Draw me the picture of a capybara swimming in the sea")
+```
+
+在这种情况下，模型可以以两种方式理解您的请求：
+- 使用`text-to-image` 生成在大海中游泳的大水獭
+- 或者，使用`text-to-image`生成大水獭，然后使用`image-transformation`工具使其在大海中游泳
+
+如果您想强制使用第一种情景，可以通过将提示作为参数传递给它来实现：
+
+
+```py
+agent.run("Draw me a picture of the `prompt`", prompt="a capybara swimming in the sea")
+```
+
+</Tip>
+
+
+### 基于交流的执行 (chat)
+
+基于交流的执行（chat）方式是使用 [`~Agent.chat`]：
+
+```py
+agent.chat("Generate a picture of rivers and lakes")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes.png" width=200> 
+
+```py
+agent.chat("Transform the picture so that there is a rock in there")
+```
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/rivers_and_lakes_and_beaver.png" width=200>
+
+<br/>
+
+当您希望在不同指令之间保持同一状态时，这会是一个有趣的方法。它更适合用于单个指令，而不是复杂的多步指令（`~Agent.run` 方法更适合处理这种情况）。
+
+这种方法也可以接受参数，以便您可以传递非文本类型或特定提示。
+
+### ⚠️ 远程执行
+
+出于演示目的以便适用于所有设置，我们为发布版本的少数默认工具创建了远程执行器。这些工具是使用推理终端（inference endpoints）创建的。
+
+目前我们已将其关闭，但为了了解如何自行设置远程执行器工具，我们建议阅读[自定义工具指南](./custom_tools)。
+
+### 这里发生了什么？什么是`tools`，什么是`agents`？
+
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/diagram.png">
+
+
+#### Agents
+
+这里的`Agents`是一个大型语言模型，我们通过提示它以访问特定的工具集。
+
+大型语言模型在生成小代码示例方面表现出色，因此这个API利用这一特点，通过提示LLM生成一个使用`tools`集合的小代码示例。然后，根据您给`Agents`的任务和`tools`的描述来完成此提示。这种方式让它能够访问工具的文档，特别是它们的期望输入和输出，以生成相关的代码。
+
+#### Tools
+
+`Tools`非常简单：它们是有名称和描述的单个函数。然后，我们使用这些`tools`的描述来提示代理。通过提示，我们向`agent`展示如何使用`tool`来执行查询语言中请求的操作。
+
+这是使用全新`tools`而不是`pipelines`，因为`agent`编写的代码更好，具有非常原子化的`tools`。`pipelines`经常被重构，并且通常将多个任务合并为一个。`tools`旨在专注于一个非常简单的任务。
+
+#### 代码执行？
+
+然后，这段代码基于`tools`的输入被我们的小型Python解释器执行。我们听到你在后面大声呼喊“任意代码执行！”，但让我们解释为什么情况并非如此。
+
+只能您提供的`tools`和打印函数可以被执行，因此您已经受到了执行的限制。如果仅限于 Hugging Face 工具，那么您应该是安全的。
+
+然后，我们不允许任何属性查找或导入（无论如何都不需要将输入/输出传递给一小组函数），因此所有最明显的攻击（并且您需要提示LLM无论如何输出它们）不应该是一个问题。如果你想超级安全，你可以使用附加参数 return_code=True 执行 run() 方法，在这种情况下，`agent`将只返回要执行的代码，你可以决定是否执行。
+
+如果`agent`生成的代码存在任何尝试执行非法操作的行为，或者代码中出现了常规Python错误，执行将停止。
+
+
+### 一组经过精心筛选的`tools`
+
+我们确定了一组可以赋予这些`agent`强大能力的`tools`。以下是我们在`transformers`中集成的`tools`的更新列表：
+
+- **文档问答**：给定一个图像格式的文档（例如PDF），回答该文档上的问题（[Donut](../en/model_doc/donut)）
+- **文本问答**：给定一段长文本和一个问题，回答文本中的问题（[Flan-T5](../en/model_doc/flan-t5)）
+- **无条件图像字幕**：为图像添加字幕！（[BLIP](../en/model_doc/blip)）
+- **图像问答**：给定一张图像，回答该图像上的问题（[VILT](../en/model_doc/vilt)）
+- **图像分割**：给定一张图像和一个提示，输出该提示的分割掩模（[CLIPSeg](../en/model_doc/clipseg)）
+- **语音转文本**：给定一个人说话的音频录音，将演讲内容转录为文本（[Whisper](../en/model_doc/whisper)）
+- **文本转语音**：将文本转换为语音（[SpeechT5](../en/model_doc/speecht5)）
+- **Zero-Shot文本分类**：给定一个文本和一个标签列表，确定文本最符合哪个标签（[BART](../en/model_doc/bart)）
+- **文本摘要**：总结长文本为一两句话（[BART](../en/model_doc/bart)）
+- **翻译**：将文本翻译为指定语言（[NLLB](../en/model_doc/nllb)）
+
+这些`tools`已在transformers中集成，并且也可以手动使用，例如：
+
+```py
+from transformers import load_tool
+
+tool = load_tool("text-to-speech")
+audio = tool("This is a text to speech tool")
+```
+
+### 自定义工具
+
+尽管我们确定了一组经过筛选的`tools`，但我们坚信，此实现提供的主要价值在于能够快速创建和共享自定义`tool`。
+
+通过将工具的代码上传到Hugging Face空间或模型repository，您可以直接通过`agent`使用`tools`。我们已经添加了一些**与transformers无关**的`tools`到[`huggingface-tools`组织](https://huggingface.co/huggingface-tools)中：
+
+- **文本下载器**：从Web URL下载文本
+- **文本到图像**：根据提示生成图像，利用`stable diffusion`
+- **图像转换**：根据初始图像和提示修改图像，利用`instruct pix2pix stable diffusion`
+- **文本到视频**：根据提示生成小视频，利用`damo-vilab`
+
+从一开始就一直在使用的文本到图像`tool`是一个远程`tool `，位于[*huggingface-tools/text-to-image*](https://huggingface.co/spaces/huggingface-tools/text-to-image)！我们将继续在此组织和其他组织上发布此类`tool`，以进一步增强此实现。
+
+`agents`默认可以访问存储在[`huggingface-tools`](https://huggingface.co/huggingface-tools)上的`tools`。我们将在后续指南中解释如何编写和共享自定义`tools`，以及如何利用Hub上存在的任何自定义`tools`。
+
+### 代码生成
+
+到目前为止，我们已经展示了如何使用`agents`来为您执行操作。但是，`agents`仅使用非常受限Python解释器执行的代码。如果您希望在不同的环境中使用生成的代码，可以提示`agents`返回代码，以及`tools`的定义和准确的导入信息。
+
+例如，以下指令
+
+```python
+agent.run("Draw me a picture of rivers and lakes", return_code=True)
+```
+
+返回以下代码
+
+```python
+from transformers import load_tool
+
+image_generator = load_tool("huggingface-tools/text-to-image")
+
+image = image_generator(prompt="rivers and lakes")
+```
+
+然后你就可以调整并执行代码
--- a/examples/flax/image-captioning/run_image_captioning_flax.py
+++ b/examples/flax/image-captioning/run_image_captioning_flax.py
@ -195,7 +195,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -407,7 +407,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/language-modeling/run_bart_dlm_flax.py
+++ b/examples/flax/language-modeling/run_bart_dlm_flax.py
@ -181,7 +181,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )

@ -471,7 +471,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@ -182,7 +182,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -352,7 +352,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@ -187,7 +187,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -395,7 +395,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@ -181,7 +181,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )

@ -512,7 +512,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -62,7 +62,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
@ -168,7 +168,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -456,7 +456,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recogintion/requirements.txt")

--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@ -201,7 +201,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -435,7 +435,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
@ -114,7 +114,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -339,7 +339,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

@ -162,7 +162,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -395,7 +395,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@ -172,7 +172,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -275,7 +275,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
--- a/examples/pytorch/README.md
+++ b/examples/pytorch/README.md
@ -223,7 +223,7 @@ import wandb
 wandb.login()
 ```

-To enable logging to W&B, include `"wandb"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to all` if you have `wandb` installed.
+To enable logging to W&B, include `"wandb"` in the `report_to` of your `TrainingArguments` or script. Or just pass along `--report_to_all` if you have `wandb` installed.

 Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metrics, model topology and gradients (for `Trainer` only) will automatically be logged.

--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

@ -164,7 +164,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -215,7 +215,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
@ -243,7 +246,7 @@ def main():

    # Log on each process the small summary:
    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu} "
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

@ -99,7 +99,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -253,7 +253,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
@ -281,7 +284,7 @@ def main():

    # Log on each process the small summary:
    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")
--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -28,6 +28,7 @@ from PIL import Image
 from torchvision.transforms import (
    CenterCrop,
    Compose,
+    Lambda,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
@ -56,7 +57,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

@ -155,7 +156,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -194,7 +195,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
@ -222,7 +226,7 @@ def main():

    # Log on each process the small summary:
    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")
@ -325,7 +329,11 @@ def main():
        size = image_processor.size["shortest_edge"]
    else:
        size = (image_processor.size["height"], image_processor.size["width"])
-    normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+    normalize = (
+        Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+        if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
+        else Lambda(lambda x: x)
+    )
    _train_transforms = Compose(
        [
            RandomResizedCrop(size),
--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -32,6 +32,7 @@ from torch.utils.data import DataLoader
 from torchvision.transforms import (
    CenterCrop,
    Compose,
+    Lambda,
    Normalize,
    RandomHorizontalFlip,
    RandomResizedCrop,
@ -47,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 logger = get_logger(__name__)

@ -331,7 +332,11 @@ def main():
        size = image_processor.size["shortest_edge"]
    else:
        size = (image_processor.size["height"], image_processor.size["width"])
-    normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+    normalize = (
+        Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
+        if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
+        else Lambda(lambda x: x)
+    )
    train_transforms = Compose(
        [
            RandomResizedCrop(size),
--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -44,7 +44,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

@ -146,7 +146,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    mask_ratio: float = field(
@ -183,7 +183,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
@ -211,7 +214,7 @@ def main():

    # Log on each process the small summary:
    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")
--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -49,7 +49,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

@ -166,7 +166,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -257,7 +257,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
@ -285,7 +288,7 @@ def main():

    # Log on each process the small summary:
    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")
--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -54,7 +54,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

@ -200,7 +200,7 @@ def parse_args():
        "--use_auth_token",
        type=bool,
        default=None,
-        help="The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`.",
+        help="The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
    )
    parser.add_argument(
        "--trust_remote_code",
@ -385,7 +385,10 @@ def main():
    args = parse_args()

    if args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        args.token = args.use_auth_token
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

@ -124,7 +124,7 @@ class ModelArguments:
    use_auth_token: bool = field(
        default=None,
        metadata={
-            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token`."
+            "help": "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead."
        },
    )
    trust_remote_code: bool = field(
@ -256,7 +256,10 @@ def main():
        model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    if model_args.use_auth_token is not None:
-        warnings.warn("The `use_auth_token` argument is deprecated and will be removed in v4.34.", FutureWarning)
+        warnings.warn(
+            "The `use_auth_token` argument is deprecated and will be removed in v4.34. Please use `token` instead.",
+            FutureWarning,
+        )
        if model_args.token is not None:
            raise ValueError("`token` and `use_auth_token` are both specified. Please set only the argument `token`.")
        model_args.token = model_args.use_auth_token
@ -285,7 +288,7 @@ def main():

    # Log on each process the small summary:
    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}, "
        + f"distributed training: {training_args.parallel_mode.value == 'distributed'}, 16-bits training: {training_args.fp16}"
    )
    logger.info(f"Training/evaluation parameters {training_args}")
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.35.0.dev0")
+check_min_version("4.35.0")

 logger = get_logger(__name__)

--- a/Show More
+++ b/Show More