run separate reports

separate reports
2025-11-02 10:34:36 +08:00 · 2023-10-27 10:21:46 +02:00 · 2023-10-27 10:11:31 +02:00 · 2023-10-27 10:04:49 +02:00 · 2023-10-24 17:07:30 +02:00
174 changed files with 545 additions and 6193 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -127,7 +127,6 @@ class CircleCIJob:
            },
        ]
        steps.extend([{"run": l} for l in self.install_steps])
-        steps.extend([{"run": "pip install pytest-subtests"}])
        steps.append(
            {
                "save_cache": {
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -15,7 +15,7 @@ jobs:
      commit_sha: ${{ github.sha }}
      package: transformers
      notebook_folder: transformers_doc
-      languages: de en es fr it ko pt zh ja te
+      languages: de en es fr it ko pt zh ja
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,4 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: de en es fr it ko pt zh ja te
+      languages: de en es fr it ko pt zh ja
--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@ -3,7 +3,7 @@ name: Doctests
 on:
  push:
    branches:
-      - doctest*
+      - run_truncate
  repository_dispatch:
  schedule:
    - cron: "17 2 * * *"
@ -20,7 +20,7 @@ env:

 jobs:
  run_doctests:
-    runs-on: [single-gpu, nvidia-gpu, t4, doctest-ci]
+    runs-on: [single-gpu, nvidia-gpu, doctest-ci, yih-dar-shieh-debug-doctest]
    container:
      image: huggingface/transformers-all-latest-gpu
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -49,7 +49,7 @@ jobs:

      - name: Run doctests
        run: |
-          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules $(cat doc_tests.txt) -sv --doctest-continue-on-failure --doctest-glob="*.md"
+          python3 -m pytest -v --make-reports doc_tests_gpu --doctest-modules docs/source/en/model_doc/code_llama.md docs/source/en/generation_strategies.md src/transformers/generation/configuration_utils.py src/transformers/generation/logits_process.py::transformers.generation.logits_process.ExponentialDecayLengthPenalty -sv --doctest-continue-on-failure --doctest-glob="*.md"

      - name: Failure short reports
        if: ${{ failure() }}
--- a/.github/workflows/self-push-amd-mi210-caller.yml
+++ b/.github/workflows/self-push-amd-mi210-caller.yml
@ -1,25 +0,0 @@
-name: Self-hosted runner (AMD mi210 CI caller)
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (push-caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_push_ci_caller*
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "templates/**"
-      - "utils/**"
-
-jobs:
-  run_amd_ci:
-    name: AMD mi210
-    if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
-    uses: ./.github/workflows/self-push-amd.yml
-    with:
-      gpu_flavor: mi210
-    secrets: inherit
--- a/.github/workflows/self-push-amd-mi250-caller.yml
+++ b/.github/workflows/self-push-amd-mi250-caller.yml
@ -1,25 +0,0 @@
-name: Self-hosted runner (AMD mi250 CI caller)
-
-on:
-  workflow_run:
-    workflows: ["Self-hosted runner (push-caller)"]
-    branches: ["main"]
-    types: [completed]
-  push:
-    branches:
-      - run_amd_push_ci_caller*
-    paths:
-      - "src/**"
-      - "tests/**"
-      - ".github/**"
-      - "templates/**"
-      - "utils/**"
-
-jobs:
-  run_amd_ci:
-    name: AMD mi250
-    if: (cancelled() != true) && ((github.event_name != 'schedule') || ((github.event_name == 'push') && startsWith(github.ref_name, 'run_amd_push_ci_caller')))
-    uses: ./.github/workflows/self-push-amd.yml
-    with:
-      gpu_flavor: mi250
-    secrets: inherit
--- a/.github/workflows/self-push-amd.yml
+++ b/.github/workflows/self-push-amd.yml
@ -1,11 +1,21 @@
 name: Self-hosted runner AMD GPU (push)

 on:
-  workflow_call:
-    inputs:
-      gpu_flavor:
-        required: true
-        type: string
+  workflow_run:
+    workflows: ["Self-hosted runner (push-caller)"]
+    branches: ["main"]
+    types: [completed]
+  push:
+    branches:
+      - ci_*
+      - ci-*
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+      - "utils/**"
+  repository_dispatch:

 env:
  HF_HOME: /mnt/cache
@ -35,7 +45,8 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -54,7 +65,8 @@ jobs:
    strategy:
      matrix:
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -152,7 +164,8 @@ jobs:
      matrix:
        folders: ${{ fromJson(needs.setup_gpu.outputs.matrix) }}
        machine_type: [single-gpu, multi-gpu]
-    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ inputs.gpu_flavor }}']
+        gpu_flavor: [mi210]
+    runs-on: [self-hosted, docker-gpu, amd-gpu, '${{ matrix.machine_type }}', '${{ matrix.gpu_flavor }}']
    container:
      image: huggingface/transformers-pytorch-amd-gpu-push-ci  # <--- We test only for PyTorch for now
      options: --device /dev/kfd --device /dev/dri --env HIP_VISIBLE_DEVICES --env ROCR_VISIBLE_DEVICES --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
@ -308,7 +321,7 @@ jobs:
          CI_SLACK_CHANNEL_DUMMY_TESTS: ${{ secrets.CI_SLACK_CHANNEL_DUMMY_TESTS }}
          CI_SLACK_REPORT_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID_AMD }}
          ACCESS_REPO_INFO_TOKEN: ${{ secrets.ACCESS_REPO_INFO_TOKEN }}
-          CI_EVENT: Push CI (AMD) - ${{ inputs.gpu_flavor }}
+          CI_EVENT: push
          CI_TITLE_PUSH: ${{ github.event.head_commit.message }}
          CI_TITLE_WORKFLOW_RUN: ${{ github.event.workflow_run.head_commit.message }}
          CI_SHA: ${{ env.CI_SHA }}
--- a/README.md
+++ b/README.md
@ -363,7 +363,6 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://openai.com/research/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
--- a/README_es.md
+++ b/README_es.md
@ -338,7 +338,6 @@ Número actual de puntos de control: ![](https://img.shields.io/endpoint?url=htt
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
--- a/README_hd.md
+++ b/README_hd.md
@ -85,13 +85,13 @@ checkpoint: जाँच बिंदु

 🤗 Transformers 100 से अधिक भाषाओं में पाठ वर्गीकरण, सूचना निष्कर्षण, प्रश्न उत्तर, सारांशीकरण, अनुवाद, पाठ निर्माण का समर्थन करने के लिए हजारों पूर्व-प्रशिक्षित मॉडल प्रदान करता है। इसका उद्देश्य सबसे उन्नत एनएलपी तकनीक को सभी के लिए सुलभ बनाना है।

-🤗 Transformers त्वरित डाउनलोड और उपयोग के लिए एक एपीआई प्रदान करता है, जिससे आप किसी दिए गए पाठ पर एक पूर्व-प्रशिक्षित मॉडल ले सकते हैं, इसे अपने डेटासेट पर ठीक कर सकते हैं और इसे [मॉडल हब](https://huggingface.co/models) के माध्यम से समुदाय के साथ साझा कर सकते हैं। इसी समय, प्रत्येक परिभाषित पायथन मॉड्यूल पूरी तरह से स्वतंत्र है, जो संशोधन और तेजी से अनुसंधान प्रयोगों के लिए सुविधाजनक है।
+🤗 Transformers त्वरित डाउनलोड और उपयोग के लिए एक एपीआई प्रदान करता है, जिससे आप किसी दिए गए पाठ पर एक पूर्व-प्रशिक्षित मॉडल ले सकते हैं, इसे अपने डेटासेट पर ठीक कर सकते हैं और इसे [मॉडल हब] (https://huggingface.co/models) के माध्यम से समुदाय के साथ साझा कर सकते हैं। ) . इसी समय, प्रत्येक परिभाषित पायथन मॉड्यूल पूरी तरह से स्वतंत्र है, जो संशोधन और तेजी से अनुसंधान प्रयोगों के लिए सुविधाजनक है।

 🤗 Transformers तीन सबसे लोकप्रिय गहन शिक्षण पुस्तकालयों का समर्थन करता है： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — और इसके साथ निर्बाध रूप से एकीकृत होता है। आप अपने मॉडल को सीधे एक ढांचे के साथ प्रशिक्षित कर सकते हैं और दूसरे के साथ लोड और अनुमान लगा सकते हैं।

 ## ऑनलाइन डेमो

-आप सबसे सीधे मॉडल पृष्ठ पर परीक्षण कर सकते हैं [model hub](https://huggingface.co/models) मॉडल पर। हम [निजी मॉडल होस्टिंग, मॉडल संस्करण, और अनुमान एपीआई](https://huggingface.co/pricing) भी प्रदान करते हैं।。
+आप सबसे सीधे मॉडल पृष्ठ पर परीक्षण कर सकते हैं [model hub](https://huggingface.co/models) मॉडल पर। हम [निजी मॉडल होस्टिंग, मॉडल संस्करण, और अनुमान एपीआई] भी प्रदान करते हैं।(https://huggingface.co/pricing)。

 यहाँ कुछ उदाहरण हैं：
 - [शब्द को भरने के लिए मास्क के रूप में BERT का प्रयोग करें](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
@ -165,7 +165,7 @@ checkpoint: जाँच बिंदु

 टोकननाइज़र सभी पूर्व-प्रशिक्षित मॉडलों के लिए प्रीप्रोसेसिंग प्रदान करता है और इसे सीधे एक स्ट्रिंग (जैसे ऊपर दिए गए उदाहरण) या किसी सूची पर बुलाया जा सकता है। यह एक डिक्शनरी (तानाशाही) को आउटपुट करता है जिसे आप डाउनस्ट्रीम कोड में उपयोग कर सकते हैं या `**` अनपैकिंग एक्सप्रेशन के माध्यम से सीधे मॉडल को पास कर सकते हैं।

-मॉडल स्वयं एक नियमित [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) या [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (आपके बैकएंड के आधार पर), जो हो सकता है सामान्य तरीके से उपयोग किया जाता है। [यह ट्यूटोरियल](https://huggingface.co/transformers/training.html) बताता है कि इस तरह के मॉडल को क्लासिक PyTorch या TensorFlow प्रशिक्षण लूप में कैसे एकीकृत किया जाए, या हमारे `ट्रेनर` एपीआई का उपयोग कैसे करें ताकि इसे जल्दी से फ़ाइन ट्यून किया जा सके।एक नया डेटासेट पे।
+मॉडल स्वयं एक नियमित [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) या [TensorFlow `tf.keras.Model`](https ://pytorch.org/docs/stable/nn.html#torch.nn.Module) ://www.tensorflow.org/api_docs/python/tf/keras/Model) (आपके बैकएंड के आधार पर), जो हो सकता है सामान्य तरीके से उपयोग किया जाता है। [यह ट्यूटोरियल](https://huggingface.co/transformers/training.html) बताता है कि इस तरह के मॉडल को क्लासिक PyTorch या TensorFlow प्रशिक्षण लूप में कैसे एकीकृत किया जाए, या हमारे `ट्रेनर` एपीआई का उपयोग कैसे करें ताकि इसे जल्दी से फ़ाइन ट्यून किया जा सके।एक नया डेटासेट पे।

 ## ट्रांसफार्मर का उपयोग क्यों करें?

@ -194,7 +194,7 @@ checkpoint: जाँच बिंदु

 - यह लाइब्रेरी मॉड्यूलर न्यूरल नेटवर्क टूलबॉक्स नहीं है। मॉडल फ़ाइल में कोड जानबूझकर अल्पविकसित है, बिना अतिरिक्त सार इनकैप्सुलेशन के, ताकि शोधकर्ता अमूर्तता और फ़ाइल जंपिंग में शामिल हुए जल्दी से पुनरावृति कर सकें।
 - `ट्रेनर` एपीआई किसी भी मॉडल के साथ संगत नहीं है, यह केवल इस पुस्तकालय के मॉडल के लिए अनुकूलित है। यदि आप सामान्य मशीन लर्निंग के लिए उपयुक्त प्रशिक्षण लूप कार्यान्वयन की तलाश में हैं, तो कहीं और देखें।
- हमारे सर्वोत्तम प्रयासों के बावजूद, [उदाहरण निर्देशिका](https://github.com/huggingface/transformers/tree/main/examples) में स्क्रिप्ट केवल उपयोग के मामले हैं। आपकी विशिष्ट समस्या के लिए, वे जरूरी नहीं कि बॉक्स से बाहर काम करें, और आपको कोड की कुछ पंक्तियों को सूट करने की आवश्यकता हो सकती है।
+- हमारे सर्वोत्तम प्रयासों के बावजूद, [उदाहरण निर्देशिका] (https://github.com/huggingface/transformers/tree/main/examples) में स्क्रिप्ट केवल उपयोग के मामले हैं। आपकी विशिष्ट समस्या के लिए, वे जरूरी नहीं कि बॉक्स से बाहर काम करें, और आपको कोड की कुछ पंक्तियों को सूट करने की आवश्यकता हो सकती है।

 ## स्थापित करना

@ -202,13 +202,11 @@ checkpoint: जाँच बिंदु

 इस रिपॉजिटरी का परीक्षण Python 3.8+, Flax 0.4.1+, PyTorch 1.10+ और TensorFlow 2.6+ के तहत किया गया है।

-आप [वर्चुअल एनवायरनमेंट](https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें।
+आप [वर्चुअल एनवायरनमेंट] (https://docs.python.org/3/library/venv.html) में 🤗 ट्रांसफॉर्मर इंस्टॉल कर सकते हैं। यदि आप अभी तक पायथन के वर्चुअल एनवायरनमेंट से परिचित नहीं हैं, तो कृपया इसे [उपयोगकर्ता निर्देश] (https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/) पढ़ें।

 सबसे पहले, पायथन के उस संस्करण के साथ एक आभासी वातावरण बनाएं जिसका आप उपयोग करने और उसे सक्रिय करने की योजना बना रहे हैं।

-फिर, आपको Flax, PyTorch या TensorFlow में से किसी एक को स्थापित करने की आवश्यकता है। अपने प्लेटफ़ॉर्म पर इन फ़्रेमवर्क को स्थापित करने के लिए, [TensorFlow स्थापना पृष्ठ](https://www.tensorflow.org/install/), [PyTorch स्थापना पृष्ठ](https://pytorch.org/get-started/locally) 
-
-देखें start-locally या [Flax स्थापना पृष्ठ](https://github.com/google/flax#quick-install).
+फिर, आपको Flax, PyTorch या TensorFlow में से किसी एक को स्थापित करने की आवश्यकता है। अपने प्लेटफ़ॉर्म पर इन फ़्रेमवर्क को स्थापित करने के लिए, [TensorFlow स्थापना पृष्ठ](https://www.tensorflow.org/install/), [PyTorch स्थापना पृष्ठ](https://pytorch.org/get-started /locally/# देखें) start-locally) या [Flax स्थापना पृष्ठ](https://github.com/google/flax#quick-install).

 जब इनमें से कोई एक बैकएंड सफलतापूर्वक स्थापित हो जाता है, तो ट्रांसफॉर्मर निम्नानुसार स्थापित किए जा सकते हैं:

@ -216,7 +214,7 @@ checkpoint: जाँच बिंदु
 pip install transformers
 ```

-यदि आप उपयोग के मामलों को आज़माना चाहते हैं या आधिकारिक रिलीज़ से पहले नवीनतम इन-डेवलपमेंट कोड का उपयोग करना चाहते हैं, तो आपको [सोर्स से इंस्टॉल करना होगा](https://huggingface.co/docs/transformers/installation#installing-from-) स्रोत।
+यदि आप उपयोग के मामलों को आज़माना चाहते हैं या आधिकारिक रिलीज़ से पहले नवीनतम इन-डेवलपमेंट कोड का उपयोग करना चाहते हैं, तो आपको [सोर्स से इंस्टॉल करना होगा](https://huggingface.co/docs/transformers/installation#installing-from- स्रोत)।

 ### कोंडा का उपयोग करना

@ -231,7 +229,7 @@ conda install -c huggingface transformers
 कोंडा के माध्यम से Flax, PyTorch, या TensorFlow में से किसी एक को स्थापित करने के लिए, निर्देशों के लिए उनके संबंधित स्थापना पृष्ठ देखें।

 ## मॉडल आर्किटेक्चर
-[उपयोगकर्ता](https://huggingface.co/users) और [organization](https://huggingface.co) द्वारा ट्रांसफॉर्मर समर्थित [**सभी मॉडल चौकियों**](https://huggingface.co/models/users) हगिंगफेस.को/ऑर्गनाइजेशन), सभी को बिना किसी बाधा के हगिंगफेस.को [मॉडल हब](https://huggingface.co) के साथ एकीकृत किया गया है।
+[उपयोगकर्ता](https://huggingface.co/users) और [organization](https://huggingface.co) द्वारा ट्रांसफॉर्मर समर्थित [**सभी मॉडल चौकियों**](https://huggingface.co/models) /users) हगिंगफेस.को/ऑर्गनाइजेशन), सभी को बिना किसी बाधा के हगिंगफेस.को [मॉडल हब](https://huggingface.co) के साथ एकीकृत किया गया है।

 चौकियों की वर्तमान संख्या: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)

@ -243,13 +241,13 @@ conda install -c huggingface transformers
 1. **[Audio Spectrogram Transformer](https://huggingface.co/docs/transformers/model_doc/audio-spectrogram-transformer)** (from MIT) released with the paper [AST: Audio Spectrogram Transformer](https://arxiv.org/abs/2104.01778) by Yuan Gong, Yu-An Chung, James Glass.
 1. **[Autoformer](https://huggingface.co/docs/transformers/model_doc/autoformer)** (from Tsinghua University) released with the paper [Autoformer: Decomposition Transformers with Auto-Correlation for Long-Term Series Forecasting](https://arxiv.org/abs/2106.13008) by Haixu Wu, Jiehui Xu, Jianmin Wang, Mingsheng Long.
 1. **[Bark](https://huggingface.co/docs/transformers/model_doc/bark)** (from Suno) released in the repository [suno-ai/bark](https://github.com/suno-ai/bark) by Suno AI team.
-1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ](https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
+1. **[BART](https://huggingface.co/docs/transformers/model_doc/bart)** (फेसबुक) साथ थीसिस [बार्ट: प्राकृतिक भाषा निर्माण, अनुवाद के लिए अनुक्रम-से-अनुक्रम पूर्व प्रशिक्षण , और समझ] (https://arxiv.org/pdf/1910.13461.pdf) पर निर्भर माइक लुईस, यिनहान लियू, नमन गोयल, मार्जन ग़ज़विनिनेजाद, अब्देलरहमान मोहम्मद, ओमर लेवी, वेस स्टोयानोव और ल्यूक ज़ेटलमॉयर
 1. **[BARThez](https://huggingface.co/docs/transformers/model_doc/barthez)** (से École polytechnique) साथ थीसिस [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) पर निर्भर Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis रिहाई।
 1. **[BARTpho](https://huggingface.co/docs/transformers/model_doc/bartpho)** (VinAI Research से) साथ में पेपर [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)गुयेन लुओंग ट्रान, डुओंग मिन्ह ले और डाट क्वोक गुयेन द्वारा पोस्ट किया गया।
 1. **[BEiT](https://huggingface.co/docs/transformers/model_doc/beit)** (Microsoft से) साथ में कागज [BEiT: BERT इमेज ट्रांसफॉर्मर्स का प्री-ट्रेनिंग](https://arxiv.org/abs/2106.08254) Hangbo Bao, Li Dong, Furu Wei द्वारा।
 1. **[BERT](https://huggingface.co/docs/transformers/model_doc/bert)** (गूगल से) साथ वाला पेपर [बीईआरटी: प्री-ट्रेनिंग ऑफ डीप बिडायरेक्शनल ट्रांसफॉर्मर्स फॉर लैंग्वेज अंडरस्टैंडिंग](https://arxiv.org/abs/1810.04805) जैकब डेवलिन, मिंग-वेई चांग, केंटन ली और क्रिस्टीना टौटानोवा द्वारा प्रकाशित किया गया था। .
 1. **[BERT For Sequence Generation](https://huggingface.co/docs/transformers/model_doc/bert-generation)** (गूगल से) साथ देने वाला पेपर [सीक्वेंस जेनरेशन टास्क के लिए प्री-ट्रेंड चेकपॉइंट का इस्तेमाल करना](https ://arxiv.org/abs/1907.12461) साशा रोठे, शशि नारायण, अलियाक्सि सेवेरिन द्वारा।
-1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research से) साथ में पेपर [BERTweet: अंग्रेजी ट्वीट्स के लिए एक पूर्व-प्रशिक्षित भाषा मॉडल](https://aclanthology.org/2020.emnlp-demos.2/) डाट क्वोक गुयेन, थान वु और अन्ह तुआन गुयेन द्वारा प्रकाशित।
+1. **[BERTweet](https://huggingface.co/docs/transformers/model_doc/bertweet)** (VinAI Research से) साथ में पेपर [BERTweet: अंग्रेजी ट्वीट्स के लिए एक पूर्व-प्रशिक्षित भाषा मॉडल] (https://aclanthology.org/2020.emnlp-demos.2/) डाट क्वोक गुयेन, थान वु और अन्ह तुआन गुयेन द्वारा प्रकाशित।
 1. **[BigBird-Pegasus](https://huggingface.co/docs/transformers/model_doc/bigbird_pegasus)** (गूगल रिसर्च से) साथ वाला पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv .org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानोन, फिलिप फाम, अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा।
 1. **[BigBird-RoBERTa](https://huggingface.co/docs/transformers/model_doc/big_bird)** (गूगल रिसर्च से) साथ में पेपर [बिग बर्ड: ट्रांसफॉर्मर्स फॉर लॉन्गर सीक्वेंस](https://arxiv.org/abs/2007.14062) मंज़िल ज़हीर, गुरु गुरुगणेश, अविनावा दुबे, जोशुआ आइंस्ली, क्रिस अल्बर्टी, सैंटियागो ओंटानन, फिलिप फाम द्वारा , अनिरुद्ध रावुला, किफ़ान वांग, ली यांग, अमर अहमद द्वारा पोस्ट किया गया।
 1. **[BioGpt](https://huggingface.co/docs/transformers/model_doc/biogpt)** (from Microsoft Research AI4Science) released with the paper [BioGPT: generative pre-trained transformer for biomedical text generation and mining](https://academic.oup.com/bib/advance-article/doi/10.1093/bib/bbac409/6713511?guestAccessKey=a66d9b5d-4f83-4017-bb52-405815c907b9) by Renqian Luo, Liai Sun, Yingce Xia, Tao Qin, Sheng Zhang, Hoifung Poon and Tie-Yan Liu.
@ -312,7 +310,6 @@ conda install -c huggingface transformers
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (गूगल रिसर्च से) साथ वाला पेपर [FNet: मिक्सिंग टोकन विद फूरियर ट्रांसफॉर्म्स](https://arxiv.org /abs/2105.03824) जेम्स ली-थॉर्प, जोशुआ आइंस्ली, इल्या एकस्टीन, सैंटियागो ओंटानन द्वारा।
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research से) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. द्वाराअनुसंधान पत्र [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) के साथ जारी किया गया
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (सीएमयू/गूगल ब्रेन से) साथ में कागज [फ़नल-ट्रांसफॉर्मर: कुशल भाषा प्रसंस्करण के लिए अनुक्रमिक अतिरेक को छानना](https://arxiv.org/abs/2006.03236) जिहांग दाई, गुओकुन लाई, यिमिंग यांग, क्वोक वी. ले द्वारा रिहाई।
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (ADEPT से) रोहन बाविशी, एरिच एलसेन, कर्टिस हॉथोर्न, मैक्सवेल नी, ऑगस्टस ओडेना, अरुशी सोमानी, सागनाक तासिरलार  [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST से) साथ वाला पेपर [वर्टिकल कटडेप्थ के साथ मोनोकुलर डेप्थ एस्टीमेशन के लिए ग्लोबल-लोकल पाथ नेटवर्क्स](https:/ /arxiv.org/abs/2201.07436) डोयोन किम, वूंगह्युन गा, प्युंगवान आह, डोंगग्यू जू, सेहवान चुन, जुनमो किम द्वारा।
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI से) साथ में दिया गया पेपर [जेनरेटिव प्री-ट्रेनिंग द्वारा भाषा की समझ में सुधार](https://blog .openai.com/language-unsupervised/) एलेक रैडफोर्ड, कार्तिक नरसिम्हन, टिम सालिमन्स और इल्या सुत्स्केवर द्वारा।
@ -450,12 +447,12 @@ conda install -c huggingface transformers
 1. **[ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn)** (मेटा एआई से) साथ में कागज [लेबल-कुशल सीखने के लिए मास्क्ड स्याम देश के नेटवर्क](https://arxiv. org/abs/2204.07141) महमूद असरान, मथिल्डे कैरन, ईशान मिश्रा, पियोट्र बोजानोवस्की, फ्लोरियन बोर्डेस, पास्कल विंसेंट, आर्मंड जौलिन, माइकल रब्बत, निकोलस बल्लास द्वारा।
 1. **[VITS](https://huggingface.co/docs/transformers/model_doc/vits)** (Kakao Enterprise से) Jaehyeon Kim, Jungil Kong, Juhee Son. द्वाराअनुसंधान पत्र [Conditional Variational Autoencoder with Adversarial Learning for End-to-End Text-to-Speech](https://arxiv.org/abs/2106.06103) के साथ जारी किया गया
 1. **[ViViT](https://huggingface.co/docs/transformers/model_doc/vivit)** (from Google Research) released with the paper [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691) by Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid.
-1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन](https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
+1. **[Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2)** (फेसबुक एआई से) साथ में पेपर [wav2vec 2.0: ए फ्रेमवर्क फॉर सेल्फ-सुपरवाइज्ड लर्निंग ऑफ स्पीच रिप्रेजेंटेशन] (https://arxiv.org/abs/2006.11477) एलेक्सी बेवस्की, हेनरी झोउ, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
 1. **[Wav2Vec2-Conformer](https://huggingface.co/docs/transformers/model_doc/wav2vec2-conformer)** (Facebook AI से) साथ वाला पेपर [FAIRSEQ S2T: FAIRSEQ के साथ फास्ट स्पीच-टू-टेक्स्ट मॉडलिंग ](https://arxiv.org/abs/2010.05171) चांगहान वांग, यूं तांग, जुताई मा, ऐनी वू, सरव्या पोपुरी, दिमित्रो ओखोनको, जुआन पिनो द्वारा पोस्ट किया गया।
-1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI से) साथ वाला पेपर [सरल और प्रभावी जीरो-शॉट क्रॉस-लिंगुअल फोनेम रिकॉग्निशन](https://arxiv.org/abs/2109.11680) कियानटोंग जू, एलेक्सी बाएव्स्की, माइकल औली द्वारा।
-1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (माइक्रोसॉफ्ट रिसर्च से) पेपर के साथ जारी किया गया [WavLM: फुल स्टैक के लिए बड़े पैमाने पर स्व-पर्यवेक्षित पूर्व-प्रशिक्षण स्पीच प्रोसेसिंग](https://arxiv.org/abs/2110.13900) सानयुआन चेन, चेंगयी वांग, झेंगयांग चेन, यू वू, शुजी लियू, ज़ुओ चेन, जिन्यु ली, नाओयुकी कांडा, ताकुया योशियोका, ज़िओंग जिओ, जियान वू, लॉन्ग झोउ, शुओ रेन, यानमिन कियान, याओ कियान, जियान वू, माइकल ज़ेंग, फुरु वेई।
+1. **[Wav2Vec2Phoneme](https://huggingface.co/docs/transformers/model_doc/wav2vec2_phoneme)** (Facebook AI से) साथ वाला पेपर [सरल और प्रभावी जीरो-शॉट क्रॉस-लिंगुअल फोनेम रिकॉग्निशन](https:/ /arxiv.org/abs/2109.11680) कियानटोंग जू, एलेक्सी बाएव्स्की, माइकल औली द्वारा।
+1. **[WavLM](https://huggingface.co/docs/transformers/model_doc/wavlm)** (माइक्रोसॉफ्ट रिसर्च से) पेपर के साथ जारी किया गया [WavLM: फुल स्टैक के लिए बड़े पैमाने पर स्व-पर्यवेक्षित पूर्व-प्रशिक्षण स्पीच प्रोसेसिंग] (https://arxiv.org/abs/2110.13900) सानयुआन चेन, चेंगयी वांग, झेंगयांग चेन, यू वू, शुजी लियू, ज़ुओ चेन, जिन्यु ली, नाओयुकी कांडा, ताकुया योशियोका, ज़िओंग जिओ, जियान वू, लॉन्ग झोउ, शुओ रेन, यानमिन कियान, याओ कियान, जियान वू, माइकल ज़ेंग, फुरु वेई।
 1. **[Whisper](https://huggingface.co/docs/transformers/model_doc/whisper)** (OpenAI से) साथ में कागज [बड़े पैमाने पर कमजोर पर्यवेक्षण के माध्यम से मजबूत भाषण पहचान](https://cdn. openai.com/papers/whisper.pdf) एलेक रैडफोर्ड, जोंग वूक किम, ताओ जू, ग्रेग ब्रॉकमैन, क्रिस्टीन मैकलीवे, इल्या सुत्स्केवर द्वारा।
-1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [एक्सपैंडिंग लैंग्वेज-इमेज प्रीट्रेन्ड मॉडल फॉर जनरल वीडियो रिकग्निशन](https://arxiv.org/abs/2208.02816) बोलिन नी, होउवेन पेंग, मिंगाओ चेन, सोंगयांग झांग, गाओफेंग मेंग, जियानलोंग फू, शिमिंग जियांग, हैबिन लिंग द्वारा।
+1. **[X-CLIP](https://huggingface.co/docs/transformers/model_doc/xclip)** (माइक्रोसॉफ्ट रिसर्च से) कागज के साथ [एक्सपैंडिंग लैंग्वेज-इमेज प्रीट्रेन्ड मॉडल फॉर जनरल वीडियो रिकग्निशन](https: //arxiv.org/abs/2208.02816) बोलिन नी, होउवेन पेंग, मिंगाओ चेन, सोंगयांग झांग, गाओफेंग मेंग, जियानलोंग फू, शिमिंग जियांग, हैबिन लिंग द्वारा।
 1. **[X-MOD](https://huggingface.co/docs/transformers/model_doc/xmod)** (Meta AI से) Jonas Pfeiffer, Naman Goyal, Xi Lin, Xian Li, James Cross, Sebastian Riedel, Mikel Artetxe. द्वाराअनुसंधान पत्र [Lifting the Curse of Multilinguality by Pre-training Modular Transformers](http://dx.doi.org/10.18653/v1/2022.naacl-main.255) के साथ जारी किया गया
 1. **[XGLM](https://huggingface.co/docs/transformers/model_doc/xglm)** (From Facebook AI) released with the paper [Few-shot Learning with Multilingual Language Models](https://arxiv.org/abs/2112.10668) by Xi Victoria Lin, Todor Mihaylov, Mikel Artetxe, Tianlu Wang, Shuohui Chen, Daniel Simig, Myle Ott, Naman Goyal, Shruti Bhosale, Jingfei Du, Ramakanth Pasunuru, Sam Shleifer, Punit Singh Koura, Vishrav Chaudhary, Brian O'Horo, Jeff Wang, Luke Zettlemoyer, Zornitsa Kozareva, Mona Diab, Veselin Stoyanov, Xian Li.
 1. **[XLM](https://huggingface.co/docs/transformers/model_doc/xlm)** (फेसबुक से) साथ में पेपर [क्रॉस-लिंगुअल लैंग्वेज मॉडल प्रीट्रेनिंग] (https://arxiv.org/abs/1901.07291) गिलाउम लैम्पल और एलेक्सिस कोनो द्वारा।
@ -468,7 +465,7 @@ conda install -c huggingface transformers
 1. **[XLSR-Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/xlsr_wav2vec2)** (फेसबुक एआई से) साथ में पेपर [अनसुपरवाइज्ड क्रॉस-लिंगुअल रिप्रेजेंटेशन लर्निंग फॉर स्पीच रिकग्निशन] (https://arxiv.org/abs/2006.13979) एलेक्सिस कोन्यू, एलेक्सी बेवस्की, रोनन कोलोबर्ट, अब्देलरहमान मोहम्मद, माइकल औली द्वारा।
 1. **[YOLOS](https://huggingface.co/docs/transformers/model_doc/yolos)** (हुआझोंग यूनिवर्सिटी ऑफ साइंस एंड टेक्नोलॉजी से) साथ में पेपर [यू ओनली लुक एट वन सीक्वेंस: रीथिंकिंग ट्रांसफॉर्मर इन विज़न थ्रू ऑब्जेक्ट डिटेक्शन](https://arxiv.org/abs/2106.00666) युक्सिन फेंग, बेनचेंग लियाओ, जिंगगैंग वांग, जेमिन फेंग, जियांग क्यूई, रुई वू, जियानवेई नीयू, वेन्यू लियू द्वारा पोस्ट किया गया।
 1. **[YOSO](https://huggingface.co/docs/transformers/model_doc/yoso)** (विस्कॉन्सिन विश्वविद्यालय - मैडिसन से) साथ में पेपर [यू ओनली सैंपल (लगभग) ज़ानपेंग ज़ेंग, युनयांग ज़िओंग द्वारा , सत्य एन. रवि, शैलेश आचार्य, ग्लेन फंग, विकास सिंह द्वारा पोस्ट किया गया।
-1. एक नए मॉडल में योगदान देना चाहते हैं? नए मॉडल जोड़ने में आपका मार्गदर्शन करने के लिए हमारे पास एक **विस्तृत मार्गदर्शिका और टेम्प्लेट** है। आप उन्हें [`टेम्पलेट्स`](./templates) निर्देशिका में पा सकते हैं। पीआर शुरू करने से पहले [योगदान दिशानिर्देश](./CONTRIBUTING.md) देखना और अनुरक्षकों से संपर्क करना या प्रतिक्रिया प्राप्त करने के लिए एक नया मुद्दा खोलना याद रखें।
+1. एक नए मॉडल में योगदान देना चाहते हैं? नए मॉडल जोड़ने में आपका मार्गदर्शन करने के लिए हमारे पास एक **विस्तृत मार्गदर्शिका और टेम्प्लेट** है। आप उन्हें [`टेम्पलेट्स`](./templates) निर्देशिका में पा सकते हैं। पीआर शुरू करने से पहले [योगदान दिशानिर्देश] (./CONTRIBUTING.md) देखना और अनुरक्षकों से संपर्क करना या प्रतिक्रिया प्राप्त करने के लिए एक नया मुद्दा खोलना याद रखें।

 यह जांचने के लिए कि क्या किसी मॉडल में पहले से ही Flax, PyTorch या TensorFlow का कार्यान्वयन है, या यदि उसके पास Tokenizers लाइब्रेरी में संबंधित टोकन है, तो [यह तालिका](https://huggingface.co/docs/transformers/index#supported) देखें। -फ्रेमवर्क)।

--- a/README_ja.md
+++ b/README_ja.md
@ -372,7 +372,6 @@ Flax、PyTorch、TensorFlowをcondaでインストールする方法は、それ
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (Google Research から) James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon から公開された研究論文: [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (Microsoft Research から) Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao. から公開された研究論文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926)
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (CMU/Google Brain から) Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le から公開された研究論文: [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236)
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (ADEPT から) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. から公開された研究論文 [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (Microsoft Research から) Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang. から公開された研究論文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100)
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (KAIST から) Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim から公開された研究論文: [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436)
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (OpenAI から) Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever から公開された研究論文: [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/)
--- a/README_ko.md
+++ b/README_ko.md
@ -287,7 +287,6 @@ Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. 논문과 함께 공개 [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
--- a/README_ru.md
+++ b/README_ru.md
@ -66,11 +66,11 @@ limitations under the License.

 🤗 Transformers предоставляет тысячи предварительно обученных моделей для выполнения различных задач, таких как текст, зрение и аудио.

-Эти модели могут быть применены к:
+Эти модели могут быть применены на:

-* 📝 Тексту для таких задач, как классификация текстов, извлечение информации, ответы на вопросы, обобщение, перевод, генерация текстов на более чем 100 языках.
-* 🖼️ Изображениям для задач классификации изображений, обнаружения объектов и сегментации.
-* 🗣️ Аудио для задач распознавания речи и классификации аудио.
+* 📝 Текст, для таких задач, как классификация текстов, извлечение информации, ответы на вопросы, обобщение, перевод, генерация текстов, на более чем 100 языках.
+* 🖼️ Изображения - для задач классификации изображений, обнаружения объектов и сегментации.
+* 🗣️ Аудио - для задач распознавания речи и классификации аудио.

 Модели transformers также могут выполнять несколько задач, такие как ответы на табличные вопросы, распознавание оптических символов, извлечение информации из отсканированных документов, классификация видео и ответы на визуальные вопросы.

@ -361,7 +361,6 @@ conda install -c huggingface transformers
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@ -311,7 +311,6 @@ conda install -c huggingface transformers
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (来自 Microsoft Research) 伴随论文 [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) 由 Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao 发布。
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (来自 ADEPT) 伴随论文 [blog post](https://www.adept.ai/blog/fuyu-8b 由 Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar 发布。)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (来自 Microsoft Research) 伴随论文 [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) 由 Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang 发布。
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (来自 KAIST) 伴随论文 [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) 由 Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim 发布。
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@ -323,7 +323,6 @@ conda install -c huggingface transformers
 1. **[FNet](https://huggingface.co/docs/transformers/model_doc/fnet)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[FocalNet](https://huggingface.co/docs/transformers/model_doc/focalnet)** (from Microsoft Research) released with the paper [Focal Modulation Networks](https://arxiv.org/abs/2203.11926) by Jianwei Yang, Chunyuan Li, Xiyang Dai, Lu Yuan, Jianfeng Gao.
 1. **[Funnel Transformer](https://huggingface.co/docs/transformers/model_doc/funnel)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-1. **[Fuyu](https://huggingface.co/docs/transformers/model_doc/fuyu)** (from ADEPT) Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. Released with the paper [blog post](https://www.adept.ai/blog/fuyu-8b)
 1. **[GIT](https://huggingface.co/docs/transformers/model_doc/git)** (from Microsoft Research) released with the paper [GIT: A Generative Image-to-text Transformer for Vision and Language](https://arxiv.org/abs/2205.14100) by Jianfeng Wang, Zhengyuan Yang, Xiaowei Hu, Linjie Li, Kevin Lin, Zhe Gan, Zicheng Liu, Ce Liu, Lijuan Wang.
 1. **[GLPN](https://huggingface.co/docs/transformers/model_doc/glpn)** (from KAIST) released with the paper [Global-Local Path Networks for Monocular Depth Estimation with Vertical CutDepth](https://arxiv.org/abs/2201.07436) by Doyeon Kim, Woonghyun Ga, Pyungwhan Ahn, Donggyu Joo, Sehwan Chun, Junmo Kim.
 1. **[GPT](https://huggingface.co/docs/transformers/model_doc/openai-gpt)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -73,8 +73,6 @@
      title: Depth estimation
    - local: tasks/image_to_image
      title: Image-to-Image
-    - local: tasks/knowledge_distillation_for_image_classification
-      title: Knowledge Distillation for Computer Vision
    title: Computer Vision
  - isExpanded: false
    sections:
@ -211,8 +209,6 @@
    title: Pipelines for webserver inference
  - local: model_memory_anatomy
    title: Model training anatomy
-  - local: llm_tutorial_optimization
-    title: Getting the most out of LLMs
  title: Conceptual guides
 - sections:
  - sections:
@ -342,8 +338,6 @@
        title: FSMT
      - local: model_doc/funnel
        title: Funnel Transformer
-      - local: model_doc/fuyu
-        title: Fuyu
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -138,7 +138,6 @@ Flax), PyTorch, and/or TensorFlow.
 |                          [FNet](model_doc/fnet)                          |       ✅        |         ❌         |      ❌      |
 |                      [FocalNet](model_doc/focalnet)                      |       ✅        |         ❌         |      ❌      |
 |                  [Funnel Transformer](model_doc/funnel)                  |       ✅        |         ✅         |      ❌      |
-|                          [Fuyu](model_doc/fuyu)                          |       ✅        |         ❌         |      ❌      |
 |                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
 |                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
 |                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -74,13 +74,14 @@ If you're interested in basic LLM usage, our high-level [`Pipeline`](pipeline_tu

 </Tip>

+<!-- TODO: update example to llama 2 (or a newer popular baseline) when it becomes ungated -->
 First, you need to load the model.

 ```py
 >>> from transformers import AutoModelForCausalLM

 >>> model = AutoModelForCausalLM.from_pretrained(
-...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
 ... )
 ```

@ -96,31 +97,18 @@ Next, you need to preprocess your text input with a [tokenizer](tokenizer_summar
 ```py
 >>> from transformers import AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
 >>> model_inputs = tokenizer(["A list of colors: red, blue"], return_tensors="pt").to("cuda")
 ```

 The `model_inputs` variable holds the tokenized text input, as well as the attention mask. While [`~generation.GenerationMixin.generate`] does its best effort to infer the attention mask when it is not passed, we recommend passing it whenever possible for optimal results.

-After tokenizing the inputs, you can call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens. The generated tokens then should be converted to text before printing.
+Finally, call the [`~generation.GenerationMixin.generate`] method to returns the generated tokens, which should be converted to text before printing.

 ```py
 >>> generated_ids = model.generate(**model_inputs)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-'A list of colors: red, blue, green, yellow, orange, purple, pink,'
-```
-
-Finally, you don't need to do it one sequence at a time! You can batch your inputs, which will greatly improve the throughput at a small latency and memory cost. All you need to do is to make sure you pad your inputs properly (more on that below).
-
-```py
->>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
->>> model_inputs = tokenizer(
-...     ["A list of colors: red, blue", "Portugal is"], return_tensors="pt", padding=True
-... ).to("cuda")
->>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
-['A list of colors: red, blue, green, yellow, orange, purple, pink,',
-'Portugal is a country in southwestern Europe, on the Iber']
+'A list of colors: red, blue, green, yellow, black, white, and brown'
 ```

 And that's it! In a few lines of code, you can harness the power of an LLM.
@ -133,10 +121,10 @@ There are many [generation strategies](generation_strategies), and sometimes the
 ```py
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer

->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
->>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
 >>> model = AutoModelForCausalLM.from_pretrained(
-...     "mistralai/Mistral-7B-v0.1", device_map="auto", load_in_4bit=True
+...     "openlm-research/open_llama_7b", device_map="auto", load_in_4bit=True
 ... )
 ```

@ -166,7 +154,7 @@ By default, and unless specified in the [`~generation.GenerationConfig`] file, `
 ```py
 >>> # Set seed or reproducibility -- you don't need this unless you want full reproducibility
 >>> from transformers import set_seed
->>> set_seed(42)
+>>> set_seed(0)

 >>> model_inputs = tokenizer(["I am a cat."], return_tensors="pt").to("cuda")

@ -178,7 +166,7 @@ By default, and unless specified in the [`~generation.GenerationConfig`] file, `
 >>> # With sampling, the output becomes more creative!
 >>> generated_ids = model.generate(**model_inputs, do_sample=True)
 >>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-'I am a cat.  Specifically, I am an indoor-only cat.  I'
+'I am a cat.\nI just need to be. I am always.\nEvery time'
 ```

 ### Wrong padding side
@ -187,17 +175,17 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt

 ```py
 >>> # The tokenizer initialized above has right-padding active by default: the 1st sequence,
->>> # which is shorter, has padding on the right side. Generation fails to capture the logic.
+>>> # which is shorter, has padding on the right side. Generation fails.
 >>> model_inputs = tokenizer(
 ...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
 ... ).to("cuda")
 >>> generated_ids = model.generate(**model_inputs)
->>> tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
-'1, 2, 33333333333'
+>>> tokenizer.batch_decode(generated_ids[0], skip_special_tokens=True)[0]
+''

 >>> # With left-padding, it works as expected!
->>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1", padding_side="left")
->>> tokenizer.pad_token = tokenizer.eos_token  # Most LLMs don't have a pad token by default
+>>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b", padding_side="left")
+>>> tokenizer.pad_token = tokenizer.eos_token  # Llama has no pad token by default
 >>> model_inputs = tokenizer(
 ...     ["1, 2, 3", "A, B, C, D, E"], padding=True, return_tensors="pt"
 ... ).to("cuda")
@ -206,61 +194,26 @@ LLMs are [decoder-only](https://huggingface.co/learn/nlp-course/chapter1/6?fw=pt
 '1, 2, 3, 4, 5, 6,'
 ```

-### Wrong prompt
-
-Some models and tasks expect a certain input prompt format to work properly. When this format is not applied, you will get a silent performance degradation: the model kinda works, but not as well as if you were following the expected prompt. More information about prompting, including which models and tasks need to be careful, is available in this [guide](tasks/prompting). Let's see an example with a chat LLM, which makes use of [chat templating](chat_templating):
-
-```python
->>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-alpha")
->>> model = AutoModelForCausalLM.from_pretrained(
-...     "HuggingFaceH4/zephyr-7b-alpha", device_map="auto", load_in_4bit=True
-... )
->>> set_seed(0)
->>> prompt = """How many helicopters can a human eat in one sitting? Reply as a thug."""
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
->>> input_length = model_inputs.input_ids.shape[1]
->>> generated_ids = model.generate(**model_inputs, max_new_tokens=20)
->>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
-"I'm not a thug, but i can tell you that a human cannot eat"
->>> # Oh no, it did not follow our instruction to reply as a thug! Let's see what happens when we write
->>> # a better prompt and use the right template for this model (through `tokenizer.apply_chat_template`)
-
->>> set_seed(0)
->>> messages = [
-...     {
-...         "role": "system",
-...         "content": "You are a friendly chatbot who always responds in the style of a thug",
-...     },
-...     {"role": "user", "content": "How many helicopters can a human eat in one sitting?"},
-... ]
->>> model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt").to("cuda")
->>> input_length = model_inputs.shape[1]
->>> generated_ids = model.generate(model_inputs, do_sample=True, max_new_tokens=20)
->>> print(tokenizer.batch_decode(generated_ids[:, input_length:], skip_special_tokens=True)[0])
-'None, you thug. How bout you try to focus on more useful questions?'
->>> # As we can see, it followed a proper thug style 😎
-```
+<!-- TODO: when the prompting guide is ready, mention the importance of setting the right prompt in this section -->

 ## Further resources

 While the autoregressive generation process is relatively straightforward, making the most out of your LLM can be a challenging endeavor because there are many moving parts. For your next steps to help you dive deeper into LLM usage and understanding:

+<!-- TODO: complete with new guides -->
 ### Advanced generate usage

 1. [Guide](generation_strategies) on how to control different generation methods, how to set up the generation configuration file, and how to stream the output;
-2. [Guide](chat_templating) on the prompt template for chat LLMs;
-3. [Guide](tasks/prompting) on to get the most of prompt design;
-4. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils).
+2. API reference on [`~generation.GenerationConfig`], [`~generation.GenerationMixin.generate`], and [generate-related classes](internal/generation_utils).

 ### LLM leaderboards

 1. [Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard), which focuses on the quality of the open-source models;
 2. [Open LLM-Perf Leaderboard](https://huggingface.co/spaces/optimum/llm-perf-leaderboard), which focuses on LLM throughput.

-### Latency, throughput and memory utilization
+### Latency and throughput

-1. [Guide](llm_tutorial_optimization) on how to optimize LLMs for speed and memory;
-2. [Guide](main_classes/quantization) on quantization such as bitsandbytes and autogptq, which shows you how to drastically reduce your memory requirements.
+1. [Guide](main_classes/quantization) on dynamic quantization, which shows you how to drastically reduce your memory requirements.

 ### Related libraries

--- a/docs/source/en/llm_tutorial_optimization.md
+++ b/docs/source/en/llm_tutorial_optimization.md
@ -1,739 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-->
-# Optimizing LLMs for Speed and Memory
-
-[[open-in-colab]]
-
-Large Language Models (LLMs) such as GPT3/4, [Falcon](https://huggingface.co/tiiuae/falcon-40b), and [Llama](https://huggingface.co/meta-llama/Llama-2-70b-hf) are rapidly advancing in their ability to tackle human-centric tasks, establishing themselves as essential tools in modern knowledge-based industries.
-Deploying these models in real-world tasks remains challenging, however:
-
-   To exhibit near-human text understanding and generation capabilities, LLMs currently require to be composed of billions of parameters (see [Kaplan et al](https://arxiv.org/abs/2001.08361), [Wei et. al](https://arxiv.org/abs/2206.07682)). This consequently amplifies the memory demands for inference.
-   In many real-world tasks, LLMs need to be given extensive contextual information. This necessitates the model's capability to manage very long input sequences during inference.
-
-The crux of these challenges lies in augmenting the computational and memory capabilities of LLMs, especially when handling expansive input sequences.
-
-In this guide, we will go over the effective techniques for efficient LLM deployment:
-
-1.  **Lower Precision**: Research has shown that operating at reduced numerical precision, namely [8-bit and 4-bit](./main_classes/quantization.md) can achieve computational advantages without a considerable decline in model performance.
-
-2.  **Flash Attention:** Flash Attention is a variation of the attention algorithm that not only provides a more memory-efficient approach but also realizes increased efficiency due to optimized GPU memory utilization.
-
-3.  **Architectural Innovations:** Considering that LLMs are always deployed in the same way during inference, namely autoregressive text generation with a long input context, specialized model architectures have been proposed that allow for more efficient inference. The most important advancement in model architectures hereby are [Alibi](https://arxiv.org/abs/2108.12409), [Rotary embeddings](https://arxiv.org/abs/2104.09864), [Multi-Query Attention (MQA)](https://arxiv.org/abs/1911.02150) and [Grouped-Query-Attention (GQA)]((https://arxiv.org/abs/2305.13245)).
-
-Throughout this guide, we will offer an analysis of auto-regressive generation from a tensor's perspective. We delve into the pros and cons of adopting lower precision, provide a comprehensive exploration of the latest attention algorithms, and discuss improved LLM architectures. While doing so, we run practical examples showcasing each of the feature improvements.
-
-## 1. Lower Precision
-
-Memory requirements of LLMs can be best understood by seeing the LLM as a set of weight matrices and vectors and the text inputs as a sequence of vectors. In the following, the definition *weights* will be used to signify all model weight matrices and vectors.
-
-At the time of writing this guide, LLMs consist of at least a couple billion parameters. Each parameter thereby is made of a decimal number, e.g. `4.5689` which is usually stored in either [float32](https://en.wikipedia.org/wiki/Single-precision_floating-point_format), [bfloat16](https://en.wikipedia.org/wiki/Bfloat16_floating-point_format), or [float16](https://en.wikipedia.org/wiki/Half-precision_floating-point_format) format. This allows us to easily compute the memory requirement to load the LLM into memory:
-
-> *Loading the weights of a model having X billion parameters requires roughly 4 * X GB of VRAM in float32 precision*
-
-Nowadays, models are however rarely trained in full float32 precision, but usually in bfloat16 precision or less frequently in float16 precision. Therefore the rule of thumb becomes:
-
-> *Loading the weights of a model having X billion parameters requires roughly 2 * X GB of VRAM in bfloat16/float16 precision*
-
-For shorter text inputs (less than 1024 tokens), the memory requirement for inference is very much dominated by the memory requirement to load the weights. Therefore, for now, let's assume that the memory requirement for inference is equal to the memory requirement to load the model into the GPU VRAM.
-
-To give some examples of how much VRAM it roughly takes to load a model in bfloat16:
-
-   **GPT3** requires 2 \* 175 GB = **350 GB** VRAM
-   [**Bloom**](https://huggingface.co/bigscience/bloom) requires 2 \* 176 GB = **352 GB** VRAM
-   [**Llama-2-70b**](https://huggingface.co/meta-llama/Llama-2-70b-hf) requires 2 \* 70 GB = **140 GB** VRAM
-   [**Falcon-40b**](https://huggingface.co/tiiuae/falcon-40b) requires 2 \* 40 GB = **80 GB** VRAM
-   [**MPT-30b**](https://huggingface.co/mosaicml/mpt-30b) requires 2 \* 30 GB = **60 GB** VRAM
-   [**bigcode/starcoder**](https://huggingface.co/bigcode/starcoder) requires 2 \* 15.5 = **31 GB** VRAM
-
-As of writing this document, the largest GPU chip on the market is the A100 & H100 offering 80GB of VRAM. Most of the models listed before require more than 80GB just to be loaded and therefore necessarily require [tensor parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#tensor-parallelism) and/or [pipeline parallelism](https://huggingface.co/docs/transformers/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
-
-🤗 Transformers does not support tensor parallelism out of the box as it requires the model architecture to be written in a specific way. If you're interested in writing models in a tensor-parallelism-friendly way, feel free to have a look at [the text-generation-inference library](https://github.com/huggingface/text-generation-inference/tree/main/server/text_generation_server/models/custom_modeling).
-
-Naive pipeline parallelism is supported out of the box. For this, simply load the model with `device="auto"` which will automatically place the different layers on the available GPUs as explained [here](https://huggingface.co/docs/accelerate/v0.22.0/en/concept_guides/big_model_inference).
-Note, however that while very effective, this naive pipeline parallelism does not tackle the issues of GPU idling. For this more advanced pipeline parallelism is required as explained [here](https://huggingface.co/docs/transformers/v4.34.0/en/perf_train_gpu_many#naive-model-parallelism-vertical-and-pipeline-parallelism).
-
-If you have access to an 8 x 80GB A100 node, you could load BLOOM as follows
-
-```bash
-!pip install transformers accelerate bitsandbytes optimum
-```
-```python
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("bigscience/bloom", device_map="auto", pad_token_id=0)
-```
-
-By using `device_map="auto"` the attention layers would be equally distributed over all available GPUs.
-
-In this guide, we will use [bigcode/octocoder](https://huggingface.co/bigcode/octocoder) as it can be run on a single 40 GB A100 GPU device chip. Note that all memory and speed optimizations that we will apply going forward, are equally applicable to models that require model or tensor parallelism.
-
-Since the model is loaded in bfloat16 precision, using our rule of thumb above, we would expect the memory requirement to run inference with `bigcode/octocoder` to be around 31 GB VRAM. Let's give it a try.
-
-We first load the model and tokenizer and then pass both to Transformers' [pipeline](https://huggingface.co/docs/transformers/main_classes/pipelines) object.
-
-```python
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-import torch
-
-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto", pad_token_id=0)
-tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
-
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-```
-
-```python
-prompt = "Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer:"
-
-result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
-result
-```
-
-**Output**:
-```
-Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
-```
-
-Nice, we can now directly use the result to convert bytes into Gigabytes.
-
-```python
-def bytes_to_giga_bytes(bytes):
-  return bytes / 1024 / 1024 / 1024
-```
-
-Let's call [`torch.cuda.max_memory_allocated`](https://pytorch.org/docs/stable/generated/torch.cuda.max_memory_allocated.html) to measure the peak GPU memory allocation.
-
-```python
-bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
-```
-
-**Output**:
-```bash
-29.0260648727417
-```
-
-Close enough to our back-of-the-envelope computation! We can see the number is not exactly correct as going from bytes to kilobytes requires a multiplication of 1024 instead of 1000. Therefore the back-of-the-envelope formula can also be understood as an "at most X GB" computation.
-Note that if we had tried to run the model in full float32 precision, a whopping 64 GB of VRAM would have been required.
-
-> Almost all models are trained in bfloat16 nowadays, there is no reason to run the model in full float32 precision if [your GPU supports bfloat16](https://discuss.pytorch.org/t/bfloat16-native-support/117155/5). Float32 won't give better inference results than the precision that was used to train the model.
-
-If you are unsure in which format the model weights are stored on the Hub, you can always look into the checkpoint's config under `"torch_dtype"`, *e.g.* [here](https://huggingface.co/meta-llama/Llama-2-7b-hf/blob/6fdf2e60f86ff2481f2241aaee459f85b5b0bbb9/config.json#L21). It is recommended to set the model to the same precision type as written in the config when loading with `from_pretrained(..., torch_dtype=...)` except when the original type is float32 in which case one can use both `float16` or `bfloat16` for inference.
-
-
-Let's define a `flush(...)` function to free all allocated memory so that we can accurately measure the peak allocated GPU memory.
-
-```python
-del pipe
-del model
-
-import gc
-import torch
-
-def flush():
-  gc.collect()
-  torch.cuda.empty_cache()
-  torch.cuda.reset_peak_memory_stats()
-```
-
-Let's call it now for the next experiment.
-
-```python
-flush()
-```
-In the recent version of the accelerate library, you can also use an utility method called `release_memory()`
-
-```python
-from accelerate.utils import release_memory
-# ...
-
-release_memory(model)
-```
-
-Now what if your GPU does not have 32 GB of VRAM? It has been found that model weights can be quantized to 8-bit or 4-bits without a significant loss in performance (see [Dettmers et al.](https://arxiv.org/abs/2208.07339)).
-Model can be quantized to even 3 or 2 bits with an acceptable loss in performance as shown in the recent [GPTQ paper](https://arxiv.org/abs/2210.17323) 🤯.
-
-Without going into too many details, quantization schemes aim at reducing the precision of weights while trying to keep the model's inference results as accurate as possible (*a.k.a* as close as possible to bfloat16).
-Note that quantization works especially well for text generation since all we care about is choosing the *set of most likely next tokens* and don't really care about the exact values of the next token *logit* distribution.
-All that matters is that the next token *logit* distribution stays roughly the same so that an `argmax` or `topk` operation gives the same results.
-
-There are various quantization techniques, which we won't discuss in detail here, but in general, all quantization techniques work as follows:
-
-   1.  Quantize all weights to the target precision
-   2.  Load the quantized weights, and pass the input sequence of vectors in bfloat16 precision
-   3.  Dynamically dequantize weights to bfloat16 to perform the computation with their input vectors in bfloat16 precision
-
-In a nutshell, this means that *inputs-weight matrix* multiplications, with \\( X \\) being the *inputs*, \\( W \\) being a weight matrix and \\( Y \\) being the output:
-
-$$ Y = X * W $$
-
-are changed to
-
-$$ Y = X * \text{dequantize}(W) $$
-
-for every matrix multiplication. Dequantization and re-quantization is performed sequentially for all weight matrices as the inputs run through the network graph.
-
-Therefore, inference time is often **not** reduced when using quantized weights, but rather increases.
-Enough theory, let's give it a try! To quantize the weights with Transformers, you need to make sure that
-the [`bitsandbytes`](https://github.com/TimDettmers/bitsandbytes) library is installed.
-
-```bash
-!pip install bitsandbytes
-```
-
-We can then load models in 8-bit quantization by simply adding a `load_in_8bit=True` flag to `from_pretrained`.
-
-```python
-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_8bit=True, pad_token_id=0)
-```
-
-Now, let's run our example again and measure the memory usage.
-
-```python
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-
-result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
-result
-```
-
-**Output**:
-```
-Here is a Python function that transforms bytes to Giga bytes:\n\n```python\ndef bytes_to_giga_bytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single
-```
-
-Nice, we're getting the same result as before, so no loss in accuracy! Let's look at how much memory was used this time.
-
-```python
-bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
-```
-
-**Output**:
-```
-15.219234466552734
-```
-
-Significantly less! We're down to just a bit over 15 GBs and could therefore run this model on consumer GPUs like the 4090.
-We're seeing a very nice gain in memory efficiency and more or less no degradation to the model's output. However, we can also notice a slight slow-down during inference.
-
-
-We delete the models and flush the memory again.
-```python
-del model
-del pipe
-```
-
-```python
-flush()
-```
-
-Let's see what peak GPU memory consumption 4-bit quantization gives. Quantizing the model to 4-bit can be done with the same API as before - this time by passing `load_in_4bit=True` instead of `load_in_8bit=True`.
-
-```python
-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", load_in_4bit=True, low_cpu_mem_usage=True, pad_token_id=0)
-
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-
-result = pipe(prompt, max_new_tokens=60)[0]["generated_text"][len(prompt):]
-result
-```
-
-**Output**:
-```
-Here is a Python function that transforms bytes to Giga bytes:\n\n```\ndef bytes_to_gigabytes(bytes):\n    return bytes / 1024 / 1024 / 1024\n```\n\nThis function takes a single argument
-```
-
-We're almost seeing the same output text as before - just the `python` is missing just before the code snippet. Let's see how much memory was required.
-
-```python
-bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
-```
-
-**Output**:
-```
-9.543574333190918
-```
-
-Just 9.5GB! That's really not a lot for a >15 billion parameter model.
-
-While we see very little degradation in accuracy for our model here, 4-bit quantization can in practice often lead to different results compared to 8-bit quantization or full `bfloat16` inference. It is up to the user to try it out.
-
-Also note that inference here was again a bit slower compared to 8-bit quantization which is due to the more aggressive quantization method used for 4-bit quantization leading to \\( \text{quantize} \\) and \\( \text{dequantize} \\) taking longer during inference.
-
-```python
-del model
-del pipe
-```
-```python
-flush()
-```
-
-Overall, we saw that running OctoCoder in 8-bit precision reduced the required GPU VRAM from 32G GPU VRAM to only 15GB and running the model in 4-bit precision further reduces the required GPU VRAM to just a bit over 9GB.
-
-4-bit quantization allows the model to be run on GPUs such as RTX3090, V100, and T4 which are quite accessible for most people.
-
-For more information on quantization and to see how one can quantize models to require even less GPU VRAM memory than 4-bit, we recommend looking into the [`AutoGPTQ`](https://huggingface.co/docs/transformers/main/en/main_classes/quantization#autogptq-integration%60) implementation.
-
-> As a conclusion, it is important to remember that model quantization trades improved memory efficiency against accuracy and in some cases inference time.
-
-If GPU memory is not a constraint for your use case, there is often no need to look into quantization. However many GPUs simply can't run LLMs without quantization methods and in this case, 4-bit and 8-bit quantization schemes are extremely useful tools.
-
-For more in-detail usage information, we strongly recommend taking a look at the [Transformers Quantization Docs](https://huggingface.co/docs/transformers/main_classes/quantization#general-usage).
-Next, let's look into how we can improve computational and memory efficiency by using better algorithms and an improved model architecture.
-
-# 2. Flash Attention
-
-Today's top-performing LLMs share more or less the same fundamental architecture that consists of feed-forward layers, activation layers, layer normalization layers, and most crucially, self-attention layers.
-
-Self-attention layers are central to Large Language Models (LLMs) in that they enable the model to understand the contextual relationships between input tokens.
-However, the peak GPU memory consumption for self-attention layers grows *quadratically* both in compute and memory complexity with number of input tokens (also called *sequence length*) that we denote in the following by \\( N \\) .
-While this is not really noticeable for shorter input sequences (of up to 1000 input tokens), it becomes a serious problem for longer input sequences (at around 16000 input tokens).
-
-Let's take a closer look. The formula to compute the output \\( \mathbf{O} \\) of a self-attention layer for an input \\( \mathbf{X} \\) of length \\( N \\) is:
-
-$$ \textbf{O} = \text{Attn}(\mathbf{X}) = \mathbf{V} \times \text{Softmax}(\mathbf{QK}^T) \text{ with } \mathbf{Q} = \mathbf{W}_q \mathbf{X}, \mathbf{V} = \mathbf{W}_v \mathbf{X}, \mathbf{K} = \mathbf{W}_k \mathbf{X} $$
-
-\\(  \mathbf{X} = (\mathbf{x}_1, ... \mathbf{x}_{N}) \\) is thereby the input sequence to the attention layer. The projections \\( \mathbf{Q} \\) and \\( \mathbf{K} \\) will each consist of \\( N \\) vectors resulting in the \\( \mathbf{QK}^T \\) being of size \\( N^2 \\) .
-
-LLMs usually have multiple attention heads, thus doing multiple self-attention computations in parallel.
-Assuming, the LLM has 40 attention heads and runs in bfloat16 precision, we can calculate the memory requirement to store the \\( \mathbf{QK^T} \\) matrices to be \\( 40 * 2 * N^2 \\) bytes. For \\( N=1000 \\) only around 50 MB of VRAM are needed, however, for \\( N=16000 \\) we would need 19 GB of VRAM, and for \\( N=100,000 \\) we would need almost 1TB just to store the \\( \mathbf{QK}^T \\) matrices.
-
-Long story short, the default self-attention algorithm quickly becomes prohibitively memory-expensive for large input contexts.
-
-As LLMs improve in text comprehension and generation, they are applied to increasingly complex tasks. While models once handled the translation or summarization of a few sentences, they now manage entire pages, demanding the capability to process extensive input lengths.
-
-How can we get rid of the exorbitant memory requirements for large input lengths? We need a new way to compute the self-attention mechanism that gets rid of the \\( QK^T \\) matrix. [Tri Dao et al.](https://arxiv.org/abs/2205.14135) developed exactly such a new algorithm and called it **Flash Attention**.
-
-In a nutshell, Flash Attention breaks the  \\(\mathbf{V} \times \text{Softmax}(\mathbf{QK}^T\\)) computation apart and instead computes smaller chunks of the output by iterating over multiple softmax computation steps:
-
-$$ \textbf{O}_i \leftarrow s^a_{ij} * \textbf{O}_i + s^b_{ij} * \mathbf{V}_{j} \times \text{Softmax}(\mathbf{QK}^T_{i,j}) \text{ for multiple } i, j \text{ iterations} $$
-
-with \\( s^a_{ij} \\) and \\( s^b_{ij} \\) being some softmax normalization statistics that need to be recomputed for every \\( i \\) and \\( j \\) .
-
-Please note that the whole Flash Attention is a bit more complex and is greatly simplified here as going in too much depth is out of scope for this guide. The reader is invited to take a look at the well-written [Flash Attention paper](https://arxiv.org/abs/2205.14135) for more details.
-
-The main takeaway here is:
-
-> By keeping track of softmax normalization statistics and by using some smart mathematics, Flash Attention gives **numerical identical** outputs compared to the default self-attention layer at a memory cost that only increases linearly with \\( N \\) .
-
-Looking at the formula, one would intuitively say that Flash Attention must be much slower compared to the default self-attention formula as more computation needs to be done. Indeed Flash Attention requires more FLOPs compared to normal attention as the softmax normalization statistics have to constantly be recomputed (see [paper](https://arxiv.org/abs/2205.14135) for more details if interested)
-
-> However, Flash Attention is much faster in inference compared to default attention which comes from its ability to significantly reduce the demands on the slower, high-bandwidth memory of the GPU (VRAM), focusing instead on the faster on-chip memory (SRAM).
-
-Essentially, Flash Attention makes sure that all intermediate write and read operations can be done using the fast *on-chip* SRAM memory instead of having to access the slower VRAM memory to compute the output vector \\( \mathbf{O} \\) .
-
-In practice, there is currently absolutely no reason to **not** use Flash Attention if available. The algorithm gives mathematically the same outputs, and is both faster and more memory-efficient.
-
-Let's look at a practical example.
-
-Our OctoCoder model now gets a significantly longer input prompt which includes a so-called *system prompt*. System prompts are used to steer the LLM into a better assistant that is tailored to the users' task.
-In the following, we use a system prompt that will make OctoCoder a better coding assistant.
-
-```python
-system_prompt = """Below are a series of dialogues between various people and an AI technical assistant.
-The assistant tries to be helpful, polite, honest, sophisticated, emotionally aware, and humble but knowledgeable.
-The assistant is happy to help with code questions and will do their best to understand exactly what is needed.
-It also tries to avoid giving false or misleading information, and it caveats when it isn't entirely sure about the right answer.
-That said, the assistant is practical really does its best, and doesn't let caution get too much in the way of being useful.
-
-The Starcoder models are a series of 15.5B parameter models trained on 80+ programming languages from The Stack (v1.2) (excluding opt-out requests).
-The model uses Multi Query Attention, was trained using the Fill-in-the-Middle objective, and with 8,192 tokens context window for a trillion tokens of heavily deduplicated data.
-
-----
-
-Question: Write a function that takes two lists and returns a list that has alternating elements from each input list.
-
-Answer: Sure. Here is a function that does that.
-
-def alternating(list1, list2):
-   results = []
-   for i in range(len(list1)):
-       results.append(list1[i])
-       results.append(list2[i])
-   return results
-
-Question: Can you write some test cases for this function?
-
-Answer: Sure, here are some tests.
-
-assert alternating([10, 20, 30], [1, 2, 3]) == [10, 1, 20, 2, 30, 3]
-assert alternating([True, False], [4, 5]) == [True, 4, False, 5]
-assert alternating([], []) == []
-
-Question: Modify the function so that it returns all input elements when the lists have uneven length. The elements from the longer list should be at the end.
-
-Answer: Here is the modified function.
-
-def alternating(list1, list2):
-   results = []
-   for i in range(min(len(list1), len(list2))):
-       results.append(list1[i])
-       results.append(list2[i])
-   if len(list1) > len(list2):
-       results.extend(list1[i+1:])
-   else:
-       results.extend(list2[i+1:])
-   return results
-
-----
-"""
-```
-For demonstration purposes, we duplicate the system prompt by ten so that the input length is long enough to observe Flash Attention's memory savings.
-We append the original text prompt `"Question: Please write a function in Python that transforms bytes to Giga bytes.\n\nAnswer: Here"`
-
-```python
-long_prompt = 10 * system_prompt + prompt
-```
-
-We instantiate our model again in bfloat16 precision.
-
-```python
-model = AutoModelForCausalLM.from_pretrained("bigcode/octocoder", torch_dtype=torch.bfloat16, device_map="auto")
-tokenizer = AutoTokenizer.from_pretrained("bigcode/octocoder")
-
-pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
-```
-
-Let's now run the model just like before *without Flash Attention* and measure the peak GPU memory requirement and inference time.
-
-```python
-import time
-
-start_time = time.time()
-result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
-
-print(f"Generated in {time.time() - start_time} seconds.")
-result
-```
-
-**Output**:
-```
-Generated in 10.96854019165039 seconds.
-Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n   return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
-````
-
-We're getting the same output as before, however this time, the model repeats the answer multiple times until it's 60 tokens cut-off. This is not surprising as we've repeated the system prompt ten times for demonstration purposes and thus cued the model to repeat itself.
-
-**Note** that the system prompt should not be repeated ten times in real-world applications - one time is enough!
-
-Let's measure the peak GPU memory requirement.
-
-```python
-bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
-```
-
-**Output**:
-```bash
-37.668193340301514
-```
-
-As we can see the peak GPU memory requirement is now significantly higher than in the beginning, which is largely due to the longer input sequence. Also the generation takes a little over a minute now.
-
-We call `flush()` to free GPU memory for our next experiment.
-
-```python
-flush()
-```
-
-For comparison, let's run the same function, but enable Flash Attention instead.
-To do so, we convert the model to [BetterTransformers](https://huggingface.co/docs/optimum/bettertransformer/overview) and by doing so enabling PyTorch's [SDPA self-attention](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention) which in turn is based on Flash Attention.
-
-```python
-model.to_bettertransformer()
-```
-
-Now we run the exact same code snippet as before and under the hood Transformers will make use of Flash Attention.
-
-```py
-start_time = time.time()
-with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
-    result = pipe(long_prompt, max_new_tokens=60)[0]["generated_text"][len(long_prompt):]
-
-print(f"Generated in {time.time() - start_time} seconds.")
-result
-```
-
-**Output**:
-```
-Generated in 3.0211617946624756 seconds.
- Sure. Here is a function that does that.\n\ndef bytes_to_giga(bytes):\n   return bytes / 1024 / 1024 / 1024\n\nAnswer: Sure. Here is a function that does that.\n\ndef
-```
-
-We're getting the exact same result as before, but can observe a very significant speed-up thanks to Flash Attention.
-
-Let's measure the memory consumption one last time.
-
-```python
-bytes_to_giga_bytes(torch.cuda.max_memory_allocated())
-```
-
-**Output**:
-```
-32.617331981658936
-```
-
-And we're almost back to our original 29GB peak GPU memory from the beginning.
-
-We can observe that we only use roughly 100MB more GPU memory when passing a very long input sequence with Flash Attention compared to passing a short input sequence as done in the beginning.
-
-```py
-flush()
-```
-For more information on how to use Flash Attention, please have a look at [this doc page](https://huggingface.co/docs/transformers/v4.34.0/en/perf_infer_gpu_one#flash-attention-2).
-## 3. Architectural Innovations
-
-So far we have looked into improving computational and memory efficiency by:
-
-   Casting the weights to a lower precision format
-   Replacing the self-attention algorithm with a more memory- and compute efficient version
-
-Let's now look into how we can change the architecture of an LLM so that it is most effective and efficient for task that require long text inputs, *e.g.*:
-   Retrieval augmented Questions Answering,
-   Summarization,
-   Chat
-
-Note that *chat* not only requires the LLM to handle long text inputs, but it also necessitates that the LLM is able to efficiently handle the back-and-forth dialogue between user and assistant (such as ChatGPT).
-
-Once trained, the fundamental LLM architecture is difficult to change, so it is important to make considerations about the LLM's tasks beforehand and accordingly optimize the model's architecture.
-There are two important components of the model architecture that quickly become memory and/or performance bottlenecks for large input sequences.
-
-   The positional embeddings
-   The key-value cache
-
-Let's go over each component in more detail
-
-### 3.1 Improving positional embeddings of LLMs
-
-Self-attention puts each token in relation to each other's tokens.
-As an example, the \\( \text{Softmax}(\mathbf{QK}^T) \\) matrix of the text input sequence *"Hello", "I", "love", "you"* could look as follows:
-
-![](/blog/assets/163_optimize_llm/self_attn_tokens.png)
-
-Each word token is given a probability mass at which it attends all other word tokens and, therefore is put into relation with all other word tokens. E.g. the word *"love"* attends to the word *"Hello"* with 5%, to *"I"* with 30%, and to itself with 65%.
-
-A LLM based on self-attention, but without position embeddings would have great difficulties in understanding the positions of the text inputs to each other.
-This is because the probability score computed by \\( \mathbf{QK}^T \\) relates each word token to each other word token in \\( O(1) \\) computations regardless of their relative positional distance to each other.
-Therefore, for the LLM without position embeddings each token appears to have the same distance to all other tokens, *e.g.* differentiating between *"Hello I love you"* and *"You love I hello"* would be very challenging.
-
-For the LLM to understand sentence order, an additional *cue* is needed and is usually applied in the form of *positional encodings* (or also called *positional embeddings*).
-Positional encodings, encode the position of each token into a numerical presentation that the LLM can leverage to better understand sentence order.
-
-The authors of the [*Attention Is All You Need*](https://arxiv.org/abs/1706.03762) paper introduced sinusoidal positional embeddings \\( \mathbf{P} = \mathbf{p}_1, \ldots, \mathbf{p}_N \\) .
-where each vector \\( \mathbf{p}_i \\) is computed as a sinusoidal function of its position \\( i \\) .
-The positional encodings are then simply added to the input sequence vectors \\( \mathbf{\hat{X}} = \mathbf{\hat{x}}_1, \ldots, \mathbf{\hat{x}}_N \\) = \\( \mathbf{x}_1 + \mathbf{p}_1, \ldots, \mathbf{x}_N + \mathbf{p}_N \\) thereby cueing the model to better learn sentence order.
-
-Instead of using fixed position embeddings, others (such as [Devlin et al.](https://arxiv.org/abs/1810.04805)) used learned positional encodings for which the positional embeddings
-\\( \mathbf{P} \\) are learned during training.
-
-Sinusoidal and learned position embeddings used to be the predominant methods to encode sentence order into LLMs, but a couple of problems related to these positional encodings were found:
-
-  1. Sinusoidal and learned position embeddings are both absolute positional embeddings, *i.e.* encoding a unique embedding for each position id: \\( 0, \ldots, N \\) . As shown by [Huang et al.](https://arxiv.org/abs/2009.13658) and [Su et al.](https://arxiv.org/abs/2104.09864), absolute positional embeddings lead to poor LLM performance for long text inputs. For long text inputs, it is advantageous if the model learns the relative positional distance input tokens have to each other instead of their absolute position.
-  2. When using learned position embeddings, the LLM has to be trained on a fixed input length \\( N \\), which makes it difficult to extrapolate to an input length longer than what it was trained on.
-
-Recently, relative positional embeddings that can tackle the above mentioned problems have become more popular, most notably:
-
-   [Rotary Position Embedding (RoPE)](https://arxiv.org/abs/2104.09864)
-   [ALiBi](https://arxiv.org/abs/2108.12409)
-
-Both *RoPE* and *ALiBi* argue that it's best to cue the LLM about sentence order directly in the self-attention algorithm as it's there that word tokens are put into relation with each other. More specifically, sentence order should be cued by modifying the \\( \mathbf{QK}^T \\) computation.
-
-Without going into too many details, *RoPE* notes that positional information can be encoded into query-key pairs, *e.g.* \\( \mathbf{q}_i \\) and \\( \mathbf{x}_j \\) by rotating each vector by an angle \\( \theta * i \\) and \\( \theta * j \\) respectively with \\( i, j \\) describing each vectors sentence position:
-
-$$ \mathbf{\hat{q}}_i^T \mathbf{\hat{x}}_j = \mathbf{{q}}_i^T \mathbf{R}_{\theta, i -j} \mathbf{{x}}_j. $$
-
-\\( \mathbf{R}_{\theta, i - j} \\) thereby represents a rotational matrix. \\( \theta \\) is *not* learned during training, but instead set to a pre-defined value that depends on the maximum input sequence length during training.
-
-> By doing so, the propability score between \\( \mathbf{q}_i \\) and \\( \mathbf{q}_j \\) is only affected if \\( i \ne j \\) and solely depends on the relative distance \\( i - j \\) regardless of each vector's specific positions \\( i \\) and \\( j \\) .
-
-*RoPE* is used in multiple of today's most important LLMs, such as:
-
-   [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
-   [**Llama**](https://arxiv.org/abs/2302.13971)
-   [**PaLM**](https://arxiv.org/abs/2204.02311)
-
-As an alternative, *ALiBi* proposes a much simpler relative position encoding scheme. The relative distance that input tokens have to each other is added as a negative integer scaled by a pre-defined value `m` to each query-key entry of the \\( \mathbf{QK}^T \\) matrix right before the softmax computation.
-
-![](/blog/assets/163_optimize_llm/alibi.png)
-
-As shown in the [ALiBi](https://arxiv.org/abs/2108.12409) paper, this simple relative positional encoding allows the model to retain a high performance even at very long text input sequences.
-
-*ALiBi* is used in multiple of today's most important LLMs, such as:
-
-   [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
-   [**BLOOM**](https://huggingface.co/bigscience/bloom)
-
-Both *RoPE* and *ALiBi* position encodings can extrapolate to input lengths not seen during training whereas it has been shown that extrapolation works much better out-of-the-box for *ALiBi* as compared to *RoPE*.
-For ALiBi, one simply increases the values of the lower triangular position matrix to match the length of the input sequence.
-For *RoPE*, keeping the same \\( \theta \\) that was used during training leads to poor results when passing text inputs much longer than those seen during training, *c.f* [Press et al.](https://arxiv.org/abs/2108.12409). However, the community has found a couple of effective tricks that adapt \\( \theta \\), thereby allowing *RoPE* position embeddings to work well for extrapolated text input sequences (see [here](https://github.com/huggingface/transformers/pull/24653)).
-
-> Both RoPE and ALiBi are relative positional embeddings that are *not* learned during training, but instead are based on the following intuitions:
- -   Positional cues about the text inputs should be given directly to the \\( QK^T \\) matrix of the self-attention layer
- -   The LLM should be incentivized to learn a constant *relative* distance positional encodings have to each other
- -   The further text input tokens are from each other, the lower the probability of their query-value probability. Both RoPE and ALiBi lower the query-key probability of tokens far away from each other. RoPE by decreasing their vector product by increasing the angle between the query-key vectors. ALiBi by adding large negative numbers to the vector product
-
-In conclusion, LLMs that are intended to be deployed in tasks that require handling large text inputs are better trained with relative positional embeddings, such as RoPE and ALiBi. Also note that even if an LLM with RoPE and ALiBi has been trained only on a fixed length of say \\( N_1 = 2048 \\) it can still be used in practice with text inputs much larger than \\( N_1 \\), like \\( N_2 = 8192 > N_1 \\) by extrapolating the positional embeddings.
-
-### 3.2 The key-value cache
-
-Auto-regressive text generation with LLMs works by iteratively putting in an input sequence, sampling the next token, appending the next token to the input sequence, and continuing to do so until the LLM produces a token that signifies that the generation has finished.
-
-Please have a look at [Transformer's Generate Text Tutorial](https://huggingface.co/docs/transformers/llm_tutorial#generate-text) to get a more visual explanation of how auto-regressive generation works.
-
-Let's run a quick code snippet to show how auto-regressive works in practice. We will simply take the most likely next token via `torch.argmax`.
-
-```python
-input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
-
-for _ in range(5):
-  next_logits = model(input_ids)["logits"][:, -1:]
-  next_token_id = torch.argmax(next_logits,dim=-1)
-
-  input_ids = torch.cat([input_ids, next_token_id], dim=-1)
-  print("shape of input_ids", input_ids.shape)
-
-generated_text = tokenizer.batch_decode(input_ids[:, -5:])
-generated_text
-```
-
-**Output**:
-```
-shape of input_ids torch.Size([1, 21])
-shape of input_ids torch.Size([1, 22])
-shape of input_ids torch.Size([1, 23])
-shape of input_ids torch.Size([1, 24])
-shape of input_ids torch.Size([1, 25])
-[' Here is a Python function']
-```
-
-As we can see every time we increase the text input tokens by the just sampled token.
-
-With very few exceptions, LLMs are trained using the [causal language modeling objective](https://huggingface.co/docs/transformers/tasks/language_modeling#causal-language-modeling) and therefore mask the upper triangle matrix of the attention score - this is why in the two diagrams above the attention scores are left blank (*a.k.a* have 0 probability). For a quick recap on causal language modeling you can refer to the [*Illustrated Self Attention blog*](https://jalammar.github.io/illustrated-gpt2/#part-2-illustrated-self-attention).
-
-As a consequence, tokens *never* depend on previous tokens, more specifically the \\( \mathbf{q}_i \\) vector is never put in relation with any key, values vectors \\( \mathbf{k}_j, \mathbf{v}_j \\) if \\( j > i \\) . Instead \\( \mathbf{q}_i \\) only attends to previous key-value vectors \\( \mathbf{k}_{m < i}, \mathbf{v}_{m < i} \text{ , for } m \in \{0, \ldots i - 1\} \\). In order to reduce unnecessary computation, one can therefore cache each layer's key-value vectors for all previous timesteps.
-
-In the following, we will tell the LLM to make use of the key-value cache by retrieving and forwarding it for each forward pass.
-In Transformers, we can retrieve the key-value cache by passing the `use_cache` flag to the `forward` call and can then pass it with the current token.
-
-```python
-past_key_values = None # past_key_values is the key-value cache
-generated_tokens = []
-next_token_id = tokenizer(prompt, return_tensors="pt")["input_ids"].to("cuda")
-
-for _ in range(5):
-  next_logits, past_key_values = model(next_token_id, past_key_values=past_key_values, use_cache=True).to_tuple()
-  next_logits = next_logits[:, -1:]
-  next_token_id = torch.argmax(next_logits, dim=-1)
-
-  print("shape of input_ids", next_token_id.shape)
-  print("length of key-value cache", len(past_key_values[0][0]))  # past_key_values are of shape [num_layers, 0 for k, 1 for v, batch_size, length, hidden_dim]
-  generated_tokens.append(next_token_id.item())
-
-generated_text = tokenizer.batch_decode(generated_tokens)
-generated_text
-```
-
-**Output**:
-```
-shape of input_ids torch.Size([1, 1])
-length of key-value cache 20
-shape of input_ids torch.Size([1, 1])
-length of key-value cache 21
-shape of input_ids torch.Size([1, 1])
-length of key-value cache 22
-shape of input_ids torch.Size([1, 1])
-length of key-value cache 23
-shape of input_ids torch.Size([1, 1])
-length of key-value cache 24
-[' Here', ' is', ' a', ' Python', ' function']
-```
-
-As one can see, when using the key-value cache the text input tokens are *not* increased in length, but remain a single input vector. The length of the key-value cache on the other hand is increased by one at every decoding step.
-
-> Making use of the key-value cache means that the \\( \mathbf{QK}^T \\) is essentially reduced to \\( \mathbf{q}_c\mathbf{K}^T \\) with \\( \mathbf{q}_c \\) being the query projection of the currently passed input token which is *always* just a single vector.
-
-Using the key-value cache has two advantages:
-   Significant increase in computational efficiency as less computations are performed compared to computing the full \\( \mathbf{QK}^T \\) matrix. This leads to an increase in inference speed
-   The maximum required memory is not increased quadratically with the number of generated tokens, but only increases linearly.
-
-> One should *always* make use of the key-value cache as it leads to identical results and a significant speed-up for longer input sequences. Transformers has the key-value cache enabled by default when making use of the text pipeline or the [`generate` method](https://huggingface.co/docs/transformers/main_classes/text_generation).
-
-Note that the key-value cache is especially useful for applications such as chat where multiple passes of auto-regressive decoding are required. Let's look at an example.
-
-```
-User: How many people live in France?
-Assistant: Roughly 75 million people live in France
-User: And how many are in Germany?
-Assistant: Germany has ca. 81 million inhabitants
-```
-
-In this chat, the LLM runs auto-regressive decoding twice:
- 1. The first time, the key-value cache is empty and the input prompt is `"User: How many people live in France?"` and the model auto-regressively generates the text `"Roughly 75 million people live in France"` while increasing the key-value cache at every decoding step.
- 2. The second time the input prompt is `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many in Germany?"`. Thanks to the cache, all key-value vectors for the first two sentences are already computed. Therefore the input prompt only consists of `"User: And how many in Germany?"`. While processing the shortened input prompt, it's computed key-value vectors are concatenated to the key-value cache of the first decoding. The second Assistant's answer `"Germany has ca. 81 million inhabitants"` is then auto-regressively generated with the key-value cache consisting of encoded key-value vectors of `"User: How many people live in France? \n Assistant: Roughly 75 million people live in France \n User: And how many are in Germany?"`.
-
-Two things should be noted here:
-  1. Keeping all the context is crucial for LLMs deployed in chat so that the LLM understands all the previous context of the conversation. E.g. for the example above the LLM needs to understand that the user refers to the population when asking `"And how many are in Germany"`.
-  2. The key-value cache is extremely useful for chat as it allows us to continuously grow the encoded chat history instead of having to re-encode the chat history again from scratch (as e.g. would be the case when using an encoder-decoder architecture).
-
-There is however one catch. While the required peak memory for the \\( \mathbf{QK}^T \\) matrix is significantly reduced, holding the key-value cache in memory can become very memory expensive for long input sequences or multi-turn chat. Remember that the key-value cache needs to store the key-value vectors for all previous input vectors \\( \mathbf{x}_i \text{, for } i \in \{1, \ldots, c - 1\} \\) for all self-attention layers and for all attention heads.
-
-Let's compute the number of float values that need to be stored in the key-value cache for the LLM `bigcode/octocoder` that we used before.
-The number of float values amounts to two times the sequence length times the number of attention heads times the attention head dimension and times the number of layers.
-Computing this for our LLM at a hypothetical input sequence length of 16000 gives:
-
-```python
-config = model.config
-2 * 16_000 * config.n_layer * config.n_head * config.n_embd // config.n_head
-```
-
-**Output**:
-```
-7864320000
-```
-
-Roughly 8 billion float values! Storing 8 billion float values in `float16` precision requires around 15 GB of RAM which is circa half as much as the model weights themselves!
-Researchers have proposed two methods that allow to significantly reduce the memory cost of storing the key-value cache:
-
-  1.  [Multi-Query-Attention (MQA)](https://arxiv.org/abs/1911.02150)
-
-Multi-Query-Attention was proposed in Noam Shazeer's *Fast Transformer Decoding: One Write-Head is All You Need* paper. As the title says, Noam found out that instead of using `n_head` key-value projections weights, one can use a single head-value projection weight pair that is shared across all attention heads without that the model's performance significantly degrades.
-
-> By using a single head-value projection weight pair, the key value vectors \\( \mathbf{k}_i, \mathbf{v}_i \\) have to be identical across all attention heads which in turn means that we only need to store 1 key-value projection pair in the cache instead of `n_head` ones.
-
-As most LLMs use between 20 and 100 attention heads, MQA significantly reduces the memory consumption of the key-value cache. For the LLM used in this notebook we could therefore reduce the required memory consumption from 15 GB to less than 400 MB at an input sequence length of 16000.
-
-In addition to memory savings, MQA also leads to improved computational efficiency as explained in the following.
-In auto-regressive decoding, large key-value vectors need to be reloaded, concatenated with the current key-value vector pair to be then fed into the \\( \mathbf{q}_c\mathbf{K}^T \\) computation at every step. For auto-regressive decoding, the required memory bandwidth for the constant reloading can become a serious time bottleneck. By reducing the size of the key-value vectors less memory needs to be accessed, thus reducing the memory bandwidth bottleneck. For more detail, please have a look at [Noam's paper](https://arxiv.org/abs/1911.02150).
-
-The important part to understand here is that reducing the number of key-value attention heads to 1 only makes sense if a key-value cache is used. The peak memory consumption of the model for a single forward pass without key-value cache stays unchanged as every attention head still has a unique query vector so that each attention head still has a different \\( \mathbf{QK}^T \\) matrix.
-
-MQA has seen wide adoption by the community and is now used by many of the most popular LLMs:
-
-   [**Falcon**](https://huggingface.co/tiiuae/falcon-40b)
-   [**PaLM**](https://arxiv.org/abs/2204.02311)
-   [**MPT**](https://huggingface.co/mosaicml/mpt-30b)
-   [**BLOOM**](https://huggingface.co/bigscience/bloom)
-
-Also, the checkpoint used in this notebook - `bigcode/octocoder` - makes use of MQA.
-
-  2.  [Grouped-Query-Attention (GQA)](https://arxiv.org/abs/2305.13245)
-
-Grouped-Query-Attention, as proposed by Ainslie et al. from Google, found that using MQA can often lead to quality degradation compared to using vanilla multi-key-value head projections. The paper argues that more model performance can be kept by less drastically reducing the number of query head projection weights. Instead of using just a single key-value projection weight, `n < n_head` key-value projection weights should be used. By choosing `n` to a significantly smaller value than `n_head`, such as 2,4 or 8 almost all of the memory and speed gains from MQA can be kept while sacrificing less model capacity and thus arguably less performance.
-
-Moreover, the authors of GQA found out that existing model checkpoints can be *uptrained* to have a GQA architecture with as little as 5% of the original pre-training compute. While 5% of the original pre-training compute can still be a massive amount, GQA *uptraining* allows existing checkpoints to be useful for longer input sequences.
-
-GQA was only recently proposed which is why there is less adoption at the time of writing this notebook.
-The most notable application of GQA is [Llama-v2](https://huggingface.co/meta-llama/Llama-2-70b-hf).
-
-> As a conclusion, it is strongly recommended to make use of either GQA or MQA if the LLM is deployed with auto-regressive decoding and is required to handle large input sequences as is the case for example for chat.
-
-## Conclusion
-
-The research community is constantly coming up with new, nifty ways to speed up inference time for ever-larger LLMs. As an example, one such promising research direction is [speculative decoding](https://arxiv.org/abs/2211.17192) where "easy tokens" are generated by smaller, faster language models and only "hard tokens" are generated by the LLM itself. Going into more detail is out of the scope of this notebook, but can be read upon in this [nice blog post](https://huggingface.co/blog/assisted-generation).
-
-The reason massive LLMs such as GPT3/4, Llama-2-70b, Claude, PaLM can run so quickly in chat-interfaces such as [Hugging Face Chat](https://huggingface.co/chat/) or ChatGPT is to a big part thanks to the above-mentioned improvements in precision, algorithms, and architecture.
-Going forward, accelerators such as GPUs, TPUs, etc... will only get faster and allow for more memory, but one should nevertheless always make sure to use the best available algorithms and architectures to get the most bang for your buck 🤗
--- a/docs/source/en/model_doc/bark.md
+++ b/docs/source/en/model_doc/bark.md
@ -64,7 +64,7 @@ model.enable_cpu_offload()

 Note that 🤗 Accelerate must be installed before using this feature. [Here's how to install it.](https://huggingface.co/docs/accelerate/basic_tutorials/install)

-#### Combining optimization techniques
+#### Combining optimizaton techniques

 You can combine optimization techniques, and use CPU offload, half-precision and 🤗 Better Transformer all at once.

--- a/docs/source/en/model_doc/flan-ul2.md
+++ b/docs/source/en/model_doc/flan-ul2.md
@ -19,10 +19,10 @@ rendered properly in your Markdown viewer.
 ## Overview

 Flan-UL2 is an encoder decoder model based on the T5 architecture. It uses the same configuration as the [UL2](ul2) model released earlier last year. 
-It was fine tuned using the "Flan" prompt tuning and dataset collection. Similar to `Flan-T5`,  one can directly use FLAN-UL2 weights without finetuning the model:
+It was fine tuned using the "Flan" prompt tuning and dataset collection. Similiar to `Flan-T5`,  one can directly use FLAN-UL2 weights without finetuning the model:


-According to the original blog here are the notable improvements:
+According ot the original blog here are the notable improvements:

 - The original UL2 model was only trained with receptive field of 512, which made it non-ideal for N-shot prompting where N is large.
 - The Flan-UL2 checkpoint uses a receptive field of 2048 which makes it more usable for few-shot in-context learning.
@ -53,4 +53,4 @@ The model is pretty heavy (~40GB in half precision) so if you just want to run t

 ## Inference

-The inference protocol is exactly the same as any `T5` model, please have a look at the [T5's documentation page](t5) for more details.
+The inference protocol is exaclty the same as any `T5` model, please have a look at the [T5's documentation page](t5) for more details.
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -1,115 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Fuyu
-
-## Overview
-
-The Fuyu model was created by [ADEPT](https://www.adept.ai/blog/fuyu-8b), and authored by Rohan Bavishi, Erich Elsen, Curtis Hawthorne, Maxwell Nye, Augustus Odena, Arushi Somani, Sağnak Taşırlar. 
-
-The authors introduced Fuyu-8B, a decoder-only multimodal model based on the classic transformers architecture, with query and key normalization. A linear encoder is added to create multimodal embeddings from image inputs. 
-
-By treating image tokens like text tokens and using a special image-newline character, the model knows when an image line ends. Image positional embeddings are removed. This avoids the need for different training phases for various image resolutions. With 8 billion parameters and licensed under CC-BY-NC, Fuyu-8B is notable for its ability to handle both text and images, its impressive context size of 16K, and its overall performance.
-
-<Tip warning={true}>
-
-The `Fuyu` models were trained using `bfloat16`, but the original inference uses `float16` The checkpoints uploaded on the hub use `torch_dtype = 'float16'` which will be
-used by the `AutoModel` API to cast the checkpoints from `torch.float32` to `torch.float16`. 
-
-The `dtype` of the online weights is mostly irrelevant, unless you are using `torch_dtype="auto"` when initializing a model using `model = AutoModelForCausalLM.from_pretrained("path", torch_dtype = "auto")`. The reason is that the model will first be downloaded ( using the `dtype` of the checkpoints online) then it will be cast to the default `dtype` of `torch` (becomes `torch.float32`). Users should specify the `torch_dtype` they want, and if they don't it will be `torch.float32`.
-
-Finetuning the model in `float16` is not recommended and known to produce `nan`, as such the model should be fine-tuned in `bfloat16`.
-
-</Tip>
-
-
-Tips:
-
- To convert the model, you need to clone the original repository using `git clone https://github.com/persimmon-ai-labs/adept-inference`, then get the checkpoints:
-
-```bash
-git clone https://github.com/persimmon-ai-labs/adept-inference
-wget path/to/fuyu-8b-model-weights.tar
-tar -xvf fuyu-8b-model-weights.tar
-python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py  --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path \
-    --pt_model_path /path/to/fuyu_8b_release/iter_0001251/mp_rank_00/model_optim_rng.pt
-    --ada_lib_path /path/to/adept-inference
-```
-
-For the chat model:
-```bash
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
-tar -xvf 8b_base_model_release.tar
-```
-Then, model can be loaded via:
-
-```py 
-from transformers import FuyuConfig, FuyuForCausalLM
-model_config = FuyuConfig()
-model = FuyuForCausalLM(model_config).from_pretrained('/output/path')
-```
-
-Inputs need to be passed through a specific Processor to have the correct formats.
-A processor requires an image_processor and a tokenizer. Hence, inputs can be loaded via:
-
-```py
-from PIL import Image
-from transformers import AutoTokenizer
-from transformers.models.fuyu.processing_fuyu import FuyuProcessor
-from transformers.models.fuyu.image_processing_fuyu import FuyuImageProcessor
-
-
-tokenizer = AutoTokenizer.from_pretrained('adept-hf-collab/fuyu-8b')
-image_processor = FuyuImageProcessor()
-
-
-processor = FuyuProcessor(image_processor=image_processor, tokenizer=tokenizer)
-text_prompt = "Generate a coco-style caption.\\n"
-
-bus_image_url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
-bus_image_pil = Image.open(io.BytesIO(requests.get(bus_image_url).content))
-inputs_to_model = processor(text=text_prompt, images=image_pil)
-
-
-```
-
-This model was contributed by [Molbap](https://huggingface.co/Molbap).
-The original code can be found [here](https://github.com/persimmon-ai-labs/adept-inference).
-
- Fuyu uses a `sentencepiece` based tokenizer, with a `Unigram` model. It supports bytefallback, which is only available in `tokenizers==0.14.0` for the fast tokenizer.
-The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece. 
-
- The authors suggest to use the following prompt for image captioning: `f"Generate a coco-style caption.\\n"`
-
-
-## FuyuConfig
-
-[[autodoc]] FuyuConfig
-
-## FuyuForCausalLM
-
-[[autodoc]] FuyuForCausalLM
-    - forward
-
-## FuyuImageProcessor
-
-[[autodoc]] FuyuImageProcessor
-    - __call__
-
-## FuyuProcessor
-
-[[autodoc]] FuyuProcessor
-    - __call__
--- a/docs/source/en/model_doc/jukebox.md
+++ b/docs/source/en/model_doc/jukebox.md
@ -28,7 +28,7 @@ The abstract from the paper is the following:

 As shown on the following figure, Jukebox is made of 3 `priors` which are decoder only models. They follow the architecture described in [Generating Long Sequences with Sparse Transformers](https://arxiv.org/abs/1904.10509), modified to support longer context length.
 First, a autoencoder is used to encode the text lyrics. Next, the first (also called `top_prior`) prior attends to the last hidden states extracted from the lyrics encoder. The priors are linked to the previous priors respectively via an `AudioConditionner` module. The`AudioConditioner` upsamples the outputs of the previous prior to raw tokens at a certain audio frame per second resolution. 
-The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positional embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.
+The metadata such as *artist, genre and timing* are passed to each prior, in the form of a start token and positionnal embedding for the timing data.  The hidden states are mapped to the closest codebook vector from the VQVAE in order to convert them to raw audio.

 ![JukeboxModel](https://gist.githubusercontent.com/ArthurZucker/92c1acaae62ebf1b6a951710bdd8b6af/raw/c9c517bf4eff61393f6c7dec9366ef02bdd059a3/jukebox.svg)

@ -36,7 +36,7 @@ Tips:
 - This model only supports inference. This is for a few reasons, mostly because it requires a crazy amount of memory to train. Feel free to open a PR and add what's missing to have a full integration with the hugging face traineer!
 - This model is very slow, and takes 8h to generate a minute long audio using the 5b top prior on a V100 GPU. In order automaticallay handle the device on which the model should execute, use `accelerate`.
 - Contrary to the paper, the order of the priors goes from `0` to `1` as it felt more intuitive : we sample starting from `0`.
- Primed sampling (conditioning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.
+- Primed sampling (conditionning the sampling on raw audio) requires more memory than ancestral sampling and should be used with `fp16` set to `True`.

 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ).
 The original code can be found [here](https://github.com/openai/jukebox).
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -55,7 +55,7 @@ These ready-to-use checkpoints can be downloaded and used via the HuggingFace Hu

 >>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
 >>> tokenizer.batch_decode(generated_ids)[0]
-"The expected output"
+"The expected outupt"
 ```

 Raw weights for `Mistral-7B-v0.1` and `Mistral-7B-Instruct-v0.1` can be downloaded from:
@ -76,7 +76,7 @@ python src/transformers/models/mistral/convert_mistral_weights_to_hf.py \
 You can then load the converted model from the `output/path`:

 ```python
-from transformers import MistralForCausalLM, LlamaTokenizer
+from transformers import MistralForCausalLM, LlamaTokenzier

 tokenizer = LlamaTokenizer.from_pretrained("/output/path")
 model = MistralForCausalLM.from_pretrained("/output/path")
@ -109,7 +109,7 @@ To load and run a model using Flash Attention 2, refer to the snippet below:

 >>> generated_ids = model.generate(**model_inputs, max_new_tokens=100, do_sample=True)
 >>> tokenizer.batch_decode(generated_ids)[0]
-"The expected output"
+"The expected outupt"
 ```

 ### Expected speedups
--- a/docs/source/en/model_doc/mra.md
+++ b/docs/source/en/model_doc/mra.md
@ -22,7 +22,7 @@ The MRA model was proposed in [Multi Resolution Analysis (MRA) for Approximate S

 The abstract from the paper is the following:

-*Transformers have emerged as a preferred model for many tasks in natural language processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at https://github.com/mlpen/mra-attention.*
+*Transformers have emerged as a preferred model for many tasks in natural langugage processing and vision. Recent efforts on training and deploying Transformers more efficiently have identified many strategies to approximate the self-attention matrix, a key module in a Transformer architecture. Effective ideas include various prespecified sparsity patterns, low-rank basis expansions and combinations thereof. In this paper, we revisit classical Multiresolution Analysis (MRA) concepts such as Wavelets, whose potential value in this setting remains underexplored thus far. We show that simple approximations based on empirical feedback and design choices informed by modern hardware and implementation challenges, eventually yield a MRA-based approach for self-attention with an excellent performance profile across most criteria of interest. We undertake an extensive set of experiments and demonstrate that this multi-resolution scheme outperforms most efficient self-attention proposals and is favorable for both short and long sequences. Code is available at https://github.com/mlpen/mra-attention.*

 This model was contributed by [novice03](https://huggingface.co/novice03).
 The original code can be found [here](https://github.com/mlpen/mra-attention).
--- a/docs/source/en/model_doc/nllb-moe.md
+++ b/docs/source/en/model_doc/nllb-moe.md
@ -53,7 +53,7 @@ which means that tokens have less probability of being forwarded. Moreover, if a
 states (kind of like a residual connection) while they are masked in `NLLB`'s top-2 routing mechanism. 

 ## Generating with NLLB-MoE
-The available checkpoints require around 350GB of storage. Make sure to use `accelerate` if you do not have enough RAM on your machine.
+The avalable checkpoints requires around 350GB of storage. Make sure to use `accelerate` if you do not have enough RAM on your machine.

 While generating the target text set the `forced_bos_token_id` to the target language id. The following
 example shows how to translate English to French using the *facebook/nllb-200-distilled-600M* model.
--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@ -29,17 +29,12 @@ Tips:
 - The architecture of OWLv2 is identical to [OWL-ViT](owlvit), however the object detection head now also includes an objectness classifier, which predicts the (query-agnostic) likelihood that a predicted box contains an object (as opposed to background). The objectness score can be used to rank or filter predictions independently of text queries.
 - Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image processor ([`Owlv2ImageProcessor`]).

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/owlv2_overview.png"
-alt="drawing" width="600"/>
-
-<small> OWLv2 high-level overview. Taken from the <a href="https://arxiv.org/abs/2306.09683">original paper</a>. </small>
-
 This model was contributed by [nielsr](https://huggingface.co/nielsr).
 The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).

 ## Usage

-OWLv2 is, just like its predecessor [OWL-ViT](owlvit), a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
+OWL-ViT a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.

 [`Owlv2ImageProcessor`] can be used to resize (or rescale) and normalize images for the model and [`CLIPTokenizer`] is used to encode the text. [`Owlv2Processor`] wraps [`Owlv2ImageProcessor`] and [`CLIPTokenizer`] into a single instance to both encode the text and prepare the images. The following example shows how to perform object detection using [`Owlv2Processor`] and [`Owlv2ForObjectDetection`].

@ -74,10 +69,6 @@ Detected a photo of a cat with confidence 0.614 at location [341.67, 17.54, 642.
 Detected a photo of a cat with confidence 0.665 at location [6.75, 38.97, 326.62, 354.85]
 ```

-## Resources
-
-A demo notebook on using OWLv2 for zero- and one-shot (image-guided) object detection can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/OWLv2).
-
 ## Owlv2Config

 [[autodoc]] Owlv2Config
--- a/docs/source/en/model_doc/owlvit.md
+++ b/docs/source/en/model_doc/owlvit.md
@ -24,13 +24,6 @@ The abstract from the paper is the following:

 *Combining simple architectures with large-scale pre-training has led to massive improvements in image classification. For object detection, pre-training and scaling approaches are less well established, especially in the long-tailed and open-vocabulary setting, where training data is relatively scarce. In this paper, we propose a strong recipe for transferring image-text models to open-vocabulary object detection. We use a standard Vision Transformer architecture with minimal modifications, contrastive image-text pre-training, and end-to-end detection fine-tuning. Our analysis of the scaling properties of this setup shows that increasing image-level pre-training and model size yield consistent improvements on the downstream detection task. We provide the adaptation strategies and regularizations needed to attain very strong performance on zero-shot text-conditioned and one-shot image-conditioned object detection. Code and models are available on GitHub.*

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/owlvit_architecture.jpg"
-alt="drawing" width="600"/>
-
-<small> OWL-ViT architecture. Taken from the <a href="https://arxiv.org/abs/2205.06230">original paper</a>. </small>
-
-This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).
-
 ## Usage

 OWL-ViT is a zero-shot text-conditioned object detection model. OWL-ViT uses [CLIP](clip) as its multi-modal backbone, with a ViT-like Transformer to get visual features and a causal language model to get the text features. To use CLIP for detection, OWL-ViT removes the final token pooling layer of the vision model and attaches a lightweight classification and box head to each transformer output token. Open-vocabulary classification is enabled by replacing the fixed classification layer weights with the class-name embeddings obtained from the text model. The authors first train CLIP from scratch and fine-tune it end-to-end with the classification and box heads on standard detection datasets using a bipartite matching loss. One or multiple text queries per image can be used to perform zero-shot text-conditioned object detection.
@ -68,9 +61,7 @@ Detected a photo of a cat with confidence 0.707 at location [324.97, 20.44, 640.
 Detected a photo of a cat with confidence 0.717 at location [1.46, 55.26, 315.55, 472.17]
 ```

-## Resources
-
-A demo notebook on using OWL-ViT for zero- and one-shot (image-guided) object detection can be found [here](https://github.com/huggingface/notebooks/blob/main/examples/zeroshot_object_detection_with_owlvit.ipynb).
+This model was contributed by [adirik](https://huggingface.co/adirik). The original code can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/owl_vit).

 ## OwlViTConfig

--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@ -48,8 +48,6 @@ The original code can be found [here](https://github.com/openai/whisper).
    - get_special_tokens_mask
    - create_token_type_ids_from_sequences
    - save_vocabulary
-    - batch_decode
-    - decode

 ## WhisperTokenizerFast

@ -59,8 +57,6 @@ The original code can be found [here](https://github.com/openai/whisper).
    - get_special_tokens_mask
    - create_token_type_ids_from_sequences
    - save_vocabulary
-    - batch_decode
-    - decode

 ## WhisperFeatureExtractor

@ -86,7 +82,6 @@ The original code can be found [here](https://github.com/openai/whisper).

 [[autodoc]] WhisperForConditionalGeneration
    - forward
-    - generate

 ## WhisperForAudioClassification

--- a/docs/source/en/preprocessing.md
+++ b/docs/source/en/preprocessing.md
@ -412,7 +412,8 @@ If you wish to normalize images as a part of the augmentation transformation, us
 and `image_processor.image_std` values.
 </Tip>

-3. Then use 🤗 Datasets[`~datasets.Dataset.set_transform`] to apply the transforms on the fly:
+3. Then use 🤗 Datasets [`set_transform`](https://huggingface.co/docs/datasets/process.html#format-transform) to apply the transforms on the fly:
+
 ```py
 >>> dataset.set_transform(transforms)
 ```
--- a/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
+++ b/docs/source/en/tasks/knowledge_distillation_for_image_classification.md
@ -1,186 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-# Knowledge Distillation for Computer Vision
-
-[[open-in-colab]]
-
-Knowledge distillation is a technique used to transfer knowledge from a larger, more complex model (teacher) to a smaller, simpler model (student). To distill knowledge from one model to another, we take a pre-trained teacher model trained on a certain task (image classification for this case) and randomly initialize a student model to be trained on image classification. Next, we train the student model to minimize the difference between it's outputs and the teacher's outputs, thus making it mimic the behavior. It was first introduced in [Distilling the Knowledge in a Neural Network by Hinton et al](https://arxiv.org/abs/1503.02531). In this guide, we will do task-specific knowledge distillation. We will use the [beans dataset](https://huggingface.co/datasets/beans) for this.
-
-This guide demonstrates how you can distill a [fine-tuned ViT model](https://huggingface.co/merve/vit-mobilenet-beans-224) (teacher model) to a [MobileNet](https://huggingface.co/google/mobilenet_v2_1.4_224) (student model) using the [Trainer API](https://huggingface.co/docs/transformers/en/main_classes/trainer#trainer) of 🤗 Transformers. 
-
-Let's install the libraries needed for distillation and evaluating the process. 
-
-```bash
-pip install transformers datasets accelerate tensorboard evaluate --upgrade
-```
-
-In this example, we are using the `merve/beans-vit-224` model as teacher model. It's an image classification model, based on `google/vit-base-patch16-224-in21k` fine-tuned on beans dataset. We will distill this model to a randomly initialized MobileNetV2.
-
-We will now load the dataset. 
-
-```python
-from datasets import load_dataset
-
-dataset = load_dataset("beans")
-```
-
-We can use an image processor from either of the models, as in this case they return the same output with same resolution. We will use the `map()` method of `dataset` to apply the preprocessing to every split of the dataset. 
-
-```python
-from transformers import AutoImageProcessor
-teacher_processor = AutoImageProcessor.from_pretrained("merve/beans-vit-224")
-
-def process(examples):
-    processed_inputs = teacher_processor(examples["image"])
-    return processed_inputs
-
-processed_datasets = dataset.map(process, batched=True)
-```
-
-Essentially, we want the student model (a randomly initialized MobileNet) to mimic the teacher model (fine-tuned vision transformer). To achieve this, we first get the logits output from the teacher and the student. Then, we divide each of them by the parameter `temperature` which controls the importance of each soft target. A parameter called `lambda` weighs the importance of the distillation loss. In this example, we will use `temperature=5` and `lambda=0.5`. We will use the Kullback-Leibler Divergence loss to compute the divergence between the student and teacher. Given two data P and Q, KL Divergence explains how much extra information we need to represent P using Q. If two are identical, their KL divergence is zero, as there's no other information needed to explain P from Q. Thus, in the context of knowledge distillation, KL divergence is useful.
-
-
-```python
-from transformers import TrainingArguments, Trainer
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-class ImageDistilTrainer(Trainer):
-    def __init__(self, *args, teacher_model=None, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.teacher = teacher_model
-        self.student = student_model
-        self.loss_function = nn.KLDivLoss(reduction="batchmean")
-        device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-        self.teacher.to(device)
-        self.teacher.eval()
-        self.temperature = temperature
-        self.lambda_param = lambda_param
-
-    def compute_loss(self, student, inputs, return_outputs=False):
-        student_output = self.student(**inputs)
-
-        with torch.no_grad():
-          teacher_output = self.teacher(**inputs)
-
-        # Compute soft targets for teacher and student
-        soft_teacher = F.softmax(teacher_output.logits / self.temperature, dim=-1)
-        soft_student = F.log_softmax(student_output.logits / self.temperature, dim=-1)
-
-        # Compute the loss
-        distillation_loss = self.loss_function(soft_student, soft_teacher) * (self.temperature ** 2)
-
-        # Compute the true label loss
-        student_target_loss = student_output.loss
-
-        # Calculate final loss
-        loss = (1. - self.lambda_param) * student_target_loss + self.lambda_param * distillation_loss
-        return (loss, student_output) if return_outputs else loss
-```
-
-We will now login to Hugging Face Hub so we can push our model to the Hugging Face Hub through the `Trainer`. 
-
-```python
-from huggingface_hub import notebook_login
-
-notebook_login()
-```
-
-Let's set the `TrainingArguments`, the teacher model and the student model. 
-
-```python
-from transformers import AutoModelForImageClassification, MobileNetV2Config, MobileNetV2ForImageClassification
-
-training_args = TrainingArguments(
-    output_dir="my-awesome-model",
-    num_train_epochs=30,
-    fp16=True,
-    logging_dir=f"{repo_name}/logs",
-    logging_strategy="epoch",
-    evaluation_strategy="epoch",
-    save_strategy="epoch",
-    load_best_model_at_end=True,
-    metric_for_best_model="accuracy",
-    report_to="tensorboard",
-    push_to_hub=True,
-    hub_strategy="every_save",
-    hub_model_id=repo_name,
-    )
-
-num_labels = len(processed_datasets["train"].features["labels"].names)
-
-# initialize models
-teacher_model = AutoModelForImageClassification.from_pretrained(
-    "merve/beans-vit-224",
-    num_labels=num_labels,
-    ignore_mismatched_sizes=True
-)
-
-# training MobileNetV2 from scratch
-student_config = MobileNetV2Config()
-student_config.num_labels = num_labels
-student_model = MobileNetV2ForImageClassification(student_config)
-```
-
-We can use `compute_metrics` function to evaluate our model on the test set. This function will be used during the training process to compute the `accuracy` & `f1` of our model.
-
-```python
-import evaluate
-import numpy as np
-
-accuracy = evaluate.load("accuracy")
-
-def compute_metrics(eval_pred):
-    predictions, labels = eval_pred
-    acc = accuracy.compute(references=labels, predictions=np.argmax(predictions, axis=1))
-    return {"accuracy": acc["accuracy"]}
-```
-
-Let's initialize the `Trainer` with the training arguments we defined. We will also initialize our data collator.
-
-```python
-from transformers import DefaultDataCollator
-
-data_collator = DefaultDataCollator()
-trainer = ImageDistilTrainer(
-    student_model=student_model,
-    teacher_model=teacher_model,
-    training_args=training_args,
-    train_dataset=processed_datasets["train"],
-    eval_dataset=processed_datasets["validation"],
-    data_collator=data_collator,
-    tokenizer=teacher_extractor,
-    compute_metrics=compute_metrics,
-    temperature=5,
-    lambda_param=0.5
-)
-```
-
-We can now train our model.
-
-```python
-trainer.train()
-```
-
-We can evaluate the model on the test set.
-
-```python
-trainer.evaluate(processed_datasets["test"])
-```
-
-On test set, our model reaches 72 percent accuracy. To have a sanity check over efficiency of distillation, we also trained MobileNet on the beans dataset from scratch with the same hyperparameters and observed 63 percent accuracy on the test set. We invite the readers to try different pre-trained teacher models, student architectures, distillation parameters and report their findings. The training logs and checkpoints for distilled model can be found in [this repository](https://huggingface.co/merve/vit-mobilenet-beans-224), and MobileNetV2 trained from scratch can be found in this [repository](https://huggingface.co/merve/resnet-mobilenet-beans-5).
--- a/docs/source/en/tasks/language_modeling.md
+++ b/docs/source/en/tasks/language_modeling.md
@ -37,7 +37,7 @@ You can finetune other architectures for causal language modeling following the
 Choose one of the following architectures:

 <!--This tip is automatically generated by `make fix-copies`, do not fill manually!-->
-[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [Fuyu](../model_doc/fuyu), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)
+[BART](../model_doc/bart), [BERT](../model_doc/bert), [Bert Generation](../model_doc/bert-generation), [BigBird](../model_doc/big_bird), [BigBird-Pegasus](../model_doc/bigbird_pegasus), [BioGpt](../model_doc/biogpt), [Blenderbot](../model_doc/blenderbot), [BlenderbotSmall](../model_doc/blenderbot-small), [BLOOM](../model_doc/bloom), [CamemBERT](../model_doc/camembert), [CodeLlama](../model_doc/code_llama), [CodeGen](../model_doc/codegen), [CPM-Ant](../model_doc/cpmant), [CTRL](../model_doc/ctrl), [Data2VecText](../model_doc/data2vec-text), [ELECTRA](../model_doc/electra), [ERNIE](../model_doc/ernie), [Falcon](../model_doc/falcon), [GIT](../model_doc/git), [GPT-Sw3](../model_doc/gpt-sw3), [OpenAI GPT-2](../model_doc/gpt2), [GPTBigCode](../model_doc/gpt_bigcode), [GPT Neo](../model_doc/gpt_neo), [GPT NeoX](../model_doc/gpt_neox), [GPT NeoX Japanese](../model_doc/gpt_neox_japanese), [GPT-J](../model_doc/gptj), [LLaMA](../model_doc/llama), [Marian](../model_doc/marian), [mBART](../model_doc/mbart), [MEGA](../model_doc/mega), [Megatron-BERT](../model_doc/megatron-bert), [Mistral](../model_doc/mistral), [MPT](../model_doc/mpt), [MusicGen](../model_doc/musicgen), [MVP](../model_doc/mvp), [OpenLlama](../model_doc/open-llama), [OpenAI GPT](../model_doc/openai-gpt), [OPT](../model_doc/opt), [Pegasus](../model_doc/pegasus), [Persimmon](../model_doc/persimmon), [PLBart](../model_doc/plbart), [ProphetNet](../model_doc/prophetnet), [QDQBert](../model_doc/qdqbert), [Reformer](../model_doc/reformer), [RemBERT](../model_doc/rembert), [RoBERTa](../model_doc/roberta), [RoBERTa-PreLayerNorm](../model_doc/roberta-prelayernorm), [RoCBert](../model_doc/roc_bert), [RoFormer](../model_doc/roformer), [RWKV](../model_doc/rwkv), [Speech2Text2](../model_doc/speech_to_text_2), [Transformer-XL](../model_doc/transfo-xl), [TrOCR](../model_doc/trocr), [XGLM](../model_doc/xglm), [XLM](../model_doc/xlm), [XLM-ProphetNet](../model_doc/xlm-prophetnet), [XLM-RoBERTa](../model_doc/xlm-roberta), [XLM-RoBERTa-XL](../model_doc/xlm-roberta-xl), [XLNet](../model_doc/xlnet), [X-MOD](../model_doc/xmod)



--- a/docs/source/ja/_toctree.yml
+++ b/docs/source/ja/_toctree.yml
@ -134,24 +134,3 @@
  - local: model_memory_anatomy
    title: モデルトレーニングの解剖学
  title: コンセプチュアルガイド
- sections:
-  - local: internal/modeling_utils
-    title: カスタムレイヤーとユーティリティ
-  - local: internal/pipelines_utils
-    title: パイプライン用のユーティリティ
-  - local: internal/tokenization_utils
-    title: ト=ークナイザー用のユーティリティ
-  - local: internal/trainer_utils
-    title: トレーナー用ユーティリティ
-  - local: internal/generation_utils
-    title: 発電用ユーティリティ
-  - local: internal/image_processing_utils
-    title: 画像プロセッサ用ユーティリティ
-  - local: internal/audio_utils
-    title: オーディオ処理用のユーティリティ
-  - local: internal/file_utils
-    title: 一般公共事業
-  - local: internal/time_series_utils
-    title: 時系列用のユーティリティ
-  title: 内部ヘルパー
-  title: API
--- a/docs/source/ja/internal/audio_utils.md
+++ b/docs/source/ja/internal/audio_utils.md
@ -1,39 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# `FeatureExtractor` 用のユーティリティ
-
-このページには、*短時間フーリエ変換* や *ログ メル スペクトログラム* などの一般的なアルゴリズムを使用して生のオーディオから特別な特徴を計算するために、オーディオ [`FeatureExtractor`] で使用できるすべてのユーティリティ関数がリストされています。
-
-これらのほとんどは、ライブラリ内のオーディオ プロセッサのコードを学習する場合にのみ役に立ちます。
-
-## オーディオ変換
-
-[[autodoc]] audio_utils.hertz_to_mel
-
-[[autodoc]] audio_utils.mel_to_hertz
-
-[[autodoc]] audio_utils.mel_filter_bank
-
-[[autodoc]] audio_utils.optimal_fft_length
-
-[[autodoc]] audio_utils.window_function
-
-[[autodoc]] audio_utils.spectrogram
-
-[[autodoc]] audio_utils.power_to_db
-
-[[autodoc]] audio_utils.amplitude_to_db
--- a/docs/source/ja/internal/file_utils.md
+++ b/docs/source/ja/internal/file_utils.md
@ -1,49 +0,0 @@
-<!--Copyright 2021 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 一般的なユーティリティ
-
-このページには、ファイル `utils.py` にある Transformers の一般的なユーティリティ関数がすべてリストされています。
-
-これらのほとんどは、ライブラリで一般的なコードを学習する場合にのみ役に立ちます。
-
-## 列挙型と名前付きタプル
-
-[[autodoc]] utils.ExplicitEnum
-
-[[autodoc]] utils.PaddingStrategy
-
-[[autodoc]] utils.TensorType
-
-## 特別なデコレーター
-
-[[autodoc]] utils.add_start_docstrings
-
-[[autodoc]] utils.add_start_docstrings_to_model_forward
-
-[[autodoc]] utils.add_end_docstrings
-
-[[autodoc]] utils.add_code_sample_docstrings
-
-[[autodoc]] utils.replace_return_docstrings
-
-## 特殊なプロパティ
-
-[[autodoc]] utils.cached_property
-
-## その他のユーティリティ
-
-[[autodoc]] utils._LazyModule
--- a/docs/source/ja/internal/generation_utils.md
+++ b/docs/source/ja/internal/generation_utils.md
@ -1,369 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 発電用ユーティリティ
-
-このページには、[`~generation.GenerationMixin.generate`] で使用されるすべてのユーティリティ関数がリストされています。
-[`~generation.GenerationMixin.greedy_search`],
-[`~generation.GenerationMixin.contrastive_search`],
-[`~generation.GenerationMixin.sample`],
-[`~generation.GenerationMixin.beam_search`],
-[`~generation.GenerationMixin.beam_sample`],
-[`~generation.GenerationMixin.group_beam_search`]、および
-[`~generation.GenerationMixin.constrained_beam_search`]。
-
-これらのほとんどは、ライブラリ内の生成メソッドのコードを学習する場合にのみ役に立ちます。
-
-## 出力を生成する
-
-[`~generation.GenerationMixin.generate`] の出力は、次のサブクラスのインスタンスです。
-[`~utils.ModelOutput`]。この出力は、返されたすべての情報を含むデータ構造です。
-[`~generation.GenerationMixin.generate`] によって作成されますが、タプルまたは辞書としても使用できます。
-
-以下に例を示します。
-
-```python
-from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained("gpt2")
-
-inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
-generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
-```
-
-`generation_output` オブジェクトは、できる限り [`~generation.GreedySearchDecoderOnlyOutput`] です。
-以下のそのクラスのドキュメントを参照してください。これは、次の属性があることを意味します。
-
- `sequences`: 生成されたトークンのシーケンス
- `scores` (オプション): 各生成ステップの言語モデリング ヘッドの予測スコア
- `hidden_states` (オプション): 生成ステップごとのモデルの隠れた状態
- `attentions` (オプション): 生成ステップごとのモデルのアテンションの重み
-
-ここでは、`output_scores=True`を渡したので `scores` がありますが、`hidden_states` はありません。
-`attentions` は、`output_hidden_states=True`または`output_attentions=True`を渡さなかったためです。
-
-通常と同じように各属性にアクセスできます。その属性がモデルから返されなかった場合は、
-は「なし」を取得します。ここで、たとえば`generation_output.scores`は、生成されたすべての予測スコアです。
-言語モデリングのヘッドであり、`generation_output.attentions`は`None`です。
-
-`generation_output` オブジェクトをタプルとして使用する場合、`None` 値を持たない属性のみが保持されます。
-たとえば、ここには 2 つの要素、`loss`、次に`logits`があります。
-
-```python
-generation_output[:2]
-```
-
-たとえば、タプル `(generation_output.sequences,generation_output.scores)` を返します。
-
-`generation_output` オブジェクトを辞書として使用する場合、`None` を持たない属性のみが保持されます。
-ここでは、たとえば、`sequences`と`scores`という 2 つのキーがあります。
-
-ここではすべての出力タイプを文書化します。
-
-### PyTorch
-
-[[autodoc]] generation.GreedySearchEncoderDecoderOutput
-
-[[autodoc]] generation.GreedySearchDecoderOnlyOutput
-
-[[autodoc]] generation.SampleEncoderDecoderOutput
-
-[[autodoc]] generation.SampleDecoderOnlyOutput
-
-[[autodoc]] generation.BeamSearchEncoderDecoderOutput
-
-[[autodoc]] generation.BeamSearchDecoderOnlyOutput
-
-[[autodoc]] generation.BeamSampleEncoderDecoderOutput
-
-[[autodoc]] generation.BeamSampleDecoderOnlyOutput
-
-[[autodoc]] generation.ContrastiveSearchEncoderDecoderOutput
-
-[[autodoc]] generation.ContrastiveSearchDecoderOnlyOutput
-
-### TensorFlow
-
-[[autodoc]] generation.TFGreedySearchEncoderDecoderOutput
-
-[[autodoc]] generation.TFGreedySearchDecoderOnlyOutput
-
-[[autodoc]] generation.TFSampleEncoderDecoderOutput
-
-[[autodoc]] generation.TFSampleDecoderOnlyOutput
-
-[[autodoc]] generation.TFBeamSearchEncoderDecoderOutput
-
-[[autodoc]] generation.TFBeamSearchDecoderOnlyOutput
-
-[[autodoc]] generation.TFBeamSampleEncoderDecoderOutput
-
-[[autodoc]] generation.TFBeamSampleDecoderOnlyOutput
-
-[[autodoc]] generation.TFContrastiveSearchEncoderDecoderOutput
-
-[[autodoc]] generation.TFContrastiveSearchDecoderOnlyOutput
-
-### FLAX
-
-[[autodoc]] generation.FlaxSampleOutput
-
-[[autodoc]] generation.FlaxGreedySearchOutput
-
-[[autodoc]] generation.FlaxBeamSearchOutput
-
-## LogitsProcessor
-
-[`LogitsProcessor`] を使用して、言語モデルのヘッドの予測スコアを変更できます。
-世代。
-
-### PyTorch
-
-[[autodoc]] AlternatingCodebooksLogitsProcessor
-    - __call__
-
-[[autodoc]] ClassifierFreeGuidanceLogitsProcessor
-    - __call__
-
-[[autodoc]] EncoderNoRepeatNGramLogitsProcessor
-    - __call__
-
-[[autodoc]] EncoderRepetitionPenaltyLogitsProcessor
-    - __call__
-
-[[autodoc]] EpsilonLogitsWarper
-    - __call__
-
-[[autodoc]] EtaLogitsWarper
-    - __call__
-
-[[autodoc]] ExponentialDecayLengthPenalty
-    - __call__
-
-[[autodoc]] ForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] ForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] ForceTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] HammingDiversityLogitsProcessor
-    - __call__
-
-[[autodoc]] InfNanRemoveLogitsProcessor
-    - __call__
-
-[[autodoc]] LogitNormalization
-    - __call__
-
-[[autodoc]] LogitsProcessor
-    - __call__
-
-[[autodoc]] LogitsProcessorList
-    - __call__
-
-[[autodoc]] LogitsWarper
-    - __call__
-
-[[autodoc]] MinLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] MinNewTokensLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] NoBadWordsLogitsProcessor
-    - __call__
-
-[[autodoc]] NoRepeatNGramLogitsProcessor
-    - __call__
-
-[[autodoc]] PrefixConstrainedLogitsProcessor
-    - __call__
-
-[[autodoc]] RepetitionPenaltyLogitsProcessor
-    - __call__
-
-[[autodoc]] SequenceBiasLogitsProcessor
-    - __call__
-
-[[autodoc]] SuppressTokensAtBeginLogitsProcessor
-    - __call__
-
-[[autodoc]] SuppressTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] TemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] TopKLogitsWarper
-    - __call__
-
-[[autodoc]] TopPLogitsWarper
-    - __call__
-
-[[autodoc]] TypicalLogitsWarper
-    - __call__
-
-[[autodoc]] UnbatchedClassifierFreeGuidanceLogitsProcessor
-    - __call__
-
-[[autodoc]] WhisperTimeStampLogitsProcessor
-    - __call__
-
-### TensorFlow
-
-[[autodoc]] TFForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] TFForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] TFForceTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] TFLogitsProcessor
-    - __call__
-
-[[autodoc]] TFLogitsProcessorList
-    - __call__
-
-[[autodoc]] TFLogitsWarper
-    - __call__
-
-[[autodoc]] TFMinLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] TFNoBadWordsLogitsProcessor
-    - __call__
-
-[[autodoc]] TFNoRepeatNGramLogitsProcessor
-    - __call__
-
-[[autodoc]] TFRepetitionPenaltyLogitsProcessor
-    - __call__
-
-[[autodoc]] TFSuppressTokensAtBeginLogitsProcessor
-    - __call__
-
-[[autodoc]] TFSuppressTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] TFTemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] TFTopKLogitsWarper
-    - __call__
-
-[[autodoc]] TFTopPLogitsWarper
-    - __call__
-
-### FLAX
-
-[[autodoc]] FlaxForcedBOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxForcedEOSTokenLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxForceTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxLogitsProcessorList
-    - __call__
-
-[[autodoc]] FlaxLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxMinLengthLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxSuppressTokensAtBeginLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxSuppressTokensLogitsProcessor
-    - __call__
-
-[[autodoc]] FlaxTemperatureLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxTopKLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxTopPLogitsWarper
-    - __call__
-
-[[autodoc]] FlaxWhisperTimeStampLogitsProcessor
-    - __call__
-
-## StoppingCriteria
-
-[`StoppingCriteria`] を使用して、(EOS トークン以外の) 生成を停止するタイミングを変更できます。これは PyTorch 実装でのみ利用可能であることに注意してください。
-
-[[autodoc]] StoppingCriteria
-    - __call__
-
-[[autodoc]] StoppingCriteriaList
-    - __call__
-
-[[autodoc]] MaxLengthCriteria
-    - __call__
-
-[[autodoc]] MaxTimeCriteria
-    - __call__
-
-## Constraints
-
-[`Constraint`] を使用すると、生成時に出力に特定のトークンまたはシーケンスが含まれるように強制できます。これは PyTorch 実装でのみ利用可能であることに注意してください。
-
-[[autodoc]] Constraint
-
-[[autodoc]] PhrasalConstraint
-
-[[autodoc]] DisjunctiveConstraint
-
-[[autodoc]] ConstraintListState
-
-## BeamSearch
-
-[[autodoc]] BeamScorer
-    - process
-    - finalize
-
-[[autodoc]] BeamSearchScorer
-    - process
-    - finalize
-
-[[autodoc]] ConstrainedBeamSearchScorer
-    - process
-    - finalize
-
-## Utilities
-
-[[autodoc]] top_k_top_p_filtering
-
-[[autodoc]] tf_top_k_top_p_filtering
-
-## Streamers
-
-[[autodoc]] TextStreamer
-
-[[autodoc]] TextIteratorStreamer
--- a/docs/source/ja/internal/image_processing_utils.md
+++ b/docs/source/ja/internal/image_processing_utils.md
@ -1,48 +0,0 @@
-!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 画像プロセッサ用ユーティリティ
-
-このページには、画像プロセッサーで使用されるすべてのユーティリティー関数がリストされています。主に機能的なものです。
-画像を処理するために使用される変換。
-
-これらのほとんどは、ライブラリ内の画像プロセッサのコードを学習する場合にのみ役に立ちます。
-
-## Image Transformations
-
-[[autodoc]] image_transforms.center_crop
-
-[[autodoc]] image_transforms.center_to_corners_format
-
-[[autodoc]] image_transforms.corners_to_center_format
-
-[[autodoc]] image_transforms.id_to_rgb
-
-[[autodoc]] image_transforms.normalize
-
-[[autodoc]] image_transforms.pad
-
-[[autodoc]] image_transforms.rgb_to_id
-
-[[autodoc]] image_transforms.rescale
-
-[[autodoc]] image_transforms.resize
-
-[[autodoc]] image_transforms.to_pil_image
-
-## ImageProcessingMixin
-
-[[autodoc]] image_processing_utils.ImageProcessingMixin
--- a/docs/source/ja/internal/modeling_utils.md
+++ b/docs/source/ja/internal/modeling_utils.md
@ -1,83 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# カスタムレイヤーとユーティリティ
-
-このページには、ライブラリで使用されるすべてのカスタム レイヤーと、モデリングに提供されるユーティリティ関数がリストされます。
-
-これらのほとんどは、ライブラリ内のモデルのコードを研究する場合にのみ役に立ちます。
-
-
-## Pytorch custom modules
-
-[[autodoc]] pytorch_utils.Conv1D
-
-[[autodoc]] modeling_utils.PoolerStartLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerEndLogits
-    - forward
-
-[[autodoc]] modeling_utils.PoolerAnswerClass
-    - forward
-
-[[autodoc]] modeling_utils.SquadHeadOutput
-
-[[autodoc]] modeling_utils.SQuADHead
-    - forward
-
-[[autodoc]] modeling_utils.SequenceSummary
-    - forward
-
-## PyTorch Helper Functions
-
-[[autodoc]] pytorch_utils.apply_chunking_to_forward
-
-[[autodoc]] pytorch_utils.find_pruneable_heads_and_indices
-
-[[autodoc]] pytorch_utils.prune_layer
-
-[[autodoc]] pytorch_utils.prune_conv1d_layer
-
-[[autodoc]] pytorch_utils.prune_linear_layer
-
-## TensorFlow custom layers
-
-[[autodoc]] modeling_tf_utils.TFConv1D
-
-[[autodoc]] modeling_tf_utils.TFSequenceSummary
-
-## TensorFlow loss functions
-
-[[autodoc]] modeling_tf_utils.TFCausalLanguageModelingLoss
-
-[[autodoc]] modeling_tf_utils.TFMaskedLanguageModelingLoss
-
-[[autodoc]] modeling_tf_utils.TFMultipleChoiceLoss
-
-[[autodoc]] modeling_tf_utils.TFQuestionAnsweringLoss
-
-[[autodoc]] modeling_tf_utils.TFSequenceClassificationLoss
-
-[[autodoc]] modeling_tf_utils.TFTokenClassificationLoss
-
-## TensorFlow Helper Functions
-
-[[autodoc]] modeling_tf_utils.get_initializer
-
-[[autodoc]] modeling_tf_utils.keras_serializable
-
-[[autodoc]] modeling_tf_utils.shape_list
--- a/docs/source/ja/internal/pipelines_utils.md
+++ b/docs/source/ja/internal/pipelines_utils.md
@ -1,44 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# パイプライン用のユーティリティ
-
-このページには、ライブラリがパイプラインに提供するすべてのユーティリティ関数がリストされます。
-
-これらのほとんどは、ライブラリ内のモデルのコードを研究する場合にのみ役に立ちます。
-
-
-## Argument handling
-
-[[autodoc]] pipelines.ArgumentHandler
-
-[[autodoc]] pipelines.ZeroShotClassificationArgumentHandler
-
-[[autodoc]] pipelines.QuestionAnsweringArgumentHandler
-
-## Data format
-
-[[autodoc]] pipelines.PipelineDataFormat
-
-[[autodoc]] pipelines.CsvPipelineDataFormat
-
-[[autodoc]] pipelines.JsonPipelineDataFormat
-
-[[autodoc]] pipelines.PipedPipelineDataFormat
-
-## Utilities
-
-[[autodoc]] pipelines.PipelineException
--- a/docs/source/ja/internal/time_series_utils.md
+++ b/docs/source/ja/internal/time_series_utils.md
@ -1,29 +0,0 @@
-<!--Copyright 2023 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 時系列ユーティリティ
-
-このページには、時系列ベースのモデルに使用できるすべてのユーティリティ関数とクラスがリストされます。
-
-これらのほとんどは、時系列モデルのコードを研究している場合、または分散出力クラスのコレクションに追加したい場合にのみ役立ちます。
-
-## Distributional Output
-
-[[autodoc]] time_series_utils.NormalOutput
-
-[[autodoc]] time_series_utils.StudentTOutput
-
-[[autodoc]] time_series_utils.NegativeBinomialOutput
--- a/docs/source/ja/internal/tokenization_utils.md
+++ b/docs/source/ja/internal/tokenization_utils.md
@ -1,42 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# Utilities for Tokenizers
-
-このページには、トークナイザーによって使用されるすべてのユーティリティ関数 (主にクラス) がリストされます。
-[`~tokenization_utils_base.PreTrainedTokenizerBase`] 間の共通メソッドを実装します。
-[`PreTrainedTokenizer`] と [`PreTrainedTokenizerFast`] およびミックスイン
-[`~tokenization_utils_base.SpecialTokensMixin`]。
-
-これらのほとんどは、ライブラリ内のトークナイザーのコードを学習する場合にのみ役に立ちます。
-
-## PreTrainedTokenizerBase
-
-[[autodoc]] tokenization_utils_base.PreTrainedTokenizerBase
-    - __call__
-    - all
-
-## SpecialTokensMixin
-
-[[autodoc]] tokenization_utils_base.SpecialTokensMixin
-
-## Enums and namedtuples
-
-[[autodoc]] tokenization_utils_base.TruncationStrategy
-
-[[autodoc]] tokenization_utils_base.CharSpan
-
-[[autodoc]] tokenization_utils_base.TokenSpan
--- a/docs/source/ja/internal/trainer_utils.md
+++ b/docs/source/ja/internal/trainer_utils.md
@ -1,49 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# トレーナー用ユーティリティ
-
-このページには、[`Trainer`] で使用されるすべてのユーティリティ関数がリストされています。
-
-これらのほとんどは、ライブラリ内のトレーナーのコードを学習する場合にのみ役に立ちます。
-
-## Utilities
-
-[[autodoc]] EvalPrediction
-
-[[autodoc]] IntervalStrategy
-
-[[autodoc]] enable_full_determinism
-
-[[autodoc]] set_seed
-
-[[autodoc]] torch_distributed_zero_first
-
-## Callbacks internals
-
-[[autodoc]] trainer_callback.CallbackHandler
-
-## Distributed Evaluation
-
-[[autodoc]] trainer_pt_utils.DistributedTensorGatherer
-
-## Distributed Evaluation
-
-[[autodoc]] HfArgumentParser
-
-## Debug Utilities
-
-[[autodoc]] debug_utils.DebugUnderflowOverflow
--- a/docs/source/ja/preprocessing.md
+++ b/docs/source/ja/preprocessing.md
@ -64,8 +64,8 @@ pip install datasets

 次に、テキストをトークナイザに渡します：

-```py
->>> encoded_input = tokenizer("Do not meddle in the affairs of wizards, for they are subtle and quick to anger.")
+```python
+>>> encoded_input = tokenizer("魔法使いの事には干渉しないでください、彼らは微妙で怒りっぽいです。")
 >>> print(encoded_input)
 {'input_ids': [101, 2079, 2025, 19960, 10362, 1999, 1996, 3821, 1997, 16657, 1010, 2005, 2027, 2024, 11259, 1998, 4248, 2000, 4963, 1012, 102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
@ -90,11 +90,11 @@ pip install datasets

 複数の文章を前処理する場合、トークナイザにリストとして渡してください：

-```py
+```python
 >>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
+...     "でも、セカンドブレックファーストはどうなるの？",
+...     "ピップ、セカンドブレックファーストのことを知っているかどうかはわからないと思うよ。",
+...     "イレブンジーズはどうなの？",
 ... ]
 >>> encoded_inputs = tokenizer(batch_sentences)
 >>> print(encoded_inputs)
@ -116,11 +116,11 @@ pip install datasets

 バッチ内の短いシーケンスを最長のシーケンスに合わせるために、`padding`パラメータを`True`に設定します：

-```py
+```python
 >>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
+...     "でもセカンドブレックファーストはどうなるの？",
+...     "セカンドブレックファーストについては知らないと思う、ピップ。",
+...     "イレブンジーズはどうなの？",
 ... ]
 >>> encoded_input = tokenizer(batch_sentences, padding=True)
 >>> print(encoded_input)
@ -143,11 +143,11 @@ pip install datasets

 モデルが受け入れる最大の長さにシーケンスを切り詰めるには、`truncation`パラメータを`True`に設定します：

-```py
+```python
 >>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
+...     "でも、セカンドブレックファーストはどうなるの？",
+...     "セカンドブレックファーストについては知らないと思う、ピップ。",
+...     "イレブンジーズはどうなの？",
 ... ]
 >>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True)
 >>> print(encoded_input)
@ -177,11 +177,11 @@ pip install datasets
 <frameworkcontent>
 <pt>

-```py
+```python
 >>> batch_sentences = [
-...     "But what about second breakfast?",
-...     "Don't think he knows about second breakfast, Pip.",
-...     "What about elevensies?",
+...     "でも、セカンドブレックファーストはどうなるの？",
+...     "ピップ、セカンドブレックファーストについては知っていないと思うよ。",
+...     "イレブンジーズはどうなの？",
 ... ]
 >>> encoded_input = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
 >>> print(encoded_input)
--- a/docs/source/te/_toctree.yml
+++ b/docs/source/te/_toctree.yml
@ -1,6 +0,0 @@
- sections:
-  - local: index
-    title: 🤗 Transformers
-  - local: quicktour
-    title: త్వరిత పర్యటన
-  title: ప్రారంభించడానికి
--- a/docs/source/te/index.md
+++ b/docs/source/te/index.md
@ -1,298 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-->
-
-[పైటోర్చ్](https://pytorch.org/), [టెన్సర్‌ఫ్లో](https://www.tensorflow.org/), మరియు [జాక్స్](https://jax.readthedocs.io/en/latest/) కోసం స్థితి-కలాన యంత్ర అభ్యాసం.
-
-🤗 ట్రాన్స్ఫార్మర్స్ అభివృద్ధిస్తున్నది API మరియు ఉపకరణాలు, పూర్వ-చేతన మోడల్లను సులభంగా డౌన్లోడ్ మరియు శిక్షణ చేయడానికి అవసరమైన సమయం, వనరులు, మరియు వస్తువులను నుంచి మోడల్ను శీర్షికం నుంచి ప్రశిక్షించడం వరకు దేవాయనం చేస్తుంది. ఈ మోడల్లు విభిన్న మోడాలిటీలలో సాధారణ పనులకు మద్దతు చేస్తాయి, వంటివి:
-
-📝 **ప్రాకృతిక భాష ప్రక్రియ**: వచన వర్గీకరణ, పేరుల యొక్క యెంటిటీ గుర్తువు, ప్రశ్న సంవాద, భాషా రచన, సంక్షేపణ, అనువాదం, అనేక ప్రకారాలు, మరియు వచన సృష్టి.<br>
-🖼️ **కంప్యూటర్ విషయం**: చిత్రం వర్గీకరణ, వస్త్రం గుర్తువు, మరియు విభజన.<br>
-🗣️ **ఆడియో**: స్వయంచలన ప్రసంగాన్ని గుర్తుచేసేందుకు, ఆడియో వర్గీకరణ.<br>
-🐙 **బహుమూలిక**: పట్టి ప్రశ్న సంవాద, ఆప్టికల్ సిఫర్ గుర్తువు, డాక్యుమెంట్లు స్క్యాన్ చేసినంతగా సమాచార పొందడం, వీడియో వర్గీకరణ, మరియు దృశ్య ప్రశ్న సంవాద.
-
-🤗 ట్రాన్స్ఫార్మర్స్ పైన మద్దతు చేస్తుంది పైన తొలగించడానికి పైన పైన పైన ప్రోగ్రామ్లో మోడల్ను శిక్షించండి, మరియు అన్ని ప్రాథమిక యొక్కడా ఇన్‌ఫరెన్స్ కోసం లోడ్ చేయండి. మో
-
-డల్లు కూడా ప్రొడక్షన్ వాతావరణాలలో వాడుకోవడానికి ONNX మరియు TorchScript వంటి ఆకృతులకు ఎగుమతి చేయవచ్చు.
-
-ఈరువులకు [హబ్](https://huggingface.co/models), [ఫోరం](https://discuss.huggingface.co/), లేదా [డిస్కార్డ్](https://discord.com/invite/JfAtkvEtRb) లో ఈ పెద్ద సముదాయంలో చేరండి!
-
-## మీరు హగ్గింగ్ ఫేస్ టీమ్ నుండి అనుకూల మద్దతు కోసం చూస్తున్నట్లయితే
-
-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="width: 100%; max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
-</a>
-
-## విషయాలు
-
-డాక్యుమెంటేషన్ ఐదు విభాగాలుగా నిర్వహించబడింది:
-
- **ప్రారంభించండి** లైబ్రరీ యొక్క శీఘ్ర పర్యటన మరియు రన్నింగ్ కోసం ఇన్‌స్టాలేషన్ సూచనలను అందిస్తుంది.
- **ట్యుటోరియల్స్** మీరు అనుభవశూన్యుడు అయితే ప్రారంభించడానికి గొప్ప ప్రదేశం. మీరు లైబ్రరీని ఉపయోగించడం ప్రారంభించడానికి అవసరమైన ప్రాథమిక నైపుణ్యాలను పొందడానికి ఈ విభాగం మీకు సహాయం చేస్తుంది.
- **హౌ-టు-గైడ్‌లు** లాంగ్వేజ్ మోడలింగ్ కోసం ప్రిట్రైన్డ్ మోడల్‌ని ఫైన్‌ట్యూన్ చేయడం లేదా కస్టమ్ మోడల్‌ను ఎలా వ్రాయాలి మరియు షేర్ చేయాలి వంటి నిర్దిష్ట లక్ష్యాన్ని ఎలా సాధించాలో మీకు చూపుతాయి.
- **కాన్సెప్చువల్ గైడ్స్** మోడల్‌లు, టాస్క్‌లు మరియు 🤗 ట్రాన్స్‌ఫార్మర్ల డిజైన్ ఫిలాసఫీ వెనుక ఉన్న అంతర్లీన భావనలు మరియు ఆలోచనల గురించి మరింత చర్చ మరియు వివరణను అందిస్తుంది.
- **API** అన్ని తరగతులు మరియు విధులను వివరిస్తుంది:
-
-  - **ప్రధాన తరగతులు** కాన్ఫిగరేషన్, మోడల్, టోకెనైజర్ మరియు పైప్‌లైన్ వంటి అత్యంత ముఖ్యమైన తరగతులను వివరిస్తుంది.
-  - **మోడల్స్** లైబ్రరీలో అమలు చేయబడిన ప్రతి మోడల్‌కు సంబంధించిన తరగతులు మరియు విధులను వివరిస్తుంది.
-  - **అంతర్గత సహాయకులు** అంతర్గతంగా ఉపయోగించే యుటిలిటీ క్లాస్‌లు మరియు ఫంక్షన్‌ల వివరాలు.
- 
-## మద్దతు ఉన్న నమూనాలు మరియు ఫ్రేమ్‌వర్క్‌లు
-
-దిగువన ఉన్న పట్టిక ఆ ప్రతి మోడల్‌కు పైథాన్ కలిగి ఉన్నా లైబ్రరీలో ప్రస్తుత మద్దతును సూచిస్తుంది
-టోకెనైజర్ ("నెమ్మదిగా" అని పిలుస్తారు). Jax (ద్వారా
-ఫ్లాక్స్), పైటార్చ్ మరియు/లేదా టెన్సర్‌ఫ్లో.
-
-<!--This table is updated automatically from the auto modules with _make fix-copies_. Do not update manually!-->
-
-|                                  Model                                   | PyTorch support | TensorFlow support | Flax Support |
-|:------------------------------------------------------------------------:|:---------------:|:------------------:|:------------:|
-|                        [ALBERT](model_doc/albert)                        |       ✅        |         ✅         |      ✅      |
-|                         [ALIGN](model_doc/align)                         |       ✅        |         ❌         |      ❌      |
-|                       [AltCLIP](model_doc/altclip)                       |       ✅        |         ❌         |      ❌      |
-| [Audio Spectrogram Transformer](model_doc/audio-spectrogram-transformer) |       ✅        |         ❌         |      ❌      |
-|                    [Autoformer](model_doc/autoformer)                    |       ✅        |         ❌         |      ❌      |
-|                          [Bark](model_doc/bark)                          |       ✅        |         ❌         |      ❌      |
-|                          [BART](model_doc/bart)                          |       ✅        |         ✅         |      ✅      |
-|                       [BARThez](model_doc/barthez)                       |       ✅        |         ✅         |      ✅      |
-|                       [BARTpho](model_doc/bartpho)                       |       ✅        |         ✅         |      ✅      |
-|                          [BEiT](model_doc/beit)                          |       ✅        |         ❌         |      ✅      |
-|                          [BERT](model_doc/bert)                          |       ✅        |         ✅         |      ✅      |
-|               [Bert Generation](model_doc/bert-generation)               |       ✅        |         ❌         |      ❌      |
-|                 [BertJapanese](model_doc/bert-japanese)                  |       ✅        |         ✅         |      ✅      |
-|                      [BERTweet](model_doc/bertweet)                      |       ✅        |         ✅         |      ✅      |
-|                      [BigBird](model_doc/big_bird)                       |       ✅        |         ❌         |      ✅      |
-|               [BigBird-Pegasus](model_doc/bigbird_pegasus)               |       ✅        |         ❌         |      ❌      |
-|                        [BioGpt](model_doc/biogpt)                        |       ✅        |         ❌         |      ❌      |
-|                           [BiT](model_doc/bit)                           |       ✅        |         ❌         |      ❌      |
-|                    [Blenderbot](model_doc/blenderbot)                    |       ✅        |         ✅         |      ✅      |
-|              [BlenderbotSmall](model_doc/blenderbot-small)               |       ✅        |         ✅         |      ✅      |
-|                          [BLIP](model_doc/blip)                          |       ✅        |         ✅         |      ❌      |
-|                        [BLIP-2](model_doc/blip-2)                        |       ✅        |         ❌         |      ❌      |
-|                         [BLOOM](model_doc/bloom)                         |       ✅        |         ❌         |      ✅      |
-|                          [BORT](model_doc/bort)                          |       ✅        |         ✅         |      ✅      |
-|                   [BridgeTower](model_doc/bridgetower)                   |       ✅        |         ❌         |      ❌      |
-|                          [BROS](model_doc/bros)                          |       ✅        |         ❌         |      ❌      |
-|                          [ByT5](model_doc/byt5)                          |       ✅        |         ✅         |      ✅      |
-|                     [CamemBERT](model_doc/camembert)                     |       ✅        |         ✅         |      ❌      |
-|                        [CANINE](model_doc/canine)                        |       ✅        |         ❌         |      ❌      |
-|                  [Chinese-CLIP](model_doc/chinese_clip)                  |       ✅        |         ❌         |      ❌      |
-|                          [CLAP](model_doc/clap)                          |       ✅        |         ❌         |      ❌      |
-|                          [CLIP](model_doc/clip)                          |       ✅        |         ✅         |      ✅      |
-|                       [CLIPSeg](model_doc/clipseg)                       |       ✅        |         ❌         |      ❌      |
-|                       [CodeGen](model_doc/codegen)                       |       ✅        |         ❌         |      ❌      |
-|                    [CodeLlama](model_doc/code_llama)                     |       ✅        |         ❌         |      ❌      |
-|              [Conditional DETR](model_doc/conditional_detr)              |       ✅        |         ❌         |      ❌      |
-|                      [ConvBERT](model_doc/convbert)                      |       ✅        |         ✅         |      ❌      |
-|                      [ConvNeXT](model_doc/convnext)                      |       ✅        |         ✅         |      ❌      |
-|                    [ConvNeXTV2](model_doc/convnextv2)                    |       ✅        |         ❌         |      ❌      |
-|                           [CPM](model_doc/cpm)                           |       ✅        |         ✅         |      ✅      |
-|                       [CPM-Ant](model_doc/cpmant)                        |       ✅        |         ❌         |      ❌      |
-|                          [CTRL](model_doc/ctrl)                          |       ✅        |         ✅         |      ❌      |
-|                           [CvT](model_doc/cvt)                           |       ✅        |         ✅         |      ❌      |
-|                   [Data2VecAudio](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
-|                    [Data2VecText](model_doc/data2vec)                    |       ✅        |         ❌         |      ❌      |
-|                   [Data2VecVision](model_doc/data2vec)                   |       ✅        |         ✅         |      ❌      |
-|                       [DeBERTa](model_doc/deberta)                       |       ✅        |         ✅         |      ❌      |
-|                    [DeBERTa-v2](model_doc/deberta-v2)                    |       ✅        |         ✅         |      ❌      |
-|          [Decision Transformer](model_doc/decision_transformer)          |       ✅        |         ❌         |      ❌      |
-|               [Deformable DETR](model_doc/deformable_detr)               |       ✅        |         ❌         |      ❌      |
-|                          [DeiT](model_doc/deit)                          |       ✅        |         ✅         |      ❌      |
-|                        [DePlot](model_doc/deplot)                        |       ✅        |         ❌         |      ❌      |
-|                          [DETA](model_doc/deta)                          |       ✅        |         ❌         |      ❌      |
-|                          [DETR](model_doc/detr)                          |       ✅        |         ❌         |      ❌      |
-|                      [DialoGPT](model_doc/dialogpt)                      |       ✅        |         ✅         |      ✅      |
-|                         [DiNAT](model_doc/dinat)                         |       ✅        |         ❌         |      ❌      |
-|                        [DINOv2](model_doc/dinov2)                        |       ✅        |         ❌         |      ❌      |
-|                    [DistilBERT](model_doc/distilbert)                    |       ✅        |         ✅         |      ✅      |
-|                           [DiT](model_doc/dit)                           |       ✅        |         ❌         |      ✅      |
-|                       [DonutSwin](model_doc/donut)                       |       ✅        |         ❌         |      ❌      |
-|                           [DPR](model_doc/dpr)                           |       ✅        |         ✅         |      ❌      |
-|                           [DPT](model_doc/dpt)                           |       ✅        |         ❌         |      ❌      |
-|               [EfficientFormer](model_doc/efficientformer)               |       ✅        |         ✅         |      ❌      |
-|                  [EfficientNet](model_doc/efficientnet)                  |       ✅        |         ❌         |      ❌      |
-|                       [ELECTRA](model_doc/electra)                       |       ✅        |         ✅         |      ✅      |
-|                       [EnCodec](model_doc/encodec)                       |       ✅        |         ❌         |      ❌      |
-|               [Encoder decoder](model_doc/encoder-decoder)               |       ✅        |         ✅         |      ✅      |
-|                         [ERNIE](model_doc/ernie)                         |       ✅        |         ❌         |      ❌      |
-|                       [ErnieM](model_doc/ernie_m)                        |       ✅        |         ❌         |      ❌      |
-|                           [ESM](model_doc/esm)                           |       ✅        |         ✅         |      ❌      |
-|              [FairSeq Machine-Translation](model_doc/fsmt)               |       ✅        |         ❌         |      ❌      |
-|                        [Falcon](model_doc/falcon)                        |       ✅        |         ❌         |      ❌      |
-|                       [FLAN-T5](model_doc/flan-t5)                       |       ✅        |         ✅         |      ✅      |
-|                      [FLAN-UL2](model_doc/flan-ul2)                      |       ✅        |         ✅         |      ✅      |
-|                      [FlauBERT](model_doc/flaubert)                      |       ✅        |         ✅         |      ❌      |
-|                         [FLAVA](model_doc/flava)                         |       ✅        |         ❌         |      ❌      |
-|                          [FNet](model_doc/fnet)                          |       ✅        |         ❌         |      ❌      |
-|                      [FocalNet](model_doc/focalnet)                      |       ✅        |         ❌         |      ❌      |
-|                  [Funnel Transformer](model_doc/funnel)                  |       ✅        |         ✅         |      ❌      |
-|                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
-|                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
-|                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
-|                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
-|             [GPT NeoX Japanese](model_doc/gpt_neox_japanese)             |       ✅        |         ❌         |      ❌      |
-|                         [GPT-J](model_doc/gptj)                          |       ✅        |         ✅         |      ✅      |
-|                       [GPT-Sw3](model_doc/gpt-sw3)                       |       ✅        |         ✅         |      ✅      |
-|                   [GPTBigCode](model_doc/gpt_bigcode)                    |       ✅        |         ❌         |      ❌      |
-|               [GPTSAN-japanese](model_doc/gptsan-japanese)               |       ✅        |         ❌         |      ❌      |
-|                    [Graphormer](model_doc/graphormer)                    |       ✅        |         ❌         |      ❌      |
-|                      [GroupViT](model_doc/groupvit)                      |       ✅        |         ✅         |      ❌      |
-|                       [HerBERT](model_doc/herbert)                       |       ✅        |         ✅         |      ✅      |
-|                        [Hubert](model_doc/hubert)                        |       ✅        |         ✅         |      ❌      |
-|                        [I-BERT](model_doc/ibert)                         |       ✅        |         ❌         |      ❌      |
-|                       [IDEFICS](model_doc/idefics)                       |       ✅        |         ❌         |      ❌      |
-|                      [ImageGPT](model_doc/imagegpt)                      |       ✅        |         ❌         |      ❌      |
-|                      [Informer](model_doc/informer)                      |       ✅        |         ❌         |      ❌      |
-|                  [InstructBLIP](model_doc/instructblip)                  |       ✅        |         ❌         |      ❌      |
-|                       [Jukebox](model_doc/jukebox)                       |       ✅        |         ❌         |      ❌      |
-|                      [LayoutLM](model_doc/layoutlm)                      |       ✅        |         ✅         |      ❌      |
-|                    [LayoutLMv2](model_doc/layoutlmv2)                    |       ✅        |         ❌         |      ❌      |
-|                    [LayoutLMv3](model_doc/layoutlmv3)                    |       ✅        |         ✅         |      ❌      |
-|                     [LayoutXLM](model_doc/layoutxlm)                     |       ✅        |         ❌         |      ❌      |
-|                           [LED](model_doc/led)                           |       ✅        |         ✅         |      ❌      |
-|                         [LeViT](model_doc/levit)                         |       ✅        |         ❌         |      ❌      |
-|                          [LiLT](model_doc/lilt)                          |       ✅        |         ❌         |      ❌      |
-|                         [LLaMA](model_doc/llama)                         |       ✅        |         ❌         |      ❌      |
-|                        [Llama2](model_doc/llama2)                        |       ✅        |         ❌         |      ❌      |
-|                    [Longformer](model_doc/longformer)                    |       ✅        |         ✅         |      ❌      |
-|                        [LongT5](model_doc/longt5)                        |       ✅        |         ❌         |      ✅      |
-|                          [LUKE](model_doc/luke)                          |       ✅        |         ❌         |      ❌      |
-|                        [LXMERT](model_doc/lxmert)                        |       ✅        |         ✅         |      ❌      |
-|                        [M-CTC-T](model_doc/mctct)                        |       ✅        |         ❌         |      ❌      |
-|                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
-|                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
-|                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
-|                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
-|                    [MaskFormer](model_doc/maskformer)                    |       ✅        |         ❌         |      ❌      |
-|                        [MatCha](model_doc/matcha)                        |       ✅        |         ❌         |      ❌      |
-|                         [mBART](model_doc/mbart)                         |       ✅        |         ✅         |      ✅      |
-|                      [mBART-50](model_doc/mbart50)                       |       ✅        |         ✅         |      ✅      |
-|                          [MEGA](model_doc/mega)                          |       ✅        |         ❌         |      ❌      |
-|                 [Megatron-BERT](model_doc/megatron-bert)                 |       ✅        |         ❌         |      ❌      |
-|                 [Megatron-GPT2](model_doc/megatron_gpt2)                 |       ✅        |         ✅         |      ✅      |
-|                       [MGP-STR](model_doc/mgp-str)                       |       ✅        |         ❌         |      ❌      |
-|                       [Mistral](model_doc/mistral)                       |       ✅        |         ❌         |      ❌      |
-|                         [mLUKE](model_doc/mluke)                         |       ✅        |         ❌         |      ❌      |
-|                           [MMS](model_doc/mms)                           |       ✅        |         ✅         |      ✅      |
-|                    [MobileBERT](model_doc/mobilebert)                    |       ✅        |         ✅         |      ❌      |
-|                  [MobileNetV1](model_doc/mobilenet_v1)                   |       ✅        |         ❌         |      ❌      |
-|                  [MobileNetV2](model_doc/mobilenet_v2)                   |       ✅        |         ❌         |      ❌      |
-|                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
-|                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
-|                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
-|                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
-|                           [MRA](model_doc/mra)                           |       ✅        |         ❌         |      ❌      |
-|                           [MT5](model_doc/mt5)                           |       ✅        |         ✅         |      ✅      |
-|                      [MusicGen](model_doc/musicgen)                      |       ✅        |         ❌         |      ❌      |
-|                           [MVP](model_doc/mvp)                           |       ✅        |         ❌         |      ❌      |
-|                           [NAT](model_doc/nat)                           |       ✅        |         ❌         |      ❌      |
-|                         [Nezha](model_doc/nezha)                         |       ✅        |         ❌         |      ❌      |
-|                          [NLLB](model_doc/nllb)                          |       ✅        |         ❌         |      ❌      |
-|                      [NLLB-MOE](model_doc/nllb-moe)                      |       ✅        |         ❌         |      ❌      |
-|                        [Nougat](model_doc/nougat)                        |       ✅        |         ✅         |      ✅      |
-|                 [Nyströmformer](model_doc/nystromformer)                 |       ✅        |         ❌         |      ❌      |
-|                     [OneFormer](model_doc/oneformer)                     |       ✅        |         ❌         |      ❌      |
-|                    [OpenAI GPT](model_doc/openai-gpt)                    |       ✅        |         ✅         |      ❌      |
-|                      [OpenAI GPT-2](model_doc/gpt2)                      |       ✅        |         ✅         |      ✅      |
-|                    [OpenLlama](model_doc/open-llama)                     |       ✅        |         ❌         |      ❌      |
-|                           [OPT](model_doc/opt)                           |       ✅        |         ✅         |      ✅      |
-|                       [OWL-ViT](model_doc/owlvit)                        |       ✅        |         ❌         |      ❌      |
-|                       [Pegasus](model_doc/pegasus)                       |       ✅        |         ✅         |      ✅      |
-|                     [PEGASUS-X](model_doc/pegasus_x)                     |       ✅        |         ❌         |      ❌      |
-|                     [Perceiver](model_doc/perceiver)                     |       ✅        |         ❌         |      ❌      |
-|                     [Persimmon](model_doc/persimmon)                     |       ✅        |         ❌         |      ❌      |
-|                       [PhoBERT](model_doc/phobert)                       |       ✅        |         ✅         |      ✅      |
-|                    [Pix2Struct](model_doc/pix2struct)                    |       ✅        |         ❌         |      ❌      |
-|                        [PLBart](model_doc/plbart)                        |       ✅        |         ❌         |      ❌      |
-|                    [PoolFormer](model_doc/poolformer)                    |       ✅        |         ❌         |      ❌      |
-|                     [Pop2Piano](model_doc/pop2piano)                     |       ✅        |         ❌         |      ❌      |
-|                    [ProphetNet](model_doc/prophetnet)                    |       ✅        |         ❌         |      ❌      |
-|                           [PVT](model_doc/pvt)                           |       ✅        |         ❌         |      ❌      |
-|                       [QDQBert](model_doc/qdqbert)                       |       ✅        |         ❌         |      ❌      |
-|                           [RAG](model_doc/rag)                           |       ✅        |         ✅         |      ❌      |
-|                         [REALM](model_doc/realm)                         |       ✅        |         ❌         |      ❌      |
-|                      [Reformer](model_doc/reformer)                      |       ✅        |         ❌         |      ❌      |
-|                        [RegNet](model_doc/regnet)                        |       ✅        |         ✅         |      ✅      |
-|                       [RemBERT](model_doc/rembert)                       |       ✅        |         ✅         |      ❌      |
-|                        [ResNet](model_doc/resnet)                        |       ✅        |         ✅         |      ✅      |
-|                     [RetriBERT](model_doc/retribert)                     |       ✅        |         ❌         |      ❌      |
-|                       [RoBERTa](model_doc/roberta)                       |       ✅        |         ✅         |      ✅      |
-|          [RoBERTa-PreLayerNorm](model_doc/roberta-prelayernorm)          |       ✅        |         ✅         |      ✅      |
-|                      [RoCBert](model_doc/roc_bert)                       |       ✅        |         ❌         |      ❌      |
-|                      [RoFormer](model_doc/roformer)                      |       ✅        |         ✅         |      ✅      |
-|                          [RWKV](model_doc/rwkv)                          |       ✅        |         ❌         |      ❌      |
-|                           [SAM](model_doc/sam)                           |       ✅        |         ✅         |      ❌      |
-|                     [SegFormer](model_doc/segformer)                     |       ✅        |         ✅         |      ❌      |
-|                           [SEW](model_doc/sew)                           |       ✅        |         ❌         |      ❌      |
-|                         [SEW-D](model_doc/sew-d)                         |       ✅        |         ❌         |      ❌      |
-|        [Speech Encoder decoder](model_doc/speech-encoder-decoder)        |       ✅        |         ❌         |      ✅      |
-|                 [Speech2Text](model_doc/speech_to_text)                  |       ✅        |         ✅         |      ❌      |
-|                      [SpeechT5](model_doc/speecht5)                      |       ✅        |         ❌         |      ❌      |
-|                      [Splinter](model_doc/splinter)                      |       ✅        |         ❌         |      ❌      |
-|                   [SqueezeBERT](model_doc/squeezebert)                   |       ✅        |         ❌         |      ❌      |
-|                   [SwiftFormer](model_doc/swiftformer)                   |       ✅        |         ❌         |      ❌      |
-|                    [Swin Transformer](model_doc/swin)                    |       ✅        |         ✅         |      ❌      |
-|                 [Swin Transformer V2](model_doc/swinv2)                  |       ✅        |         ❌         |      ❌      |
-|                       [Swin2SR](model_doc/swin2sr)                       |       ✅        |         ❌         |      ❌      |
-|           [SwitchTransformers](model_doc/switch_transformers)            |       ✅        |         ❌         |      ❌      |
-|                            [T5](model_doc/t5)                            |       ✅        |         ✅         |      ✅      |
-|                        [T5v1.1](model_doc/t5v1.1)                        |       ✅        |         ✅         |      ✅      |
-|             [Table Transformer](model_doc/table-transformer)             |       ✅        |         ❌         |      ❌      |
-|                         [TAPAS](model_doc/tapas)                         |       ✅        |         ✅         |      ❌      |
-|                         [TAPEX](model_doc/tapex)                         |       ✅        |         ✅         |      ✅      |
-|       [Time Series Transformer](model_doc/time_series_transformer)       |       ✅        |         ❌         |      ❌      |
-|                   [TimeSformer](model_doc/timesformer)                   |       ✅        |         ❌         |      ❌      |
-|        [Trajectory Transformer](model_doc/trajectory_transformer)        |       ✅        |         ❌         |      ❌      |
-|                  [Transformer-XL](model_doc/transfo-xl)                  |       ✅        |         ✅         |      ❌      |
-|                         [TrOCR](model_doc/trocr)                         |       ✅        |         ❌         |      ❌      |
-|                          [TVLT](model_doc/tvlt)                          |       ✅        |         ❌         |      ❌      |
-|                           [UL2](model_doc/ul2)                           |       ✅        |         ✅         |      ✅      |
-|                          [UMT5](model_doc/umt5)                          |       ✅        |         ❌         |      ❌      |
-|                     [UniSpeech](model_doc/unispeech)                     |       ✅        |         ❌         |      ❌      |
-|                 [UniSpeechSat](model_doc/unispeech-sat)                  |       ✅        |         ❌         |      ❌      |
-|                       [UPerNet](model_doc/upernet)                       |       ✅        |         ❌         |      ❌      |
-|                           [VAN](model_doc/van)                           |       ✅        |         ❌         |      ❌      |
-|                      [VideoMAE](model_doc/videomae)                      |       ✅        |         ❌         |      ❌      |
-|                          [ViLT](model_doc/vilt)                          |       ✅        |         ❌         |      ❌      |
-|        [Vision Encoder decoder](model_doc/vision-encoder-decoder)        |       ✅        |         ✅         |      ✅      |
-|       [VisionTextDualEncoder](model_doc/vision-text-dual-encoder)        |       ✅        |         ✅         |      ✅      |
-|                   [VisualBERT](model_doc/visual_bert)                    |       ✅        |         ❌         |      ❌      |
-|                           [ViT](model_doc/vit)                           |       ✅        |         ✅         |      ✅      |
-|                    [ViT Hybrid](model_doc/vit_hybrid)                    |       ✅        |         ❌         |      ❌      |
-|                        [VitDet](model_doc/vitdet)                        |       ✅        |         ❌         |      ❌      |
-|                       [ViTMAE](model_doc/vit_mae)                        |       ✅        |         ✅         |      ❌      |
-|                      [ViTMatte](model_doc/vitmatte)                      |       ✅        |         ❌         |      ❌      |
-|                       [ViTMSN](model_doc/vit_msn)                        |       ✅        |         ❌         |      ❌      |
-|                          [VITS](model_doc/vits)                          |       ✅        |         ❌         |      ❌      |
-|                         [ViViT](model_doc/vivit)                         |       ✅        |         ❌         |      ❌      |
-|                      [Wav2Vec2](model_doc/wav2vec2)                      |       ✅        |         ✅         |      ✅      |
-|            [Wav2Vec2-Conformer](model_doc/wav2vec2-conformer)            |       ✅        |         ❌         |      ❌      |
-|              [Wav2Vec2Phoneme](model_doc/wav2vec2_phoneme)               |       ✅        |         ✅         |      ✅      |
-|                         [WavLM](model_doc/wavlm)                         |       ✅        |         ❌         |      ❌      |
-|                       [Whisper](model_doc/whisper)                       |       ✅        |         ✅         |      ✅      |
-|                        [X-CLIP](model_doc/xclip)                         |       ✅        |         ❌         |      ❌      |
-|                         [X-MOD](model_doc/xmod)                          |       ✅        |         ❌         |      ❌      |
-|                          [XGLM](model_doc/xglm)                          |       ✅        |         ✅         |      ✅      |
-|                           [XLM](model_doc/xlm)                           |       ✅        |         ✅         |      ❌      |
-|                [XLM-ProphetNet](model_doc/xlm-prophetnet)                |       ✅        |         ❌         |      ❌      |
-|                   [XLM-RoBERTa](model_doc/xlm-roberta)                   |       ✅        |         ✅         |      ✅      |
-|                [XLM-RoBERTa-XL](model_doc/xlm-roberta-xl)                |       ✅        |         ❌         |      ❌      |
-|                         [XLM-V](model_doc/xlm-v)                         |       ✅        |         ✅         |      ✅      |
-|                         [XLNet](model_doc/xlnet)                         |       ✅        |         ✅         |      ❌      |
-|                         [XLS-R](model_doc/xls_r)                         |       ✅        |         ✅         |      ✅      |
-|                 [XLSR-Wav2Vec2](model_doc/xlsr_wav2vec2)                 |       ✅        |         ✅         |      ✅      |
-|                         [YOLOS](model_doc/yolos)                         |       ✅        |         ❌         |      ❌      |
-|                          [YOSO](model_doc/yoso)                          |       ✅        |         ❌         |      ❌      |
-
-<!-- End table-->
--- a/docs/source/te/quicktour.md
+++ b/docs/source/te/quicktour.md
@ -1,557 +0,0 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# శీఘ్ర పర్యటన
-
-[[ఓపెన్-ఇన్-కోలాబ్]]
-
-🤗 ట్రాన్స్‌ఫార్మర్‌లతో లేచి పరుగెత్తండి! మీరు డెవలపర్ అయినా లేదా రోజువారీ వినియోగదారు అయినా, ఈ శీఘ్ర పర్యటన మీకు ప్రారంభించడానికి సహాయం చేస్తుంది మరియు [`pipeline`] అనుమితి కోసం ఎలా ఉపయోగించాలో మీకు చూపుతుంది, [AutoClass](./model_doc/auto) తో ప్రీట్రైన్డ్ మోడల్ మరియు ప్రిప్రాసెసర్/ ఆటో, మరియు PyTorch లేదా TensorFlowతో మోడల్‌కు త్వరగా శిక్షణ ఇవ్వండి. మీరు ఒక అనుభవశూన్యుడు అయితే, ఇక్కడ పరిచయం చేయబడిన భావనల గురించి మరింత లోతైన వివరణల కోసం మా ట్యుటోరియల్స్ లేదా [course](https://huggingface.co/course/chapter1/1)ని తనిఖీ చేయమని మేము సిఫార్సు చేస్తున్నాము.
-
-మీరు ప్రారంభించడానికి ముందు, మీరు అవసరమైన అన్ని లైబ్రరీలను ఇన్‌స్టాల్ చేశారని నిర్ధారించుకోండి:
-
-```bash
-!pip install transformers datasets
-```
-
-మీరు మీ ప్రాధాన్య యంత్ర అభ్యాస ఫ్రేమ్‌వర్క్‌ను కూడా ఇన్‌స్టాల్ చేయాలి:
-
-<frameworkcontent>
-<pt>
-
-```bash
-pip install torch
-```
-</pt>
-<tf>
-
-```bash
-pip install tensorflow
-```
-</tf>
-</frameworkcontent>
-
-## పైప్‌లైన్
-
-<Youtube id="tiZFewofSLM"/>
-
-[`pipeline`] అనుమితి కోసం ముందుగా శిక్షణ పొందిన నమూనాను ఉపయోగించడానికి సులభమైన మరియు వేగవంతమైన మార్గం. మీరు వివిధ పద్ధతులలో అనేక పనుల కోసం [`pipeline`] వెలుపల ఉపయోగించవచ్చు, వాటిలో కొన్ని క్రింది పట్టికలో చూపబడ్డాయి:
-
-
-<Tip>
-
-అందుబాటులో ఉన్న పనుల పూర్తి జాబితా కోసం, [పైప్‌లైన్ API సూచన](./main_classes/pipelines)ని తనిఖీ చేయండి.
-
-</Tip>
-
-Here is the translation in Telugu:
-
-| **పని**                      | **వివరణ**                                                                                              | **మోడాలిటీ**    | **పైప్‌లైన్ ఐడెంటిఫైయర్**          |
-|------------------------------|--------------------------------------------------------------------------------------------------------|-----------------|------------------------------------------|
-| వచన వర్గీకరణు               | కొన్ని వచనాల అంతా ఒక లేబుల్‌ను కొడి                                                                   | NLP             | pipeline(task=“sentiment-analysis”)     |
-| వచన సృష్టి                   | ప్రమ్పుటం కలిగినంత వచనం సృష్టించండి                                                                 | NLP             | pipeline(task=“text-generation”)        |
-| సంక్షేపణ                     | వచనం లేదా పత్రం కొరకు సంక్షేపణ తయారుచేసండి                                        | NLP             | pipeline(task=“summarization”)          |
-| చిత్రం వర్గీకరణు                | చిత్రంలో ఒక లేబుల్‌ను కొడి                                                           | కంప్యూటర్ విషయం | pipeline(task=“image-classification”) |
-| చిత్రం విభజన                           | ఒక చిత్రంలో ప్రతి వ్యక్తిగత పిక్సల్‌ను ఒక లేబుల్‌గా నమోదు చేయండి (సెమాంటిక్, పానొప్టిక్, మరియు ఇన్స్టన్స్ విభజనలను మద్దతు చేస్తుంది)         | కంప్యూటర్ విషయం | pipeline(task=“image-segmentation”)   |
-| వస్త్రం గుర్తువు                    | ఒక చిత్రంలో పదాల యొక్క బౌండింగ్ బాక్స్‌లను మరియు వస్త్రాల వర్గాలను అంచనా చేయండి      | కంప్యూటర్ విషయం | pipeline(task=“object-detection”)     |
-| ఆడియో గుర్తువు                  | కొన్ని ఆడియో డేటానికి ఒక లేబుల్‌ను కొడి                                         | ఆడియో           | pipeline(task=“audio-classification”) |
-| స్వయంచలన ప్రసంగ గుర్తువు   | ప్రసంగాన్ని వచనంగా వర్ణించండి                                                                         | ఆడియో           | pipeline(task=“automatic-speech-recognition”) |
-| దృశ్య ప్రశ్న సంవాదం          | వచనం మరియు ప్రశ్నను నమోదు చేసిన చిత్రంతో ప్రశ్నకు సమాధానం ఇవ్వండి                     | బహుమూలిక          | pipeline(task=“vqa”)                   |
-| పత్రం ప్రశ్న సంవాదం         | ప్రశ్నను పత్రం లేదా డాక్యుమెంట్‌తో సమాధానం ఇవ్వండి                               | బహుమూలిక          | pipeline(task="document-question-answering") |
-| చిత్రం వ్రాసాయింగ్            | కొన్ని చిత్రానికి పిటియార్లను సృష్టించండి                                                         | బహుమూలిక          | pipeline(task="image-to-text")          |
-
-
-[`pipeline`] యొక్క ఉదాహరణను సృష్టించడం ద్వారా మరియు మీరు దానిని ఉపయోగించాలనుకుంటున్న పనిని పేర్కొనడం ద్వారా ప్రారంభించండి. ఈ గైడ్‌లో, మీరు సెంటిమెంట్ విశ్లేషణ కోసం [`pipeline`]ని ఉదాహరణగా ఉపయోగిస్తారు:
-
-```py
->>> from transformers import pipeline
-
->>> classifier = pipeline("sentiment-analysis")
-```
-
-సెంటిమెంట్ విశ్లేషణ కోసం [`pipeline`] డిఫాల్ట్ [ప్రీట్రైన్డ్ మోడల్](https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english) మరియు టోకెనైజర్‌ని డౌన్‌లోడ్ చేస్తుంది మరియు కాష్ చేస్తుంది. ఇప్పుడు మీరు మీ లక్ష్య వచనంలో `classifier`ని ఉపయోగించవచ్చు:
-
-```py
->>> classifier("We are very happy to show you the 🤗 Transformers library.")
-[{'label': 'POSITIVE', 'score': 0.9998}]
-```
-
-మీరు ఒకటి కంటే ఎక్కువ ఇన్‌పుట్‌లను కలిగి ఉంటే, నిఘంటువుల జాబితాను అందించడానికి మీ ఇన్‌పుట్‌లను జాబితాగా [`pipeline`]కి పంపండి:
-
-```py
->>> results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])
->>> for result in results:
-...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
-label: POSITIVE, with score: 0.9998
-label: NEGATIVE, with score: 0.5309
-```
-
-[`pipeline`] మీకు నచ్చిన ఏదైనా పని కోసం మొత్తం డేటాసెట్‌ను కూడా పునరావృతం చేయగలదు. ఈ ఉదాహరణ కోసం, స్వయంచాలక ప్రసంగ గుర్తింపును మన పనిగా ఎంచుకుందాం:
-
-```py
->>> import torch
->>> from transformers import pipeline
-
->>> speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")
-```
-
-మీరు మళ్లీ మళ్లీ చెప్పాలనుకుంటున్న ఆడియో డేటాసెట్‌ను లోడ్ చేయండి (మరిన్ని వివరాల కోసం 🤗 డేటాసెట్‌లు [త్వరిత ప్రారంభం](https://huggingface.co/docs/datasets/quickstart#audio) చూడండి. ఉదాహరణకు, [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) డేటాసెట్‌ను లోడ్ చేయండి:
-
-```py
->>> from datasets import load_dataset, Audio
-
->>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")  # doctest: +IGNORE_RESULT
-```
-
-డేటాసెట్ యొక్క నమూనా రేటు నమూనాతో సరిపోలుతుందని మీరు నిర్ధారించుకోవాలి
-రేటు [`facebook/wav2vec2-base-960h`](https://huggingface.co/facebook/wav2vec2-base-960h) దీనిపై శిక్షణ పొందింది:
-
-```py
->>> dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
-```
-
-`"ఆడియో"` కాలమ్‌కి కాల్ చేస్తున్నప్పుడు ఆడియో ఫైల్‌లు స్వయంచాలకంగా లోడ్ చేయబడతాయి మరియు మళ్లీ నమూనా చేయబడతాయి.
-మొదటి 4 నమూనాల నుండి ముడి వేవ్‌ఫార్మ్ శ్రేణులను సంగ్రహించి, పైప్‌లైన్‌కు జాబితాగా పాస్ చేయండి:
-
-```py
->>> result = speech_recognizer(dataset[:4]["audio"])
->>> print([d["text"] for d in result])
-['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FONDERING HOW I'D SET UP A JOIN TO HELL T WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE APSO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AN I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I FURN A JOINA COUT']
-```
-
-ఇన్‌పుట్‌లు పెద్దగా ఉన్న పెద్ద డేటాసెట్‌ల కోసం (స్పీచ్ లేదా విజన్ వంటివి), మెమరీలోని అన్ని ఇన్‌పుట్‌లను లోడ్ చేయడానికి మీరు జాబితాకు బదులుగా జెనరేటర్‌ను పాస్ చేయాలనుకుంటున్నారు. మరింత సమాచారం కోసం [పైప్‌లైన్ API సూచన](./main_classes/pipelines)ని చూడండి.
-
-### పైప్‌లైన్‌లో మరొక మోడల్ మరియు టోకెనైజర్‌ని ఉపయోగించండి
-
-[`pipeline`] [Hub](https://huggingface.co/models) నుండి ఏదైనా మోడల్‌ను కలిగి ఉంటుంది, దీని వలన ఇతర వినియోగ-కేసుల కోసం [`pipeline`]ని సులభంగా స్వీకరించవచ్చు. ఉదాహరణకు, మీరు ఫ్రెంచ్ టెక్స్ట్‌ను హ్యాండిల్ చేయగల మోడల్ కావాలనుకుంటే, తగిన మోడల్ కోసం ఫిల్టర్ చేయడానికి హబ్‌లోని ట్యాగ్‌లను ఉపయోగించండి. అగ్ర ఫిల్టర్ చేసిన ఫలితం మీరు ఫ్రెంచ్ టెక్స్ట్ కోసం ఉపయోగించగల సెంటిమెంట్ విశ్లేషణ కోసం ఫైన్‌ట్యూన్ చేయబడిన బహుభాషా [BERT మోడల్](https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment)ని అందిస్తుంది:
-
-```py
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
-```
-
-<frameworkcontent>
-<pt> 
-ముందుగా శిక్షణ పొందిన మోడల్‌ను లోడ్ చేయడానికి [`AutoModelForSequenceClassification`] మరియు [`AutoTokenizer`]ని ఉపయోగించండి మరియు దాని అనుబంధిత టోకెనైజర్ (తదుపరి విభాగంలో `AutoClass`పై మరిన్ని):
-
-```py
->>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
-
->>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</pt>
-<tf>
-ముందుగా శిక్షణ పొందిన మోడల్‌ను లోడ్ చేయడానికి [`TFAutoModelForSequenceClassification`] మరియు [`AutoTokenizer`]ని ఉపయోగించండి మరియు దాని అనుబంధిత టోకెనైజర్ (తదుపరి విభాగంలో `TFAutoClass`పై మరిన్ని):
-  
-```py
->>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-
->>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-</tf>
-</frameworkcontent>
-
-[`pipeline`]లో మోడల్ మరియు టోకెనైజర్‌ను పేర్కొనండి మరియు ఇప్పుడు మీరు ఫ్రెంచ్ టెక్స్ట్‌పై `క్లాసిఫైయర్`ని వర్తింపజేయవచ్చు:
-
-```py
->>> classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
->>> classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
-[{'label': '5 stars', 'score': 0.7273}]
-```
-
-మీరు మీ వినియోగ-కేస్ కోసం మోడల్‌ను కనుగొనలేకపోతే, మీరు మీ డేటాపై ముందుగా శిక్షణ పొందిన మోడల్‌ను చక్కగా మార్చాలి. ఎలాగో తెలుసుకోవడానికి మా [ఫైన్‌ట్యూనింగ్ ట్యుటోరియల్](./training)ని చూడండి. చివరగా, మీరు మీ ప్రీట్రైన్డ్ మోడల్‌ని ఫైన్‌ట్యూన్ చేసిన తర్వాత, దయచేసి అందరి కోసం మెషిన్ లెర్నింగ్‌ని డెమోక్రటైజ్ చేయడానికి హబ్‌లోని సంఘంతో మోడల్‌ను [షేరింగ్](./model_sharing) పరిగణించండి! 🤗
-
-## AutoClass
-
-<Youtube id="AhChOFRegn4"/>
-
-హుడ్ కింద, మీరు పైన ఉపయోగించిన [`pipeline`]కి శక్తిని అందించడానికి [`AutoModelForSequenceClassification`] మరియు [`AutoTokenizer`] తరగతులు కలిసి పని చేస్తాయి. ఒక [AutoClass](./model_doc/auto) అనేది ముందుగా శిక్షణ పొందిన మోడల్ యొక్క ఆర్కిటెక్చర్‌ను దాని పేరు లేదా మార్గం నుండి స్వయంచాలకంగా తిరిగి పొందే సత్వరమార్గం. మీరు మీ టాస్క్ కోసం తగిన `ఆటోక్లాస్`ని మాత్రమే ఎంచుకోవాలి మరియు ఇది అనుబంధిత ప్రీప్రాసెసింగ్ క్లాస్.
-
-మునుపటి విభాగం నుండి ఉదాహరణకి తిరిగి వెళ్లి, [`pipeline`] ఫలితాలను ప్రతిబింబించడానికి మీరు `ఆటోక్లాస్`ని ఎలా ఉపయోగించవచ్చో చూద్దాం.
-
-### AutoTokenizer
-
-ఒక మోడల్‌కు ఇన్‌పుట్‌లుగా సంఖ్యల శ్రేణిలో వచనాన్ని ప్రీప్రాసెసింగ్ చేయడానికి టోకెనైజర్ బాధ్యత వహిస్తుంది. పదాన్ని ఎలా విభజించాలి మరియు ఏ స్థాయిలో పదాలను విభజించాలి ([tokenizer సారాంశం](./tokenizer_summary)లో టోకనైజేషన్ గురించి మరింత తెలుసుకోండి) సహా టోకనైజేషన్ ప్రక్రియను నియంత్రించే అనేక నియమాలు ఉన్నాయి. గుర్తుంచుకోవలసిన ముఖ్యమైన విషయం ఏమిటంటే, మీరు మోడల్‌కు ముందే శిక్షణ పొందిన అదే టోకనైజేషన్ నియమాలను ఉపయోగిస్తున్నారని నిర్ధారించుకోవడానికి మీరు అదే మోడల్ పేరుతో టోకెనైజర్‌ను తక్షణం చేయాలి.
-
-[`AutoTokenizer`]తో టోకెనైజర్‌ను లోడ్ చేయండి:
-
-```py
->>> from transformers import AutoTokenizer
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tokenizer = AutoTokenizer.from_pretrained(model_name)
-```
-
-మీ వచనాన్ని టోకెనైజర్‌కు పంపండి:
-
-```py
->>> encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
->>> print(encoding)
-{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102],
- 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
- 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
-```
-
-టోకెనైజర్ వీటిని కలిగి ఉన్న నిఘంటువుని అందిస్తుంది:
-
-* [input_ids](./glossary#input-ids): మీ టోకెన్‌ల సంఖ్యాపరమైన ప్రాతినిధ్యం.
-* [అటెన్షన్_మాస్క్](./glossary#attention-mask): ఏ టోకెన్‌లకు హాజరు కావాలో సూచిస్తుంది.
-
-ఒక టోకెనైజర్ ఇన్‌పుట్‌ల జాబితాను కూడా ఆమోదించగలదు మరియు ఏకరీతి పొడవుతో బ్యాచ్‌ను తిరిగి ఇవ్వడానికి టెక్స్ట్‌ను ప్యాడ్ చేసి కత్తిరించవచ్చు:
-
-<frameworkcontent>
-<pt>
-
-```py
->>> pt_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="pt",
-... )
-```
-</pt>
-<tf>
-
-```py
->>> tf_batch = tokenizer(
-...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
-...     padding=True,
-...     truncation=True,
-...     max_length=512,
-...     return_tensors="tf",
-... )
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-టోకనైజేషన్ గురించి మరిన్ని వివరాల కోసం [ప్రీప్రాసెస్](./preprocessing) ట్యుటోరియల్‌ని చూడండి మరియు ఇమేజ్, ఆడియో మరియు మల్టీమోడల్ ఇన్‌పుట్‌లను ప్రీప్రాసెస్ చేయడానికి [`AutoImageProcessor`], [`AutoFeatureExtractor`] మరియు [`AutoProcessor`] ఎలా ఉపయోగించాలి.
-
-</Tip>
-
-### AutoModel
-
-<frameworkcontent>
-<pt>
-🤗 ట్రాన్స్‌ఫార్మర్లు ప్రీట్రైన్డ్ ఇన్‌స్టాన్స్‌లను లోడ్ చేయడానికి సులభమైన మరియు ఏకీకృత మార్గాన్ని అందిస్తాయి. దీని అర్థం మీరు [`AutoTokenizer`]ని లోడ్ చేసినట్లుగా [`AutoModel`]ని లోడ్ చేయవచ్చు. టాస్క్ కోసం సరైన [`AutoModel`]ని ఎంచుకోవడం మాత్రమే తేడా. టెక్స్ట్ (లేదా సీక్వెన్స్) వర్గీకరణ కోసం, మీరు [`AutoModelForSequenceClassification`]ని లోడ్ చేయాలి:
-
-
-```py
->>> from transformers import AutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-[`AutoModel`] క్లాస్ ద్వారా సపోర్ట్ చేసే టాస్క్‌ల కోసం [టాస్క్ సారాంశం](./task_summary)ని చూడండి.
-
-</Tip>
-
-ఇప్పుడు మీ ప్రీప్రాసెస్ చేయబడిన బ్యాచ్ ఇన్‌పుట్‌లను నేరుగా మోడల్‌కి పంపండి. మీరు `**`ని జోడించడం ద్వారా నిఘంటువుని అన్‌ప్యాక్ చేయాలి:
-
-```py
->>> pt_outputs = pt_model(**pt_batch)
-```
-
-మోడల్ తుది యాక్టివేషన్‌లను `logits` లక్షణంలో అవుట్‌పుట్ చేస్తుంది. సంభావ్యతలను తిరిగి పొందడానికి సాఫ్ట్‌మాక్స్ ఫంక్షన్‌ను `logits` కు వర్తింపజేయండి:
-
-
-```py
->>> from torch import nn
-
->>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
->>> print(pt_predictions)
-tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
-        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)
-```
-
-</pt>
-<tf>
-🤗 ట్రాన్స్‌ఫార్మర్లు ప్రీట్రైన్డ్ ఇన్‌స్టాన్స్‌లను లోడ్ చేయడానికి సులభమైన మరియు ఏకీకృత మార్గాన్ని అందిస్తాయి. మీరు [`AutoTokenizer`]ని లోడ్ చేసినట్లుగా మీరు [`TFAutoModel`]ని లోడ్ చేయవచ్చని దీని అర్థం. టాస్క్ కోసం సరైన [`TFAutoModel`]ని ఎంచుకోవడం మాత్రమే తేడా. టెక్స్ట్ (లేదా సీక్వెన్స్) వర్గీకరణ కోసం, మీరు [`TFAutoModelForSequenceClassification`]ని లోడ్ చేయాలి:
-
-```py
->>> from transformers import TFAutoModelForSequenceClassification
-
->>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
-```
-
-<Tip>
-
-[`AutoModel`] క్లాస్ ద్వారా సపోర్ట్ చేసే టాస్క్‌ల కోసం [టాస్క్ సారాంశం](./task_summary)ని చూడండి.
-
-</Tip>
-
-ఇప్పుడు మీ ప్రీప్రాసెస్ చేయబడిన బ్యాచ్ ఇన్‌పుట్‌లను నేరుగా మోడల్‌కి పంపండి. మీరు టెన్సర్‌లను ఇలా పాస్ చేయవచ్చు:
-
-```py
->>> tf_outputs = tf_model(tf_batch)
-```
-
-మోడల్ తుది యాక్టివేషన్‌లను `logits` లక్షణంలో అవుట్‌పుట్ చేస్తుంది. సంభావ్యతలను తిరిగి పొందడానికి సాఫ్ట్‌మాక్స్ ఫంక్షన్‌ను `logits`కు వర్తింపజేయండి:
-
-```py
->>> import tensorflow as tf
-
->>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
->>> tf_predictions  # doctest: +IGNORE_RESULT
-```
-</tf>
-</frameworkcontent>
-
-<Tip>
-
-అన్ని 🤗 ట్రాన్స్‌ఫార్మర్స్ మోడల్‌లు (PyTorch లేదా TensorFlow) తుది యాక్టివేషన్‌కు *ముందు* టెన్సర్‌లను అవుట్‌పుట్ చేస్తాయి
-ఫంక్షన్ (softmax వంటిది) ఎందుకంటే చివరి యాక్టివేషన్ ఫంక్షన్ తరచుగా నష్టంతో కలిసిపోతుంది. మోడల్ అవుట్‌పుట్‌లు ప్రత్యేక డేటాక్లాస్‌లు కాబట్టి వాటి లక్షణాలు IDEలో స్వయంచాలకంగా పూర్తి చేయబడతాయి. మోడల్ అవుట్‌పుట్‌లు టుపుల్ లేదా డిక్షనరీ లాగా ప్రవర్తిస్తాయి (మీరు పూర్ణాంకం, స్లైస్ లేదా స్ట్రింగ్‌తో ఇండెక్స్ చేయవచ్చు) ఈ సందర్భంలో, ఏదీ లేని గుణాలు విస్మరించబడతాయి.
-
-</Tip>
-
-### మోడల్‌ను సేవ్ చేయండి
-
-<frameworkcontent>
-<pt>
-మీ మోడల్ చక్కగా ట్యూన్ చేయబడిన తర్వాత, మీరు దానిని [`PreTrainedModel.save_pretrained`]ని ఉపయోగించి దాని టోకెనైజర్‌తో సేవ్ చేయవచ్చు:
-  
-```py
->>> pt_save_directory = "./pt_save_pretrained"
->>> tokenizer.save_pretrained(pt_save_directory)  # doctest: +IGNORE_RESULT
->>> pt_model.save_pretrained(pt_save_directory)
-```
-
-మీరు మోడల్‌ని మళ్లీ ఉపయోగించడానికి సిద్ధంగా ఉన్నప్పుడు, దాన్ని [`PreTrainedModel.from_pretrained`]తో రీలోడ్ చేయండి:
-
-```py
->>> pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
-```
-</pt>
-<tf>
-మీ మోడల్ చక్కగా ట్యూన్ చేయబడిన తర్వాత, మీరు దానిని [`TFPreTrainedModel.save_pretrained`]ని ఉపయోగించి దాని టోకెనైజర్‌తో సేవ్ చేయవచ్చు:
-  
-```py
->>> tf_save_directory = "./tf_save_pretrained"
->>> tokenizer.save_pretrained(tf_save_directory)  # doctest: +IGNORE_RESULT
->>> tf_model.save_pretrained(tf_save_directory)
-```
-మీరు మోడల్‌ని మళ్లీ ఉపయోగించడానికి సిద్ధంగా ఉన్నప్పుడు, దాన్ని [`TFPreTrainedModel.from_pretrained`]తో రీలోడ్ చేయండి:
-
-```py
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
-```
-</tf>
-</frameworkcontent>
-
-ఒక ప్రత్యేకించి అద్భుతమైన 🤗 ట్రాన్స్‌ఫార్మర్స్ ఫీచర్ మోడల్‌ను సేవ్ చేయగల సామర్థ్యం మరియు దానిని PyTorch లేదా TensorFlow మోడల్‌గా రీలోడ్ చేయగలదు. `from_pt` లేదా `from_tf` పరామితి మోడల్‌ను ఒక ఫ్రేమ్‌వర్క్ నుండి మరొక ఫ్రేమ్‌వర్క్‌కి మార్చగలదు:
-
-<frameworkcontent>
-<pt>
-
-```py
->>> from transformers import AutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
-```
-</pt>
-<tf>
-
-```py
->>> from transformers import TFAutoModel
-
->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
-```
-</tf>
-</frameworkcontent>
-
-## కస్టమ్ మోడల్ బిల్డ్స్
-మోడల్ ఎలా నిర్మించబడుతుందో మార్చడానికి మీరు మోడల్ కాన్ఫిగరేషన్ క్లాస్‌ని సవరించవచ్చు. దాచిన లేయర్‌లు లేదా అటెన్షన్ హెడ్‌ల సంఖ్య వంటి మోడల్ లక్షణాలను కాన్ఫిగరేషన్ నిర్దేశిస్తుంది. మీరు కస్టమ్ కాన్ఫిగరేషన్ క్లాస్ నుండి మోడల్‌ను ప్రారంభించినప్పుడు మీరు మొదటి నుండి ప్రారంభిస్తారు. మోడల్ అట్రిబ్యూట్‌లు యాదృచ్ఛికంగా ప్రారంభించబడ్డాయి మరియు అర్థవంతమైన ఫలితాలను పొందడానికి మీరు మోడల్‌ను ఉపయోగించే ముందు దానికి శిక్షణ ఇవ్వాలి.
-
-[`AutoConfig`]ని దిగుమతి చేయడం ద్వారా ప్రారంభించండి, ఆపై మీరు సవరించాలనుకుంటున్న ప్రీట్రైన్డ్ మోడల్‌ను లోడ్ చేయండి. [`AutoConfig.from_pretrained`]లో, మీరు అటెన్షన్ హెడ్‌ల సంఖ్య వంటి మీరు మార్చాలనుకుంటున్న లక్షణాన్ని పేర్కొనవచ్చు:
-
-```py
->>> from transformers import AutoConfig
-
->>> my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)
-```
-
-<frameworkcontent>
-<pt>
-[`AutoModel.from_config`]తో మీ అనుకూల కాన్ఫిగరేషన్ నుండి మోడల్‌ను సృష్టించండి:
-  
-```py
->>> from transformers import AutoModel
-
->>> my_model = AutoModel.from_config(my_config)
-```
-</pt>
-<tf>
-[`TFAutoModel.from_config`]తో మీ అనుకూల కాన్ఫిగరేషన్ నుండి మోడల్‌ను సృష్టించండి:
-  
-```py
->>> from transformers import TFAutoModel
-
->>> my_model = TFAutoModel.from_config(my_config)
-```
-</tf>
-</frameworkcontent>
-
-అనుకూల కాన్ఫిగరేషన్‌లను రూపొందించడం గురించి మరింత సమాచారం కోసం [కస్టమ్ ఆర్కిటెక్చర్‌ని సృష్టించండి](./create_a_model) గైడ్‌ను చూడండి.
-
-## శిక్షకుడు - పైటార్చ్ ఆప్టిమైజ్ చేసిన శిక్షణ లూప్
-
-అన్ని మోడల్‌లు ప్రామాణికమైన [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) కాబట్టి మీరు వాటిని ఏదైనా సాధారణ శిక్షణ లూప్‌లో ఉపయోగించవచ్చు. మీరు మీ స్వంత శిక్షణ లూప్‌ను వ్రాయగలిగినప్పటికీ, 🤗 ట్రాన్స్‌ఫార్మర్లు PyTorch కోసం [`ట్రైనర్`] తరగతిని అందజేస్తాయి, ఇందులో ప్రాథమిక శిక్షణ లూప్ ఉంటుంది మరియు పంపిణీ చేయబడిన శిక్షణ, మిశ్రమ ఖచ్చితత్వం మరియు మరిన్ని వంటి ఫీచర్‌ల కోసం అదనపు కార్యాచరణను జోడిస్తుంది.
-
-మీ విధిని బట్టి, మీరు సాధారణంగా కింది పారామితులను [`ట్రైనర్`]కి పంపుతారు:
-
-1. మీరు [`PreTrainedModel`] లేదా [`torch.nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)తో ప్రారంభిస్తారు:
-   ```py
-   >>> from transformers import AutoModelForSequenceClassification
-
-   >>> model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-   ```
-
-2. [`TrainingArguments`] మీరు నేర్చుకునే రేటు, బ్యాచ్ పరిమాణం మరియు శిక్షణ పొందవలసిన యుగాల సంఖ్య వంటి మార్చగల మోడల్ హైపర్‌పారామీటర్‌లను కలిగి ఉంది. మీరు ఎలాంటి శిక్షణా వాదనలను పేర్కొనకుంటే డిఫాల్ట్ విలువలు ఉపయోగించబడతాయి:
-
-   ```py
-   >>> from transformers import TrainingArguments
-
-   >>> training_args = TrainingArguments(
-   ...     output_dir="path/to/save/folder/",
-   ...     learning_rate=2e-5,
-   ...     per_device_train_batch_size=8,
-   ...     per_device_eval_batch_size=8,
-   ...     num_train_epochs=2,
-   ... )
-   ```
-
-3. టోకెనైజర్, ఇమేజ్ ప్రాసెసర్, ఫీచర్ ఎక్స్‌ట్రాక్టర్ లేదా ప్రాసెసర్ వంటి ప్రీప్రాసెసింగ్ క్లాస్‌ని లోడ్ చేయండి:
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-   ```
-
-4. డేటాసెట్‌ను లోడ్ చేయండి:
-
-   ```py
-   >>> from datasets import load_dataset
-
-   >>> dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT
-   ```
-
-5. డేటాసెట్‌ను టోకనైజ్ చేయడానికి ఒక ఫంక్షన్‌ను సృష్టించండి:
-
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])
-   ```
-
-   ఆపై దానిని [`~datasets.Dataset.map`]తో మొత్తం డేటాసెట్‌లో వర్తింపజేయండి:
-   
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset, batched=True)
-   ```
-
-6. మీ డేటాసెట్ నుండి ఉదాహరణల సమూహాన్ని సృష్టించడానికి [`DataCollatorWithPadding`]:
-
-   ```py
-   >>> from transformers import DataCollatorWithPadding
-
-   >>> data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
-   ```
-
-ఇప్పుడు ఈ తరగతులన్నింటినీ [`Trainer`]లో సేకరించండి:
-
-```py
->>> from transformers import Trainer
-
->>> trainer = Trainer(
-...     model=model,
-...     args=training_args,
-...     train_dataset=dataset["train"],
-...     eval_dataset=dataset["test"],
-...     tokenizer=tokenizer,
-...     data_collator=data_collator,
-... )  # doctest: +SKIP
-```
-
-మీరు సిద్ధంగా ఉన్నప్పుడు, శిక్షణను ప్రారంభించడానికి [`~Trainer.train`]కి కాల్ చేయండి:
-
-```py
->>> trainer.train()  # doctest: +SKIP
-```
-
-<Tip>
-
-సీక్వెన్స్-టు-సీక్వెన్స్ మోడల్‌ని ఉపయోగించే - అనువాదం లేదా సారాంశం వంటి పనుల కోసం, బదులుగా [`Seq2SeqTrainer`] మరియు [`Seq2SeqTrainingArguments`] తరగతులను ఉపయోగించండి.
-
-</Tip>
-
-మీరు [`Trainer`] లోపల ఉన్న పద్ధతులను ఉపవర్గీకరించడం ద్వారా శిక్షణ లూప్ ప్రవర్తనను అనుకూలీకరించవచ్చు. ఇది లాస్ ఫంక్షన్, ఆప్టిమైజర్ మరియు షెడ్యూలర్ వంటి లక్షణాలను అనుకూలీకరించడానికి మిమ్మల్ని అనుమతిస్తుంది. ఉపవర్గీకరించబడే పద్ధతుల కోసం [`Trainer`] సూచనను పరిశీలించండి.
-
-శిక్షణ లూప్‌ను అనుకూలీకరించడానికి మరొక మార్గం [కాల్‌బ్యాక్‌లు](./main_classes/callbacks). మీరు ఇతర లైబ్రరీలతో అనుసంధానం చేయడానికి కాల్‌బ్యాక్‌లను ఉపయోగించవచ్చు మరియు పురోగతిపై నివేదించడానికి శిక్షణ లూప్‌ను తనిఖీ చేయవచ్చు లేదా శిక్షణను ముందుగానే ఆపవచ్చు. శిక్షణ లూప్‌లోనే కాల్‌బ్యాక్‌లు దేనినీ సవరించవు. లాస్ ఫంక్షన్ వంటివాటిని అనుకూలీకరించడానికి, మీరు బదులుగా [`Trainer`]ని ఉపవర్గం చేయాలి.
-
-## TensorFlowతో శిక్షణ పొందండి
-
-అన్ని మోడల్‌లు ప్రామాణికమైన [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) కాబట్టి వాటిని [Keras]తో TensorFlowలో శిక్షణ పొందవచ్చు(https: //keras.io/) API. 🤗 ట్రాన్స్‌ఫార్మర్‌లు మీ డేటాసెట్‌ని సులభంగా `tf.data.Dataset`గా లోడ్ చేయడానికి [`~TFPreTrainedModel.prepare_tf_dataset`] పద్ధతిని అందజేస్తుంది కాబట్టి మీరు వెంటనే Keras' [`compile`](https://keras.io /api/models/model_training_apis/#compile-method) మరియు [`fit`](https://keras.io/api/models/model_training_apis/#fit-method) పద్ధతులు.
-
-1. మీరు [`TFPreTrainedModel`] లేదా [`tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)తో ప్రారంభిస్తారు:
-   ```py
-   >>> from transformers import TFAutoModelForSequenceClassification
-
-   >>> model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")
-   ```
-
-2. టోకెనైజర్, ఇమేజ్ ప్రాసెసర్, ఫీచర్ ఎక్స్‌ట్రాక్టర్ లేదా ప్రాసెసర్ వంటి ప్రీప్రాసెసింగ్ క్లాస్‌ని లోడ్ చేయండి:
-
-   ```py
-   >>> from transformers import AutoTokenizer
-
-   >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
-   ```
-
-3. డేటాసెట్‌ను టోకనైజ్ చేయడానికి ఒక ఫంక్షన్‌ను సృష్టించండి:
-
-   ```py
-   >>> def tokenize_dataset(dataset):
-   ...     return tokenizer(dataset["text"])  # doctest: +SKIP
-   ```
-
-4. [`~datasets.Dataset.map`]తో మొత్తం డేటాసెట్‌పై టోకెనైజర్‌ని వర్తింపజేయి, ఆపై డేటాసెట్ మరియు టోకెనైజర్‌ను [`~TFPreTrainedModel.prepare_tf_dataset`]కి పంపండి. మీరు కావాలనుకుంటే బ్యాచ్ పరిమాణాన్ని కూడా మార్చవచ్చు మరియు డేటాసెట్‌ను ఇక్కడ షఫుల్ చేయవచ్చు:
-   ```py
-   >>> dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
-   >>> tf_dataset = model.prepare_tf_dataset(
-   ...     dataset["train"], batch_size=16, shuffle=True, tokenizer=tokenizer
-   ... )  # doctest: +SKIP
-   ```
-
-5. మీరు సిద్ధంగా ఉన్నప్పుడు, శిక్షణను ప్రారంభించడానికి మీరు `కంపైల్` మరియు `ఫిట్`కి కాల్ చేయవచ్చు. ట్రాన్స్‌ఫార్మర్స్ మోడల్స్ అన్నీ డిఫాల్ట్ టాస్క్-సంబంధిత లాస్ ఫంక్షన్‌ని కలిగి ఉన్నాయని గుర్తుంచుకోండి, కాబట్టి మీరు కోరుకునే వరకు మీరు ఒకదానిని పేర్కొనవలసిన అవసరం లేదు:
-
-   ```py
-   >>> from tensorflow.keras.optimizers import Adam
-
-   >>> model.compile(optimizer=Adam(3e-5))  # No loss argument!
-   >>> model.fit(tf_dataset)  # doctest: +SKIP
-   ```
-
-## తరవాత ఏంటి?
-
-ఇప్పుడు మీరు 🤗 ట్రాన్స్‌ఫార్మర్స్ త్వరిత పర్యటనను పూర్తి చేసారు, మా గైడ్‌లను తనిఖీ చేయండి మరియు అనుకూల మోడల్‌ను వ్రాయడం, టాస్క్ కోసం మోడల్‌ను చక్కగా తీర్చిదిద్దడం మరియు స్క్రిప్ట్‌తో మోడల్‌కు శిక్షణ ఇవ్వడం వంటి మరింత నిర్దిష్టమైన పనులను ఎలా చేయాలో తెలుసుకోండి. 🤗 ట్రాన్స్‌ఫార్మర్స్ కోర్ కాన్సెప్ట్‌ల గురించి మరింత తెలుసుకోవడానికి మీకు ఆసక్తి ఉంటే, ఒక కప్పు కాఫీ తాగి, మా కాన్సెప్టువల్ గైడ్‌లను చూడండి!
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -9,8 +9,4 @@
 - sections:
  - local: accelerate
    title: 加速分布式训练
-  title: 教程
- sections:
-  - local: fast_tokenizers
-    title: 使用 🤗 Tokenizers 中的分词器
-  title: 开发者指南
+  title: 教程
--- a/docs/source/zh/fast_tokenizers.md
+++ b/docs/source/zh/fast_tokenizers.md
@ -1,67 +0,0 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
-
-# 使用 🤗 Tokenizers 中的分词器
-
-[`PreTrainedTokenizerFast`] 依赖于 [🤗 Tokenizers](https://huggingface.co/docs/tokenizers) 库。从 🤗 Tokenizers 库获得的分词器可以被轻松地加载到 🤗 Transformers 中。
-
-在了解具体内容之前，让我们先用几行代码创建一个虚拟的分词器：
-
-```python
->>> from tokenizers import Tokenizer
->>> from tokenizers.models import BPE
->>> from tokenizers.trainers import BpeTrainer
->>> from tokenizers.pre_tokenizers import Whitespace
-
->>> tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
->>> trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
-
->>> tokenizer.pre_tokenizer = Whitespace()
->>> files = [...]
->>> tokenizer.train(files, trainer)
-```
-
-现在，我们拥有了一个针对我们定义的文件进行训练的分词器。我们可以在当前运行时中继续使用它，或者将其保存到一个 JSON 文件以供将来重复使用。
-
-## 直接从分词器对象加载
-
-让我们看看如何利用 🤗 Transformers 库中的这个分词器对象。[`PreTrainedTokenizerFast`] 类允许通过接受已实例化的 *tokenizer* 对象作为参数，进行轻松实例化：
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_object=tokenizer)
-```
-
-现在可以使用这个对象，使用 🤗 Transformers 分词器共享的所有方法！前往[分词器页面](main_classes/tokenizer)了解更多信息。
-
-## 从 JSON 文件加载
-
-为了从 JSON 文件中加载分词器，让我们先保存我们的分词器：
-
-```python
->>> tokenizer.save("tokenizer.json")
-```
-
-我们保存此文件的路径可以通过 `tokenizer_file` 参数传递给 [`PreTrainedTokenizerFast`] 初始化方法：
-
-```python
->>> from transformers import PreTrainedTokenizerFast
-
->>> fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
-```
-
-现在可以使用这个对象，使用 🤗 Transformers 分词器共享的所有方法！前往[分词器页面](main_classes/tokenizer)了解更多信息。
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@ -229,7 +229,7 @@ tzlocal==4.1
 unidic==1.1.0
 unidic-lite==1.0.8
 uritemplate==4.1.1
-urllib3==1.26.18
+urllib3==1.26.17
 wasabi==0.9.0
 wcwidth==0.2.5
 websocket-client==1.3.1
--- a/examples/research_projects/visual_bert/requirements.txt
+++ b/examples/research_projects/visual_bert/requirements.txt
@ -90,7 +90,7 @@ tornado==6.3.3
 tqdm==4.48.2
 traitlets
 git+https://github.com/huggingface/transformers.git
-urllib3==1.26.18
+urllib3==1.26.17
 wcwidth==0.2.5
 webencodings==0.5.1
 wget==3.2
--- a/examples/tensorflow/_tests_requirements.txt
+++ b/examples/tensorflow/_tests_requirements.txt
@ -1,5 +1,4 @@
 tensorflow<2.15
-keras<2.15
 tensorboard
 scikit-learn
 seqeval
--- a/setup.py
+++ b/setup.py
@ -125,8 +125,6 @@ _deps = [
    "jaxlib>=0.4.1,<=0.4.13",
    "jieba",
    "kenlm",
-    # Keras pin - this is to make sure Keras 3 doesn't destroy us. Remove or change when we have proper support.
-    "keras<2.15",
    "keras-nlp>=0.3.1",
    "librosa",
    "nltk",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -343,7 +343,6 @@ _import_structure = {
    "models.focalnet": ["FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP", "FocalNetConfig"],
    "models.fsmt": ["FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP", "FSMTConfig", "FSMTTokenizer"],
    "models.funnel": ["FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP", "FunnelConfig", "FunnelTokenizer"],
-    "models.fuyu": ["FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP", "FuyuConfig", "FuyuProcessor"],
    "models.git": ["GIT_PRETRAINED_CONFIG_ARCHIVE_MAP", "GitConfig", "GitProcessor", "GitVisionConfig"],
    "models.glpn": ["GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP", "GLPNConfig"],
    "models.gpt2": ["GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP", "GPT2Config", "GPT2Tokenizer"],
@ -973,7 +972,6 @@ else:
    _import_structure["models.efficientformer"].append("EfficientFormerImageProcessor")
    _import_structure["models.efficientnet"].append("EfficientNetImageProcessor")
    _import_structure["models.flava"].extend(["FlavaFeatureExtractor", "FlavaImageProcessor", "FlavaProcessor"])
-    _import_structure["models.fuyu"].append("FuyuImageProcessor")
    _import_structure["models.glpn"].extend(["GLPNFeatureExtractor", "GLPNImageProcessor"])
    _import_structure["models.idefics"].extend(["IdeficsImageProcessor"])
    _import_structure["models.imagegpt"].extend(["ImageGPTFeatureExtractor", "ImageGPTImageProcessor"])
@ -1866,7 +1864,6 @@ else:
            "load_tf_weights_in_funnel",
        ]
    )
-    _import_structure["models.fuyu"].extend(["FuyuForCausalLM", "FuyuPreTrainedModel"])
    _import_structure["models.git"].extend(
        [
            "GIT_PRETRAINED_MODEL_ARCHIVE_LIST",
@ -4492,7 +4489,6 @@ if TYPE_CHECKING:
    from .models.focalnet import FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP, FocalNetConfig
    from .models.fsmt import FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP, FSMTConfig, FSMTTokenizer
    from .models.funnel import FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP, FunnelConfig, FunnelTokenizer
-    from .models.fuyu import FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP, FuyuConfig, FuyuProcessor
    from .models.git import GIT_PRETRAINED_CONFIG_ARCHIVE_MAP, GitConfig, GitProcessor, GitVisionConfig
    from .models.glpn import GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP, GLPNConfig
    from .models.gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2Config, GPT2Tokenizer
@ -5057,7 +5053,6 @@ if TYPE_CHECKING:
        from .models.efficientformer import EfficientFormerImageProcessor
        from .models.efficientnet import EfficientNetImageProcessor
        from .models.flava import FlavaFeatureExtractor, FlavaImageProcessor, FlavaProcessor
-        from .models.fuyu import FuyuImageProcessor
        from .models.glpn import GLPNFeatureExtractor, GLPNImageProcessor
        from .models.idefics import IdeficsImageProcessor
        from .models.imagegpt import ImageGPTFeatureExtractor, ImageGPTImageProcessor
@ -5812,10 +5807,6 @@ if TYPE_CHECKING:
            FunnelPreTrainedModel,
            load_tf_weights_in_funnel,
        )
-        from .models.fuyu import (
-            FuyuForCausalLM,
-            FuyuPreTrainedModel,
-        )
        from .models.git import (
            GIT_PRETRAINED_MODEL_ARCHIVE_LIST,
            GitForCausalLM,
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -854,9 +854,6 @@ class PretrainedConfig(PushToHubMixin):
                else self.quantization_config
            )

-            # pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
-            _ = serializable_config_dict.pop("_pre_quantization_dtype", None)
-
        self.dict_torch_dtype_to_str(serializable_config_dict)

        if "_flash_attn_2_enabled" in serializable_config_dict:
@ -899,9 +896,6 @@ class PretrainedConfig(PushToHubMixin):
                else self.quantization_config
            )

-            # pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
-            _ = output.pop("_pre_quantization_dtype", None)
-
        self.dict_torch_dtype_to_str(output)

        return output
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -1168,9 +1168,9 @@ class LlamaConverter(SpmConverter):
            )
            tokenizer.add_special_tokens(
                [
-                    AddedToken("<unk>", normalized=False, special=True),
-                    AddedToken("<s>", normalized=False, special=True),
-                    AddedToken("</s>", normalized=False, special=True),
+                    AddedToken("<unk>"),
+                    AddedToken("<s>"),
+                    AddedToken("</s>"),
                ]
            )
        else:
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -32,7 +32,6 @@ deps = {
    "jaxlib": "jaxlib>=0.4.1,<=0.4.13",
    "jieba": "jieba",
    "kenlm": "kenlm",
-    "keras": "keras<2.15",
    "keras-nlp": "keras-nlp>=0.3.1",
    "librosa": "librosa",
    "nltk": "nltk",
--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@ -276,14 +276,9 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    selected. The formula can be seen in the original [paper](https://arxiv.org/pdf/1909.05858.pdf). According to the
    paper a penalty of around 1.2 yields a good balance between truthful generation and lack of repetition.

-    This technique can also be used to reward and thus encourage repetition in a similar manner. To penalize and reduce
-    repetition, use `penalty` values above 1.0, where a higher value penalizes more strongly. To reward and encourage
-    repetition, use `penalty` values between 0.0 and 1.0, where a lower value rewards more strongly.
-
    Args:
-        penalty (`float`):
-            The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 penalizes previously generated
-            tokens. Between 0.0 and 1.0 rewards previously generated tokens. See [this
+        repetition_penalty (`float`):
+            The parameter for repetition penalty. 1.0 means no penalty. See [this
            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.

    Examples:
@ -318,7 +313,7 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        score = torch.gather(scores, 1, input_ids)

-        # if score < 0 then repetition penalty has to be multiplied to reduce the token probabilities
+        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
        score = torch.where(score < 0, score * self.penalty, score / self.penalty)

        scores.scatter_(1, input_ids, score)
@ -327,18 +322,11 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):

 class EncoderRepetitionPenaltyLogitsProcessor(LogitsProcessor):
    r"""
-    [`LogitsProcessor`] that avoids hallucination by boosting the probabilities of tokens found within the original
-    input.
-
-    This technique can also be used to reward and thus encourage hallucination (or creativity) in a similar manner. To
-    penalize and reduce hallucination, use `penalty` values above 1.0, where a higher value penalizes more strongly. To
-    reward and encourage hallucination, use `penalty` values between 0.0 and 1.0, where a lower value rewards more
-    strongly.
+    [`LogitsProcessor`] enforcing an exponential penalty on tokens that are not in the original input.

    Args:
-        penalty (`float`):
-            The parameter for hallucination penalty. 1.0 means no penalty. Above 1.0 penalizes hallucination. Between
-            0.0 and 1.0 rewards hallucination.
+        hallucination_penalty (`float`):
+            The parameter for hallucination penalty. 1.0 means no penalty.
        encoder_input_ids (`torch.LongTensor`):
            The encoder_input_ids that should be repeated within the decoder ids.
    """
@ -354,7 +342,7 @@ class EncoderRepetitionPenaltyLogitsProcessor(LogitsProcessor):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
        score = torch.gather(scores, 1, self.encoder_input_ids)

-        # if score < 0 then hallucination penalty has to be multiplied to increase the token probabilities
+        # if score < 0 then repetition penalty has to be multiplied to reduce the previous token probability
        score = torch.where(score < 0, score * self.penalty, score / self.penalty)

        scores.scatter_(1, self.encoder_input_ids, score)
--- a/src/transformers/generation/stopping_criteria.py
+++ b/src/transformers/generation/stopping_criteria.py
@ -23,8 +23,7 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""
            [What are input IDs?](../glossary#input-ids)
        scores (`torch.FloatTensor` of shape `(batch_size, config.vocab_size)`):
            Prediction scores of a language modeling head. These can be scores for each vocabulary token before SoftMax
-            or scores for each vocabulary token after SoftMax. If this stopping criteria depends on the `scores` input,
-            make sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`.
+            or scores for each vocabulary token after SoftMax.
        kwargs (`Dict[str, Any]`, *optional*):
            Additional stopping criteria specific kwargs.

@ -35,11 +34,7 @@ STOPPING_CRITERIA_INPUTS_DOCSTRING = r"""


 class StoppingCriteria(ABC):
-    """Abstract base class for all stopping criteria that can be applied during generation.
-
-    If your stopping criteria depends on the `scores` input, make sure you pass `return_dict_in_generate=True,
-    output_scores=True` to `generate`.
-    """
+    """Abstract base class for all stopping criteria that can be applied during generation."""

    @add_start_docstrings(STOPPING_CRITERIA_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@ -1430,22 +1430,14 @@ class TFGenerationMixin:
        # instantiate warpers list
        warpers = TFLogitsProcessorList()

-        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
-        # better score (i.e. keep len(generation_config.eos_token_id) + 1)
-        if generation_config.num_beams > 1:
-            if isinstance(generation_config.eos_token_id, list):
-                min_tokens_to_keep = len(generation_config.eos_token_id) + 1
-            else:
-                min_tokens_to_keep = 2
-        else:
-            min_tokens_to_keep = 1
-
+        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
+        # all samplers can be found in `generation_utils_samplers.py`
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TFTemperatureLogitsWarper(generation_config.temperature))
        if generation_config.top_k is not None and generation_config.top_k != 0:
-            warpers.append(TFTopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
+            warpers.append(TFTopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=1))
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
-            warpers.append(TFTopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=min_tokens_to_keep))
+            warpers.append(TFTopPLogitsWarper(top_p=generation_config.top_p, min_tokens_to_keep=1))
        return warpers

    def _get_logits_processor(
@ -2374,11 +2366,14 @@ class TFGenerationMixin:
            log_probs = tf.nn.log_softmax(logits)
            log_probs = logits_processor(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
            log_probs = unflatten_beam_dim(log_probs, num_beams)
-            if do_sample:
-                log_probs = logits_warper(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
-                log_probs = unflatten_beam_dim(log_probs, num_beams)
            log_probs_processed = log_probs
            log_probs = log_probs + tf.expand_dims(running_scores, axis=2)
+            if do_sample:
+                # Note: logits warpers are intentionally applied after adding running beam scores. On some logits
+                # warpers (like top_p) this is indiferent, but on others (like temperature) it is not. For reference,
+                # see https://github.com/huggingface/transformers/pull/5420#discussion_r449779867
+                log_probs = logits_warper(flatten_beam_dim(running_sequences), flatten_beam_dim(log_probs), cur_len)
+                log_probs = unflatten_beam_dim(log_probs, num_beams)
            vocab_size = log_probs.shape[2]
            log_probs = tf.reshape(log_probs, (batch_size, num_beams * vocab_size))

--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -820,20 +820,11 @@ class GenerationMixin:
        # instantiate warpers list
        warpers = LogitsProcessorList()

-        # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
-        # better score (i.e. keep len(list(generation_config.eos_token_id)) + 1)
-        if generation_config.num_beams > 1:
-            if isinstance(generation_config.eos_token_id, list):
-                min_tokens_to_keep = len(generation_config.eos_token_id) + 1
-            else:
-                min_tokens_to_keep = 2
-        else:
-            min_tokens_to_keep = 1
-
        # the following idea is largely copied from this PR: https://github.com/huggingface/transformers/pull/5420/files
        # all samplers can be found in `generation_utils_samplers.py`
        if generation_config.temperature is not None and generation_config.temperature != 1.0:
            warpers.append(TemperatureLogitsWarper(generation_config.temperature))
+        min_tokens_to_keep = 2 if generation_config.num_beams > 1 else 1
        if generation_config.top_k is not None and generation_config.top_k != 0:
            warpers.append(TopKLogitsWarper(top_k=generation_config.top_k, min_tokens_to_keep=min_tokens_to_keep))
        if generation_config.top_p is not None and generation_config.top_p < 1.0:
@ -1397,9 +1388,7 @@ class GenerationMixin:
            stopping_criteria (`StoppingCriteriaList`, *optional*):
                Custom stopping criteria that complement the default stopping criteria built from arguments and a
                generation config. If a stopping criteria is passed that is already created with the arguments or a
-                generation config an error is thrown. If your stopping criteria depends on the `scores` input, make
-                sure you pass `return_dict_in_generate=True, output_scores=True` to `generate`. This feature is
-                intended for advanced users.
+                generation config an error is thrown. This feature is intended for advanced users.
            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], List[int]]`, *optional*):
                If provided, this function constraints the beam search to allowed tokens only at each step. If not
                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
@ -3417,15 +3406,18 @@ class GenerationMixin:
            )  # (batch_size * num_beams, vocab_size)

            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
-            next_token_scores_processed = logits_warper(input_ids, next_token_scores_processed)
            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
                next_token_scores_processed
            )
+            # Note: logits warpers are intentionally applied after adding running beam scores. On some logits warpers
+            # (like top_p) this is indiferent, but on others (like temperature) it is not. For reference, see
+            # https://github.com/huggingface/transformers/pull/5420#discussion_r449779867
+            next_token_scores = logits_warper(input_ids, next_token_scores)

            # Store scores, attentions and hidden_states when required
            if return_dict_in_generate:
                if output_scores:
-                    scores += (next_token_scores_processed,)
+                    scores += (logits_warper(input_ids, next_token_scores_processed),)
                if output_attentions:
                    decoder_attentions += (
                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
--- a/src/transformers/modeling_utils.py
+++ b/src/transformers/modeling_utils.py
@ -2178,25 +2178,8 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
                "`.to` is not supported for `4-bit` or `8-bit` bitsandbytes models. Please use the model as it is, since the"
                " model has already been set to the correct devices and casted to the correct `dtype`."
            )
-        elif getattr(self, "quantization_method", None) == QuantizationMethod.GPTQ:
-            # For GPTQ models, we prevent users from casting the model to another dytpe to restrict unwanted behaviours.
-            # the correct API should be to load the model with the desired dtype directly through `from_pretrained`.
-            dtype_present_in_args = False
-
-            if "dtype" not in kwargs:
-                for arg in args:
-                    if isinstance(arg, torch.dtype):
-                        dtype_present_in_args = True
-                        break
-            else:
-                dtype_present_in_args = True
-
-            if dtype_present_in_args:
-                raise ValueError(
-                    "You cannot cast a GPTQ model in a new `dtype`. Make sure to load the model using `from_pretrained` using the desired"
-                    " `dtype` by passing the correct `torch_dtype` argument."
-                )
-        return super().to(*args, **kwargs)
+        else:
+            return super().to(*args, **kwargs)

    def half(self, *args):
        # Checks if the model is quantized
@ -3182,12 +3165,6 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
        if hasattr(model, "quantization_method"):
            model.is_quantized = True

-            # We store the original dtype for quantized models as we cannot easily retrieve it
-            # once the weights have been quantized
-            # Note that once you have loaded a quantized model, you can't change its dtype so this will
-            # remain a single source of truth
-            config._pre_quantization_dtype = torch_dtype
-
        if isinstance(device_map, str):
            special_dtypes = {}
            if load_in_8bit or load_in_4bit:
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@ -88,7 +88,6 @@ from . import (
    focalnet,
    fsmt,
    funnel,
-    fuyu,
    git,
    glpn,
    gpt2,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@ -97,7 +97,6 @@ CONFIG_MAPPING_NAMES = OrderedDict(
        ("focalnet", "FocalNetConfig"),
        ("fsmt", "FSMTConfig"),
        ("funnel", "FunnelConfig"),
-        ("fuyu", "FuyuConfig"),
        ("git", "GitConfig"),
        ("glpn", "GLPNConfig"),
        ("gpt-sw3", "GPT2Config"),
@ -311,7 +310,6 @@ CONFIG_ARCHIVE_MAP_MAPPING_NAMES = OrderedDict(
        ("focalnet", "FOCALNET_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("fsmt", "FSMT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("funnel", "FUNNEL_PRETRAINED_CONFIG_ARCHIVE_MAP"),
-        ("fuyu", "FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("git", "GIT_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("glpn", "GLPN_PRETRAINED_CONFIG_ARCHIVE_MAP"),
        ("gpt2", "GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP"),
@ -523,7 +521,6 @@ MODEL_NAMES_MAPPING = OrderedDict(
        ("focalnet", "FocalNet"),
        ("fsmt", "FairSeq Machine-Translation"),
        ("funnel", "Funnel Transformer"),
-        ("fuyu", "Fuyu"),
        ("git", "GIT"),
        ("glpn", "GLPN"),
        ("gpt-sw3", "GPT-Sw3"),
--- a/src/transformers/models/auto/image_processing_auto.py
+++ b/src/transformers/models/auto/image_processing_auto.py
@ -64,7 +64,6 @@ IMAGE_PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("efficientnet", "EfficientNetImageProcessor"),
        ("flava", "FlavaImageProcessor"),
        ("focalnet", "BitImageProcessor"),
-        ("fuyu", "FuyuImageProcessor"),
        ("git", "CLIPImageProcessor"),
        ("glpn", "GLPNImageProcessor"),
        ("groupvit", "CLIPImageProcessor"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@ -400,7 +400,6 @@ MODEL_FOR_CAUSAL_LM_MAPPING_NAMES = OrderedDict(
        ("electra", "ElectraForCausalLM"),
        ("ernie", "ErnieForCausalLM"),
        ("falcon", "FalconForCausalLM"),
-        ("fuyu", "FuyuForCausalLM"),
        ("git", "GitForCausalLM"),
        ("gpt-sw3", "GPT2LMHeadModel"),
        ("gpt2", "GPT2LMHeadModel"),
--- a/src/transformers/models/auto/processing_auto.py
+++ b/src/transformers/models/auto/processing_auto.py
@ -54,7 +54,6 @@ PROCESSOR_MAPPING_NAMES = OrderedDict(
        ("clip", "CLIPProcessor"),
        ("clipseg", "CLIPSegProcessor"),
        ("flava", "FlavaProcessor"),
-        ("fuyu", "FuyuProcessor"),
        ("git", "GitProcessor"),
        ("groupvit", "CLIPProcessor"),
        ("hubert", "Wav2Vec2Processor"),
--- a/src/transformers/models/bart/tokenization_bart.py
+++ b/src/transformers/models/bart/tokenization_bart.py
@ -204,6 +204,8 @@ class BartTokenizer(PreTrainedTokenizer):
        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

        # Mask token behave like a normal word, i.e. include the space before it
+        # TODO seems like both slow and fast actually don't strip left and right soooooooo yeah. See `test_embeded_special_tokens`
+        # Also this not only will strip the spaces but any punctuation
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        with open(vocab_file, encoding="utf-8") as vocab_handle:
--- a/src/transformers/models/bart/tokenization_bart_fast.py
+++ b/src/transformers/models/bart/tokenization_bart_fast.py
@ -170,12 +170,7 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
        trim_offsets=True,
        **kwargs,
    ):
-        # we have to specify that this tokens is special otherwise adding it will reset the normalized flag to `False` in `add_special_tokens`
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, normalized=True, special=True)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
        super().__init__(
            vocab_file,
            merges_file,
--- a/src/transformers/models/barthez/tokenization_barthez.py
+++ b/src/transformers/models/barthez/tokenization_barthez.py
@ -136,8 +136,8 @@ class BarthezTokenizer(PreTrainedTokenizer):
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
-        # Mask token behave like a normal word, i.e. include the space before it. Will have normalized=False by default this way
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

--- a/src/transformers/models/bert_generation/tokenization_bert_generation.py
+++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py
@ -51,19 +51,15 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
        vocab_file (`str`):
            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
-        bos_token (`str`, *optional*, defaults to `"<s>"`):
-            The begin of sequence token.
        eos_token (`str`, *optional*, defaults to `"</s>"`):
            The end of sequence token.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The begin of sequence token.
        unk_token (`str`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (`str`, *optional*, defaults to `"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
-        sep_token (`str`, *optional*, defaults to `"<::::>"`):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
-            sequence classification or for a text and a question for question answering. It is also used as the last
-            token of a sequence built with special tokens.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
--- a/src/transformers/models/bertweet/tokenization_bertweet.py
+++ b/src/transformers/models/bertweet/tokenization_bertweet.py
@ -149,10 +149,10 @@ class BertweetTokenizer(PreTrainedTokenizer):
        self.merges_file = merges_file

        self.encoder = {}
-        self.encoder[str(bos_token)] = 0
-        self.encoder[str(pad_token)] = 1
-        self.encoder[str(eos_token)] = 2
-        self.encoder[str(unk_token)] = 3
+        self.encoder[bos_token] = 0
+        self.encoder[pad_token] = 1
+        self.encoder[eos_token] = 2
+        self.encoder[unk_token] = 3

        self.add_from_file(vocab_file)

--- a/src/transformers/models/blenderbot/tokenization_blenderbot.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py
@ -423,12 +423,6 @@ class BlenderbotTokenizer(PreTrainedTokenizer):
        """
        A very simple chat template that just adds whitespace between messages.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
--- a/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
+++ b/src/transformers/models/blenderbot/tokenization_blenderbot_fast.py
@ -305,12 +305,6 @@ class BlenderbotTokenizerFast(PreTrainedTokenizerFast):
        """
        A very simple chat template that just adds whitespace between messages.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small.py
@ -242,12 +242,6 @@ class BlenderbotSmallTokenizer(PreTrainedTokenizer):
        """
        A very simple chat template that just adds whitespace between messages.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
--- a/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
+++ b/src/transformers/models/blenderbot_small/tokenization_blenderbot_small_fast.py
@ -124,12 +124,6 @@ class BlenderbotSmallTokenizerFast(PreTrainedTokenizerFast):
        """
        A very simple chat template that just adds whitespace between messages.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return (
            "{% for message in messages %}"
            "{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}"
--- a/src/transformers/models/bloom/tokenization_bloom_fast.py
+++ b/src/transformers/models/bloom/tokenization_bloom_fast.py
@ -168,10 +168,4 @@ class BloomTokenizerFast(PreTrainedTokenizerFast):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/camembert/tokenization_camembert.py
+++ b/src/transformers/models/camembert/tokenization_camembert.py
@ -89,7 +89,7 @@ class CamembertTokenizer(PreTrainedTokenizer):
        mask_token (`str`, *optional*, defaults to `"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
-        additional_special_tokens (`List[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED', '<unk>NOTUSED']`):
+        additional_special_tokens (`List[str]`, *optional*, defaults to `['<s>NOTUSED', '</s>NOTUSED']`):
            Additional special tokens used by the tokenizer.
        sp_model_kwargs (`dict`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
@ -127,16 +127,12 @@ class CamembertTokenizer(PreTrainedTokenizer):
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
-        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
+        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
        sp_model_kwargs: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> None:
        # Mask token behave like a normal word, i.e. include the space before it
-        mask_token = (
-            AddedToken(mask_token, lstrip=True, rstrip=False, normalized=False, special=True)
-            if isinstance(mask_token, str)
-            else mask_token
-        )
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token

        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

@ -148,11 +144,11 @@ class CamembertTokenizer(PreTrainedTokenizer):
        # sentencepiece vocabulary (this is the case for <s> and </s> and <unk>).
        # In this case it is recommended to properly set the tokens by hand.
        self._added_tokens_decoder = {
-            0: AddedToken("<s>NOTUSED", special=True),
-            1: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
-            2: AddedToken("</s>NOTUSED", special=True),
-            3: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
-            4: AddedToken("<unk>NOTUSED", special=True),
+            0: AddedToken("<s>NOTUSED"),
+            1: AddedToken(pad_token),
+            2: AddedToken("</s>NOTUSED"),
+            3: AddedToken(unk_token),
+            4: AddedToken("<unk>NOTUSED"),
        }

        self.fairseq_offset = 4  # 3 tokens are newly added, but the offset starts from 4
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@ -119,11 +119,12 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
-        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED", "<unk>NOTUSED"],
+        additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
        **kwargs,
    ):
-        # Mask token behave like a normal word, i.e. include the space before it. Will have normalized = False
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
        super().__init__(
            vocab_file,
            tokenizer_file=tokenizer_file,
--- a/src/transformers/models/chinese_clip/configuration_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/configuration_chinese_clip.py
@ -75,13 +75,8 @@ class ChineseCLIPTextConfig(PretrainedConfig):
            The vocabulary size of the `token_type_ids` passed when calling [`ChineseCLIPModel`].
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float`, *optional*, defaults to 1.0):
-            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
-            testing).
        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
            The epsilon used by the layer normalization layers.
-        pad_token_id (`int`, *optional*, defaults to 0):
-            Padding token id.
        position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
            Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
            positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
@ -182,14 +177,10 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 3072):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        projection_dim (`int`, *optional*, defaults to 512):
-            Dimentionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 12):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 12):
            Number of attention heads for each attention layer in the Transformer encoder.
-        num_channels (`int`, *optional*, defaults to 3):
-            The number of input channels.
        image_size (`int`, *optional*, defaults to 224):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 32):
@ -197,13 +188,13 @@ class ChineseCLIPVisionConfig(PretrainedConfig):
        hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        initializer_factor (`float``, *optional*, defaults to 1.0):
+        initializer_factor (`float``, *optional*, defaults to 1):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).
    Example:
--- a/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
+++ b/src/transformers/models/chinese_clip/image_processing_chinese_clip.py
@ -59,7 +59,7 @@ class ChineseCLIPImageProcessor(BaseImageProcessor):
            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
            method.
-        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
        do_center_crop (`bool`, *optional*, defaults to `True`):
            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
@ -73,7 +73,7 @@ class ChineseCLIPImageProcessor(BaseImageProcessor):
        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
            method.
-        do_normalize (`bool`, *optional*, defaults to `True`):
+        do_normalize:
            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
        image_mean (`float` or `List[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
--- a/src/transformers/models/code_llama/tokenization_code_llama.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama.py
@ -462,19 +462,10 @@ class CodeLlamaTokenizer(PreTrainedTokenizer):

        The output should look something like:

-        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST]
-
-        The reference for this chat template is [this code
-        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
-        in the original repository.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
+
        template = (
            "{% if messages[0]['role'] == 'system' %}"
            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
--- a/src/transformers/models/code_llama/tokenization_code_llama_fast.py
+++ b/src/transformers/models/code_llama/tokenization_code_llama_fast.py
@ -360,19 +360,10 @@ class CodeLlamaTokenizerFast(PreTrainedTokenizerFast):

        The output should look something like:

-        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos><bos>[INST] Prompt [/INST] Answer <eos>
+        <bos>[INST] B_SYS SystemPrompt E_SYS Prompt [/INST] Answer <eos> <bos>[INST] Prompt [/INST] Answer <eos>
        <bos>[INST] Prompt [/INST]
-
-        The reference for this chat template is [this code
-        snippet](https://github.com/facebookresearch/llama/blob/556949fdfb72da27c2f4a40b7f0e4cf0b8153a28/llama/generation.py#L320-L362)
-        in the original repository.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
+
        template = (
            "{% if messages[0]['role'] == 'system' %}"
            "{% set loop_messages = messages[1:] %}"  # Extract system message if it's present
--- a/src/transformers/models/codegen/configuration_codegen.py
+++ b/src/transformers/models/codegen/configuration_codegen.py
@ -57,8 +57,6 @@ class CodeGenConfig(PretrainedConfig):
        n_positions (`int`, *optional*, defaults to 2048):
            The maximum sequence length that this model might ever be used with. Typically set this to something large
            just in case (e.g., 512 or 1024 or 2048).
-        n_ctx (`int`, *optional*, defaults to 2048):
-            This attribute is used in `CodeGenModel.__init__` without any real effect.
        n_embd (`int`, *optional*, defaults to 4096):
            Dimensionality of the embeddings and hidden states.
        n_layer (`int`, *optional*, defaults to 28):
@ -67,29 +65,22 @@ class CodeGenConfig(PretrainedConfig):
            Number of attention heads for each attention layer in the Transformer encoder.
        rotary_dim (`int`, *optional*, defaults to 64):
            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
-        n_inner (`int`, *optional*):
+        n_inner (`int`, *optional*, defaults to None):
            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
-        resid_pdrop (`float`, *optional*, defaults to 0.0):
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
-        embd_pdrop (`int`, *optional*, defaults to 0.0):
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
            The dropout ratio for the embeddings.
-        attn_pdrop (`float`, *optional*, defaults to 0.0):
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
            The dropout ratio for the attention.
-        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
            The epsilon to use in the layer normalization layers.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        use_cache (`bool`, *optional*, defaults to `True`):
            Whether or not the model should return the last key/values attentions (not used by all models).
-        bos_token_id (`int`, *optional*, defaults to 50256):
-            Beginning of stream token id.
-        eos_token_id (`int`, *optional*, defaults to 50256):
-            End of stream token id.
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
-            model has a output word embedding layer.

    Example:

--- a/src/transformers/models/codegen/tokenization_codegen.py
+++ b/src/transformers/models/codegen/tokenization_codegen.py
@ -133,20 +133,16 @@ class CodeGenTokenizer(PreTrainedTokenizer):
        errors (`str`, *optional*, defaults to `"replace"`):
            Paradigm to follow when decoding bytes to UTF-8. See
            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The end of sequence token.
-        pad_token (`str`, *optional*):
-            The token used for padding, for example when batching sequences of different lengths.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
-        add_bos_token (`bool`, *optional*, defaults to `False`):
-            Whether to add a beginning of sequence token at the start of sequences.
    """

    vocab_files_names = VOCAB_FILES_NAMES
@ -167,10 +163,10 @@ class CodeGenTokenizer(PreTrainedTokenizer):
        add_bos_token=False,
        **kwargs,
    ):
-        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
        self.add_bos_token = add_bos_token

        with open(vocab_file, encoding="utf-8") as vocab_handle:
--- a/src/transformers/models/codegen/tokenization_codegen_fast.py
+++ b/src/transformers/models/codegen/tokenization_codegen_fast.py
@ -92,23 +92,25 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast):
    refer to this superclass for more information regarding those methods.

    Args:
-        vocab_file (`str`, *optional*):
+        vocab_file (`str`):
            Path to the vocabulary file.
-        merges_file (`str`, *optional*):
+        merges_file (`str`):
            Path to the merges file.
-        tokenizer_file (`str`, *optional*):
-            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
-            contains everything needed to load the tokenizer.
-        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
-        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+        bos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The beginning of sequence token.
-        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+        eos_token (`str`, *optional*, defaults to `<|endoftext|>`):
            The end of sequence token.
        add_prefix_space (`bool`, *optional*, defaults to `False`):
            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
+        trim_offsets (`bool`, *optional*, defaults to `True`):
+            Whether or not the post-processing step should trim offsets to avoid including whitespaces.
    """

    vocab_files_names = VOCAB_FILES_NAMES
--- a/src/transformers/models/deberta/tokenization_deberta.py
+++ b/src/transformers/models/deberta/tokenization_deberta.py
@ -192,12 +192,12 @@ class DebertaTokenizer(PreTrainedTokenizer):
        add_bos_token=False,
        **kwargs,
    ):
-        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
-        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
-        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
-        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token

        # Mask token behave like a normal word, i.e. include the space before it
        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
--- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
+++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py
@ -138,7 +138,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
        self._tokenizer = SPMTokenizer(
            vocab_file, None, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs
        )
-        unk_token = AddedToken(unk_token, normalized=True, special=True) if isinstance(unk_token, str) else unk_token
+        unk_token = AddedToken(unk_token, normalized=True, lstrip=False, rstrip=False)
        super().__init__(
            do_lower_case=do_lower_case,
            bos_token=bos_token,
--- a/src/transformers/models/falcon/modeling_falcon.py
+++ b/src/transformers/models/falcon/modeling_falcon.py
@ -606,28 +606,22 @@ class FalconFlashAttention2(FalconAttention):
        if alibi is not None:
            raise ValueError("`alibi` is not supported when `use_flash_attn` is True")

-        attn_dropout = self.config.attention_dropout if self.training else 0.0
+        attn_dropout = self.attention_dropout if self.training else 0.0

        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
        # therefore the input hidden states gets silently casted in float32. Hence, we need
        # cast them back in float16 just to be sure everything works as expected.
        input_dtype = query_layer.dtype
        if input_dtype == torch.float32:
-            # Handle the case where the model is quantized
-            if hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.query_key_value.weight.dtype
-
            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}."
+                "The input hidden states seems to be silently casted in float32, this might be related to"
+                " the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                " float16."
            )

-            query_layer = query_layer.to(target_dtype)
-            key_layer = key_layer.to(target_dtype)
-            value_layer = value_layer.to(target_dtype)
+            query_layer = query_layer.to(torch.float16)
+            key_layer = key_layer.to(torch.float16)
+            value_layer = value_layer.to(torch.float16)

        attn_output = self._flash_attention_forward(
            query_layer, key_layer, value_layer, padding_mask, query_length, dropout=attn_dropout
--- a/src/transformers/models/fnet/tokenization_fnet.py
+++ b/src/transformers/models/fnet/tokenization_fnet.py
@ -116,10 +116,9 @@ class FNetTokenizer(PreTrainedTokenizer):
    ) -> None:
        # Mask token behave like a normal word, i.e. include the space before it and
        # is included in the raw text, there should be a match in a non-normalized sentence.
-        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
-        cls_token = AddedToken(cls_token, special=True) if isinstance(cls_token, str) else cls_token
-        sep_token = AddedToken(sep_token, special=True) if isinstance(sep_token, str) else sep_token
-        mask_token = AddedToken(mask_token, special=True) if isinstance(mask_token, str) else mask_token
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs

        self.do_lower_case = do_lower_case
--- a/src/transformers/models/fuyu/init.py
+++ b/src/transformers/models/fuyu/init.py
@ -1,73 +0,0 @@
-# Copyright 2023 AdeptAI and The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from typing import TYPE_CHECKING
-
-from ...utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available, is_vision_available
-
-
-_import_structure = {
-    "configuration_fuyu": ["FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP", "FuyuConfig"],
-}
-
-
-try:
-    if not is_vision_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["image_processing_fuyu"] = ["FuyuImageProcessor"]
-    _import_structure["processing_fuyu"] = ["FuyuProcessor"]
-
-
-try:
-    if not is_torch_available():
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["modeling_fuyu"] = [
-        "FuyuForCausalLM",
-        "FuyuPreTrainedModel",
-    ]
-
-
-if TYPE_CHECKING:
-    from .configuration_fuyu import FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP, FuyuConfig
-
-    try:
-        if not is_vision_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .image_processing_fuyu import FuyuImageProcessor
-        from .processing_fuyu import FuyuProcessor
-
-    try:
-        if not is_torch_available():
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .modeling_fuyu import (
-            FuyuForCausalLM,
-            FuyuPreTrainedModel,
-        )
-
-
-else:
-    import sys
-
-    sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
--- a/src/transformers/models/fuyu/configuration_fuyu.py
+++ b/src/transformers/models/fuyu/configuration_fuyu.py
@ -1,211 +0,0 @@
-# coding=utf-8
-# Copyright 2023 Adept AI and the HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Fuyu model configuration"""
-
-from ...configuration_utils import PretrainedConfig
-from ...utils import logging
-from ..auto import CONFIG_MAPPING
-
-
-logger = logging.get_logger(__name__)
-
-FUYU_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    "adept/fuyu-8b-base": "https://huggingface.co/adept/fuyu-8b-base/resolve/main/config.json",
-}
-
-
-class FuyuConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`FuyuForCausalLM`]. It is used to instantiate an
-    Fuyu model according to the specified arguments, defining the model architecture. Instantiating a configuration
-    with the defaults will yield a similar configuration to that of the
-    [adept/fuyu-8b-base](https://huggingface.co/adept/fuyu-8b-base).
-
-    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
-    documentation from [`PretrainedConfig`] for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 262144):
-            Vocabulary size of the Fuyu model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`FuyuForCausalLM`]
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the hidden representations.
-        intermediate_size (`int`, *optional*, defaults to 16384):
-            Dimension of the MLP representations.
-        num_hidden_layers (`int`, *optional*, defaults to 36):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 64):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
-            The non-linear activation function (function or string) in the decoder.
-        max_position_embeddings (`int`, *optional*, defaults to 16384):
-            The maximum sequence length that this model might ever be used with.
-        image_size (`int`, *optional*, defaults to 300):
-            The input image size.
-        patch_size (`int`, *optional*, defaults to 30):
-            The input vision transformer encoding patch size.
-        num_channels (`int`, *optional*, defaults to 3):
-            The input image number of channels.
-        initializer_range (`float`, *optional*, defaults to 0.02):
-            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
-        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
-            The epsilon used by the rms normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether or not the model should return the last key/values attentions (not used by all models). Only
-            relevant if `config.is_decoder=True`. Whether to tie weight embeddings
-        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
-            Whether to tie input and output embeddings.
-        rope_theta (`float`, *optional*, defaults to 25000.0):
-            The base period of the RoPE embeddings.
-        rope_scaling (`Dict`, *optional*):
-            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
-            strategies: linear and dynamic. Their scaling factor must be an float greater than 1. The expected format
-            is `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
-            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
-            these scaling strategies behave:
-            https://www.reddit.com/r/LocalFuyu/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
-            experimental feature, subject to breaking API changes in future versions.
-        qk_layernorm (`bool`, *optional*, defaults to `True`):
-            Whether or not to normalize the Queries and Keys after projecting the hidden states
-        hidden_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio after applying the MLP to the hidden states.
-        attention_dropout (`float`, *optional*, defaults to 0.0):
-            The dropout ratio after computing the attention scores.
-        partial_rotary_factor (`float`, *optional*, defaults to 0.5):
-            Percentage of the query and keys which will have rotary embedding.
-
-        pad_token_id (`int`, *optional*):
-            The id of the *padding* token.
-        bos_token_id (`int`, *optional*, defaults to 1):
-            The id of the *beginning-of-sequence* token.
-        eos_token_id (`Union[int, List[int]]`, *optional*, defaults to 2):
-            The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
-        text_config (`dict`, *optional*):
-            Dictionary of configuration options used to initialize the `language``[`Aut`].
-
-    ```python
-    >>> from transformers import FuyuConfig
-
-    >>> # Initializing a Fuyu fuyu-7b style configuration
-    >>> configuration = FuyuConfig()
-    ```"""
-    model_type = "fuyu"
-    keys_to_ignore_at_inference = ["past_key_values"]
-
-    def __init__(
-        self,
-        vocab_size=262144,
-        hidden_size=4096,
-        intermediate_size=16384,
-        num_hidden_layers=36,
-        num_attention_heads=64,
-        hidden_act="relu2",
-        max_position_embeddings=16384,
-        image_size=300,
-        patch_size=30,
-        num_channels=3,
-        initializer_range=0.02,
-        layer_norm_eps=1e-5,
-        use_cache=True,
-        tie_word_embeddings=False,
-        rope_theta=25000.0,
-        rope_scaling=None,
-        qk_layernorm=True,
-        hidden_dropout=0.0,
-        attention_dropout=0.0,
-        partial_rotary_factor=0.5,
-        pad_token_id=None,
-        bos_token_id=1,
-        eos_token_id=2,
-        text_config=None,
-        **kwargs,
-    ):
-        if text_config is None:
-            text_config = {
-                "vocab_size": vocab_size,
-                "max_position_embeddings": max_position_embeddings,
-                "hidden_size": hidden_size,
-                "intermediate_size": intermediate_size,
-                "num_hidden_layers": num_hidden_layers,
-                "num_attention_heads": num_attention_heads,
-                "hidden_act": hidden_act,
-                "initializer_range": initializer_range,
-                "layer_norm_eps": layer_norm_eps,
-                "use_cache": use_cache,
-                "rope_theta": rope_theta,
-                "rope_scaling": rope_scaling,
-                "qk_layernorm": qk_layernorm,
-                "hidden_dropout": hidden_dropout,
-                "attention_dropout": attention_dropout,
-                "partial_rotary_factor": partial_rotary_factor,
-                "pad_token_id": pad_token_id,
-                "bos_token_id": bos_token_id,
-                "eos_token_id": eos_token_id,
-                "tie_word_embeddings": tie_word_embeddings,
-            }
-            logger.info("text_config is None. initializing the text model with default values.")
-        text_model_type = text_config["model_type"] if "model_type" in text_config else "persimmon"
-        self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
-
-        self.vocab_size = vocab_size
-        self.max_position_embeddings = max_position_embeddings
-        self.image_size = image_size
-        self.patch_size = patch_size
-        self.num_channels = num_channels
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.hidden_act = hidden_act
-        self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
-        self.use_cache = use_cache
-        self.rope_theta = rope_theta
-        self.rope_scaling = rope_scaling
-        self.qk_layernorm = qk_layernorm
-        self.hidden_dropout = hidden_dropout
-        self.attention_dropout = attention_dropout
-        self.partial_rotary_factor = partial_rotary_factor
-        self._rope_scaling_validation()
-
-        super().__init__(
-            pad_token_id=pad_token_id,
-            bos_token_id=bos_token_id,
-            eos_token_id=eos_token_id,
-            tie_word_embeddings=tie_word_embeddings,
-            **kwargs,
-        )
-
-    def _rope_scaling_validation(self):
-        """
-        Validate the `rope_scaling` configuration.
-        """
-        if self.rope_scaling is None:
-            return
-
-        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
-            raise ValueError(
-                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor`, "
-                f"got {self.rope_scaling}"
-            )
-        rope_scaling_type = self.rope_scaling.get("type", None)
-        rope_scaling_factor = self.rope_scaling.get("factor", None)
-        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
-            raise ValueError(
-                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
-            )
-        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
-            raise ValueError(f"`rope_scaling`'s factor field must be an float > 1, got {rope_scaling_factor}")
--- a/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py
+++ b/src/transformers/models/fuyu/convert_fuyu_model_weights_to_hf.py
@ -1,134 +0,0 @@
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import os
-import sys
-import warnings
-
-import flatdict
-import torch
-
-from transformers import FuyuConfig, FuyuForCausalLM, LlamaTokenizer
-
-
-try:
-    from transformers import LlamaTokenizerFast
-
-    tokenizer_class = LlamaTokenizerFast
-except ImportError as e:
-    warnings.warn(e)
-    warnings.warn(
-        "The converted tokenizer will be the `slow` tokenizer. To use the fast, update your `tokenizers` library and re-run the tokenizer conversion"
-    )
-    tokenizer_class = LlamaTokenizer
-
-"""
-Sample usage: # TODO fix clone links from persimmon to fuyu
-```
-git clone https://github.com/adept-ai-labs/adept-inference
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_base_model_release.tar
-wget https://axtkn4xl5cip.objectstorage.us-phoenix-1.oci.customer-oci.com/n/axtkn4xl5cip/b/adept-public-data/o/8b_chat_model_release.tar
-python src/transformers/models/fuyu/convert_fuyu_weights_to_hf.py  --input_dir /path/to/downloaded/fuyu/weights/ --output_dir /output/path
-```
-
-Thereafter, models can be loaded via:
-
-```py
-from transformers import FuyuForCausalLM, FuyuTokenizer
-
-model = FuyuForCausalLM.from_pretrained("/output/path")
-tokenizer = FuyuTokenizer.from_pretrained("/output/path")
-```
-
-Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
-come in several checkpoints they each contain a part of each weight of the model, so we need to load them all in RAM).
-"""
-
-
-KEYS_TO_MODIFY_MAPPING = {
-    "self_attention": "self_attn",
-    "language_model.encoder": "language_model.model",
-    "word_embeddings_for_head": "language_model.lm_head",
-    "language_model.embedding.word_embeddings": "language_model.model.embed_tokens",
-    "vit_encoder.linear_encoder": "vision_embed_tokens",
-}
-
-KEYS_TO_REMOVE = {
-    "rotary_emb.inv_freq",
-    "image_patch_projection",
-    "image_patch_projection.weight",
-    "image_patch_projection.bias",
-}
-
-
-def rename_state_dict(state_dict):
-    model_state_dict = {}
-    for key, value in state_dict.items():
-        for key_to_modify, new_key in KEYS_TO_MODIFY_MAPPING.items():
-            if key_to_modify in key:
-                key = key.replace(key_to_modify, new_key)
-        # if KEYS_TO_REMOVE in key:
-        if key in KEYS_TO_REMOVE:
-            continue
-        model_state_dict[key] = value
-    return model_state_dict
-
-
-def convert_fuyu_checkpoint(pytorch_dump_folder_path, ada_lib_path, pt_model_path, safe_serialization=False):
-    sys.path.insert(0, ada_lib_path)
-    model_state_dict_base = torch.load(pt_model_path, map_location="cpu")
-    state_dict = flatdict.FlatDict(model_state_dict_base["model"], ".")
-    state_dict = rename_state_dict(state_dict)
-
-    transformers_config = FuyuConfig()
-    model = FuyuForCausalLM(transformers_config).to(torch.bfloat16)
-    model.load_state_dict(state_dict)
-    model.save_pretrained(pytorch_dump_folder_path, safe_serialization=safe_serialization)
-    transformers_config.save_pretrained(pytorch_dump_folder_path)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--input_dir",
-        help="Location of Fuyu weights, which contains tokenizer.model and model folders",
-    )
-    parser.add_argument(
-        "--pt_model_path",
-        help="Location of Fuyu `model_optim_rng.pt`",
-    )
-    parser.add_argument(
-        "--output_dir",
-        help="Location to write HF model and tokenizer",
-    )
-    parser.add_argument(
-        "--ada_lib_path",
-        help="Location of original source code from adept to deserialize .pt checkpoint",
-    )
-    parser.add_argument("--safe_serialization", type=bool, help="Whether or not to save using `safetensors`.")
-    args = parser.parse_args()
-    spm_path = os.path.join(args.input_dir, "adept_vocab.model")
-
-    convert_fuyu_checkpoint(
-        pytorch_dump_folder_path=args.output_dir,
-        pt_model_path=args.pt_model_path,
-        safe_serialization=args.safe_serialization,
-        ada_lib_path=args.ada_lib_path,
-    )
-    tokenizer = tokenizer_class(spm_path, bos_token="|ENDOFTEXT|", eos_token="|ENDOFTEXT|")
-    tokenizer.save_pretrained(args.output_dir)
-
-
-if __name__ == "__main__":
-    main()
--- a/src/transformers/models/fuyu/image_processing_fuyu.py
+++ b/src/transformers/models/fuyu/image_processing_fuyu.py
@ -1,254 +0,0 @@
-import math
-from typing import List, Union
-
-import numpy as np
-
-from ...image_processing_utils import BaseImageProcessor
-from ...image_transforms import (
-    normalize,
-    pad,
-    resize,
-)
-from ...image_utils import to_numpy_array
-from ...utils import is_torch_available, is_vision_available, logging, requires_backends
-
-
-if is_vision_available():
-    import PIL
-
-if is_torch_available():
-    import torch
-
-logger = logging.get_logger(__name__)
-
-
-class FuyuImageProcessor(BaseImageProcessor):
-    """
-    This class should handle the image processing part before the main FuyuForCausalLM. In particular, it should
-    handle:
-
-    - Processing Images:
-        Taking a batch of images as input. If the images are variable-sized, it resizes them based on the desired patch
-        dimensions. The image output is always img_h ........................................... 1080 img_w
-        ........................................... 1920 Then, it patches up these images using the patchify_image
-        function.
-
-    - Creating Image Input IDs:
-        For each patch, a placeholder ID is given to identify where these patches belong in a token sequence. For
-        variable-sized images, each line of patches is terminated with a newline ID.
-
-    - Image Patch Indices:
-        For each image patch, the code maintains an index where these patches should be inserted in a token stream.
-
-    """
-
-    model_input_names = [
-        "images",
-        "image_input_ids",
-        "image_patches",
-        "image_patch_indices_per_batch",
-        "image_patch_indices_per_subsequence",
-    ]
-
-    def __init__(
-        self, target_height=1080, target_width=1920, padding_value=1.0, padding_mode: str = "constant", **kwargs
-    ):
-        super().__init__(**kwargs)
-        self.target_width = target_width
-        self.target_height = target_height
-        self.padding_value = padding_value
-        self.padding_mode = padding_mode
-
-    def get_num_patches(self, img_h: int, img_w: int, patch_dim_h: int, patch_dim_w: int) -> int:
-        """Calculate number of patches required to encode an image."""
-        if img_h % patch_dim_h != 0:
-            raise ValueError(f"{img_h=} must be divisible by {patch_dim_h=}")
-        if img_w % patch_dim_w != 0:
-            raise ValueError(f"{img_w=} must be divisible by {patch_dim_w=}")
-
-        num_patches_per_dim_h = img_h // patch_dim_h
-        num_patches_per_dim_w = img_w // patch_dim_w
-        num_patches = num_patches_per_dim_h * num_patches_per_dim_w
-
-        return num_patches
-
-    def patchify_image(self, image: "torch.Tensor", patch_dim_h: int, patch_dim_w: int) -> "torch.Tensor":
-        """
-        Convert an image into a tensor of patches.
-
-        Args:
-            image: Image to convert. Shape: [batch, channels, height, width]
-            patch_dim_h: Height of each patch.
-            patch_dim_w: Width of each patch.
-        """
-        requires_backends(self, ["torch"])
-
-        # TODO refer to https://github.com/ArthurZucker/transformers/blob/0f0a3fe5ca5697ee58faeb5b53f049af720b5e98/src/transformers/models/vit_mae/modeling_vit_mae.py#L871
-        # torch implementation is faster but does not handle non-squares
-
-        batch_size, channels, height, width = image.shape
-        unfolded_along_height = image.unfold(2, patch_dim_h, patch_dim_h)
-        patches = unfolded_along_height.unfold(3, patch_dim_w, patch_dim_w)
-
-        patches_reshaped = patches.contiguous().view(batch_size, channels, -1, patch_dim_h, patch_dim_w)
-
-        patches_final = patches_reshaped.permute(0, 2, 3, 4, 1).reshape(
-            batch_size, -1, channels * patch_dim_h * patch_dim_w
-        )
-
-        return patches_final
-
-    def process_images_for_model_input(
-        self,
-        image_input: "torch.Tensor",
-        image_present: "torch.Tensor",
-        image_unpadded_h: "torch.Tensor",
-        image_unpadded_w: "torch.Tensor",
-        image_patch_dim_h: int,
-        image_patch_dim_w: int,
-        image_placeholder_id: int,
-        image_newline_id: int,
-        variable_sized: bool,
-    ) -> dict:
-        """Process images for model input. In particular, variable-sized images are handled here.
-
-        Args:
-            image_input: [batch_size, 1, c, h, w] tensor of images padded to model input size.
-            image_present: [batch_size, 1] tensor of 1s and 0s indicating whether an image is present.
-            image_unpadded_h: [batch_size, 1] tensor of unpadded image heights.
-            image_unpadded_w: [batch_size, 1] tensor of unpadded image widths.
-            image_patch_dim_h: The height of the image patches.
-            image_patch_dim_w: The width of the image patches.
-            image_placeholder_id: The id of the image placeholder token.
-            image_newline_id: The id of the image newline token.
-            variable_sized: Whether to process images as variable-sized.
-        """
-        requires_backends(self, ["torch"])
-        # Only images that are present.
-        images: List[List[torch.Tensor]] = []
-        image_patches: List[List[torch.Tensor]] = []
-        # Image input ids for every subsequence, including ones with no image present.
-        image_input_ids: List[List[torch.Tensor]] = []
-        for bi in range(image_input.shape[0]):
-            images.append([])
-            image_input_ids.append([])
-            image_patches.append([])
-            for si in range(image_input.shape[1]):
-                if image_present[bi, si]:
-                    image = image_input[bi, si]
-                    if variable_sized:
-                        # The min() is required here due to floating point issues:
-                        # math.ceil(torch.tensor(300).cuda() / 30) == 11
-                        new_h = min(
-                            image.shape[1], math.ceil(image_unpadded_h[bi, si] / image_patch_dim_h) * image_patch_dim_h
-                        )
-                        new_w = min(
-                            image.shape[2], math.ceil(image_unpadded_w[bi, si] / image_patch_dim_w) * image_patch_dim_w
-                        )
-                        image = image[:, :new_h, :new_w]
-                    images[bi].append(image)
-                    num_patches = self.get_num_patches(
-                        img_h=image.shape[1],
-                        img_w=image.shape[2],
-                        patch_dim_h=image_patch_dim_h,
-                        patch_dim_w=image_patch_dim_w,
-                    )
-                    ids = torch.full([num_patches], image_placeholder_id, dtype=torch.int32, device=image_input.device)
-                    patches = self.patchify_image(
-                        image=image.unsqueeze(0), patch_dim_h=image_patch_dim_h, patch_dim_w=image_patch_dim_w
-                    ).squeeze(0)
-                    if variable_sized:
-                        # Now terminate each line with |NEWLINE|.
-                        ids = ids.reshape(-1, new_w // image_patch_dim_w)
-                        ids = torch.cat(
-                            [
-                                ids,
-                                torch.full(
-                                    [ids.shape[0], 1], image_newline_id, dtype=torch.int32, device=image_input.device
-                                ),
-                            ],
-                            dim=1,
-                        )
-                        ids = ids.reshape(-1)
-                    image_input_ids[bi].append(ids)
-                    image_patches[bi].append(patches)
-                else:
-                    image_input_ids[bi].append(torch.tensor([], dtype=torch.int32, device=image_input.device))
-
-        # Create image_patch_input_indices, where non-negative values correspond to image patches to be inserted in
-        # the stream.
-        image_patch_indices_per_batch: List[List[torch.Tensor]] = []
-        image_patch_indices_per_subsequence: List[List[torch.Tensor]] = []
-        for bi in range(len(image_input_ids)):
-            image_patch_indices_per_batch.append([])
-            image_patch_indices_per_subsequence.append([])
-            index_offset = 0
-            for si in range(len(image_input_ids[bi])):
-                # Indices of image patches.
-                num_patches = torch.count_nonzero(image_input_ids[bi][si] == image_placeholder_id)
-                indices = torch.arange(
-                    num_patches,
-                    dtype=image_input_ids[bi][si].dtype,
-                    device=image_input_ids[bi][si].device,
-                )
-
-                # Place those indices in the image input ids token stream, with -1 representing non-index tokens.
-                indices_in_stream_per_batch = torch.full_like(image_input_ids[bi][si], -1)
-                indices_in_stream_per_subsequence = torch.full_like(image_input_ids[bi][si], -1)
-                indices_in_stream_per_batch[
-                    torch.nonzero(image_input_ids[bi][si] == image_placeholder_id, as_tuple=True)[0]
-                ] = (indices + index_offset)
-                indices_in_stream_per_subsequence[
-                    torch.nonzero(image_input_ids[bi][si] == image_placeholder_id, as_tuple=True)[0]
-                ] = indices
-
-                image_patch_indices_per_batch[bi].append(indices_in_stream_per_batch)
-                image_patch_indices_per_subsequence[bi].append(indices_in_stream_per_subsequence)
-                index_offset += num_patches
-
-        return {
-            "images": images,
-            "image_input_ids": image_input_ids,
-            "image_patches": image_patches,
-            "image_patch_indices_per_batch": image_patch_indices_per_batch,
-            "image_patch_indices_per_subsequence": image_patch_indices_per_subsequence,
-        }
-
-    def _scale_to_target_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
-        image_height, image_width, _ = image.shape
-        if image_width <= self.target_width and image_height <= self.target_height:
-            return image
-
-        height_scale_factor = self.target_height / image_height
-        width_scale_factor = self.target_width / image_width
-        optimal_scale_factor = min(height_scale_factor, width_scale_factor)
-
-        new_height = int(image_height * optimal_scale_factor)
-        new_width = int(image_width * optimal_scale_factor)
-
-        scaled_image = resize(image=image, size=(new_height, new_width))
-        return np.array(scaled_image)
-
-    def _pad_to_target_size(self, image: np.ndarray) -> np.ndarray:
-        image_height, image_width, _ = image.shape
-
-        padding_top = 0
-        padding_left = 0
-        padding_bottom = self.target_height - image_height
-        padding_right = self.target_width - image_width
-
-        padded_image = pad(
-            image,
-            ((padding_top, padding_bottom), (padding_left, padding_right)),
-            mode=self.padding_mode,
-            constant_values=self.padding_value,
-        )
-        return padded_image
-
-    def apply_transformation(self, image: Union[np.ndarray, PIL.Image.Image]) -> np.ndarray:
-        if isinstance(image, PIL.Image.Image):
-            image = to_numpy_array(image)
-        scaled_image = self._scale_to_target_aspect_ratio(image)
-        padded_image = self._pad_to_target_size(scaled_image)
-        normalized_padded_image = normalize(padded_image, 0.5, 0.5)
-        return normalized_padded_image
--- a/src/transformers/models/fuyu/modeling_fuyu.py
+++ b/src/transformers/models/fuyu/modeling_fuyu.py
@ -1,323 +0,0 @@
-# coding=utf-8
-# Copyright 2023 HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" PyTorch Fuyu model."""
-from typing import List, Optional, Tuple, Union
-
-import torch
-import torch.utils.checkpoint
-from torch import nn
-
-from ...modeling_outputs import BaseModelOutputWithPast
-from ...modeling_utils import PreTrainedModel
-from ...models.auto.modeling_auto import AutoModelForCausalLM
-from ...utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging
-from .configuration_fuyu import FuyuConfig
-
-
-logger = logging.get_logger(__name__)
-
-_CONFIG_FOR_DOC = "FuyuConfig"
-
-
-FUYU_START_DOCSTRING = r"""
-    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
-    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
-    etc.)
-
-    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
-    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
-    and behavior.
-
-    Parameters:
-        config ([`FuyuConfig`]):
-            Model configuration class with all the parameters of the model. Initializing with a config file does not
-            load the weights associated with the model, only the configuration. Check out the
-            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
-"""
-
-
-@add_start_docstrings(
-    "The bare Fuyu Model outputting raw hidden-states without any specific head on top.",
-    FUYU_START_DOCSTRING,
-)
-class FuyuPreTrainedModel(PreTrainedModel):
-    config_class = FuyuConfig
-    base_model_prefix = "fuyu"
-    supports_gradient_checkpointing = True
-    _no_split_modules = []
-    _skip_keys_device_placement = "past_key_values"
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, nn.Linear):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-    def _set_gradient_checkpointing(self, module, value=False):
-        if isinstance(module, FuyuForCausalLM):
-            module.gradient_checkpointing = value
-
-
-FUYU_INPUTS_DOCSTRING = r"""
-    Args:
-        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
-            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
-            it.
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            [What are input IDs?](../glossary#input-ids)
-        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-            - 1 for tokens that are **not masked**,
-            - 0 for tokens that are **masked**.
-
-            [What are attention masks?](../glossary#attention-mask)
-
-            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
-            [`PreTrainedTokenizer.__call__`] for details.
-
-            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
-            `past_key_values`).
-
-            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
-            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
-            information on the default strategy.
-
-            - 1 indicates the head is **not masked**,
-            - 0 indicates the head is **masked**.
-        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
-            config.n_positions - 1]`.
-
-            [What are position IDs?](../glossary#position-ids)
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
-            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
-            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
-
-            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
-            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
-            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
-        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
-            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
-            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
-            model's internal embedding lookup matrix.
-        use_cache (`bool`, *optional*):
-            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
-            `past_key_values`).
-        output_attentions (`bool`, *optional*):
-            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
-            tensors for more detail.
-        output_hidden_states (`bool`, *optional*):
-            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
-            more detail.
-        return_dict (`bool`, *optional*):
-            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-"""
-
-
-@add_start_docstrings(
-    "The bare Fuyu Model outputting raw hidden-states without any specific head on top.",
-    FUYU_START_DOCSTRING,
-)
-class FuyuForCausalLM(FuyuPreTrainedModel):
-    """
-    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`FuyuDecoderLayer`]
-
-    Args:
-        config: FuyuConfig
-    """
-
-    def __init__(self, config: FuyuConfig):
-        super().__init__(config)
-        self.padding_idx = config.pad_token_id
-        self.vocab_size = config.vocab_size
-        self.language_model = AutoModelForCausalLM.from_config(config.text_config)
-
-        self.vision_embed_tokens = nn.Linear(
-            config.patch_size * config.patch_size * config.num_channels, config.hidden_size
-        )
-
-        self.gradient_checkpointing = False
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self):
-        return self.language_model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.language_model.set_input_embeddings(value)
-
-    def gather_continuous_embeddings(
-        self,
-        word_embeddings: torch.Tensor,
-        continuous_embeddings: List[torch.Tensor],
-        image_patch_input_indices: torch.Tensor,
-    ) -> torch.Tensor:
-        """This function places the continuous_embeddings into the word_embeddings at the locations
-        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
-        embeddings.
-
-        Args:
-            word_embeddings: Tensor of word embeddings. Shape: [b, s, h]
-            continuous_embeddings:
-                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is
-            shape [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
-            indices in image_patch_input_indices for that batch element.
-            image_patch_input_indices: Tensor of indices of the image patches in the input_ids tensor. Shape: [b, s]
-        """
-        if not (word_embeddings.shape[0] == len(continuous_embeddings)):
-            raise ValueError(
-                f"Batch sizes must match! Got {len(continuous_embeddings)=} and {word_embeddings.shape[0]=}"
-            )
-
-        output_embeddings = word_embeddings.clone()
-        for batch_idx in range(word_embeddings.shape[0]):
-            # First, find the positions of all the non-negative values in image_patch_input_indices, those are the
-            # positions in word_embeddings that we want to replace with content from continuous_embeddings.
-            dst_indices = torch.nonzero(image_patch_input_indices[batch_idx] >= 0, as_tuple=True)[0]
-            # Next look up those indices in image_patch_input_indices to find the indices in continuous_embeddings that we
-            # want to use to replace the values in word_embeddings.
-            src_indices = image_patch_input_indices[batch_idx][dst_indices]
-            # Check if we have more indices than embeddings. Note that we could have fewer indices if images got truncated.
-            if src_indices.shape[0] > continuous_embeddings[batch_idx].shape[0]:
-                raise ValueError(
-                    f"Number of continuous embeddings {continuous_embeddings[batch_idx].shape=} does not match "
-                    f"number of continuous token ids {src_indices.shape=} in batch element {batch_idx}."
-                )
-            output_embeddings[batch_idx, dst_indices] = continuous_embeddings[batch_idx][src_indices]
-        return output_embeddings
-
-    @add_start_docstrings_to_model_forward(FUYU_INPUTS_DOCSTRING)
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        image_patches: torch.Tensor = None,  # [batch_size, num_total_patches, patch_size_ x patch_size x num_channels ]
-        image_patches_indices: torch.Tensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPast]:
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        use_cache = use_cache if use_cache is not None else self.config.use_cache
-
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        # retrieve input_ids and inputs_embeds
-        if input_ids is not None and inputs_embeds is not None:
-            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
-        elif input_ids is not None:
-            batch_size, seq_length = input_ids.shape
-        elif inputs_embeds is not None:
-            batch_size, seq_length, _ = inputs_embeds.shape
-        else:
-            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
-
-        seq_length_with_past = seq_length
-        past_key_values_length = 0
-
-        if past_key_values is not None:
-            past_key_values_length = past_key_values[0][0].shape[2]
-            seq_length_with_past = seq_length_with_past + past_key_values_length
-
-        if position_ids is None:
-            device = input_ids.device if input_ids is not None else inputs_embeds.device
-            position_ids = torch.arange(
-                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
-            )
-            position_ids = position_ids.unsqueeze(0)
-
-        if inputs_embeds is None:
-            inputs_embeds = self.language_model.get_input_embeddings()(input_ids)
-            if image_patches is not None and past_key_values is None:
-                patch_embeddings = self.vision_embed_tokens(image_patches.to(self.vision_embed_tokens.weight.dtype))
-                inputs_embeds = self.gather_continuous_embeddings(
-                    word_embeddings=inputs_embeds,
-                    continuous_embeddings=patch_embeddings,
-                    image_patch_input_indices=image_patches_indices,
-                )
-
-        outputs = self.language_model(
-            inputs_embeds=inputs_embeds,
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            output_attentions=output_attentions,
-            use_cache=use_cache,
-        )
-        if not return_dict:
-            return tuple(v for v in outputs if v is not None)
-        return outputs
-
-    def prepare_inputs_for_generation(
-        self,
-        input_ids,
-        past_key_values=None,
-        attention_mask=None,
-        inputs_embeds=None,
-        image_patches=None,
-        image_patches_indices=None,
-        **kwargs,
-    ):
-        if past_key_values:
-            input_ids = input_ids[:, -1:]
-
-        position_ids = kwargs.get("position_ids", None)
-        if attention_mask is not None and position_ids is None:
-            # create position_ids on the fly for batch generation
-            position_ids = attention_mask.long().cumsum(-1) - 1
-            position_ids.masked_fill_(attention_mask == 0, 1)
-            if past_key_values:
-                position_ids = position_ids[:, -1].unsqueeze(-1)
-
-        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and past_key_values is None:
-            model_inputs = {"inputs_embeds": inputs_embeds}
-        else:
-            model_inputs = {"input_ids": input_ids}
-
-        if image_patches_indices is not None:
-            model_inputs["image_patches_indices"] = image_patches_indices
-
-        model_inputs.update(
-            {
-                "position_ids": position_ids,
-                "past_key_values": past_key_values,
-                "use_cache": kwargs.get("use_cache"),
-                "attention_mask": attention_mask,
-                "image_patches_indices": image_patches_indices if past_key_values is None else None,
-                "image_patches": image_patches if past_key_values is None else None,
-            }
-        )
-        return model_inputs
--- a/src/transformers/models/fuyu/processing_fuyu.py
+++ b/src/transformers/models/fuyu/processing_fuyu.py
@ -1,562 +0,0 @@
-import re
-from typing import Any, Iterable, List, Optional, Tuple, Union
-
-import numpy as np
-
-from ...image_utils import (
-    ChannelDimension,
-    get_image_size,
-    infer_channel_dimension_format,
-    is_scaled_image,
-    to_numpy_array,
-)
-from ...processing_utils import ProcessorMixin
-from ...utils import is_torch_available, is_vision_available, logging
-
-
-if is_torch_available() and is_vision_available():
-    from .image_processing_fuyu import FuyuImageProcessor
-
-
-logger = logging.get_logger(__name__)
-
-if is_vision_available():
-    import PIL
-
-if is_torch_available():
-    import torch
-
-BBOX_OPEN_STRING = "<0x00>"  # <bbox>
-BBOX_CLOSE_STRING = "<0x01>"  # </bbox>
-POINT_OPEN_STRING = "<0x02>"  # <point>
-POINT_CLOSE_STRING = "<0x03>"  # </point>
-
-TEXT_REPR_BBOX_OPEN = "<box>"
-TEXT_REPR_BBOX_CLOSE = "</box>"
-TEXT_REPR_POINT_OPEN = "<point>"
-TEXT_REPR_POINT_CLOSE = "</point>"
-
-TOKEN_BBOX_OPEN_STRING = BBOX_OPEN_STRING = "<0x00>"  # <bbox>
-BBOX_CLOSE_STRING = "<0x01>"  # </bbox>
-TOKEN_BBOX_CLOSE_STRING = TOKEN_POINT_OPEN_STRING = POINT_OPEN_STRING = "<0x02>"  # <point>
-TOKEN_POINT_CLOSE_STRING = POINT_CLOSE_STRING = "<0x03>"  # </point>
-BEGINNING_OF_ANSWER_STRING = "<0x04>"  # <boa>
-
-
-def full_unpacked_stream_to_tensor(
-    all_bi_tokens_to_place: List[int],
-    full_unpacked_stream: List["torch.Tensor"],
-    fill_value: int,
-    batch_size: int,
-    new_seq_len: int,
-    offset: int,
-) -> "torch.Tensor":
-    """Takes an unpacked stream of tokens (i.e. a list of tensors, one for each item in the batch) and does
-    the required padding to create a single tensor for the batch of shape batch_size x new_seq_len.
-    """
-
-    assert len(all_bi_tokens_to_place) == batch_size
-    assert len(full_unpacked_stream) == batch_size
-
-    # Create padded tensors for the full batch.
-    new_padded_tensor = torch.full(
-        [batch_size, new_seq_len],
-        fill_value=fill_value,
-        dtype=full_unpacked_stream[0].dtype,
-        device=full_unpacked_stream[0].device,
-    )
-
-    # Place each batch entry into the batch tensor.
-    for bi in range(batch_size):
-        tokens_to_place = all_bi_tokens_to_place[bi]
-        new_padded_tensor[bi, :tokens_to_place] = full_unpacked_stream[bi][offset : tokens_to_place + offset]
-
-    return new_padded_tensor
-
-
-def construct_full_unpacked_stream(
-    num_real_text_tokens: Union[List[List[int]], "torch.Tensor"],
-    input_stream: "torch.Tensor",
-    image_tokens: List[List["torch.Tensor"]],
-    batch_size: int,
-    num_sub_sequences: int,
-) -> List["torch.Tensor"]:
-    """Takes an input_stream tensor of shape B x S x ?. For each subsequence, adds any required
-    padding to account for images and then unpacks the subsequences to create a single sequence per item in the batch.
-    Returns a list of tensors, one for each item in the batch."""
-
-    all_bi_stream = []
-
-    for bi in range(batch_size):
-        all_si_stream = []
-
-        # First, construct full token stream (including image placeholder tokens) and loss mask for each subsequence
-        # and append to lists. We use lists rather than tensors because each subsequence is variable-sized.
-        for si in range(num_sub_sequences):
-            image_adjustment = image_tokens[bi][si]
-            si_stream = torch.cat([image_adjustment, input_stream[bi, si]], dim=0)
-            num_real_tokens = image_adjustment.shape[0] + num_real_text_tokens[bi][si]
-
-            all_si_stream.append(si_stream[:num_real_tokens])
-        # Combine all subsequences for this batch entry. Still using a list because each batch entry is variable-sized.
-        all_bi_stream.append(torch.cat(all_si_stream, dim=0))
-
-    return all_bi_stream
-
-
-def _replace_string_repr_with_token_tags(prompt: str) -> str:
-    prompt = prompt.replace(TEXT_REPR_POINT_OPEN, TOKEN_POINT_OPEN_STRING)
-    prompt = prompt.replace(TEXT_REPR_POINT_CLOSE, TOKEN_POINT_CLOSE_STRING)
-    prompt = prompt.replace(TEXT_REPR_BBOX_OPEN, TOKEN_BBOX_OPEN_STRING)
-    prompt = prompt.replace(TEXT_REPR_BBOX_CLOSE, TOKEN_BBOX_CLOSE_STRING)
-    return prompt
-
-
-def _segment_prompt_into_text_token_conversions(prompt: str) -> List:
-    """
-    Given a string prompt, converts the prompt into a list of TextTokenConversions.
-    """
-    # Wherever, we notice the [TOKEN_OPEN_STRING, TOKEN_CLOSE_STRING], we split the prompt
-    prompt_text_list: List = []
-    regex_pattern = re.compile(
-        f"({TOKEN_BBOX_OPEN_STRING}|{TOKEN_BBOX_CLOSE_STRING}|{TOKEN_POINT_OPEN_STRING}|{TOKEN_POINT_CLOSE_STRING})"
-    )
-    # Split by the regex pattern
-    prompt_split = regex_pattern.split(prompt)
-    for i, elem in enumerate(prompt_split):
-        if len(elem) == 0 or elem in [
-            TOKEN_BBOX_OPEN_STRING,
-            TOKEN_BBOX_CLOSE_STRING,
-            TOKEN_POINT_OPEN_STRING,
-            TOKEN_POINT_CLOSE_STRING,
-        ]:
-            continue
-        prompt_text_list.append(
-            (elem, i > 1 and prompt_split[i - 1] in [TOKEN_BBOX_OPEN_STRING, TOKEN_POINT_OPEN_STRING])
-        )
-    return prompt_text_list
-
-
-def _transform_coordinates_and_tokenize(prompt: str, transformed_image, tokenizer) -> List[int]:
-    """
-    This function transforms the prompt in the following fashion:
-    - <box> <point> and </box> </point> to their respective token mappings
-    - extract the coordinates from the tag
-    - transform the coordinates into the transformed image space
-    - return the prompt tokens with the transformed coordinates and new tags
-
-    Bounding boxes and points MUST be in the following format: <box>y1, x1, y2, x2</box> <point>x, y</point> The spaces
-    and punctuation added above are NOT optional.
-    """
-    # Make a namedtuple that stores "text" and "is_bbox"
-
-    # We want to do the following: Tokenize the code normally -> when we see a point or box, tokenize using the tokenize_within_tag function
-    # When point or box close tag, continue tokenizing normally
-    # First, we replace the point and box tags with their respective tokens
-    prompt = _replace_string_repr_with_token_tags(prompt)
-    # Tokenize the prompt
-    # Convert prompt into a list split
-    prompt_text_list = _segment_prompt_into_text_token_conversions(prompt)
-    transformed_prompt_tokens: List[int] = []
-    for elem in prompt_text_list:
-        if elem[1]:
-            # This is a location, we need to tokenize it
-            within_tag_tokenized = _transform_within_tags(elem[0], transformed_image, tokenizer)
-            # Surround the text with the open and close tags
-            transformed_prompt_tokens.extend(within_tag_tokenized)
-        else:
-            transformed_prompt_tokens.extend(tokenizer(elem[0], add_special_tokens=False).input_ids)
-    return transformed_prompt_tokens
-
-
-def _transform_within_tags(text: str, transformed_image, tokenizer) -> List[int]:
-    """
-    Given a bounding box of the fashion <box>1, 2, 3, 4</box> | <point>1, 2</point> This function is responsible for
-    converting 1, 2, 3, 4 into tokens of 1 2 3 4 without any commas.
-    """
-    # Convert the text into a list of strings.
-    num_int_strs = text.split(",")
-    if len(num_int_strs) == 2:
-        # If there are any open or close tags, remove them.
-        token_space_open_string = tokenizer.vocab[TOKEN_POINT_OPEN_STRING]
-        token_space_close_string = tokenizer.vocab[TOKEN_POINT_CLOSE_STRING]
-    else:
-        token_space_open_string = tokenizer.vocab[TOKEN_BBOX_OPEN_STRING]
-        token_space_close_string = tokenizer.vocab[TOKEN_BBOX_CLOSE_STRING]
-
-    # Remove all spaces from num_ints
-    num_ints = [float(num.strip()) for num in num_int_strs]
-    # scale to transformed image siz
-    if len(num_ints) == 2:
-        num_ints_translated = scale_point_to_transformed_image(
-            x=num_ints[0], y=num_ints[1], transformed_image=transformed_image
-        )
-    elif len(num_ints) == 4:
-        num_ints_translated = scale_bbox_to_transformed_image(
-            top=num_ints[0],
-            left=num_ints[1],
-            bottom=num_ints[2],
-            right=num_ints[3],
-            transformed_image=transformed_image,
-        )
-    else:
-        raise ValueError(f"Invalid number of ints: {len(num_ints)}")
-    # Tokenize the text, skipping the
-    tokens = [tokenizer.vocab[str(num)] for num in num_ints_translated]
-    return [token_space_open_string] + tokens + [token_space_close_string]
-
-
-def _tokenize_prompts_with_image_and_batch(
-    tokenizer,
-    prompts: List[List[str]],
-    transformed_images: Optional[List[List["torch.Tensor"]]],
-    max_tokens_to_generate: int,
-    max_position_embeddings: int,
-    add_BOS: bool,  # Same issue with types as above
-    add_beginning_of_answer_token: bool,
-) -> Tuple["torch.Tensor", "torch.Tensor"]:
-    """
-    Given a set of prompts and number of tokens to generate:
-    - tokenize prompts
-    - set the sequence length to be the max of length of prompts plus the number of tokens we would like to generate
-    - pad all the sequences to this length so we can convert them into a 3D tensor.
-    """
-
-    # If not tool use, tranform the coordinates while tokenizing
-    if transformed_images is not None:
-        transformed_prompt_tokens = []
-        for prompt_seq, transformed_image_seq in zip(prompts, transformed_images):
-            transformed_prompt_tokens.append(
-                [
-                    _transform_coordinates_and_tokenize(prompt, transformed_image, tokenizer)
-                    for prompt, transformed_image in zip(prompt_seq, transformed_image_seq)
-                ]
-            )
-    else:
-        transformed_prompt_tokens = [[tokenizer.tokenize(prompt) for prompt in prompt_seq] for prompt_seq in prompts]
-
-    prompts_tokens = transformed_prompt_tokens
-
-    if add_BOS:
-        bos_token = tokenizer.vocab["<s>"]
-    else:
-        bos_token = tokenizer.vocab["|ENDOFTEXT|"]
-    prompts_tokens = [[[bos_token] + x for x in prompt_seq] for prompt_seq in prompts_tokens]
-    if add_beginning_of_answer_token:
-        boa = tokenizer.vocab[BEGINNING_OF_ANSWER_STRING]
-        # Only add bbox open token to the last subsequence since that is what will be completed
-        for token_seq in prompts_tokens:
-            token_seq[-1].append(boa)
-
-    # Now we have a list of list of tokens which each list has a different
-    # size. We want to extend this list to:
-    #   - incorporate the tokens that need to be generated
-    #   - make all the sequences equal length.
-    # Get the prompts length.
-
-    prompts_length = [[len(x) for x in prompts_tokens_seq] for prompts_tokens_seq in prompts_tokens]
-    # Get the max prompts length.
-    max_prompt_len: int = np.max(prompts_length)
-    # Number of tokens in the each sample of the batch.
-    samples_length = min(max_prompt_len + max_tokens_to_generate, max_position_embeddings)
-    if max_prompt_len + max_tokens_to_generate > max_position_embeddings:
-        print(
-            f"Max subsequence prompt length of {max_prompt_len} + max tokens to generate {max_tokens_to_generate}",
-            f"exceeds context length of {max_position_embeddings}. Will generate as many tokens as possible.",
-        )
-    # Now update the list of list to be of the same size: samples_length.
-    for prompt_tokens_seq, prompts_length_seq in zip(prompts_tokens, prompts_length):
-        for prompt_tokens, prompt_length in zip(prompt_tokens_seq, prompts_length_seq):
-            if len(prompt_tokens) > samples_length:
-                raise ValueError("Length of subsequence prompt exceeds sequence length.")
-            padding_size = samples_length - prompt_length
-            prompt_tokens.extend([tokenizer.vocab["|ENDOFTEXT|"]] * padding_size)
-
-    # Now we are in a structured format, we can convert to tensors.
-    prompts_tokens_tensor = torch.tensor(prompts_tokens, dtype=torch.int64)
-    prompts_length_tensor = torch.tensor(prompts_length, dtype=torch.int64)
-
-    return prompts_tokens_tensor, prompts_length_tensor
-
-
-def original_to_transformed_h_coords(self, original_coords):
-    # apply crop
-    cropped_coords = (
-        self._clamp_coords(original_coords, min_value=self.crop_top, max_value=self.crop_bottom) - self.crop_top
-    )
-    # apply scale
-    scaled_coords = self._scale_coords(cropped_coords, scale=self.scaled_h / self.original_h)
-    # apply pad
-    return scaled_coords + self.padding_top
-
-
-def original_to_transformed_w_coords(self, original_coords):
-    # apply crop
-    cropped_coords = (
-        self._clamp_coords(original_coords, min_value=self.crop_left, max_value=self.crop_right) - self.crop_left
-    )
-    # apply scale
-    scaled_coords = self._scale_coords(cropped_coords, scale=self.scaled_w / self.original_w)
-    # apply pad
-    return scaled_coords + self.padding_left
-
-
-def scale_point_to_transformed_image(x: float, y: float) -> List[int]:
-    x_scaled = original_to_transformed_w_coords(np.array([x / 2]))[0]
-    y_scaled = original_to_transformed_h_coords(np.array([y / 2]))[0]
-    return [x_scaled, y_scaled]
-
-
-def scale_bbox_to_transformed_image(top: float, left: float, bottom: float, right: float) -> List[int]:
-    top_scaled = original_to_transformed_w_coords(np.array([top / 2]))[0]
-    left_scaled = original_to_transformed_h_coords(np.array([left / 2]))[0]
-    bottom_scaled = original_to_transformed_w_coords(np.array([bottom / 2]))[0]
-    right_scaled = original_to_transformed_h_coords(np.array([right / 2]))[0]
-    return [top_scaled, left_scaled, bottom_scaled, right_scaled]
-
-
-# Copied from transformers.models.detr.image_processing_detr.max_across_indices
-def max_across_indices(values: Iterable[Any]) -> List[Any]:
-    """
-    Return the maximum value across all indices of an iterable of values.
-    """
-    return [max(values_i) for values_i in zip(*values)]
-
-
-# Copied from transformers.models.detr.image_processing_detr.get_max_height_width
-def get_max_height_width(
-    images: List[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
-) -> List[int]:
-    """
-    Get the maximum height and width across all images in a batch.
-    """
-    if input_data_format is None:
-        input_data_format = infer_channel_dimension_format(images[0])
-
-    if input_data_format == ChannelDimension.FIRST:
-        _, max_height, max_width = max_across_indices([img.shape for img in images])
-    elif input_data_format == ChannelDimension.LAST:
-        max_height, max_width, _ = max_across_indices([img.shape for img in images])
-    else:
-        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
-    return (max_height, max_width)
-
-
-# Copied from transformers.models.detr.image_processing_detr.make_pixel_mask
-def make_pixel_mask(
-    image: np.ndarray, output_size: Tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
-) -> np.ndarray:
-    """
-    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
-
-    Args:
-        image (`np.ndarray`):
-            Image to make the pixel mask for.
-        output_size (`Tuple[int, int]`):
-            Output size of the mask.
-    """
-    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
-    mask = np.zeros(output_size, dtype=np.int64)
-    mask[:input_height, :input_width] = 1
-    return mask
-
-
-class FuyuProcessor(ProcessorMixin):
-    r"""
-    Constructs a Fuyu processor which wraps a Fuyu image processor and a Llama tokenizer into a single processor.
-
-    [`FuyuProcessor`] offers all the functionalities of [`FuyuImageProcessor`] and [`LlamaTokenizerFast`]. See the
-    [`~FuyuProcessor.__call__`] and [`~FuyuProcessor.decode`] for more information.
-
-    Args:
-        image_processor ([`FuyuImageProcessor`]):
-            The image processor is a required input.
-        tokenizer ([`LlamaTokenizerFast`]):
-            The tokenizer is a required input.
-    """
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "FuyuImageProcessor"
-    tokenizer_class = "AutoTokenizer"
-
-    def __init__(self, image_processor, tokenizer):
-        super().__init__(image_processor=image_processor, tokenizer=tokenizer)
-        self.image_processor = image_processor
-        self.tokenizer = tokenizer
-        self.max_tokens_to_generate = 10
-        self.max_position_embeddings = 16384  # TODO Can't derive this from model files: where to set it?
-        self.image_processor = FuyuImageProcessor()
-
-    def _process_images(self, images):
-        """Utility function to preprocess the images and extract necessary information about original formats."""
-        batch_images = []
-        image_unpadded_heights = []
-        image_unpadded_widths = []
-
-        for image in images:
-            image = to_numpy_array(image)
-            if not is_scaled_image(image):
-                image = image / 255.0
-            channel_dimension = infer_channel_dimension_format(image, 3)
-            if channel_dimension == ChannelDimension.FIRST:
-                width_index = 2
-                height_index = 1
-            elif channel_dimension == ChannelDimension.LAST:
-                width_index = 1
-                height_index = 0
-
-            image_unpadded_widths.append([image.shape[width_index]])
-            image_unpadded_heights.append([image.shape[height_index]])
-
-            # Reproduct adept padding sampler
-            padded_image = self.image_processor.apply_transformation(image)
-
-            tensor_img = torch.Tensor(padded_image).permute(2, 0, 1)
-            batch_images.append([tensor_img])
-
-        return batch_images, torch.Tensor(image_unpadded_heights), torch.Tensor(image_unpadded_widths)
-
-    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to
-        encode the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
-        FuyuImageProcessor's [`~FuyuImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
-        of the above two methods for more information.
-
-        Args:
-            text (`str`, `List[str]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            images (`PIL.Image.Image`, `List[PIL.Image.Image]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
-                number of channels, H and W are image height and width.
-
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-        if text is None and images is None:
-            raise ValueError("You have to specify either text or images. Both cannot be none.")
-        if text is not None and images is not None:
-            if isinstance(text, str):
-                prompts = [[text]]
-            elif isinstance(text, list):
-                prompts = [[text_seq] for text_seq in text]
-            batch_images = []
-            if isinstance(images, PIL.Image.Image):
-                images = [images]
-            if isinstance(images, list):
-                batch_images, image_unpadded_heights, image_unpadded_widths = self._process_images(images)
-                # image_unpadded_heights = image_unpadded_heights.unsqueeze(0)
-                # image_unpadded_widths = image_unpadded_widths.unsqueeze(0)
-            else:
-                raise ValueError("images must be a list of ndarrays or PIL Images to be processed.")
-
-            # Note: the original adept code has a handling of image_unpadded_h and w, but it doesn't seem to hold
-            # when there are several different size subsequences per batch. The current implementation reflects
-            # that limitation and should be documented.
-            #
-            self.subsequence_length = 1  # Each batch contains only one sequence.
-            self.batch_size = len(batch_images)
-            # FIXME max_tokens_to_generate is embedded into this processor's call.
-            prompt_tokens, prompts_length = _tokenize_prompts_with_image_and_batch(
-                tokenizer=self.tokenizer,
-                prompts=prompts,
-                transformed_images=batch_images,
-                max_tokens_to_generate=self.max_tokens_to_generate,
-                max_position_embeddings=self.max_position_embeddings,
-                add_BOS=True,
-                add_beginning_of_answer_token=True,
-            )
-            # same so far
-
-            # This is 1 if there is an image per subsequence, else 0. [batch, 1, presence]
-            # the remainder of current image processing logic assumes subsequence_size = 1.
-            # Here it is OK as the model cannot handle > 1 subsequences
-            # the image could be absent however and image presence should be inferred from user batch input
-            # hence this code assumes the images are present. Use an assert?
-
-            image_present = torch.ones(self.batch_size, 1, 1)
-
-            image_placeholder_id = self.tokenizer("|SPEAKER|", add_special_tokens=False)["input_ids"][1]
-            image_newline_id = self.tokenizer("|NEWLINE|", add_special_tokens=False)["input_ids"][1]
-            tensor_batch_images = torch.stack([img[0] for img in batch_images]).unsqueeze(1)
-            model_image_input = self.image_processor.process_images_for_model_input(
-                image_input=tensor_batch_images,
-                image_present=image_present,
-                image_unpadded_h=image_unpadded_heights,
-                image_unpadded_w=image_unpadded_widths,
-                image_patch_dim_h=30,
-                image_patch_dim_w=30,
-                image_placeholder_id=image_placeholder_id,
-                image_newline_id=image_newline_id,
-                variable_sized=True,
-            )
-
-            image_padded_unpacked_tokens = construct_full_unpacked_stream(
-                num_real_text_tokens=prompts_length,
-                input_stream=prompt_tokens,
-                image_tokens=model_image_input["image_input_ids"],
-                batch_size=self.batch_size,
-                num_sub_sequences=self.subsequence_length,
-            )
-            # Construct inputs for image patch indices.
-            unpacked_image_patch_indices_per_batch = construct_full_unpacked_stream(
-                num_real_text_tokens=prompts_length,
-                input_stream=torch.full_like(prompt_tokens, -1),
-                image_tokens=model_image_input["image_patch_indices_per_batch"],
-                batch_size=self.batch_size,
-                num_sub_sequences=self.subsequence_length,
-            )
-            max_prompt_length = max(x.shape[-1] for x in image_padded_unpacked_tokens)
-            max_seq_len_batch = min(max_prompt_length + self.max_tokens_to_generate, self.max_position_embeddings)
-            all_bi_tokens_to_place = []
-            for bi in range(self.batch_size):
-                tokens_to_place = min(max_seq_len_batch, max(0, image_padded_unpacked_tokens[bi].shape[0]))
-                all_bi_tokens_to_place.append(tokens_to_place)
-
-            # Use same packing logic for the image patch indices.
-            image_patch_input_indices = full_unpacked_stream_to_tensor(
-                all_bi_tokens_to_place=all_bi_tokens_to_place,
-                full_unpacked_stream=unpacked_image_patch_indices_per_batch,
-                fill_value=-1,
-                batch_size=self.batch_size,
-                new_seq_len=max_seq_len_batch,
-                offset=0,
-            )
-
-            image_patches_tensor = torch.stack([img[0] for img in model_image_input["image_patches"]]).unsqueeze(1)
-            return {
-                "input_ids": image_padded_unpacked_tokens[0].unsqueeze(0),
-                "image_patches": image_patches_tensor[0][0].unsqueeze(0),
-                "image_patches_indices": image_patch_input_indices,
-            }
-
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to BertTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
--- a/src/transformers/models/gpt2/tokenization_gpt2.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2.py
@ -363,10 +363,4 @@ class GPT2Tokenizer(PreTrainedTokenizer):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/gpt2/tokenization_gpt2_fast.py
+++ b/src/transformers/models/gpt2/tokenization_gpt2_fast.py
@ -181,10 +181,4 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
+++ b/src/transformers/models/gpt_neox/tokenization_gpt_neox_fast.py
@ -135,10 +135,4 @@ class GPTNeoXTokenizerFast(PreTrainedTokenizerFast):
        """
        A simple chat template that ignores role information and just concatenates messages with EOS tokens.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return "{% for message in messages %}" "{{ message.content }}{{ eos_token }}" "{% endfor %}"
--- a/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
+++ b/src/transformers/models/gpt_neox_japanese/tokenization_gpt_neox_japanese.py
@ -180,12 +180,6 @@ class GPTNeoXJapaneseTokenizer(PreTrainedTokenizer):
        """
        A simple chat template that just adds BOS/EOS tokens around messages while discarding role information.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return (
            "{% for message in messages %}"
            "{{ bos_token + eos_token + message.content + eos_token }}"
--- a/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
+++ b/src/transformers/models/gpt_sw3/tokenization_gpt_sw3.py
@ -321,12 +321,6 @@ class GPTSw3Tokenizer(PreTrainedTokenizer):
        This chat template formats messages like an instant messenger chat log, with "User:" and "Bot:" strings
        preceding messages. BOS tokens are added between all messages.
        """
-        logger.warning_once(
-            "\nNo chat template is defined for this tokenizer - using the default template "
-            f"for the {self.__class__.__name__} class. If the default is not appropriate for "
-            "your model, please set `tokenizer.chat_template` to an appropriate template. "
-            "See https://huggingface.co/docs/transformers/main/chat_templating for more information.\n"
-        )
        return (
            "{{ eos_token }}{{ bos_token }}"
            "{% for message in messages %}"
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
ydshieh	54acb92c7d	run separate reports	2023-10-27 10:21:46 +02:00
ydshieh	32a3ba4670	run separate reports	2023-10-27 10:11:31 +02:00
ydshieh	6da8d0bf24	separate reports	2023-10-27 10:04:49 +02:00
ydshieh	289095ca61	fix slack report for doctest	2023-10-24 17:07:30 +02:00