run tests

[docs] Cache implementations (#34325 )
cache
2025-11-06 13:34:37 +08:00 · 2024-10-28 14:37:49 +01:00 · 2024-10-25 08:52:45 -07:00 · 2024-10-25 08:52:29 -07:00 · 2024-10-25 17:14:07 +02:00 · 2024-10-25 10:23:20 -04:00
391 changed files with 25709 additions and 11479 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -55,7 +55,7 @@ body:
          - deepspeed: HF Trainer/Accelerate: @muellerzr
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
-          - quantization (bitsandbytes, autogpt): @SunMarc
+          - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber

        Documentation: @stevhliu

--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -59,7 +59,7 @@ Integrations:
 - deepspeed: HF Trainer/Accelerate: @muellerzr
 - ray/raytune: @richardliaw, @amogkam
 - Big Model Inference: @SunMarc
- quantization (bitsandbytes, autogpt): @SunMarc
+- quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber

 Documentation: @stevhliu

--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -18,21 +18,17 @@ jobs:
    name: Benchmark
    runs-on:
      group: aws-g5-4xlarge-cache
+    if: |
+      (github.event_name == 'pull_request' && contains( github.event.pull_request.labels.*.name, 'run-benchmark') )||
+      (github.event_name == 'push' && github.ref == 'refs/heads/main')
    container:
      image: huggingface/transformers-pytorch-gpu
      options: --gpus all --privileged --ipc host
    steps:
      - name: Get repo
-        if: github.event_name == 'pull_request'
        uses: actions/checkout@v4
        with:
-          ref: ${{ github.event.pull_request.head.sha }}
-
-      - name: Get repo
-        if: github.event_name == 'push'
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ github.sha }}
+          ref: ${{ github.event.pull_request.head.sha || github.sha }}

      - name: Install libpq-dev & psql
        run: |
--- a/.github/workflows/check_failed_model_tests.yml
+++ b/.github/workflows/check_failed_model_tests.yml
@ -0,0 +1,129 @@
+name: Process failed tests
+
+on:
+  workflow_call:
+    inputs:
+      docker:
+        required: true
+        type: string
+      start_sha:
+        required: true
+        type: string
+
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  OMP_NUM_THREADS: 8
+  MKL_NUM_THREADS: 8
+  RUN_SLOW: yes
+  # For gated repositories, we still need to agree to share information on the Hub repo. page in order to get access.
+  # This token is created under the bot `hf-transformers-bot`.
+  HF_HUB_READ_TOKEN: ${{ secrets.HF_HUB_READ_TOKEN }}
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+  TF_FORCE_GPU_ALLOW_GROWTH: true
+  RUN_PT_TF_CROSS_TESTS: 1
+  CUDA_VISIBLE_DEVICES: 0,1
+
+
+jobs:
+  run_models_gpu:
+    name: " "
+    runs-on:
+      group: aws-g4dn-2xlarge-cache
+    container:
+      image: ${{ inputs.docker }}
+      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: ci_results_run_models_gpu
+          path: /transformers/ci_results_run_models_gpu
+
+      - name: Update clone
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ github.sha }}
+
+      - name: Get target commit
+        working-directory: /transformers/utils
+        run: |
+          echo "END_SHA=$(TOKEN=${{ secrets.ACCESS_REPO_INFO_TOKEN }} python3 -c 'import os; from get_previous_daily_ci import get_last_daily_ci_run_commit; commit=get_last_daily_ci_run_commit(token=os.environ["TOKEN"]); print(commit)')" >> $GITHUB_ENV
+
+      - name: Checkout to `start_sha`
+        working-directory: /transformers
+        run: git fetch && git checkout ${{ inputs.start_sha }}
+
+      - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
+        working-directory: /transformers
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e .
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Environment
+        working-directory: /transformers
+        run: |
+          python3 utils/print_env.py
+
+      - name: Show installed libraries and their versions
+        working-directory: /transformers
+        run: pip freeze
+
+      - name: Check failed tests
+        working-directory: /transformers
+        run: python3 utils/check_bad_commit.py --start_commit ${{ inputs.start_sha }} --end_commit ${{ env.END_SHA }} --file ci_results_run_models_gpu/new_model_failures.json --output_file new_model_failures_with_bad_commit.json
+
+      - name: Show results
+        working-directory: /transformers
+        run: |
+          ls -l new_model_failures_with_bad_commit.json
+          cat new_model_failures_with_bad_commit.json
+
+      - name: Checkout back
+        working-directory: /transformers
+        run: |
+          git checkout ${{ inputs.start_sha }}
+
+      - name: Process report
+        shell: bash
+        working-directory: /transformers
+        env:
+          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+        run: |
+          python3 utils/process_bad_commit_report.py
+
+      - name: Process report
+        shell: bash
+        working-directory: /transformers
+        env:
+          TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN: ${{ secrets.TRANSFORMERS_CI_RESULTS_UPLOAD_TOKEN }}
+        run: |
+          {
+            echo 'REPORT_TEXT<<EOF'
+            python3 utils/process_bad_commit_report.py
+            echo EOF
+          } >> "$GITHUB_ENV"
+
+      - name: Send processed report
+        if: ${{ !endsWith(env.REPORT_TEXT, '{}') }}
+        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          channel-id: '#transformers-ci-feedback-tests'
+          # For posting a rich message using Block Kit
+          payload: |
+            {
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "${{ env.REPORT_TEXT }}"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/self-scheduled-caller.yml
+++ b/.github/workflows/self-scheduled-caller.yml
@ -7,7 +7,7 @@ on:
    - cron: "17 2 * * *"
  push:
    branches:
-      - run_scheduled_ci*
+      - trigger_debug

 jobs:
  model-ci:
@ -20,59 +20,3 @@ jobs:
      docker: huggingface/transformers-all-latest-gpu
      ci_event: Daily CI
    secrets: inherit
-
-  torch-pipeline:
-    name: Torch pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-torch"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  tf-pipeline:
-    name: TF pipeline CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_pipelines_tf_gpu
-      slack_report_channel: "#transformers-ci-daily-pipeline-tf"
-      runner: daily-ci
-      docker: huggingface/transformers-tensorflow-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  example-ci:
-    name: Example CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_examples_gpu
-      slack_report_channel: "#transformers-ci-daily-examples"
-      runner: daily-ci
-      docker: huggingface/transformers-all-latest-gpu
-      ci_event: Daily CI
-    secrets: inherit
-
-  deepspeed-ci:
-    name: DeepSpeed CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_torch_cuda_extensions_gpu
-      slack_report_channel: "#transformers-ci-daily-deepspeed"
-      runner: daily-ci
-      docker: huggingface/transformers-pytorch-deepspeed-latest-gpu
-      ci_event: Daily CI
-      working-directory-prefix: /workspace
-    secrets: inherit
-
-  quantization-ci:
-    name: Quantization CI
-    uses: ./.github/workflows/self-scheduled.yml
-    with:
-      job: run_quantization_torch_gpu
-      slack_report_channel: "#transformers-ci-daily-quantization"
-      runner: daily-ci
-      docker: huggingface/transformers-quantization-latest-gpu
-      ci_event: Daily CI
-    secrets: inherit
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@ -562,3 +562,13 @@ jobs:
      ci_event: ${{ inputs.ci_event }}

    secrets: inherit
+
+  check_new_model_failures:
+    if: ${{ always() && inputs.ci_event == 'Daily CI' && inputs.job == 'run_models_gpu' && needs.send_results.result == 'success' }}
+    name: Check new model failures
+    needs: send_results
+    uses: ./.github/workflows/check_failed_model_tests.yml
+    with:
+      docker: ${{ inputs.docker }}
+      start_sha: ${{ github.sha }}
+    secrets: inherit
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -132,7 +132,7 @@ You will need basic `git` proficiency to contribute to
 manual. Type `git --help` in a shell and enjoy! If you prefer books, [Pro
 Git](https://git-scm.com/book/en/v2) is a very good reference.

-You'll need **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:
+You'll need **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L449)** or above to contribute to 🤗 Transformers. Follow the steps below to start contributing:

 1. Fork the [repository](https://github.com/huggingface/transformers) by
   clicking on the **[Fork](https://github.com/huggingface/transformers/fork)** button on the repository's page. This creates a copy of the code
--- a/README.md
+++ b/README.md
@ -128,10 +128,10 @@ incredible projects built in the vicinity of transformers.

 If you own or use a project that you believe should be part of the list, please open a PR to add it!

-## If you are looking for custom support from the Hugging Face team
+## Serious about AI in your organisation? Build faster with the Hugging Face Enterprise Hub.

-<a target="_blank" href="https://huggingface.co/support">
-    <img alt="HuggingFace Expert Acceleration Program" src="https://cdn-media.huggingface.co/marketing/transformers/new-support-improved.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+<a target="_blank" href="https://huggingface.co/enterprise">
+    <img alt="Hugging Face Enterprise Hub" src="https://github.com/user-attachments/assets/247fb16d-d251-4583-96c4-d3d76dda4925">
 </a><br>

 ## Quick tour
@ -249,7 +249,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta

 ### With pip

-This repository is tested on Python 3.8+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+.
+This repository is tested on Python 3.9+, Flax 0.4.1+, PyTorch 1.11+, and TensorFlow 2.6+.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -43,7 +43,7 @@ RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/pef
 RUN python3 -m pip install --no-cache-dir git+https://github.com/huggingface/optimum@main#egg=optimum

 # For video model testing
-RUN python3 -m pip install --no-cache-dir decord av==9.2.0
+RUN python3 -m pip install --no-cache-dir av==9.2.0

 # Some slow tests require bnb
 RUN python3 -m pip install --no-cache-dir bitsandbytes
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
@ -9,12 +9,12 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.2.1'
+ARG PYTORCH='2.4.1'
 # Example: `cu102`, `cu113`, etc.
 ARG CUDA='cu118'

 RUN apt update
-RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python python3-pip ffmpeg
+RUN apt install -y git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-pip ffmpeg
 RUN python3 -m pip install --no-cache-dir --upgrade pip

 ARG REF=main
@ -53,7 +53,7 @@ RUN python3 -m pip install --no-cache-dir gguf

 # Add autoawq for quantization testing
 # >=v0.2.3 needed for compatibility with torch 2.2.1
-RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp38-cp38-linux_x86_64.whl
+RUN python3 -m pip install --no-cache-dir https://github.com/casper-hansen/AutoAWQ/releases/download/v0.2.3/autoawq-0.2.3+cu118-cp310-cp310-linux_x86_64.whl

 # Add quanto for quantization testing
 RUN python3 -m pip install --no-cache-dir optimum-quanto
--- a/docker/transformers-tensorflow-gpu/Dockerfile
+++ b/docker/transformers-tensorflow-gpu/Dockerfile
@ -1,4 +1,4 @@
-FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu20.04
+FROM nvidia/cuda:12.1.0-cudnn8-devel-ubuntu22.04
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive
--- a/docs/source/de/contributing.md
+++ b/docs/source/de/contributing.md
@ -112,7 +112,7 @@ Bevor Sie irgendwelchen Code schreiben, empfehlen wir Ihnen dringend, die besteh

 Sie benötigen grundlegende `git`-Kenntnisse, um zu 🤗 Transformers beizutragen. Obwohl `git` nicht das einfachste Werkzeug ist, hat es ein sehr gutes Handbuch. Geben Sie `git --help` in eine Shell ein und genießen Sie es! Wenn Sie Bücher bevorzugen, ist [Pro Git](https://git-scm.com/book/en/v2) eine gute Anlaufstelle.

-Sie benötigen **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** oder höher, um zu 🤗 Transformers beizutragen. Folgen Sie den nachstehenden Schritten, um mit dem Beitrag zu beginnen:
+Sie benötigen **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** oder höher, um zu 🤗 Transformers beizutragen. Folgen Sie den nachstehenden Schritten, um mit dem Beitrag zu beginnen:

 1. Forken Sie das [Repository](https://github.com/huggingface/transformers), indem Sie auf den **[Fork](https://github.com/huggingface/transformers/fork)**-Button auf der Seite des Repositorys klicken. Dadurch wird eine Kopie des Codes auf Ihrem GitHub-Account erstellt.

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -414,6 +414,8 @@
        title: Gemma
      - local: model_doc/gemma2
        title: Gemma2
+      - local: model_doc/glm
+        title: GLM
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
@ -604,6 +606,8 @@
        title: XLNet
      - local: model_doc/yoso
        title: YOSO
+      - local: model_doc/zamba
+        title: Zamba
      title: Text models
    - isExpanded: false
      sections:
@ -713,8 +717,6 @@
        title: ViTMSN
      - local: model_doc/yolos
        title: YOLOS
-      - local: model_doc/zamba
-        title: Zamba
      - local: model_doc/zoedepth
        title: ZoeDepth
      title: Vision models
@ -740,6 +742,8 @@
        title: Mimi
      - local: model_doc/mms
        title: MMS
+      - local: model_doc/moshi
+        title: Moshi
      - local: model_doc/musicgen
        title: MusicGen
      - local: model_doc/musicgen_melody
@ -969,4 +973,4 @@
    - local: internal/time_series_utils
      title: Utilities for Time Series
    title: Internal Helpers
-  title: API
+  title: API
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -332,7 +332,7 @@ This code can quickly be converted into a tool, just by wrapping it in a functio
 from transformers import tool

@tool
-def model_download_counter(task: str) -> str:
+def model_download_tool(task: str) -> str:
    """
    This is a tool that returns the most downloaded model of a given task on the Hugging Face Hub.
    It returns the name of the checkpoint.
@ -345,7 +345,7 @@ def model_download_counter(task: str) -> str:
 ```

 The function needs:
- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_counter`.
+- A clear name. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's put `model_download_tool`.
 - Type hints on both inputs and output
 - A description, that includes an 'Args:' part where each argument is described (without a type indication this time, it will be pulled from the type hint).
 All these will be automatically baked into the agent's system prompt upon initialization: so strive to make them as clear as possible!
@ -367,7 +367,7 @@ You get the following:
 ======== New task ========
 Can you give me the name of the model that has the most downloads in the 'text-to-video' task on the Hugging Face Hub?
 ==== Agent is executing the code below:
-most_downloaded_model = model_download_counter(task="text-to-video")
+most_downloaded_model = model_download_tool(task="text-to-video")
 print(f"The most downloaded model for the 'text-to-video' task is {most_downloaded_model}.")
 ====
 ```
--- a/docs/source/en/agents_advanced.md
+++ b/docs/source/en/agents_advanced.md
@ -66,10 +66,10 @@ manager_agent.run("Who is the CEO of Hugging Face?")

 Let's take again the tool example from main documentation, for which we had implemented a `tool` decorator.

-If you need to add variation, like custom attributes for your too, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass.
+If you need to add variation, like custom attributes for your tool, you can build your tool following the fine-grained method: building a class that inherits from the [`Tool`] superclass.

 The custom tool needs:
- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name is `model_download_counter`.
+- An attribute `name`, which corresponds to the name of the tool itself. The name usually describes what the tool does. Since the code returns the model with the most downloads for a task, let's name it `model_download_counter`.
 - An attribute `description` is used to populate the agent's system prompt.
 - An `inputs` attribute, which is a dictionary with keys `"type"` and `"description"`. It contains information that helps the Python interpreter make educated choices about the input.
 - An `output_type` attribute, which specifies the output type.
@ -240,4 +240,4 @@ with gr.Blocks() as demo:

 if __name__ == "__main__":
    demo.launch()
-```
+```
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -943,6 +943,35 @@ all implementations of Jinja:
 - Directly rendering a dict or list may give different results in other implementations (for example, string entries
  might change from single-quoted to double-quoted). Adding the `tojson` filter can help to ensure consistency here.

+### Writing generation prompts
+
+We mentioned above that `add_generation_prompt` is a special variable that will be accessible inside your template,
+and is controlled by the user setting the `add_generation_prompt` flag. If your model expects a header for
+assistant messages, then your template must support adding the header when `add_generation_prompt` is set.
+
+Here is an example of a template that formats messages ChatML-style, with generation prompt support:
+
+```text
+{{- bos_token }}
+{%- for message in messages %}
+    {{- '<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n' }}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}
+```
+
+The exact content of the assistant header will depend on your specific model, but it should always be **the string
+that represents the start of an assistant message**, so that if the user applies your template with 
+`add_generation_prompt=True` and then generates text, the model will write an assistant response. Also note that some
+models do not need a generation prompt, because assistant messages always begin immediately after user messages. 
+This is particularly common for LLaMA and Mistral models, where assistant messages begin immediately after the `[/INST]`
+token that ends user messages. In these cases, the template can ignore the `add_generation_prompt` flag.
+
+Generation prompts are important! If your model requires a generation prompt but it is not set in the template, then
+model generations will likely be severely degraded, or the model may display unusual behaviour like continuing 
+the final user message! 
+
 ### Writing and debugging larger templates

 When this feature was introduced, most templates were quite small, the Jinja equivalent of a "one-liner" script. 
--- a/docs/source/en/gguf.md
+++ b/docs/source/en/gguf.md
@ -85,6 +85,7 @@ For now the supported model architectures are the architectures that have been v
 - StableLM
 - GPT2
 - Starcoder2
+- T5

 ## Example usage

--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -19,7 +19,7 @@ State-of-the-art Machine Learning for [PyTorch](https://pytorch.org/), [TensorFl

 🤗 Transformers provides APIs and tools to easily download and train state-of-the-art pretrained models. Using pretrained models can reduce your compute costs, carbon footprint, and save you the time and resources required to train a model from scratch. These models support common tasks in different modalities, such as:

-📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, summarization, translation, multiple choice, and text generation.<br>
+📝 **Natural Language Processing**: text classification, named entity recognition, question answering, language modeling, code generation, summarization, translation, multiple choice, and text generation.<br>
 🖼️ **Computer Vision**: image classification, object detection, and segmentation.<br>
 🗣️ **Audio**: automatic speech recognition and audio classification.<br>
 🐙 **Multimodal**: table question answering, optical character recognition, information extraction from scanned documents, video classification, and visual question answering.
@ -150,6 +150,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                         [Gemma](model_doc/gemma)                         |       ✅        |         ❌         |      ✅      |
 |                        [Gemma2](model_doc/gemma2)                        |       ✅        |         ❌         |      ❌      |
 |                           [GIT](model_doc/git)                           |       ✅        |         ❌         |      ❌      |
+|                           [GLM](model_doc/glm)                           |       ✅        |         ❌         |      ❌      |
 |                          [GLPN](model_doc/glpn)                          |       ✅        |         ❌         |      ❌      |
 |                       [GPT Neo](model_doc/gpt_neo)                       |       ✅        |         ❌         |      ✅      |
 |                      [GPT NeoX](model_doc/gpt_neox)                      |       ✅        |         ❌         |      ❌      |
@ -223,6 +224,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                  [MobileNetV2](model_doc/mobilenet_v2)                   |       ✅        |         ❌         |      ❌      |
 |                     [MobileViT](model_doc/mobilevit)                     |       ✅        |         ✅         |      ❌      |
 |                   [MobileViTV2](model_doc/mobilevitv2)                   |       ✅        |         ❌         |      ❌      |
+|                         [Moshi](model_doc/moshi)                         |       ✅        |         ❌         |      ❌      |
 |                         [MPNet](model_doc/mpnet)                         |       ✅        |         ✅         |      ❌      |
 |                           [MPT](model_doc/mpt)                           |       ✅        |         ❌         |      ❌      |
 |                           [MRA](model_doc/mra)                           |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -185,6 +185,9 @@ generation.
 [[autodoc]] SuppressTokensLogitsProcessor
    - __call__

+[[autodoc]] SynthIDTextWatermarkLogitsProcessor
+    - __call__
+
 [[autodoc]] TemperatureLogitsWarper
    - __call__

@ -418,5 +421,18 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 ## Watermark Utils

+[[autodoc]] WatermarkingConfig
+    - __call__
+
 [[autodoc]] WatermarkDetector
    - __call__
+
+[[autodoc]] BayesianDetectorConfig
+
+[[autodoc]] BayesianDetectorModel
+    - forward
+
+[[autodoc]] SynthIDTextWatermarkingConfig
+
+[[autodoc]] SynthIDTextWatermarkDetector
+    - __call__
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -348,6 +348,99 @@ model = AutoModelForCausalLM.from_pretrained(
 )
 ```

+### Fine-Tuning with torch.compile and Padding-Free Data Collation
+
+In addition to optimizing inference, you can also enhance the training efficiency of large language models by leveraging torch.compile during fine-tuning and using a padding-free data collator. This approach can significantly speed up training and reduce computational overhead.
+
+Here's how you can fine-tune a Llama model using SFTTrainer from the TRL library, with torch_compile enabled and a padding-free data collator:
+
+```
+#################### IMPORTS ###################
+
+import math
+import datasets
+import dataclasses
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments
+)
+from trl import SFTConfig, SFTTrainer, DataCollatorForCompletionOnlyLM
+
+#################### MODEL LOADING WITH FLASH ATTENTION ###################
+
+model_name = "meta-llama/Llama-3.2-1B"
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    attn_implementation="flash_attention_2"  # Enables FlashAttention-2
+)
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+
+#################### DATA PREPROCESSING (PADDING-FREE) ###################
+
+response_template = "\n### Label:"
+response_template_ids = tokenizer.encode(
+    response_template, add_special_tokens=False
+)[2:]  # Exclude special tokens
+
+data_collator = DataCollatorForCompletionOnlyLM(
+    response_template_ids=response_template_ids,
+    tokenizer=tokenizer,
+    ignore_index=-100,
+    padding_free=True  # Enables padding-free collation
+)
+
+def format_dataset(example):
+    return {
+        "output": example["output"] + tokenizer.eos_token
+    }
+
+data_files = {"train": "path/to/dataset"}  # Replace with your dataset path
+json_dataset = datasets.load_dataset("json", data_files=data_files)
+formatted_train_dataset = json_dataset["train"].map(format_dataset)
+
+################# TRAINING CONFIGURATION ############################
+
+train_args = TrainingArguments(
+    num_train_epochs=5,
+    per_device_train_batch_size=4,
+    per_device_eval_batch_size=4,
+    gradient_accumulation_steps=4,
+    learning_rate=1e-5,
+    weight_decay=0.0,
+    warmup_ratio=0.03,
+    lr_scheduler_type="cosine",
+    logging_steps=1,
+    include_tokens_per_second=True,
+    save_strategy="epoch",
+    output_dir="output",
+    torch_compile=True,  # Enables torch.compile
+    torch_compile_backend="inductor",
+    torch_compile_mode="default"
+)
+
+# Convert TrainingArguments to SFTConfig
+transformer_train_arg_fields = [x.name for x in dataclasses.fields(SFTConfig)]
+transformer_kwargs = {
+    k: v
+    for k, v in train_args.to_dict().items()
+    if k in transformer_train_arg_fields
+}
+training_args = SFTConfig(**transformer_kwargs)
+
+####################### FINE-TUNING #####################
+
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=formatted_train_dataset,
+    data_collator=data_collator,
+    dataset_text_field="output",
+    args=training_args,
+)
+trainer.train()
+```
+
 ### PyTorch scaled dot product attention

 Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation.
--- a/docs/source/en/main_classes/text_generation.md
+++ b/docs/source/en/main_classes/text_generation.md
@ -41,8 +41,6 @@ like token streaming.
 	- validate
 	- get_generation_mode

-[[autodoc]] generation.WatermarkingConfig
-
 ## GenerationMixin

 [[autodoc]] GenerationMixin
--- a/docs/source/en/model_doc/depth_anything.md
+++ b/docs/source/en/model_doc/depth_anything.md
@ -84,27 +84,24 @@ If you want to do the pre- and postprocessing yourself, here's how to do that:

 >>> with torch.no_grad():
 ...     outputs = model(**inputs)
-...     predicted_depth = outputs.predicted_depth

->>> # interpolate to original size
->>> prediction = torch.nn.functional.interpolate(
-...     predicted_depth.unsqueeze(1),
-...     size=image.size[::-1],
-...     mode="bicubic",
-...     align_corners=False,
+>>> # interpolate to original size and visualize the prediction
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     target_sizes=[(image.height, image.width)],
 ... )

->>> # visualize the prediction
->>> output = prediction.squeeze().cpu().numpy()
->>> formatted = (output * 255 / np.max(output)).astype("uint8")
->>> depth = Image.fromarray(formatted)
+>>> predicted_depth = post_processed_output[0]["predicted_depth"]
+>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
+>>> depth = depth.detach().cpu().numpy() * 255
+>>> depth = Image.fromarray(depth.astype("uint8"))
 ```

 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Depth Anything.

- [Monocular depth estimation task guide](../tasks/depth_estimation)
+- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
 - A notebook showcasing inference with [`DepthAnythingForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Depth%20Anything/Predicting_depth_in_an_image_with_Depth_Anything.ipynb). 🌎

 If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
--- a/docs/source/en/model_doc/depth_anything_v2.md
+++ b/docs/source/en/model_doc/depth_anything_v2.md
@ -78,27 +78,24 @@ If you want to do the pre- and post-processing yourself, here's how to do that:

 >>> with torch.no_grad():
 ...     outputs = model(**inputs)
-...     predicted_depth = outputs.predicted_depth

->>> # interpolate to original size
->>> prediction = torch.nn.functional.interpolate(
-...     predicted_depth.unsqueeze(1),
-...     size=image.size[::-1],
-...     mode="bicubic",
-...     align_corners=False,
+>>> # interpolate to original size and visualize the prediction
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     target_sizes=[(image.height, image.width)],
 ... )

->>> # visualize the prediction
->>> output = prediction.squeeze().cpu().numpy()
->>> formatted = (output * 255 / np.max(output)).astype("uint8")
->>> depth = Image.fromarray(formatted)
+>>> predicted_depth = post_processed_output[0]["predicted_depth"]
+>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
+>>> depth = depth.detach().cpu().numpy() * 255
+>>> depth = Image.fromarray(depth.astype("uint8"))
 ```

 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with Depth Anything.

- [Monocular depth estimation task guide](../tasks/depth_estimation)
+- [Monocular depth estimation task guide](../tasks/monocular_depth_estimation)
 - [Depth Anything V2 demo](https://huggingface.co/spaces/depth-anything/Depth-Anything-V2).
 - A notebook showcasing inference with [`DepthAnythingForDepthEstimation`] can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Depth%20Anything/Predicting_depth_in_an_image_with_Depth_Anything.ipynb). 🌎
 - [Core ML conversion of the `small` variant for use on Apple Silicon](https://huggingface.co/apple/coreml-depth-anything-v2-small).
--- a/docs/source/en/model_doc/detr.md
+++ b/docs/source/en/model_doc/detr.md
@ -181,6 +181,15 @@ If you're interested in submitting a resource to be included here, please feel f
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation

+## DetrImageProcessorFast
+
+[[autodoc]] DetrImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
 ## DetrFeatureExtractor

 [[autodoc]] DetrFeatureExtractor
--- a/docs/source/en/model_doc/glm.md
+++ b/docs/source/en/model_doc/glm.md
@ -0,0 +1,99 @@
+<!--Copyright 2024 The GLM & ZhipuAI team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# GLM
+
+## Overview
+
+The GLM Model was proposed
+in [ChatGLM: A Family of Large Language Models from GLM-130B to GLM-4 All Tools](https://arxiv.org/html/2406.12793v1)
+by GLM Team, THUDM & ZhipuAI.
+
+The abstract from the paper is the following:
+
+*We introduce ChatGLM, an evolving family of large language models that we have been developing over time. This report
+primarily focuses on the GLM-4 language series, which includes GLM-4, GLM-4-Air, and GLM-4-9B. They represent our most
+capable models that are trained with all the insights and lessons gained from the preceding three generations of
+ChatGLM. To date, the GLM-4 models are pre-trained on ten trillions of tokens mostly in Chinese and English, along with
+a small set of corpus from 24 languages, and aligned primarily for Chinese and English usage. The high-quality alignment
+is achieved via a multi-stage post-training process, which involves supervised fine-tuning and learning from human
+feedback. Evaluations show that GLM-4 1) closely rivals or outperforms GPT-4 in terms of general metrics such as MMLU,
+GSM8K, MATH, BBH, GPQA, and HumanEval, 2) gets close to GPT-4-Turbo in instruction following as measured by IFEval, 3)
+matches GPT-4 Turbo (128K) and Claude 3 for long context tasks, and 4) outperforms GPT-4 in Chinese alignments as
+measured by AlignBench. The GLM-4 All Tools model is further aligned to understand user intent and autonomously decide
+when and which tool(s) to use—including web browser, Python interpreter, text-to-image model, and user-defined
+functions—to effectively complete complex tasks. In practical applications, it matches and even surpasses GPT-4 All
+Tools in tasks like accessing online information via web browsing and solving math problems using Python interpreter.
+Over the course, we have open-sourced a series of models, including ChatGLM-6B (three generations), GLM-4-9B (128K, 1M),
+GLM-4V-9B, WebGLM, and CodeGeeX, attracting over 10 million downloads on Hugging face in the year 2023 alone.*
+
+Tips:
+
+- This model was contributed by [THUDM](https://huggingface.co/THUDM). The most recent code can be
+  found [here](https://github.com/thudm/GLM-4).
+
+  
+## Usage tips
+
+`GLM-4` can be found on the [Huggingface Hub](https://huggingface.co/collections/THUDM/glm-4-665fcf188c414b03c2f7e3b7)
+
+In the following, we demonstrate how to use `glm-4-9b-chat` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
+
+```python
+>>> from transformers import AutoModelForCausalLM, AutoTokenizer
+>>> device = "cuda" # the device to load the model onto
+
+>>> model = AutoModelForCausalLM.from_pretrained("THUDM/glm-4-9b-chat", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("THUDM/glm-4-9b-chat")
+
+>>> prompt = "Give me a short introduction to large language model."
+
+>>> messages = [{"role": "user", "content": prompt}]
+
+>>> text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+
+>>> model_inputs = tokenizer([text], return_tensors="pt").to(device)
+
+>>> generated_ids = model.generate(model_inputs.input_ids, max_new_tokens=512, do_sample=True)
+
+>>> generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)]
+
+>>> response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+```
+
+## GlmConfig
+
+[[autodoc]] GlmConfig
+
+## GlmModel
+
+[[autodoc]] GlmModel
+    - forward
+
+## GlmForCausalLM
+
+[[autodoc]] GlmForCausalLM
+    - forward
+
+## GlmForSequenceClassification
+
+[[autodoc]] GlmForSequenceClassification
+    - forward
+
+## GlmForTokenClassification
+
+[[autodoc]] GlmForTokenClassification
+    - forward
--- a/docs/source/en/model_doc/mimi.md
+++ b/docs/source/en/model_doc/mimi.md
@ -66,4 +66,4 @@ The original code can be found [here](https://github.com/kyutai-labs/moshi).
 [[autodoc]] MimiModel
    - decode
    - encode
-    - forward
+    - forward
--- a/docs/source/en/model_doc/moshi.md
+++ b/docs/source/en/model_doc/moshi.md
@ -0,0 +1,183 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Moshi
+
+## Overview
+
+The Moshi model was proposed in [Moshi: a speech-text foundation model for real-time dialogue](https://kyutai.org/Moshi.pdf) by Alexandre Défossez, Laurent Mazaré, Manu Orsini, Amélie Royer, Patrick Pérez, Hervé Jégou, Edouard Grave and Neil Zeghidour.
+
+Moshi is a speech-text foundation model that casts spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. Moshi also predicts time-aligned text tokens as a prefix to audio tokens. This “Inner Monologue” method significantly improves the linguistic quality of generated speech and provides streaming speech recognition and text-to-speech. As a result, Moshi is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ylacombe/benchmark-comparison/resolve/main/moshi_architecture.png">
+</div>
+
+The abstract from the paper is the following:
+
+*We introduce Moshi, a speech-text foundation model and full-duplex spoken dialogue framework. Current systems for spoken dialogue rely on pipelines of independent components, namely voice activity detection, speech recognition, textual dialogue and text-to-speech. Such frameworks cannot emulate the experience of real conversations. First, their complexity induces a latency of several seconds between interactions. Second, text being the intermediate modality for dialogue, non-linguistic information that modifies meaning— such as emotion or non-speech sounds— is lost in the interaction. Finally, they rely on a segmentation into speaker turns, which does not take into account overlapping speech, interruptions and interjections. Moshi solves these independent issues altogether by casting spoken dialogue as speech-to-speech generation. Starting from a text language model backbone, Moshi generates speech as tokens from the residual quantizer of a neural audio codec, while modeling separately its own speech and that of the user into parallel streams. This allows for the removal of explicit speaker turns, and the modeling of arbitrary conversational dynamics. We moreover extend the hierarchical semantic-to-acoustic token generation of previous work to first predict time-aligned text tokens as a prefix to audio tokens. Not only this “Inner Monologue” method significantly improves the linguistic quality of generated speech, but we also illustrate how it can provide streaming speech recognition and text-to-speech. Our resulting model is the first real-time full-duplex spoken large language model, with a theoretical latency of 160ms, 200ms in practice, and is available at github.com/kyutai-labs/moshi.* 
+
+Moshi deals with 3 streams of information:
+1. The user's audio
+2. Moshi's audio
+3. Moshi's textual output
+
+Similarly to [`~MusicgenModel`], audio is represented with audio codebooks, which can be interpreted like tokens. The main difference between text tokens and audio codebooks is that audio codebooks introduce an additional dimension of information.
+Text tokens are typically of dim `(batch_size, sequence_length)` but audio tokens are of dim `(batch_size, num_codebooks, sequence_length)`.
+
+Moshi's made of 3 components:
+
+**1. The main decoder (Helium in the paper)**
+
+It corresponds to [`MoshiForCausalLM`]. It is strictly a classic text LLM, that uses an architecture similar to [` ~GemmaForCausalLM`]. In other words, it takes text tokens, embeds them, pass them through the decoder and a language head, to get text logits.
+
+**2. The depth decoder**
+
+On its own, it's also a classic LLM, but this time, instead of generating over the time dimension, it generates over the codebook dimension.
+
+It also means that its context length is `num_codebooks`, thus it can't generate more than `num_codebooks`.
+
+Note that each timestamp - i.e each codebook - gets its own set of Linear Layers and Embeddings.
+
+**3. [`MimiModel`]**
+
+It's the audio encoder from Kyutai, that has recently been integrated to transformers, which is used to "tokenize" audio. It has the same use that [`~EncodecModel`] has in [`~MusicgenModel`].
+
+
+## Tips:
+
+The original checkpoints can be converted using the conversion script `src/transformers/models/moshi/convert_moshi_transformers.py` 
+
+
+### How to use the model:
+
+This implementation has two main aims:
+1. quickly test model generation by simplifying the original API
+2. simplify training. A training guide will come soon, but user contributions are welcomed!
+
+<Tip>
+
+It is designed for intermediate use. We strongly recommend using the original [implementation](https://github.com/kyutai-labs/moshi) to infer the model in real-time streaming.
+
+</Tip>
+
+**1. Model generation**
+
+Moshi is a streaming auto-regressive model with two streams of audio. To put it differently, one audio stream corresponds to what the model said/will say and the other audio stream corresponds to what the user said/will say.
+
+[`MoshiForConditionalGeneration.generate`] thus needs 3 inputs:
+1. `input_ids` - corresponding to the text token history
+2. `moshi_input_values` or `moshi_audio_codes`- corresponding to the model audio history
+3. `user_input_values` or `user_audio_codes` - corresponding to the user audio history
+
+These three inputs must be synchronized. Meaning that their lengths must correspond to the same number of tokens.
+
+You can dynamically use the 3 inputs depending on what you want to test:
+1. Simply check the model response to an user prompt - in that case, `input_ids` can be filled with pad tokens and `user_input_values` can be a zero tensor of the same shape than the user prompt.
+2. Test more complex behaviour - in that case, you must be careful about how the input tokens are synchronized with the audios.
+
+<Tip>
+
+The original model is synchronized text with audio by padding the text in between each token enunciation.
+
+To follow the example of the following image, `"Hello, I'm Moshi"` could be transformed to `"Hello,<pad><unk>I'm Moshi"`.
+
+</Tip>
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/ylacombe/benchmark-comparison/resolve/main/moshi_text_sync.png">
+</div>
+
+
+[`MoshiForConditionalGeneration.generate`] then auto-regressively feeds to itself its own audio stream, but since it doesn't have access to the user input stream while using `transformers`, it will thus **assume that the user is producing blank audio**.
+
+
+
+```python 
+>>> from datasets import load_dataset, Audio
+>>> import torch, math
+>>> from transformers import MoshiForConditionalGeneration, AutoFeatureExtractor, AutoTokenizer
+>>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+
+>>> # prepare user input audio 
+>>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
+>>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+>>> user_input_values = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt").to(device=device, dtype=dtype)
+
+>>> # prepare moshi input values - we suppose moshi didn't say anything while the user spoke
+>>> moshi_input_values = torch.zeros_like(user_input_values.input_values)
+
+>>> # prepare moshi input ids - we suppose moshi didn't say anything while the user spoke
+>>> num_tokens = math.ceil(moshi_input_values.shape[-1] * waveform_to_token_ratio)
+>>> input_ids = torch.ones((1, num_tokens), device=device, dtype=torch.int64) * tokenizer.encode("<pad>")[0]
+
+>>> # generate 25 new tokens (around 2s of audio)
+>>> output = model.generate(input_ids=input_ids, user_input_values=user_input_values.input_values, moshi_input_values=moshi_input_values, max_new_tokens=25)
+
+>>> text_tokens = output.sequences
+>>> audio_waveforms = output.audio_sequences
+```
+
+**2. Model training**
+
+Most of the work has to be done during data creation/pre-processing, because of the need to align/synchronize streams.
+
+Once it's done, you can simply forward `text_labels` and `audio_labels` to [`MoshiForConditionalGeneration.forward`], alongside the usual inputs, to get the model loss.
+ 
+A training guide will come soon, but user contributions are welcomed!
+
+### How does the model forward the inputs / generate:
+
+1. The input streams are embedded and combined into `inputs_embeds`.
+
+2. `inputs_embeds` is passed through the main decoder, which processes it like a normal LLM would.
+
+3. The main decoder outputs `text logits` but also its `last hidden state` which is called `temporal context` in the paper.
+
+3. The depth decoder switches the dimension on which we forward / generate (codebooks instead of time). It uses the token generated from `text logits`  and the `temporal context` to auto-regressively generate audio codebooks.
+
+
+This model was contributed by [Yoach Lacombe (ylacombe)](https://huggingface.co/ylacombe).
+
+The original code can be found [here](https://github.com/kyutai-labs/moshi).
+
+
+
+## MoshiConfig
+
+[[autodoc]] MoshiConfig
+
+## MoshiDepthConfig
+
+[[autodoc]] MoshiDepthConfig
+
+## MoshiModel
+
+[[autodoc]] MoshiModel
+    - forward
+
+## MoshiForCausalLM
+
+[[autodoc]] MoshiForCausalLM
+    - forward
+
+## MoshiForConditionalGeneration
+
+[[autodoc]] MoshiForConditionalGeneration
+    - forward
+    - generate
+    - get_unconditional_inputs
--- a/docs/source/en/model_doc/vivit.md
+++ b/docs/source/en/model_doc/vivit.md
@ -23,6 +23,43 @@ The abstract from the paper is the following:

 This model was contributed by [jegormeister](https://huggingface.co/jegormeister). The original code (written in JAX) can be found [here](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit).

+### Using Scaled Dot Product Attention (SDPA)
+
+PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
+encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
+[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
+page for more information.
+
+SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
+`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+
+```
+from transformers import VivitModel
+model = VivitModel.from_pretrained("google/vivit-b-16x2-kinetics400", attn_implementation="sdpa", torch_dtype=torch.float16)
+...
+```
+
+For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+
+On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `google/vivit-b-16x2-kinetics400` model, we saw the following speedups during inference.
+
+### Training
+|   num_training_steps |   batch_size |   is cuda |   Speedup (%) |   Eager peak mem (MB) |   sdpa peak mem (MB) |   Mem saving (%) |
+|---------------------:|-------------:|----------:|--------------:|----------------------:|---------------------:|-----------------:|
+|                  100 |            1 |      True |         7.122 |               2575.28 |              5932.54 |           130.364 |
+
+
+
+### Inference
+|   num_batches |   batch_size |   is cuda |   is half |   Speedup (%) |   Mem eager (MB) |   Mem BT (MB) |   Mem saved (%) |
+|---------------|--------------|-----------|-----------|---------------|------------------|---------------|-----------------|
+|            20 |             1 |   True    |   False   |      15.422   |     715.807      |    317.079    |      125.75     |
+|            20 |             2 |   True    |   False   |      17.146   |    1234.75       |    447.175    |      176.122    |
+|            20 |             4 |   True    |   False   |      18.093   |    2275.82       |    709.864    |      220.6      |
+|            20 |             8 |   True    |   False   |      19.284   |    4358.19       |   1233.24     |      253.393    |
+           
+
 ## VivitConfig

 [[autodoc]] VivitConfig
--- a/docs/source/en/model_doc/zoedepth.md
+++ b/docs/source/en/model_doc/zoedepth.md
@ -39,54 +39,66 @@ The original code can be found [here](https://github.com/isl-org/ZoeDepth).
 The easiest to perform inference with ZoeDepth is by leveraging the [pipeline API](../main_classes/pipelines.md):

 ```python
-from transformers import pipeline
-from PIL import Image
-import requests
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests

-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)

-pipe = pipeline(task="depth-estimation", model="Intel/zoedepth-nyu-kitti")
-result = pipe(image)
-depth = result["depth"]
+>>> pipe = pipeline(task="depth-estimation", model="Intel/zoedepth-nyu-kitti")
+>>> result = pipe(image)
+>>> depth = result["depth"]
 ```

 Alternatively, one can also perform inference using the classes:

 ```python
-from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
-import torch
-import numpy as np
-from PIL import Image
-import requests
+>>> from transformers import AutoImageProcessor, ZoeDepthForDepthEstimation
+>>> import torch
+>>> import numpy as np
+>>> from PIL import Image
+>>> import requests

-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(url, stream=True).raw)
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)

-image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
-model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")
+>>> image_processor = AutoImageProcessor.from_pretrained("Intel/zoedepth-nyu-kitti")
+>>> model = ZoeDepthForDepthEstimation.from_pretrained("Intel/zoedepth-nyu-kitti")

-# prepare image for the model
-inputs = image_processor(images=image, return_tensors="pt")
+>>> # prepare image for the model
+>>> inputs = image_processor(images=image, return_tensors="pt")

-with torch.no_grad():
-    outputs = model(**inputs)
-    predicted_depth = outputs.predicted_depth
+>>> with torch.no_grad():   
+...     outputs = model(pixel_values)

-# interpolate to original size
-prediction = torch.nn.functional.interpolate(
-    predicted_depth.unsqueeze(1),
-    size=image.size[::-1],
-    mode="bicubic",
-    align_corners=False,
-)
+>>> # interpolate to original size and visualize the prediction
+>>> ## ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument
+>>> ## to `post_process_depth_estimation` to remove the padding and resize to original dimensions.
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     source_sizes=[(image.height, image.width)],
+... )

-# visualize the prediction
-output = prediction.squeeze().cpu().numpy()
-formatted = (output * 255 / np.max(output)).astype("uint8")
-depth = Image.fromarray(formatted)
+>>> predicted_depth = post_processed_output[0]["predicted_depth"]
+>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
+>>> depth = depth.detach().cpu().numpy() * 255
+>>> depth = Image.fromarray(depth.astype("uint8"))
 ```

+<Tip>
+<p>In the <a href="https://github.com/isl-org/ZoeDepth/blob/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/depth_model.py#L131">original implementation</a> ZoeDepth model performs inference on both the original and flipped images and averages out the results. The <code>post_process_depth_estimation</code> function can handle this for us by passing the flipped outputs to the optional <code>outputs_flipped</code> argument:</p>
+<pre><code class="language-Python">&gt;&gt;&gt; with torch.no_grad():   
+...     outputs = model(pixel_values)
+...     outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
+&gt;&gt;&gt; post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     source_sizes=[(image.height, image.width)],
+...     outputs_flipped=outputs_flipped,
+... )
+</code></pre>
+</Tip>
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with ZoeDepth.
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -42,6 +42,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
+* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
@ -70,6 +71,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [MBart](https://huggingface.co/docs/transformers/model_doc/mbart#transformers.MBartModel)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
+* [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
@ -77,6 +79,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
 * [OLMoE](https://huggingface.co/docs/transformers/model_doc/olmoe#transformers.OlmoeModel)
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
+* [PaliGemma](https://huggingface.co/docs/transformers/model_doc/paligemma#transformers.PaliGemmaForConditionalGeneration)
 * [Phi](https://huggingface.co/docs/transformers/model_doc/phi#transformers.PhiModel)
 * [Phi3](https://huggingface.co/docs/transformers/model_doc/phi3#transformers.Phi3Model)
 * [PhiMoE](https://huggingface.co/docs/transformers/model_doc/phimoe#transformers.PhimoeModel)
@ -86,6 +89,10 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Qwen2Audio](https://huggingface.co/docs/transformers/model_doc/qwen2_audio#transformers.Qwen2AudioEncoder)
 * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
 * [Qwen2VL](https://huggingface.co/docs/transformers/model_doc/qwen2_vl#transformers.Qwen2VLModel)
+* [RAG](https://huggingface.co/docs/transformers/model_doc/rag#transformers.RagModel)
+* [SpeechEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/speech_encoder_decoder#transformers.SpeechEncoderDecoderModel)
+* [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision_encoder_decoder#transformers.VisionEncoderDecoderModel)
+* [VisionTextDualEncoder](https://huggingface.co/docs/transformers/model_doc/vision_text_dual_encoder#transformers.VisionTextDualEncoderModel)
 * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 * [Wav2Vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
 * [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
@ -215,6 +222,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [CamemBERT](https://huggingface.co/docs/transformers/model_doc/camembert#transformers.CamembertModel)
 * [Chameleon](https://huggingface.co/docs/transformers/model_doc/chameleon#transformers.Chameleon)
 * [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPModel)
+* [GLM](https://huggingface.co/docs/transformers/model_doc/glm#transformers.GLMModel)
 * [Cohere](https://huggingface.co/docs/transformers/model_doc/cohere#transformers.CohereModel)
 * [data2vec_audio](https://huggingface.co/docs/transformers/main/en/model_doc/data2vec#transformers.Data2VecAudioModel)
 * [Dbrx](https://huggingface.co/docs/transformers/model_doc/dbrx#transformers.DbrxModel)
@ -222,6 +230,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Dinov2](https://huggingface.co/docs/transformers/en/model_doc/dinov2)
 * [DistilBert](https://huggingface.co/docs/transformers/model_doc/distilbert#transformers.DistilBertModel)
 * [Dpr](https://huggingface.co/docs/transformers/model_doc/dpr#transformers.DprReader)
+* [EncoderDecoder](https://huggingface.co/docs/transformers/model_doc/encoder_decoder#transformers.EncoderDecoderModel)
 * [Falcon](https://huggingface.co/docs/transformers/model_doc/falcon#transformers.FalconModel)
 * [Gemma](https://huggingface.co/docs/transformers/model_doc/gemma#transformers.GemmaModel)
 * [Gemma2](https://huggingface.co/docs/transformers/model_doc/gemma2#transformers.Gemma2Model)
@ -230,17 +239,23 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [GPTNeoX](https://huggingface.co/docs/transformers/model_doc/gpt_neox#transformers.GPTNeoXModel)
 * [Hubert](https://huggingface.co/docs/transformers/model_doc/hubert#transformers.HubertModel)
 * [Idefics](https://huggingface.co/docs/transformers/model_doc/idefics#transformers.IdeficsModel)
+* [Idefics2](https://huggingface.co/docs/transformers/model_doc/idefics2#transformers.Idefics2Model)
+* [Idefics3](https://huggingface.co/docs/transformers/model_doc/idefics3#transformers.Idefics3Model)
 * [Granite](https://huggingface.co/docs/transformers/model_doc/granite#transformers.GraniteModel)
 * [GraniteMoe](https://huggingface.co/docs/transformers/model_doc/granitemoe#transformers.GraniteMoeModel)
 * [JetMoe](https://huggingface.co/docs/transformers/model_doc/jetmoe#transformers.JetMoeModel)
 * [Jamba](https://huggingface.co/docs/transformers/model_doc/jamba#transformers.JambaModel)
 * [Llama](https://huggingface.co/docs/transformers/model_doc/llama#transformers.LlamaModel)
+* [Llava](https://huggingface.co/docs/transformers/model_doc/llava)
+* [Llava-NeXT](https://huggingface.co/docs/transformers/model_doc/llava_next)
+* [Llava-NeXT-Video](https://huggingface.co/docs/transformers/model_doc/llava_next_video)
 * [LLaVA-Onevision](https://huggingface.co/docs/transformers/model_doc/llava_onevision)
 * [M2M100](https://huggingface.co/docs/transformers/model_doc/m2m_100#transformers.M2M100Model)
 * [Mimi](https://huggingface.co/docs/transformers/model_doc/mimi)
 * [Mistral](https://huggingface.co/docs/transformers/model_doc/mistral#transformers.MistralModel)
 * [Mllama](https://huggingface.co/docs/transformers/model_doc/mllama#transformers.MllamaForConditionalGeneration)
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
+* [Moshi](https://huggingface.co/docs/transformers/model_doc/moshi#transformers.MoshiModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
@ -273,11 +288,17 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
 * [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
+* [SpeechEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/speech_encoder_decoder#transformers.SpeechEncoderDecoderModel)
+* [VideoLlava](https://huggingface.co/docs/transformers/model_doc/video_llava)
+* [VipLlava](https://huggingface.co/docs/transformers/model_doc/vipllava)
+* [VisionEncoderDecoder](https://huggingface.co/docs/transformers/model_doc/vision_encoder_decoder#transformers.VisionEncoderDecoderModel)
 * [ViT](https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTModel)
 * [ViTHybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid#transformers.ViTHybridModel)
 * [ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae#transformers.ViTMAEModel)
 * [ViTMSN](https://huggingface.co/docs/transformers/model_doc/vit_msn#transformers.ViTMSNModel)
+* [VisionTextDualEncoder](https://huggingface.co/docs/transformers/model_doc/vision_text_dual_encoder#transformers.VisionTextDualEncoderModel)
 * [VideoMAE](https://huggingface.co/docs/transformers/model_doc/videomae#transformers.VideoMAEModell)
+* [ViViT](https://huggingface.co/docs/transformers/model_doc/vivit#transformers.VivitModel)
 * [wav2vec2](https://huggingface.co/docs/transformers/model_doc/wav2vec2#transformers.Wav2Vec2Model)
 * [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper#transformers.WhisperModel)
 * [XLM-RoBERTa](https://huggingface.co/docs/transformers/model_doc/xlm-roberta#transformers.XLMRobertaModel)
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@ -360,8 +360,8 @@ One particularly cool 🤗 Transformers feature is the ability to save a model a
 ```py
 >>> from transformers import AutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
->>> pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
+>>> pt_model = AutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
 ```
 </pt>
 <tf>
@ -369,8 +369,8 @@ One particularly cool 🤗 Transformers feature is the ability to save a model a
 ```py
 >>> from transformers import TFAutoModel

->>> tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
->>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
+>>> tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
+>>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
 ```
 </tf>
 </frameworkcontent>
--- a/docs/source/en/tasks/monocular_depth_estimation.md
+++ b/docs/source/en/tasks/monocular_depth_estimation.md
@ -126,97 +126,34 @@ Pass the prepared inputs through the model:
 ...     outputs = model(pixel_values)
 ```

-Let's post-process and visualize the results. 
-
-We need to pad and then resize the outputs so that predicted depth map has the same dimension as the original image. After resizing we will remove the padded regions from the depth. 
+Let's post-process the results to remove any padding and resize the depth map to match the original image size. The `post_process_depth_estimation` outputs a list of dicts containing the `"predicted_depth"`.

 ```py
->>> import numpy as np
->>> import torch.nn.functional as F
+>>> # ZoeDepth dynamically pads the input image. Thus we pass the original image size as argument
+>>> # to `post_process_depth_estimation` to remove the padding and resize to original dimensions.
+>>> post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     source_sizes=[(image.height, image.width)],
+... )

->>> predicted_depth = outputs.predicted_depth.unsqueeze(dim=1)
->>> height, width = pixel_values.shape[2:]
-
->>> height_padding_factor = width_padding_factor = 3
->>> pad_h = int(np.sqrt(height/2) * height_padding_factor)
->>> pad_w = int(np.sqrt(width/2) * width_padding_factor)
-
->>> if predicted_depth.shape[-2:] != pixel_values.shape[-2:]:
->>>    predicted_depth = F.interpolate(predicted_depth, size= (height, width), mode='bicubic', align_corners=False)
-
->>> if pad_h > 0:
-     predicted_depth = predicted_depth[:, :, pad_h:-pad_h,:]
->>> if pad_w > 0:
-     predicted_depth = predicted_depth[:, :, :, pad_w:-pad_w]
+>>> predicted_depth = post_processed_output[0]["predicted_depth"]
+>>> depth = (predicted_depth - predicted_depth.min()) / (predicted_depth.max() - predicted_depth.min())
+>>> depth = depth.detach().cpu().numpy() * 255
+>>> depth = Image.fromarray(depth.astype("uint8"))
 ```

-We can now visualize the results (the function below is taken from the [GaussianObject](https://github.com/GaussianObject/GaussianObject/blob/ad6629efadb57902d5f8bc0fa562258029a4bdf1/pred_monodepth.py#L11) framework).
-
-```py
-import matplotlib
-
-def colorize(value, vmin=None, vmax=None, cmap='gray_r', invalid_val=-99, invalid_mask=None, background_color=(128, 128, 128, 255), gamma_corrected=False, value_transform=None):
-    """Converts a depth map to a color image.
-
-    Args:
-        value (torch.Tensor, numpy.ndarray): Input depth map. Shape: (H, W) or (1, H, W) or (1, 1, H, W). All singular dimensions are squeezed
-        vmin (float, optional): vmin-valued entries are mapped to start color of cmap. If None, value.min() is used. Defaults to None.
-        vmax (float, optional):  vmax-valued entries are mapped to end color of cmap. If None, value.max() is used. Defaults to None.
-        cmap (str, optional): matplotlib colormap to use. Defaults to 'magma_r'.
-        invalid_val (int, optional): Specifies value of invalid pixels that should be colored as 'background_color'. Defaults to -99.
-        invalid_mask (numpy.ndarray, optional): Boolean mask for invalid regions. Defaults to None.
-        background_color (tuple[int], optional): 4-tuple RGB color to give to invalid pixels. Defaults to (128, 128, 128, 255).
-        gamma_corrected (bool, optional): Apply gamma correction to colored image. Defaults to False.
-        value_transform (Callable, optional): Apply transform function to valid pixels before coloring. Defaults to None.
-
-    Returns:
-        numpy.ndarray, dtype - uint8: Colored depth map. Shape: (H, W, 4)
-    """
-    if isinstance(value, torch.Tensor):
-        value = value.detach().cpu().numpy()
-
-    value = value.squeeze()
-    if invalid_mask is None:
-        invalid_mask = value == invalid_val
-    mask = np.logical_not(invalid_mask)
-
-    # normalize
-    vmin = np.percentile(value[mask],2) if vmin is None else vmin
-    vmax = np.percentile(value[mask],85) if vmax is None else vmax
-    if vmin != vmax:
-        value = (value - vmin) / (vmax - vmin)  # vmin..vmax
-    else:
-        # Avoid 0-division
-        value = value * 0.
-
-    # squeeze last dim if it exists
-    # grey out the invalid values
-
-    value[invalid_mask] = np.nan
-    cmapper = matplotlib.colormaps.get_cmap(cmap)
-    if value_transform:
-        value = value_transform(value)
-        # value = value / value.max()
-    value = cmapper(value, bytes=True)  # (nxmx4)
-
-    # img = value[:, :, :]
-    img = value[...]
-    img[invalid_mask] = background_color
-
-    #     return img.transpose((2, 0, 1))
-    if gamma_corrected:
-        # gamma correction
-        img = img / 255
-        img = np.power(img, 2.2)
-        img = img * 255
-        img = img.astype(np.uint8)
-    return img
-
->>> result = colorize(predicted_depth.cpu().squeeze().numpy())
->>> Image.fromarray(result)
-```
-
-
+<Tip>
+<p>In the <a href="https://github.com/isl-org/ZoeDepth/blob/edb6daf45458569e24f50250ef1ed08c015f17a7/zoedepth/models/depth_model.py#L131">original implementation</a> ZoeDepth model performs inference on both the original and flipped images and averages out the results. The <code>post_process_depth_estimation</code> function can handle this for us by passing the flipped outputs to the optional <code>outputs_flipped</code> argument:</p>
+<pre><code class="language-Python">&gt;&gt;&gt; with torch.no_grad():   
+...     outputs = model(pixel_values)
+...     outputs_flipped = model(pixel_values=torch.flip(inputs.pixel_values, dims=[3]))
+&gt;&gt;&gt; post_processed_output = image_processor.post_process_depth_estimation(
+...     outputs,
+...     source_sizes=[(image.height, image.width)],
+...     outputs_flipped=outputs_flipped,
+... )
+</code></pre>
+</Tip>

 <div class="flex justify-center">
     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/depth-visualization-zoe.png" alt="Depth estimation visualization"/>
--- a/docs/source/ja/model_doc/detr.md
+++ b/docs/source/ja/model_doc/detr.md
@ -184,6 +184,15 @@ DETR の使用を開始するのに役立つ公式 Hugging Face およびコミ
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation

+## DetrImageProcessorFast
+
+[[autodoc]] DetrImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
 ## DetrFeatureExtractor

 [[autodoc]] DetrFeatureExtractor
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -308,6 +308,8 @@
      title: Trainer
    - local: deepspeed
      title: DeepSpeed
+    - local: main_classes/executorch
+      title: ExecuTorch
    - local: main_classes/feature_extractor
      title: 특성 추출기
    - local: in_translation
@ -322,14 +324,14 @@
        title: BART
      - local: in_translation
        title: (번역중) BARThez
-      - local: in_translation
-        title: (번역중) BARTpho
+      - local: model_doc/bartpho
+        title: BARTpho
      - local: in_translation
        title: (번역중) BERT
      - local: in_translation
        title: (번역중) BertGeneration
-      - local: in_translation
-        title: (번역중) BertJapanese
+      - local: model_doc/bert-japanese
+        title: 일본어 Bert
      - local: model_doc/bertweet
        title: Bertweet
      - local: in_translation
@ -400,6 +402,8 @@
        title: (번역중) Funnel Transformer
      - local: model_doc/gemma
        title: Gemma
+      - local: model_doc/gemma2
+        title: Gemma2
      - local: model_doc/openai-gpt
        title: GPT
      - local: in_translation
@ -671,16 +675,21 @@
      - local: in_translation
        title: (번역중) XLSR-Wav2Vec2
      title: (번역중) 오디오 모델
+    - isExpanded: false
+      sections:
+      - local: model_doc/vivit
+        title: ViViT
+      title: (번역중) 비디오 모델
    - isExpanded: false
      sections:
      - local: in_translation
        title: (번역중) ALIGN
      - local: in_translation
        title: (번역중) AltCLIP
+      - local: model_doc/blip-2
+        title: BLIP-2
      - local: model_doc/blip
        title: BLIP
-      - local: in_translation
-        title: (번역중) BLIP-2
      - local: in_translation
        title: (번역중) BridgeTower
      - local: model_doc/chameleon
@ -783,8 +792,8 @@
      title: 파이프라인을 위한 유틸리티
    - local: internal/tokenization_utils
      title: 토크나이저를 위한 유틸리티
-    - local: in_translation
-      title: (번역중) Utilities for Trainer
+    - local: internal/trainer_utils
+      title: Trainer를 위한 유틸리티
    - local: internal/generation_utils
      title: 생성을 위한 유틸리티
    - local: internal/image_processing_utils
--- a/docs/source/ko/contributing.md
+++ b/docs/source/ko/contributing.md
@ -113,7 +113,7 @@ python src/transformers/commands/transformers_cli.py env

 🤗 Transformers에 기여하기 위해서는 기본적인 `git` 사용 능력이 필요합니다. `git`은 사용하기 쉬운 도구는 아니지만, 매우 훌륭한 매뉴얼을 제공합니다. 쉘(shell)에서 `git --help`을 입력하여 확인해보세요! 만약 책을 선호한다면, [Pro Git](https://git-scm.com/book/en/v2)은 매우 좋은 참고 자료가 될 것입니다.

-🤗 Transformers에 기여하려면 **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요:
+🤗 Transformers에 기여하려면 **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 이상의 버전이 필요합니다. 기여를 시작하려면 다음 단계를 따르세요:

 1. 저장소 페이지에서 **[Fork](https://github.com/huggingface/transformers/fork)** 버튼을 클릭하여 저장소를 포크하세요. 이렇게 하면 코드의 복사본이 여러분의 GitHub 사용자 계정 아래에 생성됩니다.

--- a/docs/source/ko/internal/trainer_utils.md
+++ b/docs/source/ko/internal/trainer_utils.md
@ -0,0 +1,49 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Trainer를 위한 유틸리티 (Utilities for Trainer) [[utilities-for-trainer]]
+
+이 페이지는 [`Trainer`]에서 사용되는 모든 유틸리티 함수들을 나열합니다.
+
+이 함수들 대부분은 라이브러리에 있는 Trainer 코드를 자세히 알아보고 싶을 때만 유용합니다.
+
+## 유틸리티 (Utilities) [[transformers.EvalPrediction]]
+
+[[autodoc]] EvalPrediction
+
+[[autodoc]] IntervalStrategy
+
+[[autodoc]] enable_full_determinism
+
+[[autodoc]] set_seed
+
+[[autodoc]] torch_distributed_zero_first
+
+## 콜백 내부 (Callbacks internals) [[transformers.trainer_callback.CallbackHandler]]
+
+[[autodoc]] trainer_callback.CallbackHandler
+
+## 분산 평가 (Distributed Evaluation) [[transformers.trainer_pt_utils.DistributedTensorGatherer]]
+
+[[autodoc]] trainer_pt_utils.DistributedTensorGatherer
+
+## Trainer 인자 파서 (Trainer Argument Parser) [[transformers.HfArgumentParser]]
+
+[[autodoc]] HfArgumentParser
+
+## 디버그 유틸리티 (Debug Utilities) [[transformers.debug_utils.DebugUnderflowOverflow]]
+
+[[autodoc]] debug_utils.DebugUnderflowOverflow
--- a/docs/source/ko/main_classes/executorch.md
+++ b/docs/source/ko/main_classes/executorch.md
@ -0,0 +1,33 @@
+<!--Copyright (c) Meta Platforms, Inc. and affiliates.
+All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# ExecuTorch [[executorch]]
+
+[`ExecuTorch`](https://github.com/pytorch/executorch) 는 웨어러블, 임베디드 장치, 마이크로컨트롤러를 포함한 모바일 및 엣지 장치에서 온디바이스 추론 기능을 가능하게 하는 종합 솔루션입니다. PyTorch 생태계에 속해있으며, 이식성, 생산성, 성능에 중점을 둔 PyTorch 모델 배포를 지원합니다.
+
+ExecuTorch는 백엔드 위임, 사용자 정의 컴파일러 변환, 메모리 계획 등 모델, 장치 또는 특정 유즈케이스 맞춤 최적화를 수행할 수 있는 진입점을 명확하게 정의합니다. ExecuTorch를 사용해 엣지 장치에서 PyTorch 모델을 실행하는 첫 번째 단계는 모델을 익스포트하는 것입니다. 이 작업은 PyTorch API인 [`torch.export`](https://pytorch.org/docs/stable/export.html)를 사용하여 수행합니다.
+
+
+## ExecuTorch 통합 [[transformers.TorchExportableModuleWithStaticCache]]
+
+`torch.export`를 사용하여 🤗 Transformers를 익스포트 할 수 있도록  통합 지점이 개발되고 있습니다. 이 통합의 목표는 익스포트뿐만 아니라, 익스포트한 아티팩트가 `ExecuTorch`에서 효율적으로 실행될 수 있도록 더 축소하고 최적화하는 것입니다. 특히 모바일 및 엣지 유즈케이스에 중점을 두고 있습니다.
+
+[[autodoc]] integrations.executorch.TorchExportableModuleWithStaticCache
+    - forward
+
+[[autodoc]] integrations.executorch.convert_and_export_with_cache
--- a/docs/source/ko/model_doc/bartpho.md
+++ b/docs/source/ko/model_doc/bartpho.md
@ -0,0 +1,86 @@
+<!--Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BARTpho [[bartpho]]
+
+## 개요 [[overview]]
+
+BARTpho 모델은 Nguyen Luong Tran, Duong Minh Le, Dat Quoc Nguyen에 의해 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701)에서 제안되었습니다.
+
+이 논문의 초록은 다음과 같습니다:
+
+*우리는 BARTpho_word와 BARTpho_syllable의 두 가지 버전으로 BARTpho를 제시합니다. 
+이는 베트남어를 위해 사전훈련된 최초의 대규모 단일 언어 시퀀스-투-시퀀스 모델입니다. 
+우리의 BARTpho는 시퀀스-투-시퀀스 디노이징 모델인 BART의 "large" 아키텍처와 사전훈련 방식을 사용하여, 생성형 NLP 작업에 특히 적합합니다. 
+베트남어 텍스트 요약의 다운스트림 작업 실험에서, 
+자동 및 인간 평가 모두에서 BARTpho가 강력한 기준인 mBART를 능가하고 최신 성능을 개선했음을 보여줍니다. 
+우리는 향후 연구 및 베트남어 생성형 NLP 작업의 응용을 촉진하기 위해 BARTpho를 공개합니다.*
+
+이 모델은 [dqnguyen](https://huggingface.co/dqnguyen)이 기여했습니다. 원본 코드는 [여기](https://github.com/VinAIResearch/BARTpho)에서 찾을 수 있습니다.
+
+## 사용 예시 [[usage-example]]
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable")
+
+>>> tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
+
+>>> line = "Chúng tôi là những nghiên cứu viên."
+
+>>> input_ids = tokenizer(line, return_tensors="pt")
+
+>>> with torch.no_grad():
+...     features = bartpho(**input_ids)  # 이제 모델 출력은 튜플입니다
+
+>>> # With TensorFlow 2.0+:
+>>> from transformers import TFAutoModel
+
+>>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable")
+>>> input_ids = tokenizer(line, return_tensors="tf")
+>>> features = bartpho(**input_ids)
+```
+
+## 사용 팁 [[usage-tips]]
+
+- mBART를 따르며, BARTpho는 BART의 "large" 아키텍처에 인코더와 디코더의 상단에 추가적인 레이어 정규화 레이어를 사용합니다. 
+따라서 [BART 문서](bart)에 있는 사용 예시를 BARTpho에 맞게 적용하려면 
+BART 전용 클래스를 mBART 전용 클래스로 대체하여 조정해야 합니다. 
+예를 들어:
+
+```python
+>>> from transformers import MBartForConditionalGeneration
+
+>>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable")
+>>> TXT = "Chúng tôi là <mask> nghiên cứu viên."
+>>> input_ids = tokenizer([TXT], return_tensors="pt")["input_ids"]
+>>> logits = bartpho(input_ids).logits
+>>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+>>> probs = logits[0, masked_index].softmax(dim=0)
+>>> values, predictions = probs.topk(5)
+>>> print(tokenizer.decode(predictions).split())
+```
+
+- 이 구현은 토큰화만을 위한 것입니다: "monolingual_vocab_file"은 다국어
+ XLM-RoBERTa에서 제공되는 사전훈련된 SentencePiece 모델 
+ "vocab_file"에서 추출된 베트남어 전용 유형으로 구성됩니다.
+  다른 언어들도 이 사전훈련된 다국어 SentencePiece 모델 "vocab_file"을 하위 단어 분할에 사용하면, 자신의 언어 전용 "monolingual_vocab_file"과 함께 BartphoTokenizer를 재사용할 수 있습니다.
+
+## BartphoTokenizer [[bartphotokenizer]]
+
+[[autodoc]] BartphoTokenizer
--- a/docs/source/ko/model_doc/bert-japanese.md
+++ b/docs/source/ko/model_doc/bert-japanese.md
@ -0,0 +1,79 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 일본어 BERT (BertJapanese) [[bertjapanese]]
+
+## 개요 [[overview]]
+
+일본어 문장에 학습된 BERT 모델 입니다.
+
+각각 서로 다른 토큰화 방법을 사용하는 두 모델:
+
+- MeCab와 WordPiece를 사용하여 토큰화합니다. 이를 위해 추가 의존성 [fugashi](https://github.com/polm/fugashi)이 필요합니다. (이는 [MeCab](https://taku910.github.io/mecab/)의 래퍼입니다.)
+- 문자 단위로 토큰화합니다.
+
+*MecabTokenizer*를 사용하려면, 의존성을 설치하기 위해 `pip install transformers["ja"]` (또는 소스에서 설치하는 경우 `pip install -e .["ja"]`) 명령을 실행해야 합니다.
+
+자세한 내용은 [cl-tohoku 리포지토리](https://github.com/cl-tohoku/bert-japanese)에서 확인하세요.
+
+MeCab과 WordPiece 토큰화를 사용하는 모델 예시:
+
+```python
+>>> import torch
+>>> from transformers import AutoModel, AutoTokenizer
+
+>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese")
+>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese")
+
+>>> ## Input Japanese Text
+>>> line = "吾輩は猫である。"
+
+>>> inputs = tokenizer(line, return_tensors="pt")
+
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
+[CLS] 吾輩 は 猫 で ある 。 [SEP]
+
+>>> outputs = bertjapanese(**inputs)
+```
+
+문자 토큰화를 사용하는 모델 예시:
+
+```python
+>>> bertjapanese = AutoModel.from_pretrained("cl-tohoku/bert-base-japanese-char")
+>>> tokenizer = AutoTokenizer.from_pretrained("cl-tohoku/bert-base-japanese-char")
+
+>>> ## Input Japanese Text
+>>> line = "吾輩は猫である。"
+
+>>> inputs = tokenizer(line, return_tensors="pt")
+
+>>> print(tokenizer.decode(inputs["input_ids"][0]))
+[CLS] 吾 輩 は 猫 で あ る 。 [SEP]
+
+>>> outputs = bertjapanese(**inputs)
+```
+
+<Tip> 
+
+이는 토큰화 방법을 제외하고는 BERT와 동일합니다. API 참조 정보는 [BERT 문서](https://huggingface.co/docs/transformers/main/en/model_doc/bert)를 참조하세요.
+이 모델은 [cl-tohoku](https://huggingface.co/cl-tohoku)께서 기여하였습니다.
+
+</Tip> 
+
+
+## BertJapaneseTokenizer
+
+[[autodoc]] BertJapaneseTokenizer
--- a/docs/source/ko/model_doc/blip-2.md
+++ b/docs/source/ko/model_doc/blip-2.md
@ -0,0 +1,98 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BLIP-2[[blip-2]]
+
+## 개요[[overview]]
+BLIP-2 모델은 Junnan Li, Dongxu Li, Silvio Savarese, Steven Hoi의 [BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models](https://arxiv.org/abs/2301.12597) 논문에서 제안되었습니다. BLIP-2는 동결된 사전 학습 이미지 인코더와 대규모 언어 모델(LLM)을 연결하는 12층의 경량 Transformer 인코더를 학습시켜, 여러 비전-언어 작업에서 SOTA(현재 최고의 성능)을 달성했습니다. 특히, BLIP-2는 800억 개의 파라미터를 가진 Flamingo 모델보다 제로샷 VQAv2에서 8.7% 더 높은 성능을 기록했으며, 학습 가능한 파라미터 수는 Flamingo보다 54배 적습니다.
+
+논문의 초록은 다음과 같습니다:
+
+*비전-언어 사전 학습의 비용은 대규모 모델의 엔드-투-엔드 학습으로 인해 점점 더 부담스러워지고 있습니다. 본 논문은 사전 학습된 이미지 인코더와 대규모 언어 모델을 활용하여 비전-언어 사전 학습을 부트스트래핑하는 일반적이고 효율적인 사전 학습 전략인 BLIP-2를 제안합니다. BLIP-2는 경량화된 Querying Transformer를 통해 모달리티 간의 차이를 연결하며, 두 단계로 사전 학습됩니다. 첫 번째 단계는 동결된 이미지 인코더로부터 비전-언어 표현 학습을 부트스트래핑하고, 두 번째 단계는 동결된 언어 모델로부터 비전-언어 생성 학습을 부트스트래핑합니다. BLIP-2는 기존 방법들에 비해 훨씬 적은 학습 가능한 파라미터로 다양한 비전-언어 작업에서 최첨단 성능을 달성합니다. 예를 들어, 우리 모델은 제로샷 VQAv2에서 Flamingo80B보다 8.7% 높은 성능을 기록하며, 학습 가능한 파라미터 수는 54배 적습니다. 우리는 또한 자연어 명령을 따를 수 있는 제로샷 이미지-텍스트 생성의 새로운 기능을 입증했습니다.*
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/blip2_architecture.jpg"
+alt="drawing" width="600"/> 
+
+<small> BLIP-2 구조. <a href="https://arxiv.org/abs/2301.12597">원본 논문</a> 에서 발췌. </small>
+
+이 모델은 [nielsr](https://huggingface.co/nielsr)가 기여했습니다. 원본 코드는 [여기](https://github.com/salesforce/LAVIS/tree/5ee63d688ba4cebff63acee04adaef2dee9af207)에서 확인할 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+- BLIP-2는 이미지와 조건에 따라 텍스트 프롬프트를 입력받아 조건부 텍스트를 생성합니다. 추론 시 [`generate`] 메소드를 사용하는 것이 권장됩니다.
+- [`Blip2Processor`]를 사용하여 모델에 이미지를 준비하고, 예측된 토큰 ID를 텍스트로 디코딩할 수 있습니다.
+
+## 자료[[resources]]
+
+BLIP-2를 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티(🌎 표시) 자료 목록입니다.
+
+- 이미지 캡셔닝, 시각 질문 응답(VQA), 채팅과 같은 대화형 작업을 위한 BLIP-2 데모 노트북은 [여기](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/BLIP-2)에서 찾을 수 있습니다.
+
+리소스를 제출하여 여기에 포함하고 싶다면 언제든지 풀 리퀘스트를 열어주세요! 리소스는 기존 리소스를 복제하지 않고 새로운 내용이어야 합니다.
+
+## Blip2Config[[transformers.Blip2Config]]
+
+[[autodoc]] Blip2Config
+    - from_vision_qformer_text_configs
+
+## Blip2VisionConfig[[transformers.Blip2VisionConfig]]
+
+[[autodoc]] Blip2VisionConfig
+
+## Blip2QFormerConfig[[transformers.Blip2QFormerConfig]]
+
+[[autodoc]] Blip2QFormerConfig
+
+## Blip2Processor[[transformers.Blip2Processor]]
+
+[[autodoc]] Blip2Processor
+
+## Blip2VisionModel[[transformers.Blip2VisionModel]]
+
+[[autodoc]] Blip2VisionModel
+    - forward
+
+## Blip2QFormerModel[[transformers.Blip2QFormerModel]]
+
+[[autodoc]] Blip2QFormerModel
+    - forward
+
+## Blip2Model[[transformers.Blip2Model]]
+
+[[autodoc]] Blip2Model
+    - forward
+    - get_text_features
+    - get_image_features
+    - get_qformer_features
+
+## Blip2ForConditionalGeneration[[transformers.Blip2ForConditionalGeneration]]
+
+[[autodoc]] Blip2ForConditionalGeneration
+    - forward
+    - generate
+
+## Blip2ForImageTextRetrieval[[transformers.Blip2ForImageTextRetrieval]]
+
+[[autodoc]] Blip2ForImageTextRetrieval
+    - forward
+
+## Blip2TextModelWithProjection[[transformers.Blip2TextModelWithProjection]]
+
+[[autodoc]] Blip2TextModelWithProjection
+
+## Blip2VisionModelWithProjection[[transformers.Blip2VisionModelWithProjection]]
+
+[[autodoc]] Blip2VisionModelWithProjection
--- a/docs/source/ko/model_doc/gemma2.md
+++ b/docs/source/ko/model_doc/gemma2.md
@ -0,0 +1,63 @@
+
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Gemma2 [[gemma2]]
+
+## 개요 [[overview]]
+
+Gemma2 모델은 Google의 Gemma2 팀이 작성한 [Gemma2: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/google-gemma-2/)에서 제안되었습니다.
+파라미터 크기가 각각 90억(9B)과 270억(27B)인 두 가지 Gemma2 모델이 출시되었습니다.
+
+블로그 게시물의 초록은 다음과 같습니다:
+
+*이제 우리는 전 세계의 연구자와 개발자들에게 Gemma 2를 공식적으로 출시합니다. 90억(9B)과 270억(27B) 파라미터 크기로 제공되는 Gemma 2는 1세대보다 더 높은 성능과 추론 효율성을 제공하며, 상당한 안전성 향상을 포함하고 있습니다. 사실 270억 규모의 모델은 크기가 두 배 이상인 모델과 비교해도 경쟁력 있는 대안을 제공하며, 이는 작년 12월까지만 해도 독점 모델에서만 가능했던 성능을 제공합니다.*
+
+팁:
+
+- 원본 체크포인트는 변환 스크립트 `src/transformers/models/Gemma2/convert_Gemma2_weights_to_hf.py`를 사용하여 변환할 수 있습니다.
+
+<Tip warning={true}>
+
+- Gemma2는 매 두 번째 레이어마다 슬라이딩 윈도우 어텐션을 사용하므로 [`~DynamicCache`] 또는 텐서의 튜플과 같은 일반적인 kv 캐싱에는 적합하지 않습니다. Gemma2의 forward 호출에서 캐싱을 활성화하려면 [`~HybridCache`] 인스턴스를 초기화하고 이를 `past_key_values`로 forward 호출에 전달해야 합니다. 또한 `past_key_values`에 이미 이전의 키와 값이 포함되어 있다면 `cache_position`도 준비해야 합니다.
+
+</Tip>
+
+이 모델은 [Arthur Zucker](https://huggingface.co/ArthurZ), [Pedro Cuenca](https://huggingface.co/pcuenq), [Tom Arsen]()이 기여했습니다.
+
+## Gemma2Config [[transformers.Gemma2Config]]
+
+[[autodoc]] Gemma2Config
+
+## Gemma2Model [[transformers.Gemma2Model]]
+
+[[autodoc]] Gemma2Model
+    - forward
+
+## Gemma2ForCausalLM [[transformers.Gemma2ForCausalLM]]
+
+[[autodoc]] Gemma2ForCausalLM
+    - forward
+
+## Gemma2ForSequenceClassification [[transformers.Gemma2ForSequenceClassification]]
+
+[[autodoc]] Gemma2ForSequenceClassification
+    - forward
+
+## Gemma2ForTokenClassification [[transformers.Gemma2ForTokenClassification]]
+
+[[autodoc]] Gemma2ForTokenClassification
+    - forward
--- a/docs/source/ko/model_doc/vivit.md
+++ b/docs/source/ko/model_doc/vivit.md
@ -0,0 +1,42 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+-->
+
+# Video Vision Transformer (ViViT) [[video-vision-transformer-vivit]]
+
+## 개요 [[overview]]
+
+Vivit 모델은 Anurag Arnab, Mostafa Dehghani, Georg Heigold, Chen Sun, Mario Lučić, Cordelia Schmid가 제안한 논문 [ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691)에서 소개되었습니다. 이 논문은 비디오 이해를 위한 pure-transformer 기반의 모델 집합 중에서 최초로 성공한 모델 중 하나를 소개합니다. 
+
+논문의 초록은 다음과 같습니다:
+
+*우리는 이미지 분류에서 최근 성공을 거둔 순수 트랜스포머 기반 모델을 바탕으로 비디오 분류를 위한 모델을 제안합니다. 본 모델은 입력 비디오로부터 시공간 토큰을 추출한 후, 이를 일련의 트랜스포머 레이어로 인코딩합니다. 비디오에서 발생하는 긴 토큰 시퀀스를 처리하기 위해, 입력의 공간 및 시간 차원을 분리하는 여러 효율적인 모델 변형을 제안합니다. 트랜스포머 기반 모델은 대규모 학습 데이터셋에서만 효과적이라는 것이 일반적이지만, 우리는 학습 중 모델을 효과적으로 정규화하고, 사전 학습된 이미지 모델을 활용함으로써 상대적으로 작은 데이터셋에서도 학습할 수 있는 방법을 보여줍니다. 또한, 철저한 소거(ablation) 연구를 수행하고 Kinetics 400 및 600, Epic Kitchens, Something-Something v2, Moments in Time을 포함한 여러 비디오 분류 벤치마크에서 최첨단 성과를 달성하여, 기존의 3D 합성곱 신경망 기반 방법들을 능가합니다.*
+
+이 모델은 [jegormeister](https://huggingface.co/jegormeister)가 기여하였습니다. 원본 코드(JAX로 작성됨)는 [여기](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit)에서 확인할 수 있습니다.
+
+## VivitConfig [[transformers.VivitConfig]]
+
+[[autodoc]] VivitConfig
+
+## VivitImageProcessor [[transformers.VivitImageProcessor]]
+
+[[autodoc]] VivitImageProcessor
+    - preprocess
+
+## VivitModel [[transformers.VivitModel]]
+
+[[autodoc]] VivitModel
+    - forward
+
+## VivitForVideoClassification [[transformers.VivitForVideoClassification]]
+
+[[autodoc]] transformers.VivitForVideoClassification
+    - forward
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -50,6 +50,8 @@
    title: 导出为 TFLite
  - local: torchscript
    title: 导出为 TorchScript
+  - local: gguf
+    title: 与 GGUF 格式的互操作性
  title: 开发者指南
 - sections:
  - local: performance
--- a/docs/source/zh/contributing.md
+++ b/docs/source/zh/contributing.md
@ -112,7 +112,7 @@ python src/transformers/commands/transformers_cli.py env

 要为 🤗 Transformers 做贡献，你需要基本的 `git` 使用技能。虽然 `git` 不是一个很容易使用的工具，但它提供了非常全面的手册，在命令行中输入 `git --help` 并享受吧！如果你更喜欢书籍，[Pro Git](https://git-scm.com/book/en/v2)是一本很好的参考书。

-要为 🤗 Transformers 做贡献，你需要 **[Python 3.8](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 或更高版本。请按照以下步骤开始贡献：
+要为 🤗 Transformers 做贡献，你需要 **[Python 3.9](https://github.com/huggingface/transformers/blob/main/setup.py#L426)** 或更高版本。请按照以下步骤开始贡献：

 1. 点击[仓库](https://github.com/huggingface/transformers)页面上的 **[Fork](https://github.com/huggingface/transformers/fork)** 按钮，这会在你的 GitHub 账号下拷贝一份代码。

--- a/docs/source/zh/gguf.md
+++ b/docs/source/zh/gguf.md
@ -0,0 +1,104 @@
+<!--
+Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# GGUF 和 Transformers 的交互
+
+GGUF文件格式用于存储模型，以便通过[GGML](https://github.com/ggerganov/ggml)和其他依赖它的库进行推理，例如非常流行的[llama.cpp](https://github.com/ggerganov/llama.cpp)或[whisper.cpp](https://github.com/ggerganov/whisper.cpp)。
+
+该文件格式[由抱抱脸支持](https://huggingface.co/docs/hub/en/gguf)，可用于快速检查文件中张量和元数据。
+
+该文件格式是一种“单文件格式”，通常单个文件就包含了配置属性、分词器词汇表和其他属性，同时还有模型中要加载的所有张量。这些文件根据文件的量化类型有不同的格式。我们在[这里](https://huggingface.co/docs/hub/en/gguf#quantization-types)进行了简要介绍。
+
+## 在 Transformers 中的支持
+
+我们在 transformers 中添加了加载 gguf 文件的功能，这样可以对 GGUF 模型进行进一步的训练或微调，然后再将模型转换回 GGUF 格式，以便在 ggml 生态系统中使用。加载模型时，我们首先将其反量化为 FP32，然后再加载权重以在 PyTorch 中使用。
+
+>    [!注意]
+>    目前这个功能还处于探索阶段，欢迎大家贡献力量，以便在不同量化类型和模型架构之间更好地完善这一功能。
+
+目前，支持的模型架构和量化类型如下：
+
+### 支持的量化类型
+
+根据分享在 Hub 上的较为热门的量化文件，初步支持以下量化类型：
+
+- F32
+- F16
+- BF16
+- Q4_0
+- Q4_1
+- Q5_0
+- Q5_1
+- Q8_0
+- Q2_K
+- Q3_K
+- Q4_K
+- Q5_K
+- Q6_K
+- IQ1_S
+- IQ1_M
+- IQ2_XXS
+- IQ2_XS
+- IQ2_S
+- IQ3_XXS
+- IQ3_S
+- IQ4_XS
+- IQ4_NL
+
+>    [!注意]
+>    为了支持 gguf 反量化，需要安装 `gguf>=0.10.0`。
+
+### 支持的模型架构
+
+目前支持以下在 Hub 上非常热门的模型架构：
+
+- LLaMa
+- Mistral
+- Qwen2
+- Qwen2Moe
+- Phi3
+- Bloom
+- Falcon
+- StableLM
+- GPT2
+- Starcoder2
+
+## 使用示例
+
+为了在`transformers`中加载`gguf`文件，你需要在 `from_pretrained`方法中为分词器和模型指定 `gguf_file`参数。下面是从同一个文件中加载分词器和模型的示例：
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_id = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
+filename = "tinyllama-1.1b-chat-v1.0.Q6_K.gguf"
+
+tokenizer = AutoTokenizer.from_pretrained(model_id, gguf_file=filename)
+model = AutoModelForCausalLM.from_pretrained(model_id, gguf_file=filename)
+```
+
+现在，你就已经可以结合 PyTorch 生态系统中的一系列其他工具，来使用完整的、未量化的模型了。
+
+为了将模型转换回`gguf`文件，我们建议使用`llama.cpp`中的[`convert-hf-to-gguf.py`文件](https://github.com/ggerganov/llama.cpp/blob/master/convert_hf_to_gguf.py)。
+
+以下是如何补充上面的脚本，以保存模型并将其导出回 `gguf`的示例：
+
+```py
+tokenizer.save_pretrained('directory')
+model.save_pretrained('directory')
+
+!python ${path_to_llama_cpp}/convert-hf-to-gguf.py ${directory}
+```
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/modular-transformers/modeling_new_task_model.py
+++ b/examples/modular-transformers/modeling_new_task_model.py
@ -0,0 +1,546 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from examples/modular-transformers/modular_new_task_model.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_new_task_model.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+from dataclasses import dataclass
+from typing import ClassVar, List, Optional, Tuple, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from ...cache_utils import Cache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_new_task_model import NewTaskModelConfig
+
+
+if is_flash_attn_2_available():
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
+
+from ..auto import AutoModel, AutoModelForCausalLM
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "NewTaskModelConfig"
+
+
+# Adapted from transformers.models.llama.modeling_llama.LlamaModel._prepare_4d_causal_attention_mask_with_cache_position
+# But NewTaskModel has no causal mask on prefix
+def _prepare_4d_causal_attention_mask_with_cache_position(
+    attention_mask: torch.Tensor,
+    sequence_length: int,
+    target_length: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    min_dtype: float,
+    cache_position: torch.Tensor,
+    batch_size: int,
+    is_training: bool = False,
+    token_type_ids: torch.Tensor = None,
+):
+    """
+    Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+    `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+    Args:
+        attention_mask (`torch.Tensor`):
+            A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
+        sequence_length (`int`):
+            The sequence length being processed.
+        target_length (`int`):
+            The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
+        dtype (`torch.dtype`):
+            The dtype to use for the 4D attention mask.
+        device (`torch.device`):
+            The device to plcae the 4D attention mask on.
+        min_dtype (`float`):
+            The minimum value representable with the dtype `dtype`.
+        cache_position (`torch.Tensor`):
+            Indices depicting the position of the input sequence tokens in the sequence.
+        batch_size (`torch.Tensor`):
+            Batch size.
+        is_training (`bool`):
+            Whether the model is in training mode or in inference. The condition is checked by presence/absence of `token_type_ids/labels`
+    """
+    if attention_mask is not None and attention_mask.dim() == 4:
+        # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+        causal_mask = attention_mask
+    else:
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask[:, :sequence_length] = 0.0
+
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+    return causal_mask
+
+
+@dataclass
+class NewTaskModelCausalLMOutputWithPast(ModelOutput):
+    """
+    Base class for NewTaskModelcausal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        image_hidden_states (`torch.FloatTensor`, *optional*):
+            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class NewTaskModelMultiModalProjector(nn.Module):
+    def __init__(self, config: NewTaskModelConfig):
+        super().__init__()
+        self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True)
+
+    def forward(self, image_features):
+        hidden_states = self.linear(image_features)
+
+        return hidden_states
+
+
+NEW_TASK_MODEL_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`NewTaskModelConfig`] or [`NewTaskModelVisionConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
+    NEW_TASK_MODEL_START_DOCSTRING,
+)
+class NewTaskModelPreTrainedModel(PreTrainedModel):
+    config_class = NewTaskModelConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["NewTaskModelMultiModalProjector"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = False
+    _supports_cache_class = True
+    _supports_quantized_cache = True
+    _supports_static_cache = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+
+    def _init_weights(self, module):
+        # important: this ported version of NewTaskModelisn't meant for training from scratch - only
+        # inference and fine-tuning
+        std = (
+            self.config.initializer_range
+            if hasattr(self.config, "initializer_range")
+            else self.config.text_config.initializer_range
+        )
+
+        if hasattr(module, "class_embedding"):
+            module.class_embedding.data.normal_(mean=0.0, std=std)
+
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    @property
+    def _supports_sdpa(self):
+        """
+        Retrieve language_model's attribute to check whether the model supports
+        SDPA or not.
+        """
+        return self.language_model._supports_sdpa
+
+
+NEW_TASK_MODEL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+            The tensors corresponding to the input images. Pixel values can be obtained using
+            [`AutoImageProcessor`]. See [`SiglipImageProcessor.__call__`] for details ([]`NewTaskModelProcessor`] uses
+            [`SiglipImageProcessor`] for processing images).
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+            Indices depicting the position of the input sequence tokens in the sequence. Contrarily to `position_ids`,
+            this tensor is not affected by padding. It is used to update the cache in the correct position and to infer
+            the complete sequence length.
+"""
+
+
+@add_start_docstrings(
+    """The NEW_TASK_MODEL model which consists of a vision backbone and a language model.""",
+    NEW_TASK_MODEL_START_DOCSTRING,
+)
+class NewTaskModelForNewTask(NewTaskModelPreTrainedModel, GenerationMixin):
+    main_input_name: ClassVar[str] = "doc_input_ids"  # transformers-related
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = NewTaskModelMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self._attn_implementation = config._attn_implementation
+
+        language_model = AutoModelForCausalLM.from_config(
+            config=config.text_config, attn_implementation=self._attn_implementation
+        )
+
+        if language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"language_model.{k}" for k in language_model._tied_weights_keys]
+        self.language_model = language_model
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+
+        self.embedding_dim = self.config.embedding_dim
+        self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys]
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def _update_causal_mask(
+        self, attention_mask, token_type_ids, inputs_embeds, past_key_values, cache_position, is_training: bool = False
+    ):
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        dtype = inputs_embeds.dtype
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = inputs_embeds.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            return attention_mask
+
+        causal_mask = torch.full(
+            (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+        )
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask[:, :sequence_length] = 0.0
+
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_embeds.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+            # we are training thus we need to create a full mask on the image + prefix but causal on suffix
+            if is_training:
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+        return causal_mask
+
+    @add_start_docstrings_to_model_forward(NEW_TASK_MODEL_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=NewTaskModelCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_logits_to_keep: int = 0,
+    ) -> Union[Tuple, NewTaskModelCausalLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+            num_logits_to_keep (`int`, *optional*):
+                Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
+                `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
+                token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, NewTaskModelForNewTask
+
+        >>> model = NewTaskModelForNewTask.from_pretrained("google/NewTaskModel-test-224px-hf")
+        >>> processor = AutoProcessor.from_pretrained("google/NewTaskModel-test-224px-hf")
+
+        >>> prompt = "answer en Where is the cow standing?"
+        >>> url = "https://huggingface.co/gv-hf/NewTaskModel-test-224px-hf/resolve/main/cow_beach_1.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_length=30)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "answer en Where is the cow standing?\nbeach"
+        ```
+        Returns:
+        """
+        vlm_outputs = super().forward(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            cache_position=cache_position,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=True,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        last_hidden_states = vlm_outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
+        proj = self.custom_text_proj(last_hidden_states)  # (batch_size, sequence_length, dim)
+
+        # L2 normalization
+        embeddings = proj / proj.norm(dim=-1, keepdim=True)  # (batch_size, sequence_length, dim)
+
+        embeddings = embeddings * attention_mask.unsqueeze(-1)  # (batch_size, sequence_length, dim)
+
+        return (embeddings,) + vlm_outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        num_logits_to_keep=None,
+        **kwargs,
+    ):
+        model_inputs = self.language_model.prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            num_logits_to_keep=num_logits_to_keep,
+            **kwargs,
+        )
+
+        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
+            if model_inputs["inputs_embeds"] is not None:
+                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
+                device = model_inputs["inputs_embeds"].device
+            else:
+                batch_size, sequence_length = model_inputs["input_ids"].shape
+                device = model_inputs["input_ids"].device
+
+            dtype = self.get_output_embeddings().weight.dtype
+            min_dtype = torch.finfo(dtype).min
+
+            model_inputs["attention_mask"] = _prepare_4d_causal_attention_mask_with_cache_position(
+                attention_mask,
+                sequence_length=sequence_length,
+                target_length=past_key_values.get_max_length(),
+                dtype=dtype,
+                device=device,
+                min_dtype=min_dtype,
+                cache_position=cache_position,
+                batch_size=batch_size,
+            )
+
+        model_inputs["token_type_ids"] = token_type_ids
+
+        # position_ids in NewTaskModel are 1-indexed
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+    def resize_token_embeddings(
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of=None,
+    ) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+
+        # Update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+
+        return model_embeds
--- a/examples/modular-transformers/modular_new_task_model.py
+++ b/examples/modular-transformers/modular_new_task_model.py
@ -0,0 +1,84 @@
+from typing import ClassVar, List, Optional, Union
+
+import torch
+import torch.utils.checkpoint
+from torch import nn
+
+from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
+
+from ...cache_utils import Cache
+
+
+class NewTaskModelForNewTask(PaliGemmaForConditionalGeneration):
+    main_input_name: ClassVar[str] = "doc_input_ids"  # transformers-related
+
+    def __init__(self, config):
+        super().__init__(config=config)
+
+        self.embedding_dim = self.config.embedding_dim
+        self.custom_text_proj = nn.Linear(self.config.text_config.hidden_size, self.embedding_dim)
+
+        if self.language_model._tied_weights_keys is not None:
+            self._tied_weights_keys = [f"model.language_model.{k}" for k in self.language_model._tied_weights_keys]
+
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        pixel_values: torch.FloatTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        num_logits_to_keep: int = 0,
+    ):
+        r"""
+        Returns:
+        """
+        vlm_outputs = super().forward(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            token_type_ids=token_type_ids,
+            cache_position=cache_position,
+            inputs_embeds=inputs_embeds,
+            labels=labels,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=True,
+            num_logits_to_keep=num_logits_to_keep,
+        )
+        last_hidden_states = vlm_outputs.hidden_states[-1]  # (batch_size, sequence_length, hidden_size)
+        proj = self.custom_text_proj(last_hidden_states)  # (batch_size, sequence_length, dim)
+
+        # L2 normalization
+        embeddings = proj / proj.norm(dim=-1, keepdim=True)  # (batch_size, sequence_length, dim)
+
+        embeddings = embeddings * attention_mask.unsqueeze(-1)  # (batch_size, sequence_length, dim)
+
+        return (embeddings,) + vlm_outputs
+
+    def resize_token_embeddings(
+        self,
+        new_num_tokens: Optional[int] = None,
+        pad_to_multiple_of=None,
+    ) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
+
+        # Update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+
+        return model_embeds
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -58,7 +58,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

@ -366,7 +366,7 @@ def main():
    )

    tokenizer_name_or_path = model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path
-    if config.model_type in {"bloom", "gpt2", "roberta"}:
+    if config.model_type in {"bloom", "gpt2", "roberta", "deberta"}:
        tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_name_or_path,
            cache_dir=model_args.cache_dir,
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
--- a/examples/research_projects/synthid_text/README.md
+++ b/examples/research_projects/synthid_text/README.md
@ -0,0 +1,34 @@
+# SynthID Text
+
+This project showcases the use of SynthIDText for watermarking LLMs. The code shown in this repo also
+demostrates the training of the detector for detecting such watermarked text. This detector can be uploaded onto
+a private HF hub repo (private for security reasons) and can be initialized again through pretrained model loading also shown in this script.
+
+See our blog post: https://huggingface.co/blog/synthid-text
+
+
+## Python version
+
+User would need python 3.9 to run this example.
+
+## Installation and running
+
+Once you install transformers you would need to install requirements for this project through requirements.txt provided in this folder.
+
+```
+pip install -r requirements.txt
+```
+
+## To run the detector training
+
+```
+python detector_training.py --model_name=google/gemma-7b-it
+```
+
+Check the script for more parameters are are tunable and check out paper at link
+https://www.nature.com/articles/s41586-024-08025-4 for more information on these parameters.
+
+## Caveat
+
+Make sure to run the training of the detector and the detection on the same hardware
+CPU, GPU or TPU to get consistent results (we use detecterministic randomness which is hardware dependent).
--- a/examples/research_projects/synthid_text/detector_training.py
+++ b/examples/research_projects/synthid_text/detector_training.py
@ -0,0 +1,502 @@
+# coding=utf-8
+# Copyright 2024 Google DeepMind.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import dataclasses
+import enum
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+import numpy as np
+import torch
+
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BayesianDetectorConfig,
+    BayesianDetectorModel,
+    SynthIDTextWatermarkDetector,
+    SynthIDTextWatermarkingConfig,
+    SynthIDTextWatermarkLogitsProcessor,
+)
+from utils import (
+    get_tokenized_uwm_outputs,
+    get_tokenized_wm_outputs,
+    process_raw_model_outputs,
+    update_fn_if_fpr_tpr,
+    upload_model_to_hf,
+)
+
+
+@enum.unique
+class ValidationMetric(enum.Enum):
+    """Direction along the z-axis."""
+
+    TPR_AT_FPR = "tpr_at_fpr"
+    CROSS_ENTROPY = "cross_entropy"
+
+
+@dataclasses.dataclass
+class TrainingArguments:
+    """Training arguments pertaining to the training loop itself."""
+
+    eval_metric: Optional[str] = dataclasses.field(
+        default=ValidationMetric.TPR_AT_FPR, metadata={"help": "The evaluation metric used."}
+    )
+
+
+def train_detector(
+    detector: torch.nn.Module,
+    g_values: torch.Tensor,
+    mask: torch.Tensor,
+    watermarked: torch.Tensor,
+    epochs: int = 250,
+    learning_rate: float = 1e-3,
+    minibatch_size: int = 64,
+    seed: int = 0,
+    l2_weight: float = 0.0,
+    shuffle: bool = True,
+    g_values_val: Optional[torch.Tensor] = None,
+    mask_val: Optional[torch.Tensor] = None,
+    watermarked_val: Optional[torch.Tensor] = None,
+    verbose: bool = False,
+    validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR,
+) -> Tuple[Dict[str, Any], float]:
+    """Trains a Bayesian detector model.
+
+    Args:
+      g_values: g-values of shape [num_train, seq_len, watermarking_depth].
+      mask: A binary array shape [num_train, seq_len] indicating which g-values
+        should be used. g-values with mask value 0 are discarded.
+      watermarked: A binary array of shape [num_train] indicating whether the
+        example is watermarked (0: unwatermarked, 1: watermarked).
+      epochs: Number of epochs to train for.
+      learning_rate: Learning rate for optimizer.
+      minibatch_size: Minibatch size for training. Note that a minibatch
+        requires ~ 32 * minibatch_size * seq_len * watermarked_depth *
+        watermarked_depth bits of memory.
+      seed: Seed for parameter initialization.
+      l2_weight: Weight to apply to L2 regularization for delta parameters.
+      shuffle: Whether to shuffle before training.
+      g_values_val: Validation g-values of shape [num_val, seq_len,
+        watermarking_depth].
+      mask_val: Validation mask of shape [num_val, seq_len].
+      watermarked_val: Validation watermark labels of shape [num_val].
+      verbose: Boolean indicating verbosity of training. If true, the loss will
+        be printed. Defaulted to False.
+      use_tpr_fpr_for_val: Whether to use TPR@FPR=1% as metric for validation.
+        If false, use cross entropy loss.
+
+    Returns:
+      Tuple of
+        training_history: Training history keyed by epoch number where the
+        values are
+          dictionaries containing the loss, validation loss, and model
+          parameters,
+          keyed by
+          'loss', 'val_loss', and 'params', respectively.
+        min_val_loss: Minimum validation loss achieved during training.
+    """
+
+    # Set the random seed for reproducibility
+    torch.manual_seed(seed)
+
+    # Shuffle the data if required
+    if shuffle:
+        indices = torch.randperm(len(g_values))
+        g_values = g_values[indices]
+        mask = mask[indices]
+        watermarked = watermarked[indices]
+
+    # Initialize optimizer
+    optimizer = torch.optim.Adam(detector.parameters(), lr=learning_rate)
+    history = {}
+    min_val_loss = float("inf")
+
+    for epoch in range(epochs):
+        losses = []
+        detector.train()
+        num_batches = len(g_values) // minibatch_size
+        for i in range(0, len(g_values), minibatch_size):
+            end = i + minibatch_size
+            if end > len(g_values):
+                break
+            loss_batch_weight = l2_weight / num_batches
+
+            optimizer.zero_grad()
+            loss = detector(
+                g_values=g_values[i:end],
+                mask=mask[i:end],
+                labels=watermarked[i:end],
+                loss_batch_weight=loss_batch_weight,
+            )[1]
+            loss.backward()
+            optimizer.step()
+            losses.append(loss.item())
+        train_loss = sum(losses) / len(losses)
+
+        val_losses = []
+        if g_values_val is not None:
+            detector.eval()
+            if validation_metric == ValidationMetric.TPR_AT_FPR:
+                val_loss = update_fn_if_fpr_tpr(
+                    detector,
+                    g_values_val,
+                    mask_val,
+                    watermarked_val,
+                    minibatch_size=minibatch_size,
+                )
+            else:
+                for i in range(0, len(g_values_val), minibatch_size):
+                    end = i + minibatch_size
+                    if end > len(g_values_val):
+                        break
+                    with torch.no_grad():
+                        v_loss = detector(
+                            g_values=g_values_val[i:end],
+                            mask=mask_val[i:end],
+                            labels=watermarked_val[i:end],
+                            loss_batch_weight=0,
+                        )[1]
+                    val_losses.append(v_loss.item())
+                val_loss = sum(val_losses) / len(val_losses)
+
+        # Store training history
+        history[epoch + 1] = {"loss": train_loss, "val_loss": val_loss}
+        if verbose:
+            if val_loss is not None:
+                print(f"Epoch {epoch}: loss {loss} (train), {val_loss} (val)")
+            else:
+                print(f"Epoch {epoch}: loss {loss} (train)")
+
+        if val_loss is not None and val_loss < min_val_loss:
+            min_val_loss = val_loss
+            best_val_epoch = epoch
+
+    if verbose:
+        print(f"Best val Epoch: {best_val_epoch}, min_val_loss: {min_val_loss}")
+
+    return history, min_val_loss
+
+
+def train_best_detector(
+    tokenized_wm_outputs: Union[List[np.ndarray], np.ndarray],
+    tokenized_uwm_outputs: Union[List[np.ndarray], np.ndarray],
+    logits_processor: SynthIDTextWatermarkLogitsProcessor,
+    tokenizer: Any,
+    torch_device: torch.device,
+    test_size: float = 0.3,
+    pos_truncation_length: Optional[int] = 200,
+    neg_truncation_length: Optional[int] = 100,
+    max_padded_length: int = 2300,
+    n_epochs: int = 50,
+    learning_rate: float = 2.1e-2,
+    l2_weights: np.ndarray = np.logspace(-3, -2, num=4),
+    verbose: bool = False,
+    validation_metric: ValidationMetric = ValidationMetric.TPR_AT_FPR,
+):
+    """Train and return the best detector given range of hyperparameters.
+
+    In practice, we have found that tuning pos_truncation_length,
+    neg_truncation_length, n_epochs, learning_rate and l2_weights can help
+    improve the performance of the detector. We reccommend tuning these
+    parameters for your data.
+    """
+    l2_weights = list(l2_weights)
+
+    (
+        train_g_values,
+        train_masks,
+        train_labels,
+        cv_g_values,
+        cv_masks,
+        cv_labels,
+    ) = process_raw_model_outputs(
+        logits_processor,
+        tokenizer,
+        pos_truncation_length,
+        neg_truncation_length,
+        max_padded_length,
+        tokenized_wm_outputs,
+        test_size,
+        tokenized_uwm_outputs,
+        torch_device,
+    )
+
+    best_detector = None
+    lowest_loss = float("inf")
+    val_losses = []
+    for l2_weight in l2_weights:
+        config = BayesianDetectorConfig(watermarking_depth=len(logits_processor.keys))
+        detector = BayesianDetectorModel(config).to(torch_device)
+        _, min_val_loss = train_detector(
+            detector=detector,
+            g_values=train_g_values,
+            mask=train_masks,
+            watermarked=train_labels,
+            g_values_val=cv_g_values,
+            mask_val=cv_masks,
+            watermarked_val=cv_labels,
+            learning_rate=learning_rate,
+            l2_weight=l2_weight,
+            epochs=n_epochs,
+            verbose=verbose,
+            validation_metric=validation_metric,
+        )
+        val_losses.append(min_val_loss)
+        if min_val_loss < lowest_loss:
+            lowest_loss = min_val_loss
+            best_detector = detector
+    return best_detector, lowest_loss
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--model_name",
+        type=str,
+        default="google/gemma-2b-it",
+        help=("LM model to train the detector for."),
+    )
+    parser.add_argument(
+        "--temperature",
+        type=float,
+        default=1.0,
+        help=("Temperature to sample from the model."),
+    )
+    parser.add_argument(
+        "--top_k",
+        type=int,
+        default=40,
+        help=("Top K for sampling."),
+    )
+    parser.add_argument(
+        "--top_p",
+        type=float,
+        default=1.0,
+        help=("Top P for sampling."),
+    )
+    parser.add_argument(
+        "--num_negatives",
+        type=int,
+        default=10000,
+        help=("Number of negatives for detector training."),
+    )
+    parser.add_argument(
+        "--pos_batch_size",
+        type=int,
+        default=32,
+        help=("Batch size of watermarked positives while sampling."),
+    )
+    parser.add_argument(
+        "--num_pos_batch",
+        type=int,
+        default=313,
+        help=("Number of positive batches for training."),
+    )
+    parser.add_argument(
+        "--generation_length",
+        type=int,
+        default=512,
+        help=("Generation length for sampling."),
+    )
+    parser.add_argument(
+        "--save_model_to_hf_hub",
+        action="store_true",
+        help=("Whether to save the trained model HF hub. By default it will be a private repo."),
+    )
+    parser.add_argument(
+        "--load_from_hf_hub",
+        action="store_true",
+        help=(
+            "Whether to load trained detector model from HF Hub, make sure its the model trained on the same model "
+            "we are loading in the script."
+        ),
+    )
+    parser.add_argument(
+        "--hf_hub_model_name",
+        type=str,
+        default=None,
+        help=("HF hub model name for loading of saving the model."),
+    )
+    parser.add_argument(
+        "--eval_detector_on_prompts",
+        action="store_true",
+        help=("Evaluate detector on a prompt and print probability of watermark."),
+    )
+
+    args = parser.parse_args()
+    model_name = args.model_name
+    temperature = args.temperature
+    top_k = args.top_k
+    top_p = args.top_p
+    num_negatives = args.num_negatives
+    pos_batch_size = args.pos_batch_size
+    num_pos_batch = args.num_pos_batch
+    if num_pos_batch < 10:
+        raise ValueError("--num_pos_batch should be greater than 10.")
+    generation_length = args.generation_length
+    save_model_to_hf_hub = args.save_model_to_hf_hub
+    load_from_hf_hub = args.load_from_hf_hub
+    repo_name = args.hf_hub_model_name
+    eval_detector_on_prompts = args.eval_detector_on_prompts
+
+    NEG_BATCH_SIZE = 32
+
+    # Truncate outputs to this length for training.
+    POS_TRUNCATION_LENGTH = 200
+    NEG_TRUNCATION_LENGTH = 100
+    # Pad trucated outputs to this length for equal shape across all batches.
+    MAX_PADDED_LENGTH = 1000
+
+    DEVICE = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
+    if DEVICE.type not in ("cuda", "tpu"):
+        raise ValueError("We have found the training stable on GPU and TPU, we are working on" " a fix for CPUs")
+
+    model = None
+    if not load_from_hf_hub:
+        # Change this to make your watermark unique. Check documentation in the paper to understand the
+        # impact of these parameters.
+        DEFAULT_WATERMARKING_CONFIG = {
+            "ngram_len": 5,  # This corresponds to H=4 context window size in the paper.
+            "keys": [
+                654,
+                400,
+                836,
+                123,
+                340,
+                443,
+                597,
+                160,
+                57,
+                29,
+                590,
+                639,
+                13,
+                715,
+                468,
+                990,
+                966,
+                226,
+                324,
+                585,
+                118,
+                504,
+                421,
+                521,
+                129,
+                669,
+                732,
+                225,
+                90,
+                960,
+            ],
+            "sampling_table_size": 2**16,
+            "sampling_table_seed": 0,
+            "context_history_size": 1024,
+        }
+        watermark_config = SynthIDTextWatermarkingConfig(**DEFAULT_WATERMARKING_CONFIG)
+
+        model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+
+        logits_processor = SynthIDTextWatermarkLogitsProcessor(**DEFAULT_WATERMARKING_CONFIG, device=DEVICE)
+        tokenized_wm_outputs = get_tokenized_wm_outputs(
+            model,
+            tokenizer,
+            watermark_config,
+            num_pos_batch,
+            pos_batch_size,
+            temperature,
+            generation_length,
+            top_k,
+            top_p,
+            DEVICE,
+        )
+        tokenized_uwm_outputs = get_tokenized_uwm_outputs(num_negatives, NEG_BATCH_SIZE, tokenizer, DEVICE)
+
+        best_detector, lowest_loss = train_best_detector(
+            tokenized_wm_outputs=tokenized_wm_outputs,
+            tokenized_uwm_outputs=tokenized_uwm_outputs,
+            logits_processor=logits_processor,
+            tokenizer=tokenizer,
+            torch_device=DEVICE,
+            test_size=0.3,
+            pos_truncation_length=POS_TRUNCATION_LENGTH,
+            neg_truncation_length=NEG_TRUNCATION_LENGTH,
+            max_padded_length=MAX_PADDED_LENGTH,
+            n_epochs=100,
+            learning_rate=3e-3,
+            l2_weights=[
+                0,
+            ],
+            verbose=True,
+            validation_metric=ValidationMetric.TPR_AT_FPR,
+        )
+    else:
+        if repo_name is None:
+            raise ValueError("When loading from pretrained detector model name cannot be None.")
+        best_detector = BayesianDetectorModel.from_pretrained(repo_name).to(DEVICE)
+
+    best_detector.config.set_detector_information(
+        model_name=model_name, watermarking_config=DEFAULT_WATERMARKING_CONFIG
+    )
+    if save_model_to_hf_hub:
+        upload_model_to_hf(best_detector, repo_name)
+
+    # Evaluate model response with the detector
+    if eval_detector_on_prompts:
+        model_name = best_detector.config.model_name
+        watermark_config_dict = best_detector.config.watermarking_config
+        logits_processor = SynthIDTextWatermarkLogitsProcessor(**watermark_config_dict, device=DEVICE)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        tokenizer.pad_token = tokenizer.eos_token
+        synthid_text_detector = SynthIDTextWatermarkDetector(best_detector, logits_processor, tokenizer)
+
+        if model is None:
+            model = AutoModelForCausalLM.from_pretrained(model_name).to(DEVICE)
+        watermarking_config = SynthIDTextWatermarkingConfig(**watermark_config_dict)
+
+        prompts = ["Write a essay on cats."]
+        inputs = tokenizer(
+            prompts,
+            return_tensors="pt",
+            padding=True,
+        ).to(DEVICE)
+
+        _, inputs_len = inputs["input_ids"].shape
+
+        outputs = model.generate(
+            **inputs,
+            watermarking_config=watermarking_config,
+            do_sample=True,
+            max_length=inputs_len + generation_length,
+            temperature=temperature,
+            top_k=40,
+            top_p=1.0,
+        )
+        outputs = outputs[:, inputs_len:]
+        result = synthid_text_detector(outputs)
+
+        # You should set this based on expected fpr (false positive rate) and tpr (true positive rate).
+        # Check our demo at HF Spaces for more info.
+        upper_threshold = 0.95
+        lower_threshold = 0.12
+        if result[0][0] > upper_threshold:
+            print("The text is watermarked.")
+        elif lower_threshold < result[0][0] < upper_threshold:
+            print("It is hard to determine if the text is watermarked or not.")
+        else:
+            print("The text is not watermarked.")
--- a/examples/research_projects/synthid_text/requirements.txt
+++ b/examples/research_projects/synthid_text/requirements.txt
@ -0,0 +1,5 @@
+tensorflow-datasets>=4.9.3
+torch >= 1.3
+datasets
+scikit-learn
+tensorflow
--- a/examples/research_projects/synthid_text/utils.py
+++ b/examples/research_projects/synthid_text/utils.py
@ -0,0 +1,408 @@
+# coding=utf-8
+# Copyright 2024 Google DeepMind.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import gc
+from typing import Any, List, Optional, Tuple
+
+import datasets
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import torch
+import tqdm
+from huggingface_hub import HfApi, create_repo
+from huggingface_hub.utils import RepositoryNotFoundError
+from sklearn import model_selection
+
+import transformers
+
+
+def pad_to_len(
+    arr: torch.Tensor,
+    target_len: int,
+    left_pad: bool,
+    eos_token: int,
+    device: torch.device,
+) -> torch.Tensor:
+    """Pad or truncate array to given length."""
+    if arr.shape[1] < target_len:
+        shape_for_ones = list(arr.shape)
+        shape_for_ones[1] = target_len - shape_for_ones[1]
+        padded = (
+            torch.ones(
+                shape_for_ones,
+                device=device,
+                dtype=torch.long,
+            )
+            * eos_token
+        )
+        if not left_pad:
+            arr = torch.concatenate((arr, padded), dim=1)
+        else:
+            arr = torch.concatenate((padded, arr), dim=1)
+    else:
+        arr = arr[:, :target_len]
+    return arr
+
+
+def filter_and_truncate(
+    outputs: torch.Tensor,
+    truncation_length: Optional[int],
+    eos_token_mask: torch.Tensor,
+) -> torch.Tensor:
+    """Filter and truncate outputs to given length.
+
+    Args:
+    outputs: output tensor of shape [batch_size, output_len]
+    truncation_length: Length to truncate the final output.
+    eos_token_mask: EOS token mask of shape [batch_size, output_len]
+
+    Returns:
+    output tensor of shape [batch_size, truncation_length].
+    """
+    if truncation_length:
+        outputs = outputs[:, :truncation_length]
+        truncation_mask = torch.sum(eos_token_mask, dim=1) >= truncation_length
+        return outputs[truncation_mask, :]
+    return outputs
+
+
+def process_outputs_for_training(
+    all_outputs: List[torch.Tensor],
+    logits_processor: transformers.generation.SynthIDTextWatermarkLogitsProcessor,
+    tokenizer: Any,
+    pos_truncation_length: Optional[int],
+    neg_truncation_length: Optional[int],
+    max_length: int,
+    is_cv: bool,
+    is_pos: bool,
+    torch_device: torch.device,
+) -> Tuple[List[torch.Tensor], List[torch.Tensor]]:
+    """Process raw model outputs into format understandable by the detector.
+
+    Args:
+    all_outputs: sequence of outputs of shape [batch_size, output_len].
+    logits_processor: logits processor used for watermarking.
+    tokenizer: tokenizer used for the model.
+    pos_truncation_length: Length to truncate wm outputs.
+    neg_truncation_length: Length to truncate uwm outputs.
+    max_length: Length to pad truncated outputs so that all processed entries.
+        have same shape.
+    is_cv: Process given outputs for cross validation.
+    is_pos: Process given outputs for positives.
+    torch_device: torch device to use.
+
+    Returns:
+    Tuple of
+        all_masks: list of masks of shape [batch_size, max_length].
+        all_g_values: list of g_values of shape [batch_size, max_length, depth].
+    """
+    all_masks = []
+    all_g_values = []
+    for outputs in tqdm.tqdm(all_outputs):
+        # outputs is of shape [batch_size, output_len].
+        # output_len can differ from batch to batch.
+        eos_token_mask = logits_processor.compute_eos_token_mask(
+            input_ids=outputs,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+        if is_pos or is_cv:
+            # filter with length for positives for both train and CV.
+            # We also filter for length when CV negatives are processed.
+            outputs = filter_and_truncate(outputs, pos_truncation_length, eos_token_mask)
+        elif not is_pos and not is_cv:
+            outputs = filter_and_truncate(outputs, neg_truncation_length, eos_token_mask)
+
+        # If no filtered outputs skip this batch.
+        if outputs.shape[0] == 0:
+            continue
+
+        # All outputs are padded to max-length with eos-tokens.
+        outputs = pad_to_len(outputs, max_length, False, tokenizer.eos_token_id, torch_device)
+        # outputs shape [num_filtered_entries, max_length]
+
+        eos_token_mask = logits_processor.compute_eos_token_mask(
+            input_ids=outputs,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+
+        context_repetition_mask = logits_processor.compute_context_repetition_mask(
+            input_ids=outputs,
+        )
+
+        # context_repetition_mask of shape [num_filtered_entries, max_length -
+        # (ngram_len - 1)].
+        context_repetition_mask = pad_to_len(context_repetition_mask, max_length, True, 0, torch_device)
+        # We pad on left to get same max_length shape.
+        # context_repetition_mask of shape [num_filtered_entries, max_length].
+        combined_mask = context_repetition_mask * eos_token_mask
+
+        g_values = logits_processor.compute_g_values(
+            input_ids=outputs,
+        )
+
+        # g_values of shape [num_filtered_entries, max_length - (ngram_len - 1),
+        # depth].
+        g_values = pad_to_len(g_values, max_length, True, 0, torch_device)
+
+        # We pad on left to get same max_length shape.
+        # g_values of shape [num_filtered_entries, max_length, depth].
+        all_masks.append(combined_mask)
+        all_g_values.append(g_values)
+    return all_masks, all_g_values
+
+
+def tpr_at_fpr(detector, detector_inputs, w_true, minibatch_size, target_fpr=0.01) -> torch.Tensor:
+    """Calculates true positive rate (TPR) at false positive rate (FPR)=target_fpr."""
+    positive_idxs = w_true == 1
+    negative_idxs = w_true == 0
+    num_samples = detector_inputs[0].size(0)
+
+    w_preds = []
+    for start in range(0, num_samples, minibatch_size):
+        end = start + minibatch_size
+        detector_inputs_ = (
+            detector_inputs[0][start:end],
+            detector_inputs[1][start:end],
+        )
+        with torch.no_grad():
+            w_pred = detector(*detector_inputs_)[0]
+        w_preds.append(w_pred)
+
+    w_pred = torch.cat(w_preds, dim=0)  # Concatenate predictions
+    positive_scores = w_pred[positive_idxs]
+    negative_scores = w_pred[negative_idxs]
+
+    # Calculate the FPR threshold
+    # Note: percentile -> quantile
+    fpr_threshold = torch.quantile(negative_scores, 1 - target_fpr)
+    # Note: need to switch to FP32 since torch.mean doesn't work with torch.bool
+    return torch.mean((positive_scores >= fpr_threshold).to(dtype=torch.float32)).item()  # TPR
+
+
+def update_fn_if_fpr_tpr(detector, g_values_val, mask_val, watermarked_val, minibatch_size):
+    """Loss function for negative TPR@FPR=1% as the validation loss."""
+    tpr_ = tpr_at_fpr(
+        detector=detector,
+        detector_inputs=(g_values_val, mask_val),
+        w_true=watermarked_val,
+        minibatch_size=minibatch_size,
+    )
+    return -tpr_
+
+
+def process_raw_model_outputs(
+    logits_processor,
+    tokenizer,
+    pos_truncation_length,
+    neg_truncation_length,
+    max_padded_length,
+    tokenized_wm_outputs,
+    test_size,
+    tokenized_uwm_outputs,
+    torch_device,
+):
+    # Split data into train and CV
+    train_wm_outputs, cv_wm_outputs = model_selection.train_test_split(tokenized_wm_outputs, test_size=test_size)
+
+    train_uwm_outputs, cv_uwm_outputs = model_selection.train_test_split(tokenized_uwm_outputs, test_size=test_size)
+
+    process_kwargs = {
+        "logits_processor": logits_processor,
+        "tokenizer": tokenizer,
+        "pos_truncation_length": pos_truncation_length,
+        "neg_truncation_length": neg_truncation_length,
+        "max_length": max_padded_length,
+        "torch_device": torch_device,
+    }
+
+    # Process both train and CV data for training
+    wm_masks_train, wm_g_values_train = process_outputs_for_training(
+        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_wm_outputs],
+        is_pos=True,
+        is_cv=False,
+        **process_kwargs,
+    )
+    wm_masks_cv, wm_g_values_cv = process_outputs_for_training(
+        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_wm_outputs],
+        is_pos=True,
+        is_cv=True,
+        **process_kwargs,
+    )
+    uwm_masks_train, uwm_g_values_train = process_outputs_for_training(
+        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in train_uwm_outputs],
+        is_pos=False,
+        is_cv=False,
+        **process_kwargs,
+    )
+    uwm_masks_cv, uwm_g_values_cv = process_outputs_for_training(
+        [torch.tensor(outputs, device=torch_device, dtype=torch.long) for outputs in cv_uwm_outputs],
+        is_pos=False,
+        is_cv=True,
+        **process_kwargs,
+    )
+
+    # We get list of data; here we concat all together to be passed to the detector.
+    def pack(mask, g_values):
+        mask = torch.cat(mask, dim=0)
+        g = torch.cat(g_values, dim=0)
+        return mask, g
+
+    wm_masks_train, wm_g_values_train = pack(wm_masks_train, wm_g_values_train)
+    # Note: Use float instead of bool. Otherwise, the entropy calculation doesn't work
+    wm_labels_train = torch.ones((wm_masks_train.shape[0],), dtype=torch.float, device=torch_device)
+
+    wm_masks_cv, wm_g_values_cv = pack(wm_masks_cv, wm_g_values_cv)
+    wm_labels_cv = torch.ones((wm_masks_cv.shape[0],), dtype=torch.float, device=torch_device)
+
+    uwm_masks_train, uwm_g_values_train = pack(uwm_masks_train, uwm_g_values_train)
+    uwm_labels_train = torch.zeros((uwm_masks_train.shape[0],), dtype=torch.float, device=torch_device)
+
+    uwm_masks_cv, uwm_g_values_cv = pack(uwm_masks_cv, uwm_g_values_cv)
+    uwm_labels_cv = torch.zeros((uwm_masks_cv.shape[0],), dtype=torch.float, device=torch_device)
+
+    # Concat pos and negatives data together.
+    train_g_values = torch.cat((wm_g_values_train, uwm_g_values_train), dim=0).squeeze()
+    train_labels = torch.cat((wm_labels_train, uwm_labels_train), axis=0).squeeze()
+    train_masks = torch.cat((wm_masks_train, uwm_masks_train), axis=0).squeeze()
+
+    cv_g_values = torch.cat((wm_g_values_cv, uwm_g_values_cv), axis=0).squeeze()
+    cv_labels = torch.cat((wm_labels_cv, uwm_labels_cv), axis=0).squeeze()
+    cv_masks = torch.cat((wm_masks_cv, uwm_masks_cv), axis=0).squeeze()
+
+    # Shuffle data.
+    shuffled_idx = torch.randperm(train_g_values.shape[0])  # Use torch for GPU compatibility
+
+    train_g_values = train_g_values[shuffled_idx]
+    train_labels = train_labels[shuffled_idx]
+    train_masks = train_masks[shuffled_idx]
+
+    # Shuffle the cross-validation data
+    shuffled_idx_cv = torch.randperm(cv_g_values.shape[0])  # Use torch for GPU compatibility
+    cv_g_values = cv_g_values[shuffled_idx_cv]
+    cv_labels = cv_labels[shuffled_idx_cv]
+    cv_masks = cv_masks[shuffled_idx_cv]
+
+    # Del some variables so we free up GPU memory.
+    del (
+        wm_g_values_train,
+        wm_labels_train,
+        wm_masks_train,
+        wm_g_values_cv,
+        wm_labels_cv,
+        wm_masks_cv,
+    )
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    return train_g_values, train_masks, train_labels, cv_g_values, cv_masks, cv_labels
+
+
+def get_tokenized_uwm_outputs(num_negatives, neg_batch_size, tokenizer, device):
+    dataset, info = tfds.load("wikipedia/20230601.en", split="train", with_info=True)
+    dataset = dataset.take(num_negatives)
+
+    # Convert the dataset to a DataFrame
+    df = tfds.as_dataframe(dataset, info)
+    ds = tf.data.Dataset.from_tensor_slices(dict(df))
+    tf.random.set_seed(0)
+    ds = ds.shuffle(buffer_size=10_000)
+    ds = ds.batch(batch_size=neg_batch_size)
+
+    tokenized_uwm_outputs = []
+    # Pad to this length (on the right) for batching.
+    padded_length = 1000
+    for i, batch in tqdm.tqdm(enumerate(ds)):
+        responses = [val.decode() for val in batch["text"].numpy()]
+        inputs = tokenizer(
+            responses,
+            return_tensors="pt",
+            padding=True,
+        ).to(device)
+        inputs = inputs["input_ids"].cpu().numpy()
+        if inputs.shape[1] >= padded_length:
+            inputs = inputs[:, :padded_length]
+        else:
+            inputs = np.concatenate(
+                [inputs, np.ones((neg_batch_size, padded_length - inputs.shape[1])) * tokenizer.eos_token_id], axis=1
+            )
+        tokenized_uwm_outputs.append(inputs)
+        if len(tokenized_uwm_outputs) * neg_batch_size > num_negatives:
+            break
+    return tokenized_uwm_outputs
+
+
+def get_tokenized_wm_outputs(
+    model,
+    tokenizer,
+    watermark_config,
+    num_pos_batches,
+    pos_batch_size,
+    temperature,
+    max_output_len,
+    top_k,
+    top_p,
+    device,
+):
+    eli5_prompts = datasets.load_dataset("Pavithree/eli5")
+
+    wm_outputs = []
+
+    for batch_id in tqdm.tqdm(range(num_pos_batches)):
+        prompts = eli5_prompts["train"]["title"][batch_id * pos_batch_size : (batch_id + 1) * pos_batch_size]
+        prompts = [prompt.strip('"') for prompt in prompts]
+        inputs = tokenizer(
+            prompts,
+            return_tensors="pt",
+            padding=True,
+        ).to(device)
+        _, inputs_len = inputs["input_ids"].shape
+
+        outputs = model.generate(
+            **inputs,
+            watermarking_config=watermark_config,
+            do_sample=True,
+            max_length=inputs_len + max_output_len,
+            temperature=temperature,
+            top_k=top_k,
+            top_p=top_p,
+        )
+
+        wm_outputs.append(outputs[:, inputs_len:].cpu().detach())
+
+        del outputs, inputs, prompts
+        gc.collect()
+
+    gc.collect()
+    torch.cuda.empty_cache()
+    return wm_outputs
+
+
+def upload_model_to_hf(model, hf_repo_name: str, private: bool = True):
+    api = HfApi()
+
+    # Check if the repository exists
+    try:
+        api.repo_info(repo_id=hf_repo_name, use_auth_token=True)
+        print(f"Repository '{hf_repo_name}' already exists.")
+    except RepositoryNotFoundError:
+        # If the repository does not exist, create it
+        print(f"Repository '{hf_repo_name}' not found. Creating it...")
+        create_repo(repo_id=hf_repo_name, private=private, use_auth_token=True)
+        print(f"Repository '{hf_repo_name}' created successfully.")
+
+    # Push the model to the Hugging Face Hub
+    print(f"Uploading model to Hugging Face repo '{hf_repo_name}'...")
+    model.push_to_hub(repo_id=hf_repo_name, use_auth_token=True)
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version(
    "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@ -50,7 +50,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@ -62,7 +62,7 @@ except (ModuleNotFoundError, ImportError):


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.46.0.dev0")
+check_min_version("4.47.0.dev0")

 logger = logging.getLogger(__name__)

--- a/Show More
+++ b/Show More