style

Merge remote-tracking branch 'upstream/main' into hqq_serialization
2025-11-04 20:14:36 +08:00 · 2024-08-01 16:19:50 +02:00 · 2024-08-01 16:16:47 +02:00 · 2024-08-01 16:16:11 +02:00 · 2024-08-01 16:12:17 +02:00 · 2024-08-01 13:57:42 +01:00
351 changed files with 5975 additions and 3740 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -142,6 +142,7 @@ jobs:
            - run: python utils/custom_init_isort.py --check_only
            - run: python utils/sort_auto_mappings.py --check_only
            - run: python utils/check_doc_toc.py
+            - run: python utils/check_docstrings.py --check_all

    check_repository_consistency:
        working_directory: ~/transformers
@ -190,4 +191,4 @@ workflows:
            - check_circleci_user
            - check_code_quality
            - check_repository_consistency
-            - fetch_all_tests
+            - fetch_all_tests
--- a/.github/workflows/self-pr-slow-ci.yml
+++ b/.github/workflows/self-pr-slow-ci.yml
@ -4,7 +4,7 @@ on:
  pull_request:
    paths:
      - "src/transformers/models/*/modeling_*.py"
-      - "tests/models/*/test_*.py"
+      - "tests/**/test_*.py"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -10,20 +10,9 @@ jobs:
  trufflehog:
    runs-on: ubuntu-latest
    steps:
-    - shell: bash
-      run: |
-        if [ "${{ github.event_name }}" == "push" ]; then
-          echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV
-          echo "branch=${{ github.ref_name }}" >> $GITHUB_ENV
-        fi
-        if [ "${{ github.event_name }}" == "pull_request" ]; then
-          echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV
-          echo "branch=${{ github.event.pull_request.head.ref }}" >> $GITHUB_ENV
-        fi
-    - name: Checkout code
-      uses: actions/checkout@v4
-      with:
-        ref: ${{env.branch}}
-        fetch-depth: ${{env.depth}}
-    - name: Secret Scanning
-      uses: trufflesecurity/trufflehog@main
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Secret Scanning
+        uses: trufflesecurity/trufflehog@main
--- a/1
+++ b/1
@ -56,6 +56,7 @@ quality:
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
 	python utils/check_doc_toc.py
+	python utils/check_docstrings.py --check_all


 # Format source code automatically and check is there are any problems left that need manual fixing
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -8,7 +8,7 @@ RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
-RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,speech,vision,testing]"
 RUN git lfs install

 RUN pip uninstall -y transformers
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.3.0'
+ARG PYTORCH='2.4.0'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -11,7 +11,7 @@ ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

 # If set to nothing, will install the latest version
-ARG PYTORCH='2.3.0'
+ARG PYTORCH='2.4.0'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -509,3 +509,54 @@ agent = ReactCodeAgent(tools=[search_tool])

 agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
 ```
+
+## Gradio interface
+
+You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example:
+
+```py
+import gradio as gr
+from transformers import (
+    load_tool,
+    ReactCodeAgent,
+    HfEngine,
+    stream_to_gradio,
+)
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+
+llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
+
+# Initialize the agent with the image generation tool
+agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+
+def interact_with_agent(task):
+    messages = []
+    messages.append(gr.ChatMessage(role="user", content=task))
+    yield messages
+    for msg in stream_to_gradio(agent, task):
+        messages.append(msg)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+        ]
+    yield messages
+
+
+with gr.Blocks() as demo:
+    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+    submit = gr.Button("Run illustrator agent!")
+    chatbot = gr.Chatbot(
+        label="Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+    demo.launch()
+```
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -580,7 +580,7 @@ default template for that model class is used instead. Let's take a look at the
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

@ -704,23 +704,6 @@ with other names, pass the name of the template you want to the `chat_template`
 We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend
 trying to put it all in a single template where possible!

-### What are "default" templates?
-
-Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards 
-compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a
-model does not have a chat template set, but there is a default template for its model class, the `TextGenerationPipeline`
-class and methods like `apply_chat_template` will use the class template instead. You can find out what the default
-template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute.
-
-This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when
-the class template is appropriate for your model, we strongly recommend overriding the default template by
-setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured
-for chat.
-
-Now that actual chat templates have been adopted more widely, default templates have been deprecated and will be
-removed in a future release. We strongly recommend setting the `chat_template` attribute for any tokenizers that
-still depend on them!
-
 ### What template should I use?

 When setting the template for a model that's already been trained for chat, you should ensure that the template
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -195,7 +195,7 @@ inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
 print("Tokenized inputs:\n", inputs)

 # 4: Generate text from the model
-outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.)
+outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
 print("Generated tokens:\n", outputs)

 # 5: Decode the output back to a string
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -211,6 +211,80 @@ I like rock music because it's loud and energetic. It's a great way to express m
 I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
 ```

+## KV Cache Offloading
+
+Similarly to KV cache quantization, this strategy aims to reduce GPU VRAM usage.
+It does so by moving the KV cache for most layers to the CPU.
+As the model's `forward()` method iterates over the layers, this strategy maintains the current layer cache on the GPU.
+At the same time it asynchronously prefetches the next layer cache as well as sending the previous layer cache back to the CPU.
+Unlike KV cache quantization, this strategy always produces the same result as the default KV cache implementation.
+Thus, it can serve as a drop-in replacement or a fallback for it.
+
+Depending on your model and the characteristics of your generation task (size of context, number of generated tokens, number of beams, etc.)
+you may notice a small degradation in generation throughput compared to the default KV cache implementation.
+
+To enable KV cache offloading, pass `cache_implementation="offloaded"` in the `generation_config`.
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23)
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+```
+
+<Tip warning={true}>
+
+Cache offloading requires a GPU and can be slower than the default KV cache. Use it if you are getting CUDA out of memory errors.
+
+</Tip>
+
+The example below shows how KV cache offloading can be used as a fallback strategy.
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+>>> def resilient_generate(model, *args, **kwargs):
+...     oom = False
+...     try:
+...         return model.generate(*args, **kwargs)
+...     except torch.cuda.OutOfMemoryError as e:
+...         print(e)
+...         print("retrying with cache_implementation='offloaded'")
+...         oom = True
+...     if oom:
+...         torch.cuda.empty_cache()
+...         kwargs["cache_implementation"] = "offloaded"
+...         return model.generate(*args, **kwargs)
+...
+...
+>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
+>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+>>> prompt = ["okay "*1000 + "Fun fact: The most"]
+>>> inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+>>> beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
+>>> out = resilient_generate(model, **inputs, **beams)
+>>> responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
+```
+
+On a GPU with 50 GB of RAM, running this code will print
+```
+CUDA out of memory. Tried to allocate 4.83 GiB. GPU
+retrying with cache_implementation='offloaded'
+```
+before successfully generating 40 beams.
+
+
 ## Watermarking

 The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green".
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -18,59 +18,109 @@ Basic inference is slow because LLMs have to be called repeatedly to generate th
 This guide will show you how to use the optimization techniques available in Transformers to accelerate LLM inference.

 > [!TIP]
-> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes more optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.
+> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.

-## Static kv-cache and torch.compile
+## Static kv-cache and `torch.compile`

 During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time.

-To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.
+To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.

-The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with torch.compile for up to a 4x speed up.
+The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware.

 > [!WARNING]
-> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and torch.compile. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
+> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and `torch.compile`. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.

-For this example, let's load the [Gemma](https://hf.co/google/gemma-2b) model.
+There are three flavors of static kv-cache usage, depending on the complexity of your task:
+1. Basic usage: simply set a flag in `generation_config` (recommended);
+2. Advanced usage: handle a cache object for multi-turn generation or a custom generation loop;
+3. Advanced usage: compile the entire `generate` function into a single graph, if having a single graph is relevant for you.
+
+Select the correct tab below for further instructions on each of these flavors.
+
+> [!TIP]
+> Regardless of the strategy used with `torch.compile`, you can avoid shape-related recompilations if you left-pad your LLM inputs to a limited set of values. The [`pad_to_multiple_of` tokenizer flag](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) is your friend!
+
+<hfoptions id="static-kv">
+<hfoption id="basic usage: generation_config">
+
+For this example, let's use the [Gemma](https://hf.co/google/gemma-2b) model. All we need to do is to:
+1. Access the model's `generation_config` attribute and set the `cache_implementation` to "static";
+2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
+
+And that's it!

 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)

 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
-model = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2b", device_map="auto"
-)
-```
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")

-There are two ways you can configure the model to use a static kv-cache. For a 7B model on an A100, both methods get a 4x speed up in the forward pass. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. If you're using the [`~GenerationMixin.generate`] method, the speed up is ~3x. The forward pass (which still gets 4x speed up) is only a part of the whole [`~GenerationMixin.generate`] code.
-
-<hfoptions id="static-kv">
-<hfoption id="generation_config">
-
-Access the model's `generation_config` attribute and set the `cache_implementation` to "static".
-
-```py
 model.generation_config.cache_implementation = "static"
-```

-Call torch.compile on the model to compile the forward pass with the static kv-cache.
-
-```py
-compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
 input_text = "The theory of special relativity states "
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

-outputs = compiled_model.generate(**input_ids)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
+outputs = model.generate(**input_ids)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
 ```

-Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. However, if the batch size or the maximum output length increase between calls, the cache will have to be reinitialized, triggering a new compilation.
+Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. Avoiding re-compilation is critical to get the most out of `torch.compile`, and you should be aware of the following:
+1. If the batch size changes or the maximum output length increases between calls, the cache will have to be reinitialized, triggering a new compilation;
+2. The first couple of calls of the compiled function are slower, as the function is being compiled.
+
+> [!WARNING]
+> For a more advanced usage of the static cache, such as multi-turn conversations, we recommend instantiating and manipulating the cache object outside [`~GenerationMixin.generate`]. See the advanced usage tab.

 </hfoption>
-<hfoption id="Static Cache">
+<hfoption id="advanced usage: control Static Cache">

-A [`StaticCache`] object can be passed to the model's forward pass under the `past_key_values` argument, enabling the use of this object as a static kv-cache. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. You can also pass the [`StaticCache`] object to [`~GenerationMixin.generate`] and use it across calls, like you would do with a dynamic cache.
+A [`StaticCache`] object can be passed to the model's [`~GenerationMixin.generate`] under the `past_key_values` argument. The object will retain the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, like you would do with a dynamic cache.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+input_text = "The theory of special relativity states "
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+prompt_length = input_ids.input_ids.shape[1]
+model.generation_config.max_new_tokens = 16
+
+past_key_values = StaticCache(
+    config=model.config,
+    max_batch_size=1,
+    # If you plan to reuse the cache, make sure the cache length is large enough for all cases
+    max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
+    device=model.device,
+    dtype=model.dtype
+)
+outputs = model.generate(**input_ids, past_key_values=past_key_values)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2']
+
+# pass in the generated text and the same cache object to continue generation from where it left off. Optionally, in a
+# multi-turn conversation, append the new user input to the generated text.
+new_input_ids = outputs
+outputs = model.generate(new_input_ids, past_key_values=past_key_values)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2. The speed of light is constant in all inertial reference frames. 3.']
+```
+
+> [!TIP]
+> If you want to reuse the same [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method between calls
+
+If you want to go further down a level, the [`StaticCache`] object can also be passed to the model's forward pass under the same `past_key_values` argument. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens.

 ```py
 from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
@ -102,12 +152,9 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
    return new_token
 ```

-There are a few important things you must do to enable static kv-cache and torch.compile with the `StaticCache` method:
-
+There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method:
 1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.
-
-2. Call torch.compile on the model to compile the forward pass with the static kv-cache.
-
+2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
 3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.

 ```py
@ -142,8 +189,34 @@ text
 'My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p']
 ```

-> [!TIP]
-> If you want to reuse the [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method
+</hfoption>
+<hfoption id="advanced usage: end-to-end generate compilation">
+
+Compiling the entire `generate` function, in terms of code, is even simpler than in the basic usage: call `torch.compile` on `generate` to compile the entire function. No need to specify the use of the static cache: although it is compatible, dynamic cache (default) was faster in our benchmarks.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+
+model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True)
+input_text = "The theory of special relativity states "
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
+```
+
+As a result, we compile not only the model forward pass, but also all input preparation, logit processor operations, and so on. The result should be a slightly `generate` call, compared to the basic usage example, and the compiled graph may be better suited to more exotic hardware devices or use cases. However, there are severe drawbacks in using this approach:
+1. Compilation is much slower;
+2. All parameterization of `generate` must be done through `generation_config`;
+3. Many warnings and exceptions are suppressed -- we suggest testing with its uncompiled form first;
+4. Although we are working on it, it is heavily feature restricted (for instance, at the time of writing, generation does not stop if an EOS token is selected).

 </hfoption>
 </hfoptions>
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@ -72,6 +72,10 @@ We provide two types of agents, based on the main [`Agent`] class:

 [[autodoc]] launch_gradio_demo

+### stream_to_gradio
+
+[[autodoc]] stream_to_gradio
+
 ### ToolCollection

 [[autodoc]] ToolCollection
--- a/docs/source/en/main_classes/data_collator.md
+++ b/docs/source/en/main_classes/data_collator.md
@ -66,3 +66,8 @@ Examples of use can be found in the [example scripts](../examples) or [example n
    - numpy_mask_tokens
    - tf_mask_tokens
    - torch_mask_tokens
+
+## DataCollatorWithFlattening
+
+[[autodoc]] data.data_collator.DataCollatorWithFlattening
+
--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@ -41,33 +41,40 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
 Here's how to use the model for zero-shot object detection:

 ```python
-import requests
+>>> import requests

-import torch
-from PIL import Image
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, 
+>>> import torch
+>>> from PIL import Image
+>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

-model_id = "IDEA-Research/grounding-dino-tiny"
+>>> model_id = "IDEA-Research/grounding-dino-tiny"
+>>> device = "cuda"

-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+>>> processor = AutoProcessor.from_pretrained(model_id)
+>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

-image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(image_url, stream=True).raw)
-# Check for cats and remote controls
-text = "a cat. a remote control."
+>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(image_url, stream=True).raw)
+>>> # Check for cats and remote controls
+>>> text = "a cat. a remote control."

-inputs = processor(images=image, text=text, return_tensors="pt").to(device)
-with torch.no_grad():
-    outputs = model(**inputs)
+>>> inputs = processor(images=image, text=text, return_tensors="pt").to(device)
+>>> with torch.no_grad():
+...     outputs = model(**inputs)

-results = processor.post_process_grounded_object_detection(
-    outputs,
-    inputs.input_ids,
-    box_threshold=0.4,
-    text_threshold=0.3,
-    target_sizes=[image.size[::-1]]
-)
+>>> results = processor.post_process_grounded_object_detection(
+...     outputs,
+...     inputs.input_ids,
+...     box_threshold=0.4,
+...     text_threshold=0.3,
+...     target_sizes=[image.size[::-1]]
+... )
+>>> print(results)
+[{'boxes': tensor([[344.6959,  23.1090, 637.1833, 374.2751],
+        [ 12.2666,  51.9145, 316.8582, 472.4392],
+        [ 38.5742,  70.0015, 176.7838, 118.1806]], device='cuda:0'),
+  'labels': ['a cat', 'a cat', 'a remote control'],
+  'scores': tensor([0.4785, 0.4381, 0.4776], device='cuda:0')}]
 ```

 ## Grounded SAM
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 ## Overview

-Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc.
+Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc.

 ### Model Details

@ -27,16 +27,16 @@ Qwen2 is a language model series including decoder language models of different

 ## Usage tips

-`Qwen2-7B-beta` and `Qwen2-7B-Chat-beta` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
+`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)

-In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
+In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.

 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto

->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", device_map="auto")
->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")
+>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

 >>> prompt = "Give me a short introduction to large language model."

--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@ -72,7 +72,7 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
 ```

-Whisper is compatible with the following optimisations:
+Whisper is compatible with the following optimisations for both short and long-form generation:
 - [PyTorch Scaled Dot Product Attention (SDPA)](../perf_infer_gpu_one#pytorch-scaled-dot-product-attention): flash attention and memory-efficient attention kernels. Enabled by default for `torch>=2.1.1`.
 - [Flash Attention 2](../perf_infer_gpu_one#flashattention-2): improved implementation of flash attention through better parallelism and work partitioning. 
 - [torch.compile](../llm_optims#static-kv-cache-and-torchcompile): JIT-compile the forward pass to dispatch to efficient fused kernels.
@ -101,7 +101,8 @@ As an example, the following codesnippet enables SDPA and `torch.compile` for up
 ... ).input_features

 >>> # Compile the forward pass
->>> _ = model.generate(input_features)
+>>> for _ in range(2):
+>>>     model.generate(input_features)

 >>> # Generate token ids using compiled graph (fast!)
 >>> predicted_ids = model.generate(input_features)
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@ -77,7 +77,7 @@ Then use `notebook_login` to sign-in to the Hub, and follow the link [here](http

 To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.

-Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. 
+Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.

 <frameworkcontent>
 <pt>
--- a/docs/source/en/perf_torch_compile.md
+++ b/docs/source/en/perf_torch_compile.md
@ -98,7 +98,7 @@ Below you can find the list of the models we benchmarked.
 - [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224)
 - [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k)
 - [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224)
- [microsoft/resnet-50](https://huggingface.co/)
+- [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50)

 **Image Segmentation** 
 - [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
--- a/docs/source/en/tf_xla.md
+++ b/docs/source/en/tf_xla.md
@ -157,7 +157,7 @@ Execution time -- 79.0 ms

 Execution time -- 78.9 ms
 ```
-The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point with trigger re-tracing and thus leading to slow-downs in the generation time. 
+The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point will trigger re-tracing and thus leading to slow-downs in the generation time. 

 We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases.

@ -171,4 +171,4 @@ Here, we leave you with some additional resources if you want to delve deeper in
 * Recommended posts for learning more about XLA and TensorFlow graphs in general:
    * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla)
    * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs)
-    * [Better performance with tf.function](https://www.tensorflow.org/guide/function) 
+    * [Better performance with tf.function](https://www.tensorflow.org/guide/function)
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -278,7 +278,7 @@ args = TrainingArguments(
    max_steps=100,
    per_device_train_batch_size=2,
    optim="galore_adamw",
-    optim_target_modules=["attn", "mlp"]
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
 )

 model_id = "google/gemma-2b"
@ -315,7 +315,7 @@ args = TrainingArguments(
    max_steps=100,
    per_device_train_batch_size=2,
    optim="galore_adamw",
-    optim_target_modules=["attn", "mlp"],
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
    optim_args="rank=64, update_proj_gap=100, scale=0.10",
 )

@ -359,7 +359,7 @@ args = TrainingArguments(
    max_steps=100,
    per_device_train_batch_size=2,
    optim="galore_adamw_layerwise",
-    optim_target_modules=["attn", "mlp"]
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
 )

 model_id = "google/gemma-2b"
--- a/docs/source/es/chat_templating.md
+++ b/docs/source/es/chat_templating.md
@ -220,7 +220,7 @@ La plantilla de chat para un modelo se almacena en el atributo `tokenizer.chat_t
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

@ -307,12 +307,6 @@ Si estás ajustando finamente un modelo para chat, además de establecer una pla

 </Tip>

-### ¿Qué son las plantillas "default"?
-
-Antes de la introducción de las plantillas de chat, el manejo del chat estaba codificado en el nivel de la clase del modelo. Por razones de compatibilidad con versiones anteriores, hemos conservado este manejo específico de la clase como plantillas predeterminadas, también establecidas a nivel de clase. Si un modelo no tiene una plantilla de chat establecida, pero hay una plantilla predeterminada para su clase de modelo, la clase `TextGenerationPipeline` y métodos como `apply_chat_template` usarán la plantilla de clase en su lugar. Puedes averiguar cuál es la plantilla predeterminada para tu tokenizador comprobando el atributo `tokenizer.default_chat_template`.
-
-Esto es algo que hacemos puramente por razones de compatibilidad con versiones anteriores, para evitar romper cualquier flujo de trabajo existente. Incluso cuando la plantilla de clase es apropiada para tu modelo, recomendamos encarecidamente anular la plantilla predeterminada estableciendo explícitamente el atributo `chat_template` para dejar claro a los usuarios que tu modelo ha sido configurado correctamente para el chat, y para estar preparados para el futuro en caso de que las plantillas predeterminadas alguna vez se alteren o se eliminen.
-
 ### ¿Qué plantilla debería usar?

 Cuando establezcas la plantilla para un modelo que ya ha sido entrenado para chat, debes asegurarte de que la plantilla coincida exactamente con el formato de mensajes que el modelo vio durante el entrenamiento, o de lo contrario es probable que experimentes degradación del rendimiento. Esto es cierto incluso si estás entrenando aún más el modelo; probablemente obtendrás el mejor rendimiento si mantienes constantes los tokens de chat. Esto es muy análogo a la tokenización: generalmente obtienes el mejor rendimiento para la inferencia o el ajuste fino cuando coincides precisamente con la tokenización utilizada durante el entrenamiento.
--- a/docs/source/ja/chat_templating.md
+++ b/docs/source/ja/chat_templating.md
@ -85,7 +85,7 @@ LLM（Language Model）のますます一般的な使用事例の1つは「チ
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -78,6 +78,8 @@
    title: 如何将流水线添加到 🤗 Transformers？
  title: 贡献
 - sections:
+  - local: philosophy
+    title: Transformers的设计理念
  - local: task_summary
    title: 🤗Transformers能做什么
  - local: tokenizer_summary
--- a/docs/source/zh/chat_templating.md
+++ b/docs/source/zh/chat_templating.md
@ -228,7 +228,7 @@ The sun.</s>
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

--- a/docs/source/zh/philosophy.md
+++ b/docs/source/zh/philosophy.md
@ -0,0 +1,67 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+
+# Transformers 的设计理念
+
+🤗 Transformers 是一个专为以下用户群体构建的库：
+
+- 寻求使用、研究或扩展大规模 Transformers 模型的机器学习研究人员和教育者。
+- 希望微调这些模型或在生产环境中使用它们（或两者兼而有之）的实际操作者。
+- 只想下载预训练模型并将其用于解决给定机器学习任务的工程师。
+
+Transformers 设计时有两个主要目标：
+
+1. 尽可能简单快速地使用：
+
+   - 我们尽可能地限制用户能接触的抽象层，实际上几乎没有抽象。用户只需学习三个标准类即可使用每个模型：[configuration](main_classes/configuration)、[models](main_classes/model) 和一个预处理类（用于 NLP 的 [tokenizer](main_classes/tokenizer)，用于视觉的 [image processor](main_classes/image_processor)，用于音频的 [feature extractor](main_classes/feature_extractor)，以及用于多模态输入的 [processor](main_classes/processors)）。
+   - 所有这些类都可以通过一个通用的 `from_pretrained()` 方法从预训练实例中简单统一地初始化，该方法会从提供在 [Hugging Face Hub](https://huggingface.co/models) 上的预训练检查点（如果需要的话）下载、缓存和加载相关类实例及相关数据（配置的超参数、分词器的词汇表和模型的权重）。
+   - 在这三个基本类之上，该库提供了两种 API：[`pipeline`] 用于快速在给定任务上使用模型进行推断，以及 [`Trainer`] 用于快速训练或微调 PyTorch 模型（所有 TensorFlow 模型与 `Keras.fit` 兼容）。
+   - 因此，Transformers 不是神经网络的模块化工具箱。如果要基于 Transformers 扩展或搭建新项目，请使用常规的 Python、PyTorch、TensorFlow、Keras 模块，并从 Transformers 的基类继承以重用模型加载和保存等功能。如果想了解更多有关我们的模型代码的设计理念，请查看我们的[重复自己](https://huggingface.co/blog/transformers-design-philosophy)博文。
+
+2. 提供与原始模型性能尽可能接近的最新模型：
+
+   - 我们为每种架构提供至少一个示例，复现了该架构官方作者提供的结果。
+   - 代码通常尽可能接近原始代码库，这意味着某些 PyTorch 代码可能不够*pytorchic*，因为它是转换后的 TensorFlow 代码，反之亦然。
+
+其他几个目标：
+
+- 尽可能一致地公开模型的内部：
+
+   - 我们使用单一 API 提供对完整隐藏状态和注意力权重的访问。
+   - 预处理类和基本模型 API 标准化，便于在不同模型之间轻松切换。
+
+- 结合主观选择的有前途的工具进行模型微调和调查：
+
+   - 简单一致的方法来向词汇表和嵌入中添加新标记以进行微调。
+   - 简单的方法来屏蔽和修剪 Transformer 头部。
+
+- 轻松在 PyTorch、TensorFlow 2.0 和 Flax 之间切换，允许使用一个框架进行训练并使用另一个进行推断。
+
+## 主要概念
+
+该库围绕每个模型的三类类构建：
+
+- **模型类** 可以是 PyTorch 模型（[torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)）、Keras 模型（[tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)）或 JAX/Flax 模型（[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html)），这些模型可以使用库中提供的预训练权重。
+- **配置类** 存储构建模型所需的超参数（如层数和隐藏大小）。通常情况下，如果您使用不进行任何修改的预训练模型，则创建模型将自动处理配置的实例化（配置是模型的一部分）。
+- **预处理类** 将原始数据转换为模型可接受的格式。一个 [tokenizer](main_classes/tokenizer) 存储每个模型的词汇表，并提供编码和解码字符串为要馈送到模型的令牌嵌入索引列表的方法。[Image processors](main_classes/image_processor) 预处理视觉输入，[feature extractors](main_classes/feature_extractor) 预处理音频输入，而 [processor](main_classes/processors) 则处理多模态输入。
+
+所有这些类都可以从预训练实例中实例化、本地保存，并通过以下三种方法与 Hub 共享：
+
+- `from_pretrained()` 允许您从库自身提供的预训练版本（支持的模型可在 [Model Hub](https://huggingface.co/models) 上找到）或用户本地（或服务器上）存储的版本实例化模型、配置和预处理类。
+- `save_pretrained()` 允许您本地保存模型、配置和预处理类，以便可以使用 `from_pretrained()` 重新加载。
+- `push_to_hub()` 允许您将模型、配置和预处理类共享到 Hub，以便所有人都可以轻松访问。
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -58,7 +58,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@ -557,7 +557,7 @@ class MultiHeadedAttention(nn.Module):
            return context


-class DecoderState(object):
+class DecoderState:
    """Interface for grouping together the current state of a recurrent
    decoder. In the simplest case just represents the hidden state of
    the model.  But can also be used for implementing various forms of
@ -694,7 +694,7 @@ def build_predictor(args, tokenizer, symbols, model, logger=None):
    return translator


-class GNMTGlobalScorer(object):
+class GNMTGlobalScorer:
    """
    NMT re-ranking score from
    "Google's Neural Machine Translation System" :cite:`wu2016google`
@ -717,7 +717,7 @@ class GNMTGlobalScorer(object):
        return normalized_probs


-class PenaltyBuilder(object):
+class PenaltyBuilder:
    """
    Returns the Length and Coverage Penalty function for Beam Search.

@ -763,7 +763,7 @@ class PenaltyBuilder(object):
        return logprobs


-class Translator(object):
+class Translator:
    """
    Uses a model to translate a batch of sentences.

@ -1002,7 +1002,7 @@ def tile(x, count, dim=0):
 #


-class BertSumOptimizer(object):
+class BertSumOptimizer:
    """Specific optimizer for BertSum.

    As described in [1], the authors fine-tune BertSum for abstractive
--- a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
+++ b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
@ -3,7 +3,7 @@ import torch
 from transformers import AutoTokenizer


-class FSNERTokenizerUtils(object):
+class FSNERTokenizerUtils:
    def __init__(self, pretrained_model_name_or_path):
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)

--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@ -417,7 +417,7 @@ class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride
        return super().__new__(cls, channels, height, width, stride)


-class Box2BoxTransform(object):
+class Box2BoxTransform:
    """
    This R-CNN transformation scales the box's width and height
    by exp(dw), exp(dh) and shifts a box's center by the offset
@ -519,7 +519,7 @@ class Box2BoxTransform(object):
        return pred_boxes


-class Matcher(object):
+class Matcher:
    """
    This class assigns to each predicted "element" (e.g., a box) a ground-truth
    element. Each predicted element will have exactly zero or one matches; each
@ -622,7 +622,7 @@ class Matcher(object):
        match_labels[pred_inds_with_highest_quality] = 1


-class RPNOutputs(object):
+class RPNOutputs:
    def __init__(
        self,
        box2box_transform,
@ -1132,7 +1132,7 @@ class ROIPooler(nn.Module):
        return output


-class ROIOutputs(object):
+class ROIOutputs:
    def __init__(self, cfg, training=False):
        self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA
        self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
--- a/examples/research_projects/movement-pruning/emmental/modules/binarizer.py
+++ b/examples/research_projects/movement-pruning/emmental/modules/binarizer.py
@ -108,7 +108,7 @@ class TopKBinarizer(autograd.Function):
        return gradOutput, None


-class MagnitudeBinarizer(object):
+class MagnitudeBinarizer:
    """
    Magnitude Binarizer.
    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@ -98,7 +98,7 @@ def regularization(model: nn.Module, mode: str):
            elif mode == "l0":
                regu += torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)).sum() / param.numel()
            else:
-                ValueError("Don't know this mode.")
+                raise ValueError("Don't know this mode.")
            counter += 1
    return regu / counter

--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@ -101,7 +101,7 @@ def regularization(model: nn.Module, mode: str):
            elif mode == "l0":
                regu += torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)).sum() / param.numel()
            else:
-                ValueError("Don't know this mode.")
+                raise ValueError("Don't know this mode.")
            counter += 1
    return regu / counter

--- a/examples/research_projects/performer/modeling_flax_performer_utils.py
+++ b/examples/research_projects/performer/modeling_flax_performer_utils.py
@ -284,7 +284,7 @@ def make_fast_generalized_attention(
    return attention_fn


-class RandomMatrix(object):
+class RandomMatrix:
    r"""
    Abstract class providing a method for constructing 2D random arrays. Class is responsible for constructing 2D
    random arrays.
@ -348,7 +348,7 @@ class GaussianOrthogonalRandomMatrix(RandomMatrix):
        return jnp.matmul(jnp.diag(multiplier), final_matrix)


-class FastAttention(object):
+class FastAttention:
    r"""
    Abstract class providing a method for fast attention. Class is responsible for providing a method
    <dot_product_attention> for fast approximate attention.
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
@ -418,7 +418,7 @@ class TestTheRest(TestCasePlus):
            with CaptureStdout() as cs:
                args = parser.parse_args(args)
            assert False, "--help is expected to sys.exit"
-        assert excinfo.type == SystemExit
+        assert excinfo.type is SystemExit
        expected = lightning_base.arg_to_scheduler_metavar
        assert expected in cs.out, "--help is expected to list the supported schedulers"

@ -429,7 +429,7 @@ class TestTheRest(TestCasePlus):
            with CaptureStderr() as cs:
                args = parser.parse_args(args)
            assert False, "invalid argument is expected to sys.exit"
-        assert excinfo.type == SystemExit
+        assert excinfo.type is SystemExit
        expected = f"invalid choice: '{unsupported_param}'"
        assert expected in cs.err, f"should have bailed on invalid choice of scheduler {unsupported_param}"

--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@ -417,7 +417,7 @@ class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride
        return super().__new__(cls, channels, height, width, stride)


-class Box2BoxTransform(object):
+class Box2BoxTransform:
    """
    This R-CNN transformation scales the box's width and height
    by exp(dw), exp(dh) and shifts a box's center by the offset
@ -519,7 +519,7 @@ class Box2BoxTransform(object):
        return pred_boxes


-class Matcher(object):
+class Matcher:
    """
    This class assigns to each predicted "element" (e.g., a box) a ground-truth
    element. Each predicted element will have exactly zero or one matches; each
@ -622,7 +622,7 @@ class Matcher(object):
        match_labels[pred_inds_with_highest_quality] = 1


-class RPNOutputs(object):
+class RPNOutputs:
    def __init__(
        self,
        box2box_transform,
@ -1132,7 +1132,7 @@ class ROIPooler(nn.Module):
        return output


-class ROIOutputs(object):
+class ROIOutputs:
    def __init__(self, cfg, training=False):
        self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA
        self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version(
    "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@ -50,7 +50,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@ -62,7 +62,7 @@ except (ModuleNotFoundError, ImportError):


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/summarization/run_summarization.py
+++ b/examples/tensorflow/summarization/run_summarization.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version

 # region Checking dependencies
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/tensorflow/text-classification/run_glue.py
+++ b/examples/tensorflow/text-classification/run_glue.py
@ -47,7 +47,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 task_to_keys = {
    "cola": ("sentence", None),
--- a/examples/tensorflow/translation/run_translation.py
+++ b/examples/tensorflow/translation/run_translation.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version

 # region Dependencies and constants
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.44.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@ -147,7 +147,7 @@ def get_original_command(max_width=80, full_python_path=False):
    Return the original command line string that can be replayed nicely and wrapped for 80 char width.

    Args:
-        max_width (`int`, `optional`, defaults to 80):
+        max_width (`int`, *optional*, defaults to 80):
            The width to wrap for.
        full_python_path (`bool`, `optional`, defaults to `False`):
             Whether to replicate the full path or just the last segment (i.e. `python`).
--- a/setup.py
+++ b/setup.py
@ -157,7 +157,7 @@ _deps = [
    "rhoknp>=1.1.0,<1.3.1",
    "rjieba",
    "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff==0.4.4",
+    "ruff==0.5.1",
    "sacrebleu>=1.4.12,<2.0.0",
    "sacremoses",
    "safetensors>=0.4.1",
@ -430,7 +430,7 @@ install_requires = [

 setup(
    name="transformers",
-    version="4.43.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
+    version="4.44.0.dev0",  # expected format is one of x.y.z.dev0, or x.y.z.rc1 or x.y.z (no to dashes, yes to dots)
    author="The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)",
    author_email="transformers@huggingface.co",
    description="State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow",
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).

-__version__ = "4.43.0.dev0"
+__version__ = "4.44.0.dev0"

 from typing import TYPE_CHECKING

@ -67,6 +67,7 @@ _import_structure = {
        "ToolCollection",
        "launch_gradio_demo",
        "load_tool",
+        "stream_to_gradio",
    ],
    "audio_utils": [],
    "benchmark": [],
@ -103,6 +104,7 @@ _import_structure = {
        "DataCollatorForSOP",
        "DataCollatorForTokenClassification",
        "DataCollatorForWholeWordMask",
+        "DataCollatorWithFlattening",
        "DataCollatorWithPadding",
        "DefaultDataCollator",
        "default_data_collator",
@ -4732,6 +4734,7 @@ if TYPE_CHECKING:
        ToolCollection,
        launch_gradio_demo,
        load_tool,
+        stream_to_gradio,
    )
    from .configuration_utils import PretrainedConfig

@ -4764,6 +4767,7 @@ if TYPE_CHECKING:
        DataCollatorForSOP,
        DataCollatorForTokenClassification,
        DataCollatorForWholeWordMask,
+        DataCollatorWithFlattening,
        DataCollatorWithPadding,
        DefaultDataCollator,
        default_data_collator,
--- a/src/transformers/agents/init.py
+++ b/src/transformers/agents/init.py
@ -26,6 +26,7 @@ from ..utils import (
 _import_structure = {
    "agents": ["Agent", "CodeAgent", "ReactAgent", "ReactCodeAgent", "ReactJsonAgent", "Toolbox"],
    "llm_engine": ["HfEngine"],
+    "monitoring": ["stream_to_gradio"],
    "tools": ["PipelineTool", "Tool", "ToolCollection", "launch_gradio_demo", "load_tool"],
 }

@ -45,6 +46,7 @@ else:
 if TYPE_CHECKING:
    from .agents import Agent, CodeAgent, ReactAgent, ReactCodeAgent, ReactJsonAgent, Toolbox
    from .llm_engine import HfEngine
+    from .monitoring import stream_to_gradio
    from .tools import PipelineTool, Tool, ToolCollection, launch_gradio_demo, load_tool

    try:
--- a/src/transformers/agents/agents.py
+++ b/src/transformers/agents/agents.py
@ -17,7 +17,7 @@
 import json
 import logging
 import re
-from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union

 from .. import is_torch_available
 from ..utils import logging as transformers_logging
@ -30,13 +30,12 @@ from .prompts import (
    DEFAULT_REACT_CODE_SYSTEM_PROMPT,
    DEFAULT_REACT_JSON_SYSTEM_PROMPT,
    PLAN_UPDATE_FINAL_PLAN_REDACTION,
+    PROMPTS_FOR_INITIAL_PLAN,
+    PROMPTS_FOR_PLAN_UPDATE,
+    SUPPORTED_PLAN_TYPES,
    SYSTEM_PROMPT_FACTS,
    SYSTEM_PROMPT_FACTS_UPDATE,
-    SYSTEM_PROMPT_PLAN,
-    SYSTEM_PROMPT_PLAN_UPDATE,
    USER_PROMPT_FACTS_UPDATE,
-    USER_PROMPT_PLAN,
-    USER_PROMPT_PLAN_UPDATE,
 )
 from .python_interpreter import LIST_SAFE_MODULES, evaluate_python_code
 from .tools import (
@ -653,9 +652,11 @@ class ReactAgent(Agent):
        llm_engine: Callable = HfEngine(),
        system_prompt: str = DEFAULT_REACT_CODE_SYSTEM_PROMPT,
        tool_description_template: str = DEFAULT_TOOL_DESCRIPTION_TEMPLATE,
+        plan_type: Literal[tuple(SUPPORTED_PLAN_TYPES)] = SUPPORTED_PLAN_TYPES[0],
        planning_interval: Optional[int] = None,
        **kwargs,
    ):
+        assert plan_type in SUPPORTED_PLAN_TYPES, f"plan type {plan_type} is not supported"
        super().__init__(
            tools=tools,
            llm_engine=llm_engine,
@ -664,6 +665,7 @@ class ReactAgent(Agent):
            **kwargs,
        )
        self.planning_interval = planning_interval
+        self.plan_type = plan_type

    def provide_final_answer(self, task) -> str:
        """
@ -794,10 +796,13 @@ Now begin!""",

            answer_facts = self.llm_engine([message_prompt_facts, message_prompt_task])

-            message_system_prompt_plan = {"role": MessageRole.SYSTEM, "content": SYSTEM_PROMPT_PLAN}
+            message_system_prompt_plan = {
+                "role": MessageRole.SYSTEM,
+                "content": PROMPTS_FOR_INITIAL_PLAN[self.plan_type]["system"],
+            }
            message_user_prompt_plan = {
                "role": MessageRole.USER,
-                "content": USER_PROMPT_PLAN.format(
+                "content": PROMPTS_FOR_INITIAL_PLAN[self.plan_type]["user"].format(
                    task=task,
                    tool_descriptions=self._toolbox.show_tool_descriptions(self.tool_description_template),
                    answer_facts=answer_facts,
@ -837,11 +842,11 @@ Now begin!""",
            # Redact updated plan
            plan_update_message = {
                "role": MessageRole.SYSTEM,
-                "content": SYSTEM_PROMPT_PLAN_UPDATE.format(task=task),
+                "content": PROMPTS_FOR_PLAN_UPDATE[self.plan_type]["system"].format(task=task),
            }
            plan_update_message_user = {
                "role": MessageRole.USER,
-                "content": USER_PROMPT_PLAN_UPDATE.format(
+                "content": PROMPTS_FOR_PLAN_UPDATE[self.plan_type]["user"].format(
                    task=task,
                    tool_descriptions=self._toolbox.show_tool_descriptions(self.tool_description_template),
                    facts_update=facts_update,
--- a/src/transformers/agents/evaluate_agent.py
+++ b/src/transformers/agents/evaluate_agent.py
@ -113,7 +113,7 @@ class Problem:
            The inputs that will be fed to the tools. For this testing environment, only strings are accepted as
            values. Pass along a dictionary when you want to specify the values of each inputs, or just the list of
            inputs expected (the value used will be `<<input_name>>` in this case).
-        answer (`str` or `list[str`]):
+        answer (`str` or `list[str]`):
            The theoretical answer (or list of possible valid answers) to the problem, as code.
    """

--- a/src/transformers/agents/monitoring.py
+++ b/src/transformers/agents/monitoring.py
@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# coding=utf-8
+
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .agent_types import AgentAudio, AgentImage, AgentText
+from .agents import ReactAgent
+
+
+def pull_message(step_log: dict):
+    try:
+        from gradio import ChatMessage
+    except ImportError:
+        raise ImportError("Gradio should be installed in order to launch a gradio demo.")
+
+    if step_log.get("rationale"):
+        yield ChatMessage(role="assistant", content=step_log["rationale"])
+    if step_log.get("tool_call"):
+        used_code = step_log["tool_call"]["tool_name"] == "code interpreter"
+        content = step_log["tool_call"]["tool_arguments"]
+        if used_code:
+            content = f"```py\n{content}\n```"
+        yield ChatMessage(
+            role="assistant",
+            metadata={"title": f"🛠️ Used tool {step_log['tool_call']['tool_name']}"},
+            content=str(content),
+        )
+    if step_log.get("observation"):
+        yield ChatMessage(role="assistant", content=f"```\n{step_log['observation']}\n```")
+    if step_log.get("error"):
+        yield ChatMessage(
+            role="assistant",
+            content=str(step_log["error"]),
+            metadata={"title": "💥 Error"},
+        )
+
+
+def stream_to_gradio(agent: ReactAgent, task: str, **kwargs):
+    """Runs an agent with the given task and streams the messages from the agent as gradio ChatMessages."""
+
+    try:
+        from gradio import ChatMessage
+    except ImportError:
+        raise ImportError("Gradio should be installed in order to launch a gradio demo.")
+
+    for step_log in agent.run(task, stream=True, **kwargs):
+        if isinstance(step_log, dict):
+            for message in pull_message(step_log):
+                yield message
+
+    if isinstance(step_log, AgentText):
+        yield ChatMessage(role="assistant", content=f"**Final answer:**\n```\n{step_log.to_string()}\n```")
+    elif isinstance(step_log, AgentImage):
+        yield ChatMessage(
+            role="assistant",
+            content={"path": step_log.to_string(), "mime_type": "image/png"},
+        )
+    elif isinstance(step_log, AgentAudio):
+        yield ChatMessage(
+            role="assistant",
+            content={"path": step_log.to_string(), "mime_type": "audio/wav"},
+        )
+    else:
+        yield ChatMessage(role="assistant", content=str(step_log))
--- a/src/transformers/agents/prompts.py
+++ b/src/transformers/agents/prompts.py
@ -471,6 +471,299 @@ After writing the final step of the plan, write the '\n<end_plan>' tag and stop

 Now write your new plan below."""

+SYSTEM_PROMPT_PLAN_STRUCTURED = """Output a step-by-step plan to solve the task using the given tools.
+This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. Each step should be structured as follows:
+Step #n: {
+  "description": <description of what the step does and its output>
+  "tool": <tool to use>,
+  "params": {
+      <parameters to pass to the tool as a valid dict>
+  }
+  "output_var": <output variable name>
+}
+Each step must be necessary to reach the final answer. Steps should reuse outputs produced by earlier steps. The last step must be the final answer.
+
+Below are some examples:
+
+Example 1:
+------
+Inputs:
+---
+Task:
+How many encoder blocks were in the first attention-only ML architecture published?
+
+[FACTS LIST]:
+### 1. Facts given in the task
+- The paper first introduced an attention-only ML architecture.
+- The specific information required is the page number where the number of encoder blocks is stated.
+- No local files are provided for access.
+
+### 2. Facts to look up
+- The title and authors of the paper that first introduced an attention-only ML architecture.
+  - Source: Online search (e.g., Google Scholar, arXiv, or other academic databases)
+- The full text of the identified paper.
+  - Source: Online academic repositories (e.g., arXiv, journal websites)
+- The specific page number in the paper where the number of encoder blocks is mentioned.
+  - Source: The content of the identified paper
+
+### 3. Facts to derive
+- By identifying the correct paper and locating the specific page, we will derive the page number where the number of encoder blocks is stated.
+  - Logical steps: Identify the correct paper, access its content, search for the term "encoder blocks," and note the page number where this information is found.
+```
+
+[STEP 1 TOOL CALL]: {'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Identify the title and authors of the paper that first introduced an attention-only ML architecture.\nanswer = ask_search_agent(query="Can you find the title and authors of the paper that first introduced an attention-only machine learning architecture? Please provide the full citation.")\nprint(answer)'}
+[OUTPUT OF STEP 1] Observation: **Title**: Attention Is All You Need
+**Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
+[STEP 2 TOOL CALL]: {'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Find the full text of the identified paper on arXiv\\npaper_url = "https://arxiv.org/pdf/1706.03762.pdf"\\nprint(paper_url)'}
+[OUTPUT OF STEP 2] Observation: https://arxiv.org/pdf/1706.03762.pdf
+---
+
+Output plan:
+---
+Step #1: {
+  "description": "Open the PDF of the paper from the provided URL and search within the text of the paper for the  mention of "encoder blocks"",
+  "tool": "inspect_file_as_text",
+  "params": {
+    "file_path": "https://arxiv.org/pdf/1706.03762.pdf",
+    "question": "On which page is the number of encoder blocks mentioned?"
+  },
+  "output_var": "page_number"
+}
+
+Step #2: {
+  "description": "Provide the final answer",
+  "tool": "final_answer",
+  "params": {
+      "answer": "{page_number}"
+  },
+  "output_var": ""
+}
+------
+
+Example 2:
+------
+Inputs:
+---
+Task:
+How many golf balls fits into a Boeing-747?
+
+[FACTS LIST]:
+### 1. Facts given in the task
+- The task requires calculating the number of golf balls that fir into a Boeing-747
+### 2. Facts to look up
+- The volume of a golf ball
+- The volume of a Boeing-747
+### 3. Facts to derive
+- Once the volumes are known the final answer can be calculated
+---
+Output plan:
+---
+Step #1: {
+  "description": "Find the volume of a Boeing-747",
+  "tool": "web_search",
+  "params": {
+      "query": "What is the internal volume of a Boeing-747 in cubic meters?"
+  },
+  "output_var": "boeing_volume"
+}
+
+Step #2: {
+  "description": "Find the volume of a standard golf ball",
+  "tool": "ask_search_agent",
+  "params": {
+      "query": "What is the volume of a standard golf ball in cubic centimeters?"
+  },
+  "output_var": "golf_ball_volume"
+}
+
+Step #3: {
+  "description": "Convert the volume of a golf ball from cubic centimeters to cubic meters. Calculate the number of golf balls that fit into the Boeing-747 by dividing the internal volume of the Boeing-747 by the volume of a golf ball.",
+  "tool": "python_code",
+  "params": {
+      "code": "golf_ball_volume_m3 = golf_ball_volume / 1e6\nnumber_of_golf_balls = boeing_volume / golf_ball_volume_m3"
+  },
+  "output_var": "number_of_golf_balls"
+}
+
+Step #4: {
+  "description": "Provide the final answer",
+  "tool": "final_answer",
+  "params": {
+      "answer": "{number_of_golf_balls}"
+  },
+  "output_var": ""
+}
+------
+Above example were using tools that might not exist for you.
+Your goal is to create a plan to solve the task."""
+
+USER_PROMPT_PLAN_STRUCTURED = """
+Here are your inputs:
+
+Task:
+```
+{task}
+```
+
+Your plan can leverage any of these tools:
+{tool_descriptions}
+These tools are Python functions which you can call with code. You also have access to a Python interpreter so you can run Python code.
+
+List of facts that you know:
+```
+{answer_facts}
+```
+
+Now for the given task, create a plan taking into account the list of facts.
+After writing the final step of the plan, write the '\n<end_plan>' tag and stop there. Output the plan only and nothing else."""
+
+SYSTEM_PROMPT_PLAN_UPDATE_STRUCTURED = """Output a step-by-step plan to solve the task using the given tools.
+This plan should involve individual tasks based on the avilable tools, that if executed correctly will yield the correct answer. Each step should be structured as follows:
+Step #n: {{
+  "description": <description of what the step does and its output>
+  "tool": <tool to use>,
+  "params": {{
+      <parameters to pass to the tool as a valid dict>
+  }}
+  "output_var": <output variable name>
+}}
+Each step must be necessary to reach the final answer. Steps should reuse outputs produced by earlier steps. The last step must be the final answer.
+
+Below are some examples:
+
+Example 1:
+------
+Inputs:
+---
+Task:
+How many encoder blocks were in the first attention-only ML architecture published?
+
+[FACTS LIST]:
+### 1. Facts given in the task
+- The paper first introduced an attention-only ML architecture.
+- The specific information required is the page number where the number of encoder blocks is stated.
+- No local files are provided for access.
+
+### 2. Facts to look up
+- The title and authors of the paper that first introduced an attention-only ML architecture.
+  - Source: Online search (e.g., Google Scholar, arXiv, or other academic databases)
+- The full text of the identified paper.
+  - Source: Online academic repositories (e.g., arXiv, journal websites)
+- The specific page number in the paper where the number of encoder blocks is mentioned.
+  - Source: The content of the identified paper
+
+### 3. Facts to derive
+- By identifying the correct paper and locating the specific page, we will derive the page number where the number of encoder blocks is stated.
+  - Logical steps: Identify the correct paper, access its content, search for the term "encoder blocks," and note the page number where this information is found.
+```
+
+[STEP 1 TOOL CALL]: {{'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Identify the title and authors of the paper that first introduced an attention-only ML architecture.\nanswer = ask_search_agent(query="Can you find the title and authors of the paper that first introduced an attention-only machine learning architecture? Please provide the full citation.")\nprint(answer)'}}
+[OUTPUT OF STEP 1] Observation: **Title**: Attention Is All You Need
+**Authors**: Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin
+[STEP 2 TOOL CALL]: {{'tool_name': 'code interpreter', 'tool_arguments': '# Step 1: Find the full text of the identified paper on arXiv\\npaper_url = "https://arxiv.org/pdf/1706.03762.pdf"\\nprint(paper_url)'}}
+[OUTPUT OF STEP 2] Observation: https://arxiv.org/pdf/1706.03762.pdf
+---
+
+Output plan:
+---
+Step #1: {{
+  "description": "Open the PDF of the paper from the provided URL and search within the text of the paper for the  mention of "encoder blocks"",
+  "tool": "inspect_file_as_text",
+  "params": {{
+    "file_path": "https://arxiv.org/pdf/1706.03762.pdf",
+    "question": "On which page is the number of encoder blocks mentioned?"
+  }},
+  "output_var": "page_number"
+}}
+
+Step #2: {{
+  "description": "Provide the final answer",
+  "tool": "final_answer",
+  "params": {{
+      "answer": "{{page_number}}"
+  }},
+  "output_var": ""
+}}
+------
+
+Example 2:
+------
+Inputs:
+---
+Task:
+How many golf balls fits into a Boeing-747?
+
+[FACTS LIST]:
+### 1. Facts given in the task
+- The task requires calculating the number of golf balls that fir into a Boeing-747
+### 2. Facts to look up
+- The volume of a golf ball
+- The volume of a Boeing-747
+### 3. Facts to derive
+- Once the volumes are known the final answer can be calculated
+---
+Output plan:
+---
+Step #1: {{
+  "description": "Find the volume of a Boeing-747",
+  "tool": "web_search",
+  "params": {{
+      "query": "What is the internal volume of a Boeing-747 in cubic meters?"
+  }},
+  "output_var": "boeing_volume"
+}}
+
+Step #2: {{
+  "description": "Find the volume of a standard golf ball",
+  "tool": "ask_search_agent",
+  "params": {{
+      "query": "What is the volume of a standard golf ball in cubic centimeters?"
+  }},
+  "output_var": "golf_ball_volume"
+}}
+
+Step #3: {{
+  "description": "Convert the volume of a golf ball from cubic centimeters to cubic meters. Calculate the number of golf balls that fit into the Boeing-747 by dividing the internal volume of the Boeing-747 by the volume of a golf ball.",
+  "tool": "python_code",
+  "params": {{
+      "code": "golf_ball_volume_m3 = golf_ball_volume / 1e6\nnumber_of_golf_balls = boeing_volume / golf_ball_volume_m3"
+  }},
+  "output_var": "number_of_golf_balls"
+}}
+
+Step #4: {{
+  "description": "Provide the final answer",
+  "tool": "final_answer",
+  "params": {{
+      "answer": "{{number_of_golf_balls}}"
+  }},
+  "output_var": ""
+}}
+------
+Above example were using tools that might not exist for you.
+Find below the record of what has been tried so far to solve it. Your goal is to create an updated plan to solve the task."""
+
+USER_PROMPT_PLAN_UPDATE_STRUCTURED = """
+Here are your inputs:
+
+Task:
+```
+{task}
+```
+
+Your plan can leverage any of these tools:
+{tool_descriptions}
+These tools are Python functions which you can call with code. You also have access to a Python interpreter so you can run Python code.
+
+List of facts that you know:
+```
+{facts_update}
+```
+
+Now for the given task, create a plan taking into account the above inputs and list of facts.
+Beware that you have {remaining_steps} steps remaining.
+After writing the final step of the plan, write the '\n<end_plan>' tag and stop there. Output the plan only and nothing else."""
+
 PLAN_UPDATE_FINAL_PLAN_REDACTION = """I still need to solve the task I was given:
 ```
 {task}
@ -480,3 +773,15 @@ Here is my new/updated plan of action to solve the task:
 ```
 {plan_update}
 ```"""
+
+SUPPORTED_PLAN_TYPES = ["default", "structured"]
+
+PROMPTS_FOR_INITIAL_PLAN = {
+    "default": {"system": SYSTEM_PROMPT_PLAN, "user": USER_PROMPT_PLAN},
+    "structured": {"system": SYSTEM_PROMPT_PLAN_STRUCTURED, "user": USER_PROMPT_PLAN_STRUCTURED},
+}
+
+PROMPTS_FOR_PLAN_UPDATE = {
+    "default": {"system": SYSTEM_PROMPT_PLAN_UPDATE, "user": USER_PROMPT_PLAN_UPDATE},
+    "structured": {"system": SYSTEM_PROMPT_PLAN_UPDATE_STRUCTURED, "user": USER_PROMPT_PLAN_UPDATE_STRUCTURED},
+}
--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -663,7 +663,7 @@ def spectrogram_batch(
            Specifies log scaling strategy; options are None, "log", "log10", "dB".
        reference (`float`, *optional*, defaults to 1.0):
            Reference value for dB conversion in log_mel.
-        min_value (`float`, °optional*, defaults to 1e-10):
+        min_value (`float`, *optional*, defaults to 1e-10):
            Minimum floor value for log scale conversions.
        db_range (`float`, *optional*):
            Dynamic range for dB scale spectrograms.
--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -9,7 +9,7 @@ import torch
 from packaging import version

 from .configuration_utils import PretrainedConfig
-from .utils import is_hqq_available, is_quanto_available, logging
+from .utils import is_hqq_available, is_quanto_available, is_torchdynamo_compiling, logging


 if is_quanto_available():
@ -23,12 +23,14 @@ if is_hqq_available():
 logger = logging.get_logger(__name__)


-@dataclass
-class Cache:
+class Cache(torch.nn.Module):
    """
    Base, abstract class for all caches. The actual data structure is specific to each subclass.
    """

+    def __init__(self):
+        super().__init__()
+
    def update(
        self,
        key_states: torch.Tensor,
@ -110,6 +112,7 @@ class CacheConfig:
        Args:
            config_dict (Dict[str, Any]): Dictionary containing configuration parameters.
            **kwargs: Additional keyword arguments to override dictionary values.
+
        Returns:
            CacheConfig: Instance of CacheConfig constructed from the dictionary.
        """
@ -299,6 +302,7 @@ class DynamicCache(Cache):
    """

    def __init__(self) -> None:
+        super().__init__()
        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
        self._seen_tokens = 0  # Used in `generate` to keep tally of how many tokens the cache has seen
@ -398,7 +402,6 @@ class DynamicCache(Cache):
    def crop(self, max_length: int):
        """Crop the past key values up to a new `max_length` in terms of tokens. `max_length` can also be
        negative to remove `max_length` tokens. This is used in assisted decoding and contrastive search."""
-
        # In case it is negative
        if max_length < 0:
            max_length = self.get_seq_length() - abs(max_length)
@ -447,6 +450,118 @@ class DynamicCache(Cache):
            self.value_cache[layer_idx] = self.value_cache[layer_idx][indices, ...]


+class OffloadedCache(DynamicCache):
+    """
+    A drop-in replacement for DynamicCache that conserves GPU memory at the expense of more CPU memory.
+    Useful for generating from models with very long context.
+
+    In addition to the default CUDA stream, where all forward() computations happen,
+    this class uses another stream, the prefetch stream, which it creates itself.
+    Since scheduling of operations on separate streams happens independently, this class uses
+    the prefetch stream to asynchronously prefetch the KV cache of layer k+1 when layer k is executing.
+    The movement of the layer k-1 cache to the CPU is handled by the default stream as a simple way to
+    ensure the eviction is scheduled after all computations on that cache are finished.
+    """
+
+    def __init__(self) -> None:
+        if not torch.cuda.is_available():
+            raise RuntimeError("OffloadedCache can only be used with a GPU")
+        super().__init__()
+        self.original_device = []
+        self.prefetch_stream = torch.cuda.Stream()
+        self.beam_idx = None  # used to delay beam search operations
+
+    def prefetch_layer(self, layer_idx: int):
+        "Starts prefetching the next layer cache"
+        if layer_idx < len(self):
+            with torch.cuda.stream(self.prefetch_stream):
+                # Prefetch next layer tensors to GPU
+                device = self.original_device[layer_idx]
+                self.key_cache[layer_idx] = self.key_cache[layer_idx].to(device, non_blocking=True)
+                self.value_cache[layer_idx] = self.value_cache[layer_idx].to(device, non_blocking=True)
+
+    def evict_previous_layer(self, layer_idx: int):
+        "Moves the previous layer cache to the CPU"
+        if len(self) > 2:
+            # We do it on the default stream so it occurs after all earlier computations on these tensors are done
+            prev_layer_idx = (layer_idx - 1) % len(self)
+            self.key_cache[prev_layer_idx] = self.key_cache[prev_layer_idx].to("cpu", non_blocking=True)
+            self.value_cache[prev_layer_idx] = self.value_cache[prev_layer_idx].to("cpu", non_blocking=True)
+
+    def __getitem__(self, layer_idx: int) -> List[Tuple[torch.Tensor]]:
+        "Gets the cache for this layer to the device. Prefetches the next and evicts the previous layer."
+        if layer_idx < len(self):
+            # Evict the previous layer if necessary
+            torch.cuda.current_stream().synchronize()
+            self.evict_previous_layer(layer_idx)
+            # Load current layer cache to its original device if not already there
+            original_device = self.original_device[layer_idx]
+            self.prefetch_stream.synchronize()
+            key_tensor = self.key_cache[layer_idx]
+            value_tensor = self.value_cache[layer_idx]
+            # Now deal with beam search ops which were delayed
+            if self.beam_idx is not None:
+                self.beam_idx = self.beam_idx.to(original_device)
+                key_tensor = key_tensor.index_select(0, self.beam_idx)
+                value_tensor = value_tensor.index_select(0, self.beam_idx)
+            # Prefetch the next layer
+            self.prefetch_layer((layer_idx + 1) % len(self))
+            return (key_tensor, value_tensor)
+        else:
+            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
+
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Saves the beam indices and reorders the cache when the tensor is back to its device."""
+        # We delay this operation until the tensors are back to their original
+        # device because performing torch.index_select on the CPU is very slow
+        del self.beam_idx
+        self.beam_idx = beam_idx.clone()
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """
+        Updates the cache with the new `key_states` and `value_states` for the layer `layer_idx`.
+        Parameters:
+            key_states (`torch.Tensor`):
+                The new key states to cache.
+            value_states (`torch.Tensor`):
+                The new value states to cache.
+            layer_idx (`int`):
+                The index of the layer to cache the states for.
+            cache_kwargs (`Dict[str, Any]`, `optional`):
+                Additional arguments for the cache subclass. No additional arguments are used in `OffloadedCache`.
+        Return:
+            A tuple containing the updated key and value states.
+        """
+        # Update the number of seen tokens
+        if layer_idx == 0:
+            self._seen_tokens += key_states.shape[-2]
+
+        # Update the cache
+        if len(self.key_cache) <= layer_idx:
+            self.key_cache.append(key_states)
+            self.value_cache.append(value_states)
+            self.original_device.append(key_states.device)
+            self.evict_previous_layer(layer_idx)
+        else:
+            key_tensor, value_tensor = self[layer_idx]
+            self.key_cache[layer_idx] = torch.cat([key_tensor, key_states], dim=-2)
+            self.value_cache[layer_idx] = torch.cat([value_tensor, value_states], dim=-2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    # According to https://docs.python.org/3/library/exceptions.html#NotImplementedError
+    # if a method is not supposed to be supported in a subclass we should set it to None
+    from_legacy_cache = None
+
+    to_legacy_cache = None
+
+
 class QuantizedCache(DynamicCache):
    """
    A quantizer cache similar to what is described in the [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache paper](https://arxiv.org/abs/2402.02750).
@ -462,6 +577,7 @@ class QuantizedCache(DynamicCache):
    """

    def __init__(self, cache_config: QuantizedCacheConfig) -> None:
+        super().__init__()
        self._quantized_key_cache: List[torch.Tensor] = []
        self._quantized_value_cache: List[torch.Tensor] = []

@ -539,7 +655,7 @@ class QuantoQuantizedCache(QuantizedCache):
    Quantized Cache class that uses `quanto` as a backend to perform quantization. Current implementation supports `int2` and `int4` dtypes only.

    Parameters:
-        cache_config (`QuantizedCacheConfig`,):
+        cache_config (`QuantizedCacheConfig`):
            A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
    """

@ -580,7 +696,7 @@ class HQQQuantizedCache(QuantizedCache):
    Quantized Cache class that uses `HQQ` as a backend to perform quantization. Current implementation supports `int2`, `int4`, `int8` dtypes.

    Parameters:
-        cache_config (`QuantizedCacheConfig`,):
+        cache_config (`QuantizedCacheConfig`):
            A configuration containing all the arguments to be used by the quantizer, including axis, qtype and group size.
    """

@ -635,6 +751,7 @@ class SinkCache(Cache):
    """

    def __init__(self, window_length: int, num_sink_tokens: int) -> None:
+        super().__init__()
        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
        self.window_length = window_length
@ -787,10 +904,10 @@ class SinkCache(Cache):

 class StaticCache(Cache):
    """
-    Static Cache class to be used with `torch.compile(model)`.
+    Static Cache class to be used with `torch.compile(model)` and `torch.export()`.

    Parameters:
-        config (`PretrainedConfig):
+        config (`PretrainedConfig`):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used.
@ -818,16 +935,22 @@ class StaticCache(Cache):

        self.key_cache: List[torch.Tensor] = []
        self.value_cache: List[torch.Tensor] = []
+        # Note: There will be significant perf decrease if switching to use 5D tensors instead.
        cache_shape = (max_batch_size, self.num_key_value_heads, self.max_cache_len, self.head_dim)
-        for _ in range(config.num_hidden_layers):
+        for idx in range(config.num_hidden_layers):
+            # Note: `torch.export()`` requires mutations to be registered as buffers.
+            self.register_buffer(f"key_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device))
+            self.register_buffer(f"value_cache_{idx}", torch.zeros(cache_shape, dtype=dtype, device=device))
+            key_cache = getattr(self, f"key_cache_{idx}")
+            value_cache = getattr(self, f"value_cache_{idx}")
            # Note: `mark_static_address` is used to tag the cache as an fixed data pointer, preventing cuda graph
-            # breaks when updating the cache.
-            new_layer_key_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
-            new_layer_value_cache = torch.zeros(cache_shape, dtype=self.dtype, device=device)
-            torch._dynamo.mark_static_address(new_layer_key_cache)
-            torch._dynamo.mark_static_address(new_layer_value_cache)
-            self.key_cache.append(new_layer_key_cache)
-            self.value_cache.append(new_layer_value_cache)
+            # breaks when updating the cache. It can't be used if the cache code is being compiled (but in that case
+            # it is not needed anyway)
+            if not is_torchdynamo_compiling():
+                torch._dynamo.mark_static_address(key_cache)
+                torch._dynamo.mark_static_address(value_cache)
+            self.key_cache.append(key_cache)
+            self.value_cache.append(value_cache)

    def update(
        self,
@ -914,7 +1037,7 @@ class SlidingWindowCache(StaticCache):
    We overwrite the cache using these, then we always write at cache_position (clamped to `sliding_window`)

    Parameters:
-        config (`PretrainedConfig):
+        config (`PretrainedConfig`):
            The configuration file defining the shape-related attributes required to initialize the static cache.
        max_batch_size (`int`):
            The maximum batch size with which the model will be used.
@ -927,6 +1050,7 @@ class SlidingWindowCache(StaticCache):
    """

    def __init__(self, config: PretrainedConfig, max_batch_size: int, max_cache_len: int, device, dtype=None) -> None:
+        super().__init__()
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
            raise ValueError(
                "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
@ -1004,6 +1128,7 @@ class EncoderDecoderCache(Cache):
    """

    def __init__(self, self_attention_cache: Cache, cross_attention_cache: Cache):
+        super().__init__()
        self.self_attention_cache = self_attention_cache
        self.cross_attention_cache = cross_attention_cache

@ -1021,7 +1146,7 @@ class EncoderDecoderCache(Cache):
                self.self_attention_cache.key_cache[layer_idx],
                self.self_attention_cache.value_cache[layer_idx],
                self.cross_attention_cache.key_cache[layer_idx],
-                self.cross_attention_cache.key_cache[layer_idx],
+                self.cross_attention_cache.value_cache[layer_idx],
            )
        else:
            raise KeyError(f"Cache only has {len(self)} layers, attempted to access layer with index {layer_idx}")
@ -1147,6 +1272,7 @@ class EncoderDecoderCache(Cache):

 class HybridCache(Cache):
    def __init__(self, config: PretrainedConfig, max_batch_size, max_cache_len, device="cpu", dtype=None) -> None:
+        super().__init__()
        if not hasattr(config, "sliding_window") or config.sliding_window is None:
            raise ValueError(
                "Setting `cache_implementation` to 'sliding_window' requires the model config supporting "
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@ -53,6 +53,25 @@ def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
    return prepend_scheme


+def generate_merges(vocab, vocab_scores):
+    reverse = vocab_scores is not None
+    vocab_scores = dict(vocab_scores) if reverse else vocab
+
+    merges = []
+    for merge, piece_score in vocab_scores.items():
+        local = []
+        for index in range(1, len(merge)):
+            piece_l, piece_r = merge[:index], merge[index:]
+            if piece_l in vocab and piece_r in vocab:
+                local.append((piece_l, piece_r, piece_score))
+        local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
+        merges.extend(local)
+
+    merges = sorted(merges, key=lambda val: (val[2], len(val[0]), len(val[1])), reverse=reverse)
+    merges = [(val[0], val[1]) for val in merges]
+    return merges
+
+
 class SentencePieceExtractor:
    """
    Extractor implementation for SentencePiece trained models. https://github.com/google/sentencepiece
@ -73,24 +92,8 @@ class SentencePieceExtractor:
        sp = self.sp
        vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}

-        if vocab_scores is not None:
-            vocab_scores, reverse = dict(vocab_scores), True
-        else:
-            vocab_scores, reverse = vocab, False
+        merges = generate_merges(vocab, vocab_scores)

-        # Merges
-        merges = []
-        for merge, piece_score in vocab_scores.items():
-            local = []
-            for index in range(1, len(merge)):
-                piece_l, piece_r = merge[:index], merge[index:]
-                if piece_l in vocab and piece_r in vocab:
-                    local.append((piece_l, piece_r, piece_score))
-            local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
-            merges.extend(local)
-
-        merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
-        merges = [(val[0], val[1]) for val in merges]
        return vocab, merges


@ -107,24 +110,7 @@ class GemmaSentencePieceExtractor(SentencePieceExtractor):
        # "<0x09>" is the bytefallback for `\t`
        vocab["\t"] = vocab.get("<0x09>")

-        if vocab_scores is not None:
-            vocab_scores, reverse = dict(vocab_scores), True
-        else:
-            vocab_scores, reverse = vocab, False
-
-        # Merges
-        merges = []
-        for merge, piece_score in vocab_scores.items():
-            local = []
-            for index in range(1, len(merge)):
-                piece_l, piece_r = merge[:index], merge[index:]
-                if piece_l in vocab and piece_r in vocab:
-                    local.append((piece_l, piece_r, piece_score))
-            local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
-            merges.extend(local)
-
-        merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
-        merges = [(val[0], val[1]) for val in merges]
+        merges = generate_merges(vocab, vocab_scores)
        return vocab, merges


@ -544,6 +530,10 @@ class DebertaConverter(Converter):


 class SpmConverter(Converter):
+    handle_byte_fallback = False
+    SpmExtractor = SentencePieceExtractor
+    special_tokens = {}
+
    def __init__(self, *args):
        requires_backends(self, "protobuf")

@ -557,14 +547,13 @@ class SpmConverter(Converter):
            m.ParseFromString(f.read())
        self.proto = m

-        if self.proto.trainer_spec.byte_fallback:
-            if not getattr(self, "handle_byte_fallback", None):
-                warnings.warn(
-                    "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
-                    " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
-                    " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
-                    "unknown tokens into a sequence of byte tokens matching the original piece of text."
-                )
+        if self.proto.trainer_spec.byte_fallback and not self.handle_byte_fallback:
+            warnings.warn(
+                "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
+                " which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
+                " tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
+                "unknown tokens into a sequence of byte tokens matching the original piece of text."
+            )

    def vocab(self, proto):
        return [(piece.piece, piece.score) for piece in proto.pieces]
@ -575,12 +564,18 @@ class SpmConverter(Converter):
    def tokenizer(self, proto):
        model_type = proto.trainer_spec.model_type
        vocab_scores = self.vocab(proto)
-        unk_id = self.unk_id(proto)

        if model_type == 1:
-            tokenizer = Tokenizer(Unigram(vocab_scores, unk_id))
+            tokenizer = Tokenizer(
+                Unigram(
+                    vocab_scores,
+                    unk_id=self.unk_id(proto),
+                    byte_fallback=self.handle_byte_fallback,
+                )
+            )
+
        elif model_type == 2:
-            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
+            _, merges = self.SpmExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
            bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
            tokenizer = Tokenizer(
                BPE(
@ -588,13 +583,53 @@ class SpmConverter(Converter):
                    merges,
                    unk_token=proto.trainer_spec.unk_piece,
                    fuse_unk=True,
+                    byte_fallback=self.handle_byte_fallback,
+                    dropout=None,
                )
            )
+
        else:
            raise Exception(
                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
            )

+        # control tokens are special
+        # user defined symbols are not
+        # both user and control tokens are AddedTokens
+        # Add user defined symbols (type == 4) from sentencepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
+        spm_added_tokens = [
+            (id, p.piece, p.type == 3 or p.piece in self.special_tokens)
+            for id, p in enumerate(proto.pieces)
+            if p.type in [3, 4]
+        ]
+        tokens_to_add = [
+            AddedToken(token, normalized=False, special=special)
+            for id, token, special in sorted(spm_added_tokens, key=lambda x: x[0])
+        ]
+
+        if len(tokens_to_add) > 0:
+            # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
+            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
+            # individual tokens would repeatedly rebuild a trie, which can be slow.
+            is_last_special = None
+            tokens = []
+            for token in tokens_to_add:
+                is_special = token.special
+                if is_last_special is None or is_last_special == is_special:
+                    tokens.append(token)
+                else:
+                    if is_last_special:
+                        tokenizer.add_special_tokens(tokens)
+                    else:
+                        tokenizer.add_tokens(tokens)
+                    tokens = [token]
+                is_last_special = is_special
+            if tokens:
+                if is_last_special:
+                    tokenizer.add_special_tokens(tokens)
+                else:
+                    tokenizer.add_tokens(tokens)
+
        return tokenizer

    def normalizer(self, proto):
@ -622,40 +657,6 @@ class SpmConverter(Converter):
    def converted(self) -> Tokenizer:
        tokenizer = self.tokenizer(self.proto)

-        # control tokens are special
-        # user defined symbols are not
-        # both user and control tokens are AddedTokens
-        # Add user defined symbols (type == 4) from sentnecepiece (https://github.com/google/sentencepiece/blob/6225e08edb2577757163b3f5dbba4c0b670ef445/src/sentencepiece_model.proto#L299C29-L299C33)
-
-        tokens_to_add = {
-            id: AddedToken(token, normalized=False, special=special)
-            for id, token, special in [
-                (id, p.piece, p.type == 3) for id, p in enumerate(self.proto.pieces) if p.type in [3, 4]
-            ]
-        }
-        tokens_to_add = [k for _, k in sorted(tokens_to_add.items(), key=lambda x: x[0])]
-        if len(tokens_to_add) > 0:
-            # super hack: if a token.special is set, tokenizer ignores it for now so FIXME @ArthurZ
-            # Accumulate added tokens into batches of special/non-special tokens, because calling add_tokens() for
-            # individual tokens would repeatedly rebuild a trie, which can be slow.
-            is_last_special = None
-            tokens = []
-            for token in tokens_to_add:
-                is_special = token.special
-                if is_last_special is None or is_last_special == is_special:
-                    tokens.append(token)
-                else:
-                    if is_last_special:
-                        tokenizer.add_special_tokens(tokens)
-                    else:
-                        tokenizer.add_tokens(tokens)
-                    tokens = [token]
-                is_last_special = is_special
-            if tokens:
-                if is_last_special:
-                    tokenizer.add_special_tokens(tokens)
-                else:
-                    tokenizer.add_tokens(tokens)
        # Tokenizer assemble
        normalizer = self.normalizer(self.proto)
        if normalizer is not None:
@ -1283,6 +1284,9 @@ class XGLMConverter(SpmConverter):

 class GemmaConvert(SpmConverter):
    handle_byte_fallback = True
+    SpmExtractor = GemmaSentencePieceExtractor
+    # start and end of turn tokens must be marked as special
+    special_tokens = {"<start_of_turn>", "<end_of_turn>"}

    """"
    split_by_unicode_script: true
@ -1327,45 +1331,6 @@ class GemmaConvert(SpmConverter):
            ]
        )

-    def tokenizer(self, proto):
-        model_type = proto.trainer_spec.model_type
-        vocab_scores = self.vocab(proto)
-        if model_type == 1:
-            import tokenizers
-
-            if version.parse(tokenizers.__version__) < version.parse("0.14.0"):
-                tokenizer = Tokenizer(Unigram(vocab_scores, 0))
-            else:
-                tokenizer = Tokenizer(Unigram(vocab_scores, 0, byte_fallback=True))
-
-        elif model_type == 2:
-            _, merges = GemmaSentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
-            bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
-
-            tokenizer = Tokenizer(
-                BPE(
-                    bpe_vocab,
-                    merges,
-                    unk_token=proto.trainer_spec.unk_piece,
-                    fuse_unk=True,
-                    byte_fallback=True,
-                    dropout=None,
-                )
-            )
-            tokenizer.add_special_tokens(
-                [
-                    AddedToken("<pad>", normalized=False, special=True),
-                    AddedToken("<eos>", normalized=False, special=True),
-                    AddedToken("<bos>", normalized=False, special=True),
-                    AddedToken("<unk>", normalized=False, special=True),
-                ]
-            )
-        else:
-            raise Exception(
-                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
-            )
-        return tokenizer
-

 class LlamaConverter(SpmConverter):
    handle_byte_fallback = True
@ -1393,37 +1358,6 @@ class LlamaConverter(SpmConverter):
            sequence += [decoders.Strip(content=" ", left=1)]
        return decoders.Sequence(sequence)

-    def tokenizer(self, proto):
-        model_type = proto.trainer_spec.model_type
-        vocab_scores = self.vocab(proto)
-        if model_type == 1:
-            import tokenizers
-
-            if version.parse(tokenizers.__version__) < version.parse("0.14.0"):
-                tokenizer = Tokenizer(Unigram(vocab_scores, 0))
-            else:
-                tokenizer = Tokenizer(Unigram(vocab_scores, 0, byte_fallback=True))
-
-        elif model_type == 2:
-            _, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
-            bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
-            tokenizer = Tokenizer(
-                BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
-            )
-            tokenizer.add_special_tokens(
-                [
-                    AddedToken(self.original_tokenizer.convert_ids_to_tokens(0), normalized=False, special=True),
-                    AddedToken(self.original_tokenizer.convert_ids_to_tokens(1), normalized=False, special=True),
-                    AddedToken(self.original_tokenizer.convert_ids_to_tokens(2), normalized=False, special=True),
-                ]
-            )
-        else:
-            raise Exception(
-                "You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
-            )
-
-        return tokenizer
-
    def normalizer(self, proto):
        if getattr(self.original_tokenizer, "legacy", True):
            sequence = []
--- a/src/transformers/data/init.py
+++ b/src/transformers/data/init.py
@ -19,6 +19,7 @@ from .data_collator import (
    DataCollatorForSOP,
    DataCollatorForTokenClassification,
    DataCollatorForWholeWordMask,
+    DataCollatorWithFlattening,
    DataCollatorWithPadding,
    DefaultDataCollator,
    default_data_collator,
--- a/src/transformers/data/data_collator.py
+++ b/src/transformers/data/data_collator.py
@ -751,7 +751,7 @@ class DataCollatorForLanguageModeling(DataCollatorMixin):
        inputs = tf.where(indices_replaced, mask_token_id, inputs)

        # 10% of the time, we replace masked input tokens with random word
-        indices_random = self.tf_bernoulli(input_shape, 0.1) & masked_indices & ~indices_replaced
+        indices_random = self.tf_bernoulli(input_shape, 0.5) & masked_indices & ~indices_replaced
        random_words = tf.random.uniform(input_shape, maxval=vocab_size, dtype=inputs.dtype)

        inputs = tf.where(indices_random, random_words, inputs)
@ -1611,3 +1611,38 @@ class DataCollatorForPermutationLanguageModeling(DataCollatorMixin):
            ) & masked_indices[i]

        return inputs.astype(np.int64), perm_mask, target_mapping, labels.astype(np.int64)
+
+
+@dataclass
+class DataCollatorWithFlattening(DefaultDataCollator):
+    """
+    Data collator used for padding free approach. Does the following:
+
+    - concatate the entire mini batch into single long sequence [1, total_tokens]
+    - no padding will be added, returns `input_ids`, `labels` and `position_ids`
+    """
+
+    def __init__(self, *args, return_position_ids=True, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.return_position_ids = return_position_ids
+        warnings.warn(
+            "Using `DataCollatorWithFlattening` will flatten the entire mini batch into single long sequence."
+            "Make sure your attention computation is able to handle it!"
+        )
+
+    def __call__(self, features, return_tensors=None):
+        if return_tensors is None:
+            return_tensors = self.return_tensors
+        is_labels_provided = "labels" in features[0]
+        ret = {"input_ids": [], "labels": []}
+        if self.return_position_ids:
+            ret.update({"position_ids": []})
+        for idx in range(0, len(features)):
+            ret["input_ids"] += features[idx]["input_ids"]
+            if is_labels_provided:
+                ret["labels"] += [-100] + features[idx]["labels"][1:]
+            else:
+                ret["labels"] += [-100] + features[idx]["input_ids"][1:]
+            if self.return_position_ids:
+                ret["position_ids"] += list(range(len(features[idx]["input_ids"])))
+        return default_data_collator([ret], return_tensors)
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -63,7 +63,7 @@ deps = {
    "rhoknp": "rhoknp>=1.1.0,<1.3.1",
    "rjieba": "rjieba",
    "rouge-score": "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
-    "ruff": "ruff==0.4.4",
+    "ruff": "ruff==0.5.1",
    "sacrebleu": "sacrebleu>=1.4.12,<2.0.0",
    "sacremoses": "sacremoses",
    "safetensors": "safetensors>=0.4.1",
--- a/Show More
+++ b/Show More