more fixes

fix bart test
skip whisper
2025-10-21 01:23:56 +08:00 · 2024-08-08 15:38:14 +02:00 · 2024-08-08 15:33:25 +02:00 · 2024-08-08 15:19:40 +02:00 · 2024-08-08 14:17:57 +02:00 · 2024-08-08 14:16:51 +02:00
482 changed files with 14546 additions and 5506 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -142,6 +142,7 @@ jobs:
            - run: python utils/custom_init_isort.py --check_only
            - run: python utils/sort_auto_mappings.py --check_only
            - run: python utils/check_doc_toc.py
+            - run: python utils/check_docstrings.py --check_all

    check_repository_consistency:
        working_directory: ~/transformers
@ -190,4 +191,4 @@ workflows:
            - check_circleci_user
            - check_code_quality
            - check_repository_consistency
-            - fetch_all_tests
+            - fetch_all_tests
--- a/.github/workflows/self-pr-slow-ci.yml
+++ b/.github/workflows/self-pr-slow-ci.yml
@ -4,7 +4,7 @@ on:
  pull_request:
    paths:
      - "src/transformers/models/*/modeling_*.py"
-      - "tests/models/*/test_*.py"
+      - "tests/**/test_*.py"

 concurrency:
  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -10,20 +10,9 @@ jobs:
  trufflehog:
    runs-on: ubuntu-latest
    steps:
-    - shell: bash
-      run: |
-        if [ "${{ github.event_name }}" == "push" ]; then
-          echo "depth=$(($(jq length <<< '${{ toJson(github.event.commits) }}') + 2))" >> $GITHUB_ENV
-          echo "branch=${{ github.ref_name }}" >> $GITHUB_ENV
-        fi
-        if [ "${{ github.event_name }}" == "pull_request" ]; then
-          echo "depth=$((${{ github.event.pull_request.commits }}+2))" >> $GITHUB_ENV
-          echo "branch=${{ github.event.pull_request.head.ref }}" >> $GITHUB_ENV
-        fi
-    - name: Checkout code
-      uses: actions/checkout@v4
-      with:
-        ref: ${{env.branch}}
-        fetch-depth: ${{env.depth}}
-    - name: Secret Scanning
-      uses: trufflesecurity/trufflehog@main
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+      - name: Secret Scanning
+        uses: trufflesecurity/trufflehog@main
--- a/1
+++ b/1
@ -56,6 +56,7 @@ quality:
 	python utils/custom_init_isort.py --check_only
 	python utils/sort_auto_mappings.py --check_only
 	python utils/check_doc_toc.py
+	python utils/check_docstrings.py --check_all


 # Format source code automatically and check is there are any problems left that need manual fixing
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -8,7 +8,7 @@ RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
-RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,vision,testing]"
+RUN uv pip install --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[flax,quality,torch-speech,vision,testing]"
 RUN git lfs install

 RUN pip uninstall -y transformers
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -9,7 +9,7 @@ SHELL ["sh", "-lc"]
 # The following `ARG` are mainly used to specify the versions explicitly & directly in this docker file, and not meant
 # to be used as arguments for docker build (so far).

-ARG PYTORCH='2.3.0'
+ARG PYTORCH='2.4.0'
 # (not always a valid torch version)
 ARG INTEL_TORCH_EXT='2.3.0'
 # Example: `cu102`, `cu113`, etc.
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -11,7 +11,7 @@ ARG REF=main
 RUN git clone https://github.com/huggingface/transformers && cd transformers && git checkout $REF

 # If set to nothing, will install the latest version
-ARG PYTORCH='2.3.0'
+ARG PYTORCH='2.4.0'
 ARG TORCH_VISION=''
 ARG TORCH_AUDIO=''
 # Example: `cu102`, `cu113`, etc.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -99,6 +99,8 @@
    sections:
    - local: generation_strategies
      title: Customize the generation strategy
+    - local: kv_cache
+      title: Best Practices for Generation with Cache
    title: Generation
  - isExpanded: false
    sections:
@ -436,6 +438,8 @@
        title: MADLAD-400
      - local: model_doc/mamba
        title: Mamba
+      - local: model_doc/mamba2
+        title: mamba2
      - local: model_doc/marian
        title: MarianMT
      - local: model_doc/markuplm
@ -466,6 +470,8 @@
        title: MT5
      - local: model_doc/mvp
        title: MVP
+      - local: model_doc/nemotron
+        title: Nemotron
      - local: model_doc/nezha
        title: NEZHA
      - local: model_doc/nllb
--- a/docs/source/en/agents.md
+++ b/docs/source/en/agents.md
@ -119,10 +119,12 @@ def llm_engine(messages, stop_sequences=["Task"]) -> str:
 ```

 You could use any `llm_engine` method as long as:
-1. it follows the [messages format](./chat_templating.md) for its input (`List[Dict[str, str]]`) and returns a `str`
-2. it stops generating outputs at the sequences passed in the argument `stop`
+1. it follows the [messages format](./chat_templating.md) (`List[Dict[str, str]]`) for its input `messages`, and it returns a `str`.
+2. it stops generating outputs at the sequences passed in the argument `stop_sequences`

-You also need a `tools` argument which accepts a list of `Tools`. You can provide an empty list for `tools`, but use the default toolbox with the optional argument `add_base_tools=True`.
+Additionally, `llm_engine` can also take a `grammar` argument. In the case where you specify a `grammar` upon agent initialization, this argument will be passed to the calls to llm_engine, with the `grammar` that you defined upon initialization, to allow [constrained generation](https://huggingface.co/docs/text-generation-inference/conceptual/guidance) in order to force properly-formatted agent outputs.
+
+You will also need a `tools` argument which accepts a list of `Tools` - it can be an empty list. You can also add the default toolbox on top of your `tools` list by defining the optional argument `add_base_tools=True`.

 Now you can create an agent, like [`CodeAgent`], and run it. For convenience, we also provide the [`HfEngine`] class that uses `huggingface_hub.InferenceClient` under the hood.

@ -509,3 +511,54 @@ agent = ReactCodeAgent(tools=[search_tool])

 agent.run("How many more blocks (also denoted as layers) in BERT base encoder than the encoder from the architecture proposed in Attention is All You Need?")
 ```
+
+## Gradio interface
+
+You can leverage `gradio.Chatbot`to display your agent's thoughts using `stream_to_gradio`, here is an example:
+
+```py
+import gradio as gr
+from transformers import (
+    load_tool,
+    ReactCodeAgent,
+    HfEngine,
+    stream_to_gradio,
+)
+
+# Import tool from Hub
+image_generation_tool = load_tool("m-ric/text-to-image")
+
+llm_engine = HfEngine("meta-llama/Meta-Llama-3-70B-Instruct")
+
+# Initialize the agent with the image generation tool
+agent = ReactCodeAgent(tools=[image_generation_tool], llm_engine=llm_engine)
+
+
+def interact_with_agent(task):
+    messages = []
+    messages.append(gr.ChatMessage(role="user", content=task))
+    yield messages
+    for msg in stream_to_gradio(agent, task):
+        messages.append(msg)
+        yield messages + [
+            gr.ChatMessage(role="assistant", content="⏳ Task not finished yet!")
+        ]
+    yield messages
+
+
+with gr.Blocks() as demo:
+    text_input = gr.Textbox(lines=1, label="Chat Message", value="Make me a picture of the Statue of Liberty.")
+    submit = gr.Button("Run illustrator agent!")
+    chatbot = gr.Chatbot(
+        label="Agent",
+        type="messages",
+        avatar_images=(
+            None,
+            "https://em-content.zobj.net/source/twitter/53/robot-face_1f916.png",
+        ),
+    )
+    submit.click(interact_with_agent, [text_input], [chatbot])
+
+if __name__ == "__main__":
+    demo.launch()
+```
--- a/docs/source/en/chat_templating.md
+++ b/docs/source/en/chat_templating.md
@ -580,7 +580,7 @@ default template for that model class is used instead. Let's take a look at the
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

@ -704,23 +704,6 @@ with other names, pass the name of the template you want to the `chat_template`
 We find that this can be a bit confusing for users, though - so if you're writing a template yourself, we recommend
 trying to put it all in a single template where possible!

-### What are "default" templates?
-
-Before the introduction of chat templates, chat handling was hardcoded at the model class level. For backwards 
-compatibility, we have retained this class-specific handling as default templates, also set at the class level. If a
-model does not have a chat template set, but there is a default template for its model class, the `TextGenerationPipeline`
-class and methods like `apply_chat_template` will use the class template instead. You can find out what the default
-template for your tokenizer is by checking the `tokenizer.default_chat_template` attribute.
-
-This is something we do purely for backward compatibility reasons, to avoid breaking any existing workflows. Even when
-the class template is appropriate for your model, we strongly recommend overriding the default template by
-setting the `chat_template` attribute explicitly to make it clear to users that your model has been correctly configured
-for chat.
-
-Now that actual chat templates have been adopted more widely, default templates have been deprecated and will be
-removed in a future release. We strongly recommend setting the `chat_template` attribute for any tokenizers that
-still depend on them!
-
 ### What template should I use?

 When setting the template for a model that's already been trained for chat, you should ensure that the template
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -195,7 +195,7 @@ inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}
 print("Tokenized inputs:\n", inputs)

 # 4: Generate text from the model
-outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.)
+outputs = model.generate(**inputs, max_new_tokens=512, temperature=0.1)
 print("Generated tokens:\n", outputs)

 # 5: Decode the output back to a string
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -174,43 +174,6 @@ An increasing sequence: one, two, three, four, five, six, seven, eight, nine, te
 ```


-## KV Cache Quantization
-
-The `generate()` method supports caching keys and values to enhance efficiency and avoid re-computations. However the key and value
-cache can occupy a large portion of memory, becoming a bottleneck for long-context generation, especially for Large Language Models.
-Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed.
-
-KV Cache quantization in `transformers` is largely inspired by the paper [KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache]
-(https://arxiv.org/abs/2402.02750) and currently supports `quanto` and `HQQ` as backends. For more information on the inner workings see the paper.
-
-To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`.
-Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`QuantizedCacheConfig`] class.
-One has to indicate which quantization backend to use in the [`QuantizedCacheConfig`], the default is `quanto`.
-
-<Tip warning={true}>
-
-Cache quantization can be detrimental if the context length is short and there is enough GPU VRAM available to run without cache quantization.
-
-</Tip>
-
-
-```python
->>> import torch
->>> from transformers import AutoTokenizer, AutoModelForCausalLM
-
->>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
->>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
->>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
-
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's loud and energetic. It's a great way to express myself and rel
-
->>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
->>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
-I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
-```
-
 ## Watermarking

 The `generate()` supports watermarking the generated text by randomly marking a portion of tokens as "green".
--- a/docs/source/en/index.md
+++ b/docs/source/en/index.md
@ -194,6 +194,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                       [M2M100](model_doc/m2m_100)                        |       ✅        |         ❌         |      ❌      |
 |                    [MADLAD-400](model_doc/madlad-400)                    |       ✅        |         ✅         |      ✅      |
 |                         [Mamba](model_doc/mamba)                         |       ✅        |         ❌         |      ❌      |
+|                        [mamba2](model_doc/mamba2)                        |       ✅        |         ❌         |      ❌      |
 |                        [Marian](model_doc/marian)                        |       ✅        |         ✅         |      ✅      |
 |                      [MarkupLM](model_doc/markuplm)                      |       ✅        |         ❌         |      ❌      |
 |                   [Mask2Former](model_doc/mask2former)                   |       ✅        |         ❌         |      ❌      |
@ -222,6 +223,7 @@ Flax), PyTorch, and/or TensorFlow.
 |               [MusicGen Melody](model_doc/musicgen_melody)               |       ✅        |         ❌         |      ❌      |
 |                           [MVP](model_doc/mvp)                           |       ✅        |         ❌         |      ❌      |
 |                           [NAT](model_doc/nat)                           |       ✅        |         ❌         |      ❌      |
+|                      [Nemotron](model_doc/nemotron)                      |       ✅        |         ❌         |      ❌      |
 |                         [Nezha](model_doc/nezha)                         |       ✅        |         ❌         |      ❌      |
 |                          [NLLB](model_doc/nllb)                          |       ✅        |         ❌         |      ❌      |
 |                      [NLLB-MOE](model_doc/nllb-moe)                      |       ✅        |         ❌         |      ❌      |
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -386,11 +386,25 @@ A [`Constraint`] can be used to force the generation to include specific tokens
    - get_seq_length
    - reorder_cache

+[[autodoc]] OffloadedCache
+    - update
+    - prefetch_layer
+    - evict_previous_layer
+
 [[autodoc]] StaticCache
    - update
    - get_seq_length
    - reset

+[[autodoc]] HybridCache
+    - update
+    - get_seq_length
+    - reset
+
+[[autodoc]] SlidingWindowCache
+    - update
+    - reset
+
 [[autodoc]] EncoderDecoderCache
    - get_seq_length
    - to_legacy_cache
@ -398,6 +412,11 @@ A [`Constraint`] can be used to force the generation to include specific tokens
    - reset
    - reorder_cache

+[[autodoc]] MambaCache
+    - update_conv_state
+    - update_ssm_state
+    - reset
+
 ## Watermark Utils

 [[autodoc]] WatermarkDetector
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -0,0 +1,346 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Best Practices for Generation with Cache
+
+Efficient caching is crucial for optimizing the performance of models in various generative tasks,
+including text generation, translation, summarization and other transformer-based applications.
+Effective caching helps reduce computation time and improve response rates, especially in real-time or resource-intensive applications.
+
+Transformers support various caching methods, leveraging "Cache" classes to abstract and manage the caching logic.
+This document outlines best practices for using these classes to maximize performance and efficiency.
+Check out all the available `Cache` classes in the [API documentation](./internal/generation_utils.md).
+
+## What is Cache and why we should care?
+
+Imagine you’re having a conversation with someone, and instead of remembering what was said previously, you have to start from scratch every time you respond. This would be slow and inefficient, right? In the world of Transformer models, a similar concept applies, and that's where Caching keys and values come into play. From now on, I'll refer to the concept as KV Cache.
+
+KV cache is needed to optimize the generation in autoregressive models, where the model predicts text token by token. This process can be slow since the model can generate only one token at a time, and each new prediction is dependent on the previous context. That means, to predict token number 1000 in the generation, you need information from the previous 999 tokens, which comes in the form of some matrix multiplications across the representations of those tokens. But to predict token number 1001, you also need the same information from the first 999 tokens, plus additional information from token number 1000. That is where key-value cache is used to optimize the sequential generation process by storing previous calculations to reuse in subsequent tokens, so they don't need to be computed again.
+
+More concretely, key-value cache acts as a memory bank for these generative models, where the model stores key-value pairs derived from self-attention layers for previously processed tokens. By storing this information, the model can avoid redundant computations and instead retrieve keys and values of previous tokens from the cache.
+
+<details>
+  <summary><em>For the Curious Minds Who Like to Dive Deep</em></summary>
+
+  ### Under the Hood: How Cache Object Works in Attention Mechanism
+
+  When utilizing a cache object in the input, the Attention module performs several critical steps to integrate past and present information seamlessly.
+
+  The Attention module concatenates the current key-values with the past key-values stored in the cache. This results in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`. Essentially, the past and current key-values are combined to compute attention scores, ensuring that the model considers both previous context and new input. The concatenated key-values are used to compute the attention scores resulting in attention weights of shape `(new_tokens_length, past_kv_length + new_tokens_length)`.
+
+  Therefore, when iteratively calling `forward()` instead of the `generate()` method, it’s crucial to ensure that the attention mask shape matches the combined length of past and current key-values. The attention mask should have the shape `(batch_size, past_kv_length + new_tokens_length)`. This is usually handled internally when you call `generate()` method. If you want to implement your own generation loop with Cache classes, take this into consideration and prepare the attention mask to hold values to current and past tokens.
+
+  <Tip warning={true}>
+
+  One important concept you need to know when writing your own generation loop, is `cache_position`. In case you want to reuse an already filled Cache object by calling `forward()`, you have to pass in a valid `cache_position` which will indicate the positions of inputs in the sequence. Note that `cache_position` is not affected by padding, and always adds one more position for each token. For example, if key/value cache contains 10 tokens (no matter how many of it is a pad token), the cache position for the next token should be `torch.tensor([10])`.
+
+  </Tip>
+
+
+  See an example below for how to implement your own generation loop.
+    
+  ```python
+  >>> import torch
+  >>> from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+ 
+  >>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+  >>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+  >>> tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+  >>> past_key_values = DynamicCache()
+  >>> messages = [{"role": "user", "content": "Hello, what's your name."}]
+  >>> inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+
+  >>> generated_ids = inputs.input_ids
+  >>> cache_position = torch.arange(inputs.input_ids.shape[1], dtype=torch.int64, device="cuda:0")
+  >>> max_new_tokens = 10
+
+  >>> for _ in range(max_new_tokens):
+  ...     outputs = model(**inputs, cache_position=cache_position, past_key_values=past_key_values, use_cache=True)     
+  ...     # Greedily sample one next token
+  ...     next_token_ids = outputs.logits[:, -1:].argmax(-1)
+  ...     generated_ids = torch.cat([generated_ids, next_token_ids], dim=-1)   
+  ...
+  ...     # Prepare inputs for the next generation step by leaaving unprocessed tokens, in our case we have only one new token
+  ...     # and expanding attn mask for the new token, as explained above
+  ...     attention_mask = inputs["attention_mask"]
+  ...     attention_mask = torch.cat([attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1)
+  ...     inputs = {"input_ids": next_token_ids, "attention_mask": attention_mask}
+  ...     cache_position = cache_position[-1:] + 1 # add one more position for the next token
+
+  >>> print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
+  "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
+  ```
+
+</details>
+
+
+
+## Generate with Cache
+
+In 🤗 Transformers, we support various Cache types to optimize the performance across different models and tasks. By default, all models generate with caching,
+with the [`~DynamicCache`] class being the default cache for most models. It allows us to dynamically grow cache size, by saving more and more keys and values as we generate. If for some reason you don't want to use caches, you can pass `use_cache=False` into the `generate()` method.
+
+Refer to the table below to see the difference between cache types and choose the one that suits best for your use-case.
+
+| Cache Type          | Memory Efficient | Supports torch.compile() | Initialization Recommended | Latency  |  Long Context Generation |
+|---------------------|------------------|--------------------------|----------------------------|----------|--------------------------|
+| Dynamic Cache       |      No          |        No                |         No                 |   Mid    |     No                   |
+| Static Cache        |      No          |        Yes               |         Yes                |   High   |     No                   |
+| Quantized Cache     |      Yes         |        No                |         No                 |   Low    |     Yes                  |
+| Offloaded Cache     |      Yes         |        No                |         No                 |   Low    |     No                   |
+| Sliding Window Cache|      No          |        Yes               |         Yes                |   High   |     No                   |
+| Sink Cache          |      Yes         |        No                |         Yes                |   Mid    |     Yes                  |
+
+
+These cache classes can be set with a `cache_implementation` argument when generating. To learn about the available options for the cache_implementation flag, please refer to the [API Documentation](./main_classes/text_generation.md#transformers.GenerationConfig). Now, let's explore each cache type in detail and see how to use them. Note that the below examples are for decoder-only Tranformer-based models. We also support ["Model-Specific Cache"] classes for models such as Mamba or Jamba, keep reading for more details.
+
+### Quantized Cache
+
+The key and value cache can occupy a large portion of memory, becoming a [bottleneck for long-context generation](https://huggingface.co/blog/llama31#inference-memory-requirements), especially for Large Language Models.
+Quantizing the cache when using `generate()` can significantly reduce memory requirements at the cost of speed.
+
+KV Cache quantization in `transformers` is largely inspired by the paper ["KIVI: A Tuning-Free Asymmetric 2bit Quantization for KV Cache"](https://arxiv.org/abs/2402.02750) and currently supports [`~QuantoQuantizedCache`] and [`~HQQQuantizedCache`] classes. For more information on the inner workings see the paper.
+
+To enable quantization of the key-value cache, one needs to indicate `cache_implementation="quantized"` in the `generation_config`.
+Quantization related arguments should be passed to the `generation_config` either as a `dict` or an instance of a [`~QuantizedCacheConfig`] class.
+One has to indicate which quantization backend to use in the [`~QuantizedCacheConfig`], the default is `quanto`.
+
+<Tip warning={true}>
+
+Cache quantization can be detrimental in terms of latency if the context length is short and there is enough GPU VRAM available to run without cache quantization. It is recommended to seek balance between memory efficiency and latency.
+</Tip>
+
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("I like rock music because", return_tensors="pt").to(model.device)
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="quantized", cache_config={"nbits": 4, "backend": "quanto"})
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. It's a great way to express myself and rel
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20)
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+I like rock music because it's loud and energetic. I like to listen to it when I'm feeling
+```
+
+## OffloadedCache
+
+Similarly to KV cache quantization, [`~OffloadedCache`] strategy aims to reduce GPU VRAM usage.
+It does so by moving the KV cache for most layers to the CPU.
+As the model's `forward()` method iterates over the layers, this strategy maintains the current layer cache on the GPU.
+At the same time it asynchronously prefetches the next layer cache as well as sending the previous layer cache back to the CPU.
+Unlike KV cache quantization, this strategy always produces the same result as the default KV cache implementation.
+Thus, it can serve as a drop-in replacement or a fallback for it.
+
+Depending on your model and the characteristics of your generation task (size of context, number of generated tokens, number of beams, etc.)
+you may notice a small degradation in generation throughput compared to the default KV cache implementation.
+
+To enable KV cache offloading, pass `cache_implementation="offloaded"` in the `generation_config` or directky to the `generate()` call.
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
+
+>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("Fun fact: The shortest", return_tensors="pt").to(model.device)
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23, cache_implementation="offloaded")
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=23)
+>>> print(tokenizer.batch_decode(out, skip_special_tokens=True)[0])
+Fun fact: The shortest war in history was between Britain and Zanzibar on August 27, 1896.
+```
+
+<Tip warning={true}>
+
+Cache offloading requires a GPU and can be slower than dynamic KV cache. Use it if you are getting CUDA out of memory errors.
+
+</Tip>
+
+The example below shows how KV cache offloading can be used as a fallback strategy.
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+>>> def resilient_generate(model, *args, **kwargs):
+...     oom = False
+...     try:
+...         return model.generate(*args, **kwargs)
+...     except torch.cuda.OutOfMemoryError as e:
+...         print(e)
+...         print("retrying with cache_implementation='offloaded'")
+...         oom = True
+...     if oom:
+...         torch.cuda.empty_cache()
+...         kwargs["cache_implementation"] = "offloaded"
+...         return model.generate(*args, **kwargs)
+...
+...
+>>> ckpt = "microsoft/Phi-3-mini-4k-instruct"
+>>> tokenizer = AutoTokenizer.from_pretrained(ckpt)
+>>> model = AutoModelForCausalLM.from_pretrained(ckpt, torch_dtype=torch.float16).to("cuda:0")
+>>> prompt = ["okay "*1000 + "Fun fact: The most"]
+>>> inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+>>> beams = { "num_beams": 40, "num_beam_groups": 40, "num_return_sequences": 40, "diversity_penalty": 1.0, "max_new_tokens": 23, "early_stopping": True, }
+>>> out = resilient_generate(model, **inputs, **beams)
+>>> responses = tokenizer.batch_decode(out[:,-28:], skip_special_tokens=True)
+```
+
+On a GPU with 50 GB of RAM, running this code will print
+```
+CUDA out of memory. Tried to allocate 4.83 GiB. GPU
+retrying with cache_implementation='offloaded'
+```
+before successfully generating 40 beams.
+
+
+
+### Static Cache
+
+Since the "DynamicCache" dynamically grows with each generation step, it prevents you from taking advantage of JIT optimizations. The [`~StaticCache`] pre-allocates 
+a specific maximum size for the keys and values, allowing you to generate up to the maximum length without having to modify cache size. Check the below usage example.
+
+For more examples with Static Cache and JIT compilation, take a look at [StaticCache & torchcompile](./llm_optims.md#static-kv-cache-and-torchcompile)
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
+>>> inputs = tokenizer("Hello, my name is", return_tensors="pt").to(model.device)
+
+>>> # simply pass the cache implementation="static"
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=20, cache_implementation="static")
+>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"Hello, my name is [Your Name], and I am a [Your Profession] with [Number of Years] of"
+```
+
+### Sliding Window Cache
+
+As the name suggests, this cache type implements a sliding window over previous keys and values, retaining only the last `sliding_window` tokens. It should be used with models like Mistral that support sliding window attention. Additionally, similar to Static Cache, this one is JIT-friendly and can be used with the same compile tecniques as Static Cache.
+
+Note that you can use this cache only for models that support sliding window, e.g. Mistral models. 
+
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
+
+>>> tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1")
+>>> model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("Yesterday I was on a rock concert and.", return_tensors="pt").to(model.device)
+
+>>> # can be used by passing in cache implementation
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, cache_implementation="sliding_window")
+>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"Yesterday I was on a rock concert and. I was so excited to see my favorite band. I was so excited that I was jumping up and down and screaming. I was so excited that I"
+```
+
+### Sink Cache
+
+Sink Cache was introduced in ["Efficient Streaming Language Models with Attention Sinks"](https://arxiv.org/abs/2309.17453). It allows you to generate long sequences of text ("infinite length" according to the paper) without any fine-tuning. That is achieved by smart handling of previous keys and values, specifically it retains a few initial tokens from the sequence, called "sink tokens". This is based on the observation that these initial tokens attract a significant portion of attention scores during the generation process. Tokens that come after "sink tokens" are discarded on a sliding windowed basis, keeping only the latest `window_size` tokens. By keeping these initial tokens as "attention sinks," the model maintains stable performance even when dealing with very long texts, thus discarding most of the previous knowledge.
+
+Unlike other cache classes, this one can't be used directly by indicating a `cache_implementation`. You have to initialize the Cache before calling on `generate()` as follows.
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer, AutoModelForCausalLM, SinkCache
+
+>>> tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
+>>> model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16).to("cuda:0")
+>>> inputs = tokenizer("This is a long story about unicorns, fairies and magic.", return_tensors="pt").to(model.device)
+
+>>> # get our cache, specify number of sink tokens and window size
+>>> # Note that window size already includes sink tokens, so has to be larger
+>>> past_key_values = SinkCache(window_length=256, num_sink_tokens=4)
+>>> out = model.generate(**inputs, do_sample=False, max_new_tokens=30, past_key_values=past_key_values)
+>>> tokenizer.batch_decode(out, skip_special_tokens=True)[0]
+"This is a long story about unicorns, fairies and magic. It is a fantasy world where unicorns and fairies live together in harmony. The story follows a young girl named Lily"
+```
+
+### Encoder-Decoder Cache
+
+The [`~EncoderDecoderCache`] is a wrapper designed to handle the caching needs of encoder-decoder models. This cache type is specifically built to manage both self-attention and cross-attention caches, ensuring storage and retrieval of past key/values required for these complex models. Cool thing about Encoder-Decoder Cache is that you can set different cache types for the encoder and for the decoder, depending on your use case. Currently this cache is only supported in [Whisper](./model_doc/whisper.md) models but we will be adding more models soon. 
+
+In terms of usage, there is nothing special to be done and calling `generate()` or `forward()` will handle everything for you.
+
+
+### Model-specific Cache Classes
+
+Some models require storing previous keys, values, or states in a specific way, and the above cache classes cannot be used. For such cases, we have several specialized cache classes that are designed for specific models. These models only accept their own dedicated cache classes and do not support using any other cache types. Some examples include [`~HybridCache`] for [Gemma2](./model_doc/gemma2.md) series models or [`~MambaCache`] for [Mamba](./model_doc/mamba.md) architecture models.
+
+
+## Iterative Generation with Cache
+
+We have seen how to use each of the cache types when generating. What if you want to use cache in iterative generation setting, for example in applications like chatbots, where interactions involve multiple turns and continuous back-and-forth exchanges. Iterative generation with cache allows these systems to handle ongoing conversations effectively without reprocessing the entire context at each step. But there are some tips that you should know before you start implementing:
+
+The general format when doing iterative generation is as below. First you have to initialize an empty cache of the type you want, and you can start feeding in new prompts iteratively. Keeping track of dialogues history and formatting can be done with chat templates, read more on that in [chat_templating](./chat_templating.md)
+
+In case you are using Sink Cache, you have to crop your inputs to that maximum length because Sink Cache can generate text longer than its maximum window size, but it expects the first input to not exceed the maximum cache length.  
+
+
+```python
+>>> import torch
+>>> from transformers import AutoTokenizer,AutoModelForCausalLM
+>>> from transformers.cache_utils import (
+>>>     DynamicCache,
+>>>     SinkCache,
+>>>     StaticCache,
+>>>     SlidingWindowCache,
+>>>     QuantoQuantizedCache,
+>>>     QuantizedCacheConfig,
+>>> )
+
+>>> model_id = "meta-llama/Llama-2-7b-chat-hf"
+>>> model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map='auto')
+>>> tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+>>> user_prompts = ["Hello, what's your name?", "Btw, yesterday I was on a rock concert."]
+
+>>> past_key_values = DynamicCache()
+>>> max_cache_length = past_key_values.get_max_length()
+
+>>> messages = []
+>>> for prompt in user_prompts:
+...     messages.append({"role": "user", "content": prompt})
+...     inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to(model.device)
+...     if isinstance(past_key_values, SinkCache):
+...         inputs = {k: v[:, -max_cache_length:] for k, v in inputs.items()}
+... 
+...     input_length = inputs["input_ids"].shape[1]
+...     
+...     outputs = model.generate(**inputs, do_sample=False, max_new_tokens=256, past_key_values=past_key_values)
+...     completion = tokenizer.decode(outputs[0, input_length: ], skip_special_tokens=True)
+...     messages.append({"role": "assistant", "content": completion})
+
+print(messages)
+[{'role': 'user', 'content': "Hello, what's your name?"}, {'role': 'assistant', 'content': " Hello! My name is LLaMA, I'm a large language model trained by a team of researcher at Meta AI. 😊"}, {'role': 'user', 'content': 'Btw, yesterday I was on a rock concert.'}, {'role': 'assistant', 'content': ' Oh, cool! That sounds like a lot of fun! 🎉 Did you enjoy the concert? What was the band like? 🤔'}]
+```
+
+
+## Re-use Cache to continue generation
+
+Sometimes you would want to fist fill-in cache object with key/values for certain prefix prompt and re-use it several times to generate different sequences from it. We are working hard on adding this feature to 🤗 Transformers and will update this section soon. 
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -18,59 +18,109 @@ Basic inference is slow because LLMs have to be called repeatedly to generate th
 This guide will show you how to use the optimization techniques available in Transformers to accelerate LLM inference.

 > [!TIP]
-> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes more optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.
+> Hugging Face also provides [Text Generation Inference (TGI)](https://hf.co/docs/text-generation-inference), a library dedicated to deploying and serving highly optimized LLMs for inference. It includes deployment-oriented optimization features not included in Transformers, such as continuous batching for increasing throughput and tensor parallelism for multi-GPU inference.

-## Static kv-cache and torch.compile
+## Static kv-cache and `torch.compile`

 During decoding, a LLM computes the key-value (kv) values for each input token and since it is autoregressive, it computes the same kv values each time because the generated output becomes part of the input now. This is not very efficient because you're recomputing the same kv values each time.

-To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [torch.compile](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.
+To optimize this, you can use a kv-cache to store the past keys and values instead of recomputing them each time. However, since the kv-cache grows with each generation step and is dynamic, it prevents you from taking advantage of [`torch.compile`](./perf_torch_compile), a powerful optimization tool that fuses PyTorch code into fast and optimized kernels.

-The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with torch.compile for up to a 4x speed up.
+The *static kv-cache* solves this issue by pre-allocating the kv-cache size to a maximum value which allows you to combine it with `torch.compile` for up to a 4x speed up. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware.

 > [!WARNING]
-> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and torch.compile. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.
+> Currently, only [Llama](./model_doc/llama2) and a few other models support static kv-cache and `torch.compile`. Check [this issue](https://github.com/huggingface/transformers/issues/28981) for a live model compatibility list.

-For this example, let's load the [Gemma](https://hf.co/google/gemma-2b) model.
+There are three flavors of static kv-cache usage, depending on the complexity of your task:
+1. Basic usage: simply set a flag in `generation_config` (recommended);
+2. Advanced usage: handle a cache object for multi-turn generation or a custom generation loop;
+3. Advanced usage: compile the entire `generate` function into a single graph, if having a single graph is relevant for you.
+
+Select the correct tab below for further instructions on each of these flavors.
+
+> [!TIP]
+> Regardless of the strategy used with `torch.compile`, you can avoid shape-related recompilations if you left-pad your LLM inputs to a limited set of values. The [`pad_to_multiple_of` tokenizer flag](https://huggingface.co/docs/transformers/main_classes/tokenizer#transformers.PreTrainedTokenizer.__call__.pad_to_multiple_of) is your friend!
+
+<hfoptions id="static-kv">
+<hfoption id="basic usage: generation_config">
+
+For this example, let's use the [Gemma](https://hf.co/google/gemma-2b) model. All we need to do is to:
+1. Access the model's `generation_config` attribute and set the `cache_implementation` to "static";
+2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
+
+And that's it!

 ```py
 from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)

 tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
-model = AutoModelForCausalLM.from_pretrained(
-    "google/gemma-2b", device_map="auto"
-)
-```
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")

-There are two ways you can configure the model to use a static kv-cache. For a 7B model on an A100, both methods get a 4x speed up in the forward pass. Your speed up may vary depending on the model size (larger models have a smaller speed up) and hardware. If you're using the [`~GenerationMixin.generate`] method, the speed up is ~3x. The forward pass (which still gets 4x speed up) is only a part of the whole [`~GenerationMixin.generate`] code.
-
-<hfoptions id="static-kv">
-<hfoption id="generation_config">
-
-Access the model's `generation_config` attribute and set the `cache_implementation` to "static".
-
-```py
 model.generation_config.cache_implementation = "static"
-```

-Call torch.compile on the model to compile the forward pass with the static kv-cache.
-
-```py
-compiled_model = torch.compile(model, mode="reduce-overhead", fullgraph=True)
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
 input_text = "The theory of special relativity states "
 input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")

-outputs = compiled_model.generate(**input_ids)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
+outputs = model.generate(**input_ids)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
 ```

-Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. However, if the batch size or the maximum output length increase between calls, the cache will have to be reinitialized, triggering a new compilation.
+Under the hood, `generate` will attempt to reuse the same cache object, removing the need for re-compilation at each call. Avoiding re-compilation is critical to get the most out of `torch.compile`, and you should be aware of the following:
+1. If the batch size changes or the maximum output length increases between calls, the cache will have to be reinitialized, triggering a new compilation;
+2. The first couple of calls of the compiled function are slower, as the function is being compiled.
+
+> [!WARNING]
+> For a more advanced usage of the static cache, such as multi-turn conversations, we recommend instantiating and manipulating the cache object outside [`~GenerationMixin.generate`]. See the advanced usage tab.

 </hfoption>
-<hfoption id="Static Cache">
+<hfoption id="advanced usage: control Static Cache">

-A [`StaticCache`] object can be passed to the model's forward pass under the `past_key_values` argument, enabling the use of this object as a static kv-cache. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens. You can also pass the [`StaticCache`] object to [`~GenerationMixin.generate`] and use it across calls, like you would do with a dynamic cache.
+A [`StaticCache`] object can be passed to the model's [`~GenerationMixin.generate`] under the `past_key_values` argument. The object will retain the cache contents, so you can pass it to a new [`~GenerationMixin.generate`] call to continue generation, like you would do with a dynamic cache.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM, StaticCache
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+
+model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)
+input_text = "The theory of special relativity states "
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+prompt_length = input_ids.input_ids.shape[1]
+model.generation_config.max_new_tokens = 16
+
+past_key_values = StaticCache(
+    config=model.config,
+    max_batch_size=1,
+    # If you plan to reuse the cache, make sure the cache length is large enough for all cases
+    max_cache_len=prompt_length+(model.generation_config.max_new_tokens*2),
+    device=model.device,
+    dtype=model.dtype
+)
+outputs = model.generate(**input_ids, past_key_values=past_key_values)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2']
+
+# pass in the generated text and the same cache object to continue generation from where it left off. Optionally, in a
+# multi-turn conversation, append the new user input to the generated text.
+new_input_ids = outputs
+outputs = model.generate(new_input_ids, past_key_values=past_key_values)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference frames. 2. The speed of light is constant in all inertial reference frames. 3.']
+```
+
+> [!TIP]
+> If you want to reuse the same [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method between calls
+
+If you want to go further down a level, the [`StaticCache`] object can also be passed to the model's forward pass under the same `past_key_values` argument. Using this strategy, you can write your own function to decode the next token given the current token and position and cache position of previously generated tokens.

 ```py
 from transformers import LlamaTokenizer, LlamaForCausalLM, StaticCache, logging
@ -102,12 +152,9 @@ def decode_one_tokens(model, cur_token, input_pos, cache_position, past_key_valu
    return new_token
 ```

-There are a few important things you must do to enable static kv-cache and torch.compile with the `StaticCache` method:
-
+There are a few important things you must do to enable static kv-cache and `torch.compile` with the `StaticCache` method:
 1. Initialize the [`StaticCache`] instance before using the model for inference. There you can configure parameters like the maximum batch size and sequence length.
-
-2. Call torch.compile on the model to compile the forward pass with the static kv-cache.
-
+2. Call `torch.compile` on the model to compile the forward pass with the static kv-cache.
 3. Set `enable_math=True` in the [torch.backends.cuda.sdp_kernel](https://pytorch.org/docs/master/generated/torch.nn.functional.scaled_dot_product_attention.html) context manager to enable the native PyTorch C++ implementation of scaled dot product attention to speed up inference even more.

 ```py
@ -142,8 +189,34 @@ text
 'My favorite all time favorite condiment is ketchup. I love it on everything. I love it on my eggs, my fries, my chicken, my burgers, my hot dogs, my sandwiches, my salads, my p']
 ```

-> [!TIP]
-> If you want to reuse the [`StaticCache`] object on a new prompt, be sure to reset its contents with the `.reset()` method
+</hfoption>
+<hfoption id="advanced usage: end-to-end generate compilation">
+
+Compiling the entire `generate` function, in terms of code, is even simpler than in the basic usage: call `torch.compile` on `generate` to compile the entire function. No need to specify the use of the static cache: although it is compatible, dynamic cache (default) was faster in our benchmarks.
+
+```py
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import torch
+import os
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # To prevent long warnings :)
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained("google/gemma-2b", device_map="auto")
+
+model.generate = torch.compile(model.generate, mode="reduce-overhead", fullgraph=True)
+input_text = "The theory of special relativity states "
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids)
+print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
+['The theory of special relativity states 1. The speed of light is constant in all inertial reference']
+```
+
+As a result, we compile not only the model forward pass, but also all input preparation, logit processor operations, and so on. The result should be a slightly `generate` call, compared to the basic usage example, and the compiled graph may be better suited to more exotic hardware devices or use cases. However, there are severe drawbacks in using this approach:
+1. Compilation is much slower;
+2. All parameterization of `generate` must be done through `generation_config`;
+3. Many warnings and exceptions are suppressed -- we suggest testing with its uncompiled form first;
+4. Although we are working on it, it is heavily feature restricted (for instance, at the time of writing, generation does not stop if an EOS token is selected).

 </hfoption>
 </hfoptions>
--- a/docs/source/en/main_classes/agent.md
+++ b/docs/source/en/main_classes/agent.md
@ -72,6 +72,10 @@ We provide two types of agents, based on the main [`Agent`] class:

 [[autodoc]] launch_gradio_demo

+### stream_to_gradio
+
+[[autodoc]] stream_to_gradio
+
 ### ToolCollection

 [[autodoc]] ToolCollection
--- a/docs/source/en/main_classes/data_collator.md
+++ b/docs/source/en/main_classes/data_collator.md
@ -66,3 +66,8 @@ Examples of use can be found in the [example scripts](../examples) or [example n
    - numpy_mask_tokens
    - tf_mask_tokens
    - torch_mask_tokens
+
+## DataCollatorWithFlattening
+
+[[autodoc]] data.data_collator.DataCollatorWithFlattening
+
--- a/docs/source/en/model_doc/chameleon.md
+++ b/docs/source/en/model_doc/chameleon.md
@ -137,7 +137,7 @@ from transformers import ChameleonForConditionalGeneration, BitsAndBytesConfig
 quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16,
+    bnb_4bit_compute_dtype=torch.bfloat16,
 )

 model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", quantization_config=quantization_config, device_map="cuda")
--- a/docs/source/en/model_doc/gemma2.md
+++ b/docs/source/en/model_doc/gemma2.md
@ -30,6 +30,12 @@ Tips:

 - The original checkpoints can be converted using the conversion script `src/transformers/models/Gemma2/convert_Gemma2_weights_to_hf.py` 

+<Tip warning={true}>
+
+- Gemma2 uses sliding window attention every second layer, which makes it unsuitable for typical kv caching with [`~DynamicCache`] or tuples of tensors. To enable caching in Gemma2 forward call, you must initialize a [`~HybridCache`] instance and pass it as `past_key_values` to the forward call. Note, that you also have to prepare `cache_position` if the `past_key_values` already contains previous keys and values.
+
+</Tip>
+
 This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Pedro Cuenca](https://huggingface.co/pcuenq) and [Tom Arsen]().


--- a/docs/source/en/model_doc/grounding-dino.md
+++ b/docs/source/en/model_doc/grounding-dino.md
@ -41,33 +41,40 @@ The original code can be found [here](https://github.com/IDEA-Research/Grounding
 Here's how to use the model for zero-shot object detection:

 ```python
-import requests
+>>> import requests

-import torch
-from PIL import Image
-from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection, 
+>>> import torch
+>>> from PIL import Image
+>>> from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

-model_id = "IDEA-Research/grounding-dino-tiny"
+>>> model_id = "IDEA-Research/grounding-dino-tiny"
+>>> device = "cuda"

-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+>>> processor = AutoProcessor.from_pretrained(model_id)
+>>> model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

-image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image = Image.open(requests.get(image_url, stream=True).raw)
-# Check for cats and remote controls
-text = "a cat. a remote control."
+>>> image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(image_url, stream=True).raw)
+>>> # Check for cats and remote controls
+>>> text = "a cat. a remote control."

-inputs = processor(images=image, text=text, return_tensors="pt").to(device)
-with torch.no_grad():
-    outputs = model(**inputs)
+>>> inputs = processor(images=image, text=text, return_tensors="pt").to(device)
+>>> with torch.no_grad():
+...     outputs = model(**inputs)

-results = processor.post_process_grounded_object_detection(
-    outputs,
-    inputs.input_ids,
-    box_threshold=0.4,
-    text_threshold=0.3,
-    target_sizes=[image.size[::-1]]
-)
+>>> results = processor.post_process_grounded_object_detection(
+...     outputs,
+...     inputs.input_ids,
+...     box_threshold=0.4,
+...     text_threshold=0.3,
+...     target_sizes=[image.size[::-1]]
+... )
+>>> print(results)
+[{'boxes': tensor([[344.6959,  23.1090, 637.1833, 374.2751],
+        [ 12.2666,  51.9145, 316.8582, 472.4392],
+        [ 38.5742,  70.0015, 176.7838, 118.1806]], device='cuda:0'),
+  'labels': ['a cat', 'a cat', 'a remote control'],
+  'scores': tensor([0.4785, 0.4381, 0.4776], device='cuda:0')}]
 ```

 ## Grounded SAM
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -55,12 +55,12 @@ The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/

 - Note that each checkpoint has been trained with a specific prompt format, depending on which large language model (LLM) was used. You can use the processor's `apply_chat_template` to format your prompts correctly. For that you have to construct a conversation history, passing a plain string will not format your prompt. Each message in the conversation history for chat templates is a dictionary with keys "role" and "content". The "content" should be a list of dictionaries, for "text" and "image" modalities. Below is an example of how to do that and the list of formats accepted by each checkpoint.

-We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows:
+We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image. Each content field has to be a list of dicts, as follows:

 ```python
 from transformers import LlavaNextProcessor

-processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-hf/llava-v1.6-mistral-7b-hf")
+processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")

 conversation = [
    {
--- a/docs/source/en/model_doc/mamba2.md
+++ b/docs/source/en/model_doc/mamba2.md
@ -0,0 +1,106 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Mamba 2
+
+## Overview
+
+The Mamba2 model was proposed in [Transformers are SSMs: Generalized Models and Efficient Algorithms Through Structured State Space Duality](https://arxiv.org/abs/2405.21060) by Tri Dao and Albert Gu. It is a State Space Model similar to Mamba 1, with better performances in a simplified architecture. 
+
+
+The abstract from the paper is the following:
+
+*While Transformers have been the main architecture behind deep learning's success in language modeling, state-space models (SSMs) such as Mamba have recently been shown to match or outperform Transformers at small to medium scale. We show that these families of models are actually quite closely related, and develop a rich framework of theoretical connections between SSMs and variants of attention, connected through various decompositions of a well-studied class of structured semiseparable matrices. Our state space duality (SSD) framework allows us to design a new architecture (Mamba-2) whose core layer is an a refinement of Mamba's selective SSM that is 2-8X faster, while continuing to be competitive with Transformers on language modeling.*
+
+Tips:
+
+This version should support all implementations of Mamba 2, and in particular [Mamba-2 codestral](https://huggingface.co/mistralai/Mamba-Codestral-7B-v0.1) from Mistral AI. In particular, mamba 2 codestral was released with a number of `groups` equal to 8, which can be thought intuitively as similar to the number of kv heads in an attention-based model. 
+This model has two different forward passes, `torch_forward` or `cuda_kernels_forward`. The latter uses the original cuda kernels if they are found in your environment, and is slower on the prefill i.e. requires a "warmup run" due to high cpu overhead, see [here](https://github.com/state-spaces/mamba/issues/389#issuecomment-2171755306) and [also here](https://github.com/state-spaces/mamba/issues/355#issuecomment-2147597457). Without compilation, the `torch_forward` implementation is faster by a factor 3 to 4. Further, there are no positional embeddings in this model, but there is an `attention_mask` and a specific logic to mask out hidden states in two places in the case of batched generation, see [here](https://github.com/state-spaces/mamba/issues/66#issuecomment-1863563829) as well. Due to this, in addition to the reimplementation of mamba2 kernels, batched generation and cached generation are expected to have slight discrepancies. Further, the results given by the cuda kernels or the torch forward are expected to be slightly different. The SSM algorithm heavily relies on tensor contractions, which have matmul equivalents but the order of operations is slightly different, making the difference greater at smaller precisions. 
+Another note, shutdown of hidden states corresponding to padding tokens is done in 2 places and mostly has been tested with left-padding. Right-padding will propagate noise down the line and is not guaranteed to yield satisfactory results. `tokenizer.padding_side = "left"` ensures you are using the correct padding side.
+
+This model was contributed by [Molbap](https://huggingface.co/Molbap), with tremendous help from [Anton Vlasjuk](https://github.com/vasqu).
+The original code can be found [here](https://github.com/state-spaces/mamba).
+
+
+# Usage
+
+### A simple generation example: 
+```python 
+from transformers import MambaConfig, MambaForCausalLM, AutoTokenizer
+import torch
+model_id = 'mistralai/Mamba-Codestral-7B-v0.1'
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision='refs/pr/9', from_slow=True, legacy=False)
+model = MambaForCausalLM.from_pretrained(model_id, revision='refs/pr/9')
+input_ids = tokenizer("Hey how are you doing?", return_tensors= "pt")["input_ids"]
+
+out = model.generate(input_ids, max_new_tokens=10)
+print(tokenizer.batch_decode(out))
+```
+
+Here's a draft script for finetuning: 
+```python 
+from trl import SFTTrainer
+from peft import LoraConfig
+from transformers import AutoTokenizer, Mamba2ForCausalLM, TrainingArguments
+model_id = 'mistralai/Mamba-Codestral-7B-v0.1'
+tokenizer = AutoTokenizer.from_pretrained(model_id, revision='refs/pr/9', from_slow=True, legacy=False)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "left" #enforce padding side left
+
+model = Mamba2ForCausalLM.from_pretrained(model_id, revision='refs/pr/9')
+dataset = load_dataset("Abirate/english_quotes", split="train")
+# Without CUDA kernels, batch size of 2 occupies one 80GB device
+# but precision can be reduced.
+# Experiments and trials welcome!
+training_args = TrainingArguments(
+    output_dir="./results",
+    num_train_epochs=3,
+    per_device_train_batch_size=2,
+    logging_dir='./logs',
+    logging_steps=10,
+    learning_rate=2e-3
+)
+lora_config =  LoraConfig(
+        r=8,
+        target_modules=["embeddings", "in_proj", "out_proj"],
+        task_type="CAUSAL_LM",
+        bias="none"
+)
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    args=training_args,
+    peft_config=lora_config,
+    train_dataset=dataset,
+    dataset_text_field="quote",
+)
+trainer.train()
+```
+
+
+## Mamba2Config
+
+[[autodoc]] Mamba2Config
+
+## Mamba2Model
+
+[[autodoc]] Mamba2Model
+    - forward
+
+## Mamba2LMHeadModel
+
+[[autodoc]] Mamba2ForCausalLM
+    - forward
--- a/docs/source/en/model_doc/nemotron.md
+++ b/docs/source/en/model_doc/nemotron.md
@ -0,0 +1,148 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+-->
+
+# Nemotron
+
+## Nemotron
+
+### License
+
+The use of this model is governed by the [NVIDIA AI Foundation Models Community License Agreement](https://developer.nvidia.com/downloads/nv-ai-foundation-models-license).
+
+### Description
+
+Nemotron-4 is a family of enterprise ready generative text models compatible with [NVIDIA NeMo Framework](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/).
+
+NVIDIA NeMo is an end-to-end, cloud-native platform to build, customize, and deploy generative AI models anywhere. It includes training and inferencing frameworks, guardrailing toolkits, data curation tools, and pretrained models, offering enterprises an easy, cost-effective, and fast way to adopt generative AI. To get access to NeMo Framework, please sign up at [this link](https://developer.nvidia.com/nemo-framework/join).
+
+### References
+
+[Announcement Blog](https://developer.nvidia.com/blog/nvidia-ai-foundation-models-build-custom-enterprise-chatbots-and-co-pilots-with-production-ready-llms/)
+
+### Model Architecture
+
+**Architecture Type:** Transformer
+
+**Network Architecture:** Transformer Decoder (auto-regressive language model).
+
+## Minitron
+
+### Minitron 4B Base
+
+Minitron is a family of small language models (SLMs) obtained by pruning NVIDIA's [Nemotron-4 15B](https://arxiv.org/abs/2402.16819) model. We prune model embedding size, attention heads, and MLP intermediate dimension, following which, we perform continued training with distillation to arrive at the final models.
+
+Deriving the Minitron 8B and 4B models from the base 15B model using our approach requires up to **40x fewer training tokens** per model compared to training from scratch; this results in **compute cost savings of 1.8x** for training the full model family (15B, 8B, and 4B). Minitron models exhibit up to a 16% improvement in MMLU scores compared to training from scratch, perform comparably to other community models such as Mistral 7B, Gemma 7B and Llama-3 8B, and outperform state-of-the-art compression techniques from the literature. Please refer to our [arXiv paper](https://arxiv.org/abs/2407.14679) for more details.
+
+Minitron models are for research and development only.
+
+### HuggingFace Quickstart
+
+The following code provides an example of how to load the Minitron-4B model and use it to perform text generation.
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+# Load the tokenizer and model
+model_path = 'nvidia/Minitron-4B-Base'
+tokenizer  = AutoTokenizer.from_pretrained(model_path)
+
+device = 'cuda'
+dtype  = torch.bfloat16
+model  = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=dtype, device_map=device)
+
+# Prepare the input text
+prompt = 'Complete the paragraph: our solar system is'
+inputs = tokenizer.encode(prompt, return_tensors='pt').to(model.device)
+
+# Generate the output
+outputs = model.generate(inputs, max_length=20)
+
+# Decode and print the output
+output_text = tokenizer.decode(outputs[0])
+print(output_text)
+```
+
+### License
+
+Minitron is released under the [NVIDIA Open Model License Agreement](https://developer.download.nvidia.com/licenses/nvidia-open-model-license-agreement-june-2024.pdf).
+
+### Evaluation Results
+
+*5-shot performance.* Language Understanding evaluated using [Massive Multitask Language Understanding](https://arxiv.org/abs/2009.03300):
+
+| Average |
+| :---- |
+| 58.6 |
+
+*Zero-shot performance.* Evaluated using select datasets from the [LM Evaluation Harness](https://github.com/EleutherAI/lm-evaluation-harness) with additions:
+
+| HellaSwag | Winogrande | GSM8K| ARC-C | XLSum |
+| :------------- | :------------- | :------------- | :------------- | :------------- |
+| 75.0 | 74.0 | 24.1  | 50.9 | 29.5
+
+
+*Code generation performance*. Evaluated using [HumanEval](https://github.com/openai/human-eval):
+
+| p@1, 0-Shot |
+| :------------- |
+| 23.3 |
+
+Please refer to our [paper](https://arxiv.org/abs/2407.14679) for the full set of results.
+
+### Citation
+
+If you find our work helpful, please consider citing our paper:
+```
+@article{minitron2024,
+      title={Compact Language Models via Pruning and Knowledge Distillation},
+      author={Saurav Muralidharan and Sharath Turuvekere Sreenivas and Raviraj Joshi and Marcin Chochowski and Mostofa Patwary and Mohammad Shoeybi and Bryan Catanzaro and Jan Kautz and Pavlo Molchanov},
+      journal={arXiv preprint arXiv:2407.14679},
+      year={2024},
+      url={https://arxiv.org/abs/2407.14679},
+}
+```
+
+## NemotronConfig
+
+[[autodoc]] NemotronConfig
+
+
+## NemotronModel
+
+[[autodoc]] NemotronModel
+    - forward
+
+
+## NemotronForCausalLM
+
+[[autodoc]] NemotronForCausalLM
+    - forward
+
+## NemotronForSequenceClassification
+
+[[autodoc]] NemotronForSequenceClassification
+    - forward
+
+
+## NemotronForQuestionAnswering
+
+[[autodoc]] NemotronForQuestionAnswering
+    - forward
+
+
+## NemotronForTokenClassification
+
+[[autodoc]] NemotronForTokenClassification
+    - forward
--- a/docs/source/en/model_doc/nllb.md
+++ b/docs/source/en/model_doc/nllb.md
@ -101,7 +101,7 @@ for the list of all BCP-47 in the Flores 200 dataset.
 >>> inputs = tokenizer(article, return_tensors="pt")

 >>> translated_tokens = model.generate(
-...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["fra_Latn"], max_length=30
+...     **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("fra_Latn"), max_length=30
 ... )
 >>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
 Le chef de l'ONU dit qu'il n'y a pas de solution militaire en Syrie
@ -126,7 +126,7 @@ See example below for a translation from romanian to german:
 >>> inputs = tokenizer(article, return_tensors="pt")

 >>> translated_tokens = model.generate(
-...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30
+...     **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("deu_Latn"), max_length=30
 ... )
 >>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
 UN-Chef sagt, es gibt keine militärische Lösung in Syrien
@ -175,7 +175,7 @@ To load a model using Flash Attention 2, we can pass the argument `attn_implemen
 >>> inputs = tokenizer(article, return_tensors="pt").to("cuda")

 >>> translated_tokens = model.generate(
-...     **inputs, forced_bos_token_id=tokenizer.lang_code_to_id["deu_Latn"], max_length=30
+...     **inputs, forced_bos_token_id=tokenizer.convert_tokens_to_ids("deu_Latn"), max_length=30
 ... )
 >>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
 "UN-Chef sagt, es gibt keine militärische Lösung in Syrien"
@ -187,4 +187,4 @@ Below is an expected speedup diagram that compares pure inference time between t

 <div style="text-align: center">
 <img src="https://huggingface.co/datasets/visheratin/documentation-images/resolve/main/nllb-speedup.webp">
-</div>
+</div>
--- a/docs/source/en/model_doc/qwen2.md
+++ b/docs/source/en/model_doc/qwen2.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 ## Overview

-Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen-72B, Qwen-1.8B, Qwen-VL, Qwen-Audio, etc.
+Qwen2 is the new model series of large language models from the Qwen team. Previously, we released the Qwen series, including Qwen2-0.5B, Qwen2-1.5B, Qwen2-7B, Qwen2-57B-A14B, Qwen2-72B, Qwen2-Audio, etc.

 ### Model Details

@ -27,16 +27,16 @@ Qwen2 is a language model series including decoder language models of different

 ## Usage tips

-`Qwen2-7B-beta` and `Qwen2-7B-Chat-beta` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)
+`Qwen2-7B` and `Qwen2-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)

-In the following, we demonstrate how to use `Qwen2-7B-Chat-beta` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
+In the following, we demonstrate how to use `Qwen2-7B-Instruct` for the inference. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.

 ```python
 >>> from transformers import AutoModelForCausalLM, AutoTokenizer
 >>> device = "cuda" # the device to load the model onto

->>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen1.5-7B-Chat", device_map="auto")
->>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")
+>>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-7B-Instruct", device_map="auto")
+>>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-7B-Instruct")

 >>> prompt = "Give me a short introduction to large language model."

--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@ -72,7 +72,7 @@ Here is a step-by-step guide to transcribing an audio sample using a pre-trained
 ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
 ```

-Whisper is compatible with the following optimisations:
+Whisper is compatible with the following optimisations for both short and long-form generation:
 - [PyTorch Scaled Dot Product Attention (SDPA)](../perf_infer_gpu_one#pytorch-scaled-dot-product-attention): flash attention and memory-efficient attention kernels. Enabled by default for `torch>=2.1.1`.
 - [Flash Attention 2](../perf_infer_gpu_one#flashattention-2): improved implementation of flash attention through better parallelism and work partitioning. 
 - [torch.compile](../llm_optims#static-kv-cache-and-torchcompile): JIT-compile the forward pass to dispatch to efficient fused kernels.
@ -101,7 +101,8 @@ As an example, the following codesnippet enables SDPA and `torch.compile` for up
 ... ).input_features

 >>> # Compile the forward pass
->>> _ = model.generate(input_features)
+>>> for _ in range(2):
+>>>     model.generate(input_features)

 >>> # Generate token ids using compiled graph (fast!)
 >>> predicted_ids = model.generate(input_features)
--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@ -77,7 +77,7 @@ Then use `notebook_login` to sign-in to the Hub, and follow the link [here](http

 To ensure your model can be used by someone working with a different framework, we recommend you convert and upload your model with both PyTorch and TensorFlow checkpoints. While users are still able to load your model from a different framework if you skip this step, it will be slower because 🤗 Transformers will need to convert the checkpoint on-the-fly.

-Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework. 
+Converting a checkpoint for another framework is easy. Make sure you have PyTorch and TensorFlow installed (see [here](installation) for installation instructions), and then find the specific model for your task in the other framework.

 <frameworkcontent>
 <pt>
--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -67,6 +67,7 @@ FlashAttention-2 is currently supported for the following architectures:
 * [Mixtral](https://huggingface.co/docs/transformers/model_doc/mixtral#transformers.MixtralModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
+* [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
 * [NLLB](https://huggingface.co/docs/transformers/model_doc/nllb)
 * [OLMo](https://huggingface.co/docs/transformers/model_doc/olmo#transformers.OlmoModel)
 * [OPT](https://huggingface.co/docs/transformers/model_doc/opt#transformers.OPTModel)
@ -228,6 +229,7 @@ For now, Transformers supports SDPA inference and training for the following arc
 * [Qwen2MoE](https://huggingface.co/docs/transformers/model_doc/qwen2_moe#transformers.Qwen2MoeModel)
 * [Musicgen](https://huggingface.co/docs/transformers/model_doc/musicgen#transformers.MusicgenModel)
 * [MusicGen Melody](https://huggingface.co/docs/transformers/model_doc/musicgen_melody#transformers.MusicgenMelodyModel)
+* [Nemotron](https://huggingface.co/docs/transformers/model_doc/nemotron)
 * [ViT](https://huggingface.co/docs/transformers/model_doc/vit#transformers.ViTModel)
 * [ViTHybrid](https://huggingface.co/docs/transformers/model_doc/vit_hybrid#transformers.ViTHybridModel)
 * [ViTMAE](https://huggingface.co/docs/transformers/model_doc/vit_mae#transformers.ViTMAEModel)
--- a/docs/source/en/perf_torch_compile.md
+++ b/docs/source/en/perf_torch_compile.md
@ -98,7 +98,7 @@ Below you can find the list of the models we benchmarked.
 - [google/vit-base-patch16-224](https://huggingface.co/google/vit-base-patch16-224)
 - [microsoft/beit-base-patch16-224-pt22k-ft22k](https://huggingface.co/microsoft/beit-base-patch16-224-pt22k-ft22k)
 - [facebook/convnext-large-224](https://huggingface.co/facebook/convnext-large-224)
- [microsoft/resnet-50](https://huggingface.co/)
+- [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50)

 **Image Segmentation** 
 - [nvidia/segformer-b0-finetuned-ade-512-512](https://huggingface.co/nvidia/segformer-b0-finetuned-ade-512-512)
--- a/docs/source/en/tf_xla.md
+++ b/docs/source/en/tf_xla.md
@ -157,7 +157,7 @@ Execution time -- 79.0 ms

 Execution time -- 78.9 ms
 ```
-The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point with trigger re-tracing and thus leading to slow-downs in the generation time. 
+The first call to `xla_generate()` is time-consuming because of tracing, but the successive calls are orders of magnitude faster. Keep in mind that any change in the generation options at any point will trigger re-tracing and thus leading to slow-downs in the generation time. 

 We didn’t cover all the text generation options 🤗 Transformers provides in this document. We encourage you to read the documentation for advanced use cases.

@ -171,4 +171,4 @@ Here, we leave you with some additional resources if you want to delve deeper in
 * Recommended posts for learning more about XLA and TensorFlow graphs in general:
    * [XLA: Optimizing Compiler for Machine Learning](https://www.tensorflow.org/xla)
    * [Introduction to graphs and tf.function](https://www.tensorflow.org/guide/intro_to_graphs)
-    * [Better performance with tf.function](https://www.tensorflow.org/guide/function) 
+    * [Better performance with tf.function](https://www.tensorflow.org/guide/function)
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -278,7 +278,7 @@ args = TrainingArguments(
    max_steps=100,
    per_device_train_batch_size=2,
    optim="galore_adamw",
-    optim_target_modules=["attn", "mlp"]
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
 )

 model_id = "google/gemma-2b"
@ -315,7 +315,7 @@ args = TrainingArguments(
    max_steps=100,
    per_device_train_batch_size=2,
    optim="galore_adamw",
-    optim_target_modules=["attn", "mlp"],
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"],
    optim_args="rank=64, update_proj_gap=100, scale=0.10",
 )

@ -359,7 +359,7 @@ args = TrainingArguments(
    max_steps=100,
    per_device_train_batch_size=2,
    optim="galore_adamw_layerwise",
-    optim_target_modules=["attn", "mlp"]
+    optim_target_modules=[r".*.attn.*", r".*.mlp.*"]
 )

 model_id = "google/gemma-2b"
--- a/docs/source/es/chat_templating.md
+++ b/docs/source/es/chat_templating.md
@ -220,7 +220,7 @@ La plantilla de chat para un modelo se almacena en el atributo `tokenizer.chat_t
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

@ -307,12 +307,6 @@ Si estás ajustando finamente un modelo para chat, además de establecer una pla

 </Tip>

-### ¿Qué son las plantillas "default"?
-
-Antes de la introducción de las plantillas de chat, el manejo del chat estaba codificado en el nivel de la clase del modelo. Por razones de compatibilidad con versiones anteriores, hemos conservado este manejo específico de la clase como plantillas predeterminadas, también establecidas a nivel de clase. Si un modelo no tiene una plantilla de chat establecida, pero hay una plantilla predeterminada para su clase de modelo, la clase `TextGenerationPipeline` y métodos como `apply_chat_template` usarán la plantilla de clase en su lugar. Puedes averiguar cuál es la plantilla predeterminada para tu tokenizador comprobando el atributo `tokenizer.default_chat_template`.
-
-Esto es algo que hacemos puramente por razones de compatibilidad con versiones anteriores, para evitar romper cualquier flujo de trabajo existente. Incluso cuando la plantilla de clase es apropiada para tu modelo, recomendamos encarecidamente anular la plantilla predeterminada estableciendo explícitamente el atributo `chat_template` para dejar claro a los usuarios que tu modelo ha sido configurado correctamente para el chat, y para estar preparados para el futuro en caso de que las plantillas predeterminadas alguna vez se alteren o se eliminen.
-
 ### ¿Qué plantilla debería usar?

 Cuando establezcas la plantilla para un modelo que ya ha sido entrenado para chat, debes asegurarte de que la plantilla coincida exactamente con el formato de mensajes que el modelo vio durante el entrenamiento, o de lo contrario es probable que experimentes degradación del rendimiento. Esto es cierto incluso si estás entrenando aún más el modelo; probablemente obtendrás el mejor rendimiento si mantienes constantes los tokens de chat. Esto es muy análogo a la tokenización: generalmente obtienes el mejor rendimiento para la inferencia o el ajuste fino cuando coincides precisamente con la tokenización utilizada durante el entrenamiento.
--- a/docs/source/ja/chat_templating.md
+++ b/docs/source/ja/chat_templating.md
@ -85,7 +85,7 @@ LLM（Language Model）のますます一般的な使用事例の1つは「チ
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -73,12 +73,12 @@
        title: 제로샷(zero-shot) 이미지 분류
      - local: tasks/monocular_depth_estimation
        title: 단일 영상 기반 깊이 추정
-      - local: in_translation
-        title: (번역중) Image-to-Image
+      - local: tasks/image_to_image
+        title: Image-to-Image
      - local: in_translation
        title: (번역중) Image Feature Extraction
-      - local: in_translation
-        title: (번역중) Mask Generation
+      - local: tasks/mask_generation
+        title: 마스크 생성
      - local: in_translation
        title: (번역중) Knowledge Distillation for Computer Vision
    title: 컴퓨터 비전
@ -100,8 +100,8 @@
    title: 생성
  - isExpanded: false
    sections:
-    - local: in_translation
-      title: (번역중) Image tasks with IDEFICS
+    - local: tasks/idefics
+      title: IDEFICS를 이용한 이미지 작업
    - local: in_translation
      title: (번역중) LLM prompting guide
    title: (번역중) 프롬프팅
--- a/docs/source/ko/tasks/idefics.md
+++ b/docs/source/ko/tasks/idefics.md
@ -0,0 +1,391 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# IDEFICS를 이용한 이미지 작업[[image-tasks-with-idefics]]
+
+[[open-in-colab]]
+
+개별 작업은 특화된 모델을 미세 조정하여 처리할 수 있지만, 최근 등장하여 인기를 얻고 있는 방식은 대규모 모델을 미세 조정 없이 다양한 작업에 사용하는 것입니다. 예를 들어, 대규모 언어 모델은 요약, 번역, 분류 등과 같은 자연어처리 (NLP) 작업을 처리할 수 있습니다. 이 접근 방식은 텍스트와 같은 단일 모달리티에 국한되지 않으며, 이 가이드에서는 IDEFICS라는 대규모 멀티모달 모델을 사용하여 이미지-텍스트 작업을 다루는 방법을 설명합니다.
+
+[IDEFICS](../model_doc/idefics)는 [Flamingo](https://huggingface.co/papers/2204.14198)를 기반으로 하는 오픈 액세스 비전 및 언어 모델로, DeepMind에서 처음 개발한 최신 시각 언어 모델입니다. 이 모델은 임의의 이미지 및 텍스트 입력 시퀀스를 받아 일관성 있는 텍스트를 출력으로 생성합니다. 이미지에 대한 질문에 답변하고, 시각적인 내용을 설명하며, 여러 이미지에 기반한 이야기를 생성하는 등 다양한 작업을 수행할 수 있습니다. IDEFICS는 [800억 파라미터](https://huggingface.co/HuggingFaceM4/idefics-80b)와 [90억 파라미터](https://huggingface.co/HuggingFaceM4/idefics-9b) 두 가지 버전을 제공하며, 두 버전 모두 🤗 Hub에서 이용할 수 있습니다. 각 버전에는 대화형 사용 사례에 맞게 미세 조정된 버전도 있습니다.
+
+이 모델은 매우 다재다능하며 광범위한 이미지 및 멀티모달 작업에 사용될 수 있습니다. 그러나 대규모 모델이기 때문에 상당한 컴퓨팅 자원과 인프라가 필요합니다. 각 개별 작업에 특화된 모델을 미세 조정하는 것보다 모델을 그대로 사용하는 것이 더 적합한지는 사용자가 판단해야 합니다.
+
+이 가이드에서는 다음을 배우게 됩니다:
+- [IDEFICS 로드하기](#loading-the-model) 및 [양자화된 버전의 모델 로드하기](#quantized-model)
+- IDEFICS를 사용하여:
+  - [이미지 캡셔닝](#image-captioning)
+  - [프롬프트 이미지 캡셔닝](#prompted-image-captioning)
+  - [퓨샷 프롬프트](#few-shot-prompting)
+  - [시각적 질의 응답](#visual-question-answering)
+  - [이미지 분류](#image-classification)
+  - [이미지 기반 텍스트 생성](#image-guided-text-generation)
+- [배치 모드에서 추론 실행](#running-inference-in-batch-mode)
+- [대화형 사용을 위한 IDEFICS 인스트럭트 실행](#idefics-instruct-for-conversational-use)
+
+시작하기 전에 필요한 모든 라이브러리가 설치되어 있는지 확인하세요.
+
+```bash
+pip install -q bitsandbytes sentencepiece accelerate transformers
+```
+
+<Tip>
+다음 예제를 비양자화된 버전의 모델 체크포인트로 실행하려면 최소 20GB의 GPU 메모리가 필요합니다.
+</Tip>
+
+## 모델 로드[[loading-the-model]]
+
+모델을 90억 파라미터 버전의 체크포인트로 로드해 봅시다:
+
+```py
+>>> checkpoint = "HuggingFaceM4/idefics-9b"
+```
+
+다른 Transformers 모델과 마찬가지로, 체크포인트에서 프로세서와 모델 자체를 로드해야 합니다.
+IDEFICS 프로세서는 [`LlamaTokenizer`]와 IDEFICS 이미지 프로세서를 하나의 프로세서로 감싸서 텍스트와 이미지 입력을 모델에 맞게 준비합니다.
+
+```py
+>>> import torch
+
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor
+
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16, device_map="auto")
+```
+
+`device_map`을 `"auto"`로 설정하면 사용 중인 장치를 고려하여 모델 가중치를 가장 최적화된 방식으로 로드하고 저장하는 방법을 자동으로 결정합니다.
+
+### 양자화된 모델[[quantized-model]]
+
+고용량 GPU 사용이 어려운 경우, 모델의 양자화된 버전을 로드할 수 있습니다. 모델과 프로세서를 4비트 정밀도로 로드하기 위해서, `from_pretrained` 메소드에 `BitsAndBytesConfig`를 전달하면 모델이 로드되는 동안 실시간으로 압축됩니다.
+
+```py
+>>> import torch
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor, BitsAndBytesConfig
+
+>>> quantization_config = BitsAndBytesConfig(
+...     load_in_4bit=True,
+...     bnb_4bit_compute_dtype=torch.float16,
+... )
+
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> model = IdeficsForVisionText2Text.from_pretrained(
+...     checkpoint,
+...     quantization_config=quantization_config,
+...     device_map="auto"
+... )
+```
+
+이제 모델을 제안된 방법 중 하나로 로드했으니, IDEFICS를 사용할 수 있는 작업들을 탐구해봅시다.
+
+## 이미지 캡셔닝[[image-captioning]]
+이미지 캡셔닝은 주어진 이미지에 대한 캡션을 예측하는 작업입니다. 일반적인 응용 분야는 시각 장애인이 다양한 상황을 탐색할 수 있도록 돕는 것입니다. 예를 들어, 온라인에서 이미지 콘텐츠를 탐색하는 데 도움을 줄 수 있습니다.
+
+작업을 설명하기 위해 캡션을 달 이미지 예시를 가져옵니다. 예시:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-im-captioning.jpg" alt="Image of a puppy in a flower bed"/>
+</div>
+
+사진 제공: [Hendo Wang](https://unsplash.com/@hendoo).
+
+IDEFICS는 텍스트 및 이미지 프롬프트를 모두 수용합니다. 그러나 이미지를 캡션하기 위해 모델에 텍스트 프롬프트를 제공할 필요는 없습니다. 전처리된 입력 이미지만 제공하면 됩니다. 텍스트 프롬프트 없이 모델은 BOS(시퀀스 시작) 토큰부터 텍스트 생성을 시작하여 캡션을 만듭니다.
+
+모델에 이미지 입력으로는 이미지 객체(`PIL.Image`) 또는 이미지를 가져올 수 있는 URL을 사용할 수 있습니다.
+
+```py
+>>> prompt = [
+...     "https://images.unsplash.com/photo-1583160247711-2191776b4b91?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3542&q=80",
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+A puppy in a flower bed
+```
+
+<Tip>
+
+`max_new_tokens`의 크기를 증가시킬 때 발생할 수 있는 오류를 피하기 위해 `generate` 호출 시 `bad_words_ids`를 포함하는 것이 좋습니다. 모델로부터 생성된 이미지가 없을 때 새로운 `<image>` 또는 `<fake_token_around_image>` 토큰을 생성하려고 하기 때문입니다.
+이 가이드에서처럼 `bad_words_ids`를 함수 호출 시에 매개변수로 설정하거나, [텍스트 생성 전략](../generation_strategies) 가이드에 설명된 대로 `GenerationConfig`에 저장할 수도 있습니다.
+</Tip>
+
+## 프롬프트 이미지 캡셔닝[[prompted-image-captioning]]
+
+텍스트 프롬프트를 이용하여 이미지 캡셔닝을 확장할 수 있으며, 모델은 주어진 이미지를 바탕으로 텍스트를 계속 생성합니다. 다음 이미지를 예시로 들어보겠습니다:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-prompted-im-captioning.jpg" alt="Image of the Eiffel Tower at night"/>
+</div>
+
+사진 제공: [Denys Nevozhai](https://unsplash.com/@dnevozhai).
+
+텍스트 및 이미지 프롬프트는 적절한 입력을 생성하기 위해 모델의 프로세서에 하나의 목록으로 전달될 수 있습니다.
+
+```py
+>>> prompt = [
+...     "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...     "This is an image of ",
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+This is an image of the Eiffel Tower in Paris, France.
+```
+
+## 퓨샷 프롬프트[[few-shot-prompting]]
+
+IDEFICS는 훌륭한 제로샷 결과를 보여주지만, 작업에 특정 형식의 캡션이 필요하거나 작업의 복잡성을 높이는 다른 제한 사항이나 요구 사항이 있을 수 있습니다. 이럴 때 퓨샷 프롬프트를 사용하여 맥락 내 학습(In-Context Learning)을 가능하게 할 수 있습니다.
+프롬프트에 예시를 제공함으로써 모델이 주어진 예시의 형식을 모방한 결과를 생성하도록 유도할 수 있습니다.
+
+이전의 에펠탑 이미지를 모델에 예시로 사용하고, 모델에게 이미지의 객체를 학습하는 것 외에도 흥미로운 정보를 얻고 싶다는 것을 보여주는 프롬프트를 작성해 봅시다.
+그런 다음 자유의 여신상 이미지에 대해 동일한 응답 형식을 얻을 수 있는지 확인해 봅시다:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-few-shot.jpg" alt="Image of the Statue of Liberty"/>
+</div>
+
+사진 제공: [Juan Mayobre](https://unsplash.com/@jmayobres).
+  
+```py
+>>> prompt = ["User:",
+...            "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...            "Describe this image.\nAssistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building.\n",
+...            "User:",
+...            "https://images.unsplash.com/photo-1524099163253-32b7f0256868?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3387&q=80",
+...            "Describe this image.\nAssistant:"
+...            ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=30, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+User: Describe this image.
+Assistant: An image of the Eiffel Tower at night. Fun fact: the Eiffel Tower is the same height as an 81-storey building. 
+User: Describe this image.
+Assistant: An image of the Statue of Liberty. Fun fact: the Statue of Liberty is 151 feet tall.
+```
+
+단 하나의 예시만으로도(즉, 1-shot) 모델이 작업 수행 방법을 학습했다는 점이 주목할 만합니다. 더 복잡한 작업의 경우, 더 많은 예시(예: 3-shot, 5-shot 등)를 사용하여 실험해 보는 것도 좋은 방법입니다. 
+
+## 시각적 질의 응답[[visual-question-answering]]
+
+시각적 질의 응답(VQA)은 이미지를 기반으로 개방형 질문에 답하는 작업입니다. 이미지 캡셔닝과 마찬가지로 접근성 애플리케이션에서 사용할 수 있지만, 교육(시각 자료에 대한 추론), 고객 서비스(이미지를 기반으로 한 제품 질문), 이미지 검색 등에서도 사용할 수 있습니다.
+
+이 작업을 위해 새로운 이미지를 가져옵니다:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-vqa.jpg" alt="Image of a couple having a picnic"/>
+</div>
+
+사진 제공: [Jarritos Mexican Soda](https://unsplash.com/@jarritos).
+
+적절한 지시문을 사용하면 이미지 캡셔닝에서 시각적 질의 응답으로 모델을 유도할 수 있습니다:
+
+```py
+>>> prompt = [
+...     "Instruction: Provide an answer to the question. Use the image to answer.\n",
+...     "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...     "Question: Where are these people and what's the weather like? Answer:"
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=20, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+Instruction: Provide an answer to the question. Use the image to answer.
+ Question: Where are these people and what's the weather like? Answer: They're in a park in New York City, and it's a beautiful day.
+```
+
+## 이미지 분류[[image-classification]]
+
+IDEFICS는 특정 카테고리의 라벨이 포함된 데이터로 명시적으로 학습되지 않아도 이미지를 다양한 카테고리로 분류할 수 있습니다. 카테고리 목록이 주어지면, 모델은 이미지와 텍스트 이해 능력을 사용하여 이미지가 속할 가능성이 높은 카테고리를 추론할 수 있습니다.
+
+여기에 야채 가판대 이미지가 있습니다.
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-classification.jpg" alt="Image of a vegetable stand"/>
+</div>
+
+사진 제공: [Peter Wendt](https://unsplash.com/@peterwendt).
+
+우리는 모델에게 우리가 가진 카테고리 중 하나로 이미지를 분류하도록 지시할 수 있습니다:
+
+```py
+>>> categories = ['animals','vegetables', 'city landscape', 'cars', 'office']
+>>> prompt = [f"Instruction: Classify the following image into a single category from the following list: {categories}.\n",
+...     "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",    
+...     "Category: "
+... ]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=6, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0])
+Instruction: Classify the following image into a single category from the following list: ['animals', 'vegetables', 'city landscape', 'cars', 'office'].
+Category: Vegetables
+```  
+
+위 예제에서는 모델에게 이미지를 단일 카테고리로 분류하도록 지시했지만, 순위 분류를 하도록 모델에 프롬프트를 제공할 수도 있습니다.
+
+## 이미지 기반 텍스트 생성[[image-guided-text-generation]]
+
+이미지를 활용한 텍스트 생성 기술을 사용하면 더욱 창의적인 작업이 가능합니다. 이 기술은 이미지를 바탕으로 텍스트를 만들어내며, 제품 설명, 광고 문구, 장면 묘사 등 다양한 용도로 활용할 수 있습니다.
+
+간단한 예로, 빨간 문 이미지를 IDEFICS에 입력하여 이야기를 만들어보겠습니다:
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/idefics-story-generation.jpg" alt="Image of a red door with a pumpkin on the steps"/>
+</div>
+
+사진 제공: [Craig Tidball](https://unsplash.com/@devonshiremedia).
+  
+```py
+>>> prompt = ["Instruction: Use the image to write a story. \n",
+...     "https://images.unsplash.com/photo-1517086822157-2b0358e7684a?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=2203&q=80",
+...     "Story: \n"]
+
+>>> inputs = processor(prompt, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, num_beams=2, max_new_tokens=200, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> print(generated_text[0]) 
+Instruction: Use the image to write a story. 
+ Story: 
+Once upon a time, there was a little girl who lived in a house with a red door.  She loved her red door.  It was the prettiest door in the whole world.
+
+One day, the little girl was playing in her yard when she noticed a man standing on her doorstep.  He was wearing a long black coat and a top hat.
+
+The little girl ran inside and told her mother about the man.
+
+Her mother said, “Don’t worry, honey.  He’s just a friendly ghost.”
+
+The little girl wasn’t sure if she believed her mother, but she went outside anyway.
+
+When she got to the door, the man was gone.
+
+The next day, the little girl was playing in her yard again when she noticed the man standing on her doorstep.
+
+He was wearing a long black coat and a top hat.
+
+The little girl ran
+```
+
+IDEFICS가 문 앞에 있는 호박을 보고 유령에 대한 으스스한 할로윈 이야기를 만든 것 같습니다.
+
+<Tip>
+
+이처럼 긴 텍스트를 생성할 때는 텍스트 생성 전략을 조정하는 것이 좋습니다. 이렇게 하면 생성된 결과물의 품질을 크게 향상시킬 수 있습니다. 자세한 내용은 [텍스트 생성 전략](../generation_strategies)을 참조하세요.
+</Tip>
+
+## 배치 모드에서 추론 실행[[running-inference-in-batch-mode]]
+
+앞선 모든 섹션에서는 단일 예시에 대해 IDEFICS를 설명했습니다. 이와 매우 유사한 방식으로, 프롬프트 목록을 전달하여 여러 예시에 대한 추론을 실행할 수 있습니다:
+
+```py
+>>> prompts = [
+...     [   "https://images.unsplash.com/photo-1543349689-9a4d426bee8e?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3501&q=80",
+...         "This is an image of ",
+...     ],
+...     [   "https://images.unsplash.com/photo-1623944889288-cd147dbb517c?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...         "This is an image of ",
+...     ],
+...     [   "https://images.unsplash.com/photo-1471193945509-9ad0617afabf?ixlib=rb-4.0.3&ixid=M3wxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8fA%3D%3D&auto=format&fit=crop&w=3540&q=80",
+...         "This is an image of ",
+...     ],
+... ]
+
+>>> inputs = processor(prompts, return_tensors="pt").to("cuda")
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, max_new_tokens=10, bad_words_ids=bad_words_ids)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> for i,t in enumerate(generated_text):
+...     print(f"{i}:\n{t}\n") 
+0:
+This is an image of the Eiffel Tower in Paris, France.
+
+1:
+This is an image of a couple on a picnic blanket.
+
+2:
+This is an image of a vegetable stand.
+```
+
+## 대화형 사용을 위한 IDEFICS 인스트럭트 실행[[idefics-instruct-for-conversational-use]]
+
+대화형 사용 사례를 위해, 🤗 Hub에서 명령어 수행에 최적화된 버전의 모델을 찾을 수 있습니다. 이곳에는 `HuggingFaceM4/idefics-80b-instruct`와 `HuggingFaceM4/idefics-9b-instruct`가 있습니다.
+
+이 체크포인트는 지도 학습 및 명령어 미세 조정 데이터셋의 혼합으로 각각의 기본 모델을 미세 조정한 결과입니다. 이를 통해 모델의 하위 작업 성능을 향상시키는 동시에 대화형 환경에서 모델을 더 사용하기 쉽게 합니다.
+
+대화형 사용을 위한 사용법 및 프롬프트는 기본 모델을 사용하는 것과 매우 유사합니다.
+
+```py
+>>> import torch
+>>> from transformers import IdeficsForVisionText2Text, AutoProcessor
+
+>>> device = "cuda" if torch.cuda.is_available() else "cpu"
+
+>>> checkpoint = "HuggingFaceM4/idefics-9b-instruct"
+>>> model = IdeficsForVisionText2Text.from_pretrained(checkpoint, torch_dtype=torch.bfloat16).to(device)
+>>> processor = AutoProcessor.from_pretrained(checkpoint)
+
+>>> prompts = [
+...     [
+...         "User: What is in this image?",
+...         "https://upload.wikimedia.org/wikipedia/commons/8/86/Id%C3%A9fix.JPG",
+...         "<end_of_utterance>",
+
+...         "\nAssistant: This picture depicts Idefix, the dog of Obelix in Asterix and Obelix. Idefix is running on the ground.<end_of_utterance>",
+
+...         "\nUser:",
+...         "https://static.wikia.nocookie.net/asterix/images/2/25/R22b.gif/revision/latest?cb=20110815073052",
+...         "And who is that?<end_of_utterance>",
+
+...         "\nAssistant:",
+...     ],
+... ]
+
+>>> # --batched mode
+>>> inputs = processor(prompts, add_end_of_utterance_token=False, return_tensors="pt").to(device)
+>>> # --single sample mode
+>>> # inputs = processor(prompts[0], return_tensors="pt").to(device)
+
+>>> # args 생성
+>>> exit_condition = processor.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids
+>>> bad_words_ids = processor.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
+
+>>> generated_ids = model.generate(**inputs, eos_token_id=exit_condition, bad_words_ids=bad_words_ids, max_length=100)
+>>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
+>>> for i, t in enumerate(generated_text):
+...     print(f"{i}:\n{t}\n")
+```
--- a/docs/source/ko/tasks/image_to_image.md
+++ b/docs/source/ko/tasks/image_to_image.md
@ -0,0 +1,132 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Image-to-Image 작업 가이드 [[image-to-image-task-guide]]
+
+[[open-in-colab]]
+
+Image-to-Image 작업은 애플리케이션이 이미지를 입력받아 또 다른 이미지를 출력하는 작업입니다. 여기에는 이미지 향상(초고해상도, 저조도 향상, 빗줄기 제거 등), 이미지 복원 등 다양한 하위 작업이 포함됩니다.
+
+이 가이드에서는 다음을 수행하는 방법을 보여줍니다.
+- 초고해상도 작업을 위한 image-to-image 파이프라인 사용,
+- 파이프라인 없이 동일한 작업을 위한 image-to-image 모델 실행
+
+이 가이드가 발표된 시점에서는, `image-to-image` 파이프라인은 초고해상도 작업만 지원한다는 점을 유의하세요.
+
+필요한 라이브러리를 설치하는 것부터 시작하겠습니다.
+
+```bash
+pip install transformers
+```
+
+이제 [Swin2SR 모델](https://huggingface.co/caidas/swin2SR-lightweight-x2-64)을 사용하여 파이프라인을 초기화할 수 있습니다. 그런 다음 이미지와 함께 호출하여 파이프라인으로 추론할 수 있습니다. 현재 이 파이프라인에서는 [Swin2SR 모델](https://huggingface.co/caidas/swin2SR-lightweight-x2-64)만 지원됩니다.
+
+```python
+from transformers import pipeline
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+pipe = pipeline(task="image-to-image", model="caidas/swin2SR-lightweight-x2-64", device=device)
+```
+
+이제 이미지를 불러와 봅시다.
+
+```python
+from PIL import Image
+import requests
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat.jpg"
+image = Image.open(requests.get(url, stream=True).raw)
+
+print(image.size)
+```
+```bash
+# (532, 432)
+```
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat.jpg" alt="Photo of a cat"/>
+</div>
+
+이제 파이프라인으로 추론을 수행할 수 있습니다. 고양이 이미지의 업스케일된 버전을 얻을 수 있습니다.
+
+```python
+upscaled = pipe(image)
+print(upscaled.size)
+```
+```bash
+# (1072, 880)
+```
+
+파이프라인 없이 직접 추론을 수행하려면 Transformers의 `Swin2SRForImageSuperResolution` 및 `Swin2SRImageProcessor` 클래스를 사용할 수 있습니다. 이를 위해 동일한 모델 체크포인트를 사용합니다. 모델과 프로세서를 초기화해 보겠습니다. 
+
+```python
+from transformers import Swin2SRForImageSuperResolution, Swin2SRImageProcessor 
+
+model = Swin2SRForImageSuperResolution.from_pretrained("caidas/swin2SR-lightweight-x2-64").to(device)
+processor = Swin2SRImageProcessor("caidas/swin2SR-lightweight-x2-64")
+```
+
+`pipeline` 우리가 직접 수행해야 하는 전처리와 후처리 단계를 추상화하므로, 이미지를 전처리해 보겠습니다. 이미지를 프로세서에 전달한 다음 픽셀값을 GPU로 이동시키겠습니다. 
+
+```python
+pixel_values = processor(image, return_tensors="pt").pixel_values
+print(pixel_values.shape)
+
+pixel_values = pixel_values.to(device)
+```
+
+이제 픽셀값을 모델에 전달하여 이미지를 추론할 수 있습니다.
+
+```python
+import torch
+
+with torch.no_grad():
+  outputs = model(pixel_values)
+```
+출력은 아래와 같은 `ImageSuperResolutionOutput` 유형의 객체입니다 👇 
+
+```
+(loss=None, reconstruction=tensor([[[[0.8270, 0.8269, 0.8275,  ..., 0.7463, 0.7446, 0.7453],
+          [0.8287, 0.8278, 0.8283,  ..., 0.7451, 0.7448, 0.7457],
+          [0.8280, 0.8273, 0.8269,  ..., 0.7447, 0.7446, 0.7452],
+          ...,
+          [0.5923, 0.5933, 0.5924,  ..., 0.0697, 0.0695, 0.0706],
+          [0.5926, 0.5932, 0.5926,  ..., 0.0673, 0.0687, 0.0705],
+          [0.5927, 0.5914, 0.5922,  ..., 0.0664, 0.0694, 0.0718]]]],
+       device='cuda:0'), hidden_states=None, attentions=None)
+```
+`reconstruction`를 가져와 시각화를 위해 후처리해야 합니다. 어떻게 생겼는지 살펴봅시다.
+
+```python
+outputs.reconstruction.data.shape
+# torch.Size([1, 3, 880, 1072])
+```
+
+출력 텐서의 차원을 축소하고 0번째 축을 제거한 다음, 값을 클리핑하고 NumPy 부동소수점 배열로 변환해야 합니다. 그런 다음 [1072, 880] 모양을 갖도록 축을 재정렬하고 마지막으로 출력을 0과 255 사이의 값을 갖도록 되돌립니다.
+
+```python
+import numpy as np
+
+# 크기를 줄이고, CPU로 이동하고, 값을 클리핑
+output = outputs.reconstruction.data.squeeze().cpu().clamp_(0, 1).numpy()
+# 축을 재정렬
+output = np.moveaxis(output, source=0, destination=-1)
+# 값을 픽셀값 범위로 되돌리기
+output = (output * 255.0).round().astype(np.uint8)
+Image.fromarray(output)
+```
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/cat_upscaled.png" alt="Upscaled photo of a cat"/>
+</div>
--- a/docs/source/ko/tasks/mask_generation.md
+++ b/docs/source/ko/tasks/mask_generation.md
@ -0,0 +1,228 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 마스크 생성[[mask-generation]]
+
+마스크 생성(Mask generation)은 이미지에 대한 의미 있는 마스크를 생성하는 작업입니다. 
+이 작업은 [이미지 분할](semantic_segmentation)과 매우 유사하지만, 많은 차이점이 있습니다. 이미지 분할 모델은 라벨이 달린 데이터셋으로 학습되며, 학습 중에 본 클래스들로만 제한됩니다. 이미지가 주어지면, 이미지 분할 모델은 여러 마스크와 그에 해당하는 클래스를 반환합니다. 
+
+반면, 마스크 생성 모델은 대량의 데이터로 학습되며 두 가지 모드로 작동합니다.
+- 프롬프트 모드(Prompting mode): 이 모드에서는 모델이 이미지와 프롬프트를 입력받습니다. 프롬프트는 이미지 내 객체의 2D 좌표(XY 좌표)나 객체를 둘러싼 바운딩 박스가 될 수 있습니다. 프롬프트 모드에서는 모델이 프롬프트가 가리키는 객체의 마스크만 반환합니다.
+- 전체 분할 모드(Segment Everything mode): 이 모드에서는 주어진 이미지 내에서 모든 마스크를 생성합니다. 이를 위해 그리드 형태의 점들을 생성하고 이를 이미지에 오버레이하여 추론합니다.
+
+마스크 생성 작업은 [전체 분할 모드(Segment Anything Model, SAM)](model_doc/sam)에 의해 지원됩니다. SAM은 Vision Transformer 기반 이미지 인코더, 프롬프트 인코더, 그리고 양방향 트랜스포머 마스크 디코더로 구성된 강력한 모델입니다. 이미지와 프롬프트는 인코딩되고, 디코더는 이러한 임베딩을 받아 유효한 마스크를 생성합니다.
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/sam.png" alt="SAM Architecture"/>
+</div>
+
+SAM은 대규모 데이터를 다룰 수 있는 강력한 분할 기반 모델입니다. 이 모델은 100만 개의 이미지와 11억 개의 마스크를 포함하는 [SA-1B](https://ai.meta.com/datasets/segment-anything/) 데이터 세트로 학습되었습니다.
+
+이 가이드에서는 다음과 같은 내용을 배우게 됩니다:
+- 배치 처리와 함께 전체 분할 모드에서 추론하는 방법
+- 포인트 프롬프팅 모드에서 추론하는 방법
+- 박스 프롬프팅 모드에서 추론하는 방법
+
+먼저, `transformers`를 설치해 봅시다:
+
+```bash
+pip install -q transformers
+```
+
+## 마스크 생성 파이프라인[[mask-generation-pipeline]]
+
+마스크 생성 모델로 추론하는 가장 쉬운 방법은 `mask-generation` 파이프라인을 사용하는 것입니다.
+
+```python
+>>> from transformers import pipeline
+
+>>> checkpoint = "facebook/sam-vit-base"
+>>> mask_generator = pipeline(model=checkpoint, task="mask-generation")
+```
+
+이미지를 예시로 봅시다.
+
+```python
+from PIL import Image
+import requests
+
+img_url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" alt="Example Image"/>
+</div>
+
+전체적으로 분할해봅시다. `points-per-batch`는 전체 분할 모드에서 점들의 병렬 추론을 가능하게 합니다. 이를 통해 추론 속도가 빨라지지만, 더 많은 메모리를 소모하게 됩니다. 또한, SAM은 이미지가 아닌 점들에 대해서만 배치 처리를 지원합니다. `pred_iou_thresh`는 IoU 신뢰 임계값으로, 이 임계값을 초과하는 마스크만 반환됩니다.
+
+```python
+masks = mask_generator(image, points_per_batch=128, pred_iou_thresh=0.88)
+```
+
+`masks` 는 다음과 같이 생겼습니다:
+
+```bash
+{'masks': [array([[False, False, False, ...,  True,  True,  True],
+         [False, False, False, ...,  True,  True,  True],
+         [False, False, False, ...,  True,  True,  True],
+         ...,
+         [False, False, False, ..., False, False, False],
+         [False, False, False, ..., False, False, False],
+         [False, False, False, ..., False, False, False]]),
+  array([[False, False, False, ..., False, False, False],
+         [False, False, False, ..., False, False, False],
+         [False, False, False, ..., False, False, False],
+         ...,
+'scores': tensor([0.9972, 0.9917,
+        ...,
+}
+```
+
+위 내용을 아래와 같이 시각화할 수 있습니다:
+
+```python
+import matplotlib.pyplot as plt
+
+plt.imshow(image, cmap='gray')
+
+for i, mask in enumerate(masks["masks"]):
+    plt.imshow(mask, cmap='viridis', alpha=0.1, vmin=0, vmax=1)
+
+plt.axis('off')
+plt.show()
+```
+
+아래는 회색조 원본 이미지에 다채로운 색상의 맵을 겹쳐놓은 모습입니다. 매우 인상적인 결과입니다.
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee_segmented.png" alt="Visualized"/>
+</div>
+
+## 모델 추론[[model-inference]]
+
+### 포인트 프롬프팅[[point-prompting]]
+
+파이프라인 없이도 모델을 사용할 수 있습니다. 이를 위해 모델과 프로세서를 초기화해야 합니다.
+
+```python
+from transformers import SamModel, SamProcessor
+import torch
+
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+
+model = SamModel.from_pretrained("facebook/sam-vit-base").to(device)
+processor = SamProcessor.from_pretrained("facebook/sam-vit-base")
+```
+
+포인트 프롬프팅을 하기 위해, 입력 포인트를 프로세서에 전달한 다음, 프로세서 출력을 받아 모델에 전달하여 추론합니다. 모델 출력을 후처리하려면, 출력과 함께 프로세서의 초기 출력에서 가져온 `original_sizes`와 `reshaped_input_sizes`를 전달해야 합니다. 왜냐하면, 프로세서가 이미지 크기를 조정하고 출력을 추정해야 하기 때문입니다.
+
+```python
+input_points = [[[2592, 1728]]] # 벌의 포인트 위치
+
+inputs = processor(image, input_points=input_points, return_tensors="pt").to(device)
+with torch.no_grad():
+    outputs = model(**inputs)
+masks = processor.image_processor.post_process_masks(outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu())
+```
+
+`masks` 출력으로 세 가지 마스크를 시각화할 수 있습니다.
+
+```python
+import matplotlib.pyplot as plt
+import numpy as np
+
+fig, axes = plt.subplots(1, 4, figsize=(15, 5))
+
+axes[0].imshow(image)
+axes[0].set_title('Original Image')
+mask_list = [masks[0][0][0].numpy(), masks[0][0][1].numpy(), masks[0][0][2].numpy()]
+
+for i, mask in enumerate(mask_list, start=1):
+    overlayed_image = np.array(image).copy()
+
+    overlayed_image[:,:,0] = np.where(mask == 1, 255, overlayed_image[:,:,0])
+    overlayed_image[:,:,1] = np.where(mask == 1, 0, overlayed_image[:,:,1])
+    overlayed_image[:,:,2] = np.where(mask == 1, 0, overlayed_image[:,:,2])
+
+    axes[i].imshow(overlayed_image)
+    axes[i].set_title(f'Mask {i}')
+for ax in axes:
+    ax.axis('off')
+
+plt.show()
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/masks.png" alt="Visualized"/>
+</div>
+
+### 박스 프롬프팅[[box-prompting]]
+
+박스 프롬프팅도 포인트 프롬프팅과 유사한 방식으로 할 수 있습니다. 입력 박스를 `[x_min, y_min, x_max, y_max]` 형식의 리스트로 작성하여 이미지와 함께 `processor`에 전달할 수 있습니다. 프로세서 출력을 받아 모델에 직접 전달한 후, 다시 출력을 후처리해야 합니다.
+
+```python
+# 벌 주위의 바운딩 박스
+box = [2350, 1600, 2850, 2100]
+
+inputs = processor(
+        image,
+        input_boxes=[[[box]]],
+        return_tensors="pt"
+    ).to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+mask = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(),
+    inputs["original_sizes"].cpu(),
+    inputs["reshaped_input_sizes"].cpu()
+)[0][0][0].numpy()
+```
+
+이제 아래와 같이, 벌 주위의 바운딩 박스를 시각화할 수 있습니다.
+
+```python
+import matplotlib.patches as patches
+
+fig, ax = plt.subplots()
+ax.imshow(image)
+
+rectangle = patches.Rectangle((2350, 1600), 500, 500, linewidth=2, edgecolor='r', facecolor='none')
+ax.add_patch(rectangle)
+ax.axis("off")
+plt.show()
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/bbox.png" alt="Visualized Bbox"/>
+</div>
+
+아래에서 추론 결과를 확인할 수 있습니다.
+
+```python
+fig, ax = plt.subplots()
+ax.imshow(image)
+ax.imshow(mask, cmap='viridis', alpha=0.4)
+
+ax.axis("off")
+plt.show()
+```
+
+<div class="flex justify-center">
+     <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/box_inference.png" alt="Visualized Inference"/>
+</div>
--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -78,6 +78,8 @@
    title: 如何将流水线添加到 🤗 Transformers？
  title: 贡献
 - sections:
+  - local: philosophy
+    title: Transformers的设计理念
  - local: task_summary
    title: 🤗Transformers能做什么
  - local: tokenizer_summary
--- a/docs/source/zh/chat_templating.md
+++ b/docs/source/zh/chat_templating.md
@ -228,7 +228,7 @@ The sun.</s>
 >>> from transformers import AutoTokenizer
 >>> tokenizer = AutoTokenizer.from_pretrained("facebook/blenderbot-400M-distill")

->>> tokenizer.default_chat_template
+>>> tokenizer.chat_template
 "{% for message in messages %}{% if message['role'] == 'user' %}{{ ' ' }}{% endif %}{{ message['content'] }}{% if not loop.last %}{{ '  ' }}{% endif %}{% endfor %}{{ eos_token }}"
 ```

--- a/docs/source/zh/philosophy.md
+++ b/docs/source/zh/philosophy.md
@ -0,0 +1,67 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+
+# Transformers 的设计理念
+
+🤗 Transformers 是一个专为以下用户群体构建的库：
+
+- 寻求使用、研究或扩展大规模 Transformers 模型的机器学习研究人员和教育者。
+- 希望微调这些模型或在生产环境中使用它们（或两者兼而有之）的实际操作者。
+- 只想下载预训练模型并将其用于解决给定机器学习任务的工程师。
+
+Transformers 设计时有两个主要目标：
+
+1. 尽可能简单快速地使用：
+
+   - 我们尽可能地限制用户能接触的抽象层，实际上几乎没有抽象。用户只需学习三个标准类即可使用每个模型：[configuration](main_classes/configuration)、[models](main_classes/model) 和一个预处理类（用于 NLP 的 [tokenizer](main_classes/tokenizer)，用于视觉的 [image processor](main_classes/image_processor)，用于音频的 [feature extractor](main_classes/feature_extractor)，以及用于多模态输入的 [processor](main_classes/processors)）。
+   - 所有这些类都可以通过一个通用的 `from_pretrained()` 方法从预训练实例中简单统一地初始化，该方法会从提供在 [Hugging Face Hub](https://huggingface.co/models) 上的预训练检查点（如果需要的话）下载、缓存和加载相关类实例及相关数据（配置的超参数、分词器的词汇表和模型的权重）。
+   - 在这三个基本类之上，该库提供了两种 API：[`pipeline`] 用于快速在给定任务上使用模型进行推断，以及 [`Trainer`] 用于快速训练或微调 PyTorch 模型（所有 TensorFlow 模型与 `Keras.fit` 兼容）。
+   - 因此，Transformers 不是神经网络的模块化工具箱。如果要基于 Transformers 扩展或搭建新项目，请使用常规的 Python、PyTorch、TensorFlow、Keras 模块，并从 Transformers 的基类继承以重用模型加载和保存等功能。如果想了解更多有关我们的模型代码的设计理念，请查看我们的[重复自己](https://huggingface.co/blog/transformers-design-philosophy)博文。
+
+2. 提供与原始模型性能尽可能接近的最新模型：
+
+   - 我们为每种架构提供至少一个示例，复现了该架构官方作者提供的结果。
+   - 代码通常尽可能接近原始代码库，这意味着某些 PyTorch 代码可能不够*pytorchic*，因为它是转换后的 TensorFlow 代码，反之亦然。
+
+其他几个目标：
+
+- 尽可能一致地公开模型的内部：
+
+   - 我们使用单一 API 提供对完整隐藏状态和注意力权重的访问。
+   - 预处理类和基本模型 API 标准化，便于在不同模型之间轻松切换。
+
+- 结合主观选择的有前途的工具进行模型微调和调查：
+
+   - 简单一致的方法来向词汇表和嵌入中添加新标记以进行微调。
+   - 简单的方法来屏蔽和修剪 Transformer 头部。
+
+- 轻松在 PyTorch、TensorFlow 2.0 和 Flax 之间切换，允许使用一个框架进行训练并使用另一个进行推断。
+
+## 主要概念
+
+该库围绕每个模型的三类类构建：
+
+- **模型类** 可以是 PyTorch 模型（[torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)）、Keras 模型（[tf.keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model)）或 JAX/Flax 模型（[flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html)），这些模型可以使用库中提供的预训练权重。
+- **配置类** 存储构建模型所需的超参数（如层数和隐藏大小）。通常情况下，如果您使用不进行任何修改的预训练模型，则创建模型将自动处理配置的实例化（配置是模型的一部分）。
+- **预处理类** 将原始数据转换为模型可接受的格式。一个 [tokenizer](main_classes/tokenizer) 存储每个模型的词汇表，并提供编码和解码字符串为要馈送到模型的令牌嵌入索引列表的方法。[Image processors](main_classes/image_processor) 预处理视觉输入，[feature extractors](main_classes/feature_extractor) 预处理音频输入，而 [processor](main_classes/processors) 则处理多模态输入。
+
+所有这些类都可以从预训练实例中实例化、本地保存，并通过以下三种方法与 Hub 共享：
+
+- `from_pretrained()` 允许您从库自身提供的预训练版本（支持的模型可在 [Model Hub](https://huggingface.co/models) 上找到）或用户本地（或服务器上）存储的版本实例化模型、配置和预处理类。
+- `save_pretrained()` 允许您本地保存模型、配置和预处理类，以便可以使用 `from_pretrained()` 重新加载。
+- `push_to_hub()` 允许您将模型、配置和预处理类共享到 Hub，以便所有人都可以轻松访问。
--- a/examples/flax/language-modeling/t5_tokenizer_model.py
+++ b/examples/flax/language-modeling/t5_tokenizer_model.py
@ -47,14 +47,14 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
            [
                pre_tokenizers.Metaspace(
-                    replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
+                    replacement=replacement, prepend_scheme="always" if add_prefix_space else "never"
                ),
                pre_tokenizers.Digits(individual_digits=True),
                pre_tokenizers.Punctuation(),
            ]
        )
        tokenizer.decoder = decoders.Metaspace(
-            replacement=replacement, add_prefix_space="always" if add_prefix_space else "never"
+            replacement=replacement, prepend_scheme="always" if add_prefix_space else "never"
        )

        tokenizer.post_processor = TemplateProcessing(
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -61,7 +61,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -56,7 +56,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -45,7 +45,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -43,7 +43,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -48,7 +48,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -53,7 +53,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -58,7 +58,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -60,7 +60,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -47,7 +47,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/examples/pytorch/object-detection/run_object_detection.py
+++ b/examples/pytorch/object-detection/run_object_detection.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/object-detection/requirements.txt")

--- a/examples/pytorch/object-detection/run_object_detection_no_trainer.py
+++ b/examples/pytorch/object-detection/run_object_detection_no_trainer.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logging.basicConfig(level=logging.INFO)
 logger = get_logger(__name__)
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/question-answering/run_seq2seq_qa.py
+++ b/examples/pytorch/question-answering/run_seq2seq_qa.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/semantic-segmentation/requirements.txt")

--- a/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
+++ b/examples/pytorch/semantic-segmentation/run_semantic_segmentation_no_trainer.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc.py
@ -50,7 +50,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_ctc_adapter.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/pytorch/speech-recognition/run_speech_recognition_seq2seq.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
--- a/examples/pytorch/text-classification/run_classification.py
+++ b/examples/pytorch/text-classification/run_classification.py
@ -47,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)

--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -49,7 +49,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")
--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = get_logger(__name__)
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
--- a/examples/research_projects/bertabs/modeling_bertabs.py
+++ b/examples/research_projects/bertabs/modeling_bertabs.py
@ -557,7 +557,7 @@ class MultiHeadedAttention(nn.Module):
            return context


-class DecoderState(object):
+class DecoderState:
    """Interface for grouping together the current state of a recurrent
    decoder. In the simplest case just represents the hidden state of
    the model.  But can also be used for implementing various forms of
@ -694,7 +694,7 @@ def build_predictor(args, tokenizer, symbols, model, logger=None):
    return translator


-class GNMTGlobalScorer(object):
+class GNMTGlobalScorer:
    """
    NMT re-ranking score from
    "Google's Neural Machine Translation System" :cite:`wu2016google`
@ -717,7 +717,7 @@ class GNMTGlobalScorer(object):
        return normalized_probs


-class PenaltyBuilder(object):
+class PenaltyBuilder:
    """
    Returns the Length and Coverage Penalty function for Beam Search.

@ -763,7 +763,7 @@ class PenaltyBuilder(object):
        return logprobs


-class Translator(object):
+class Translator:
    """
    Uses a model to translate a batch of sentences.

@ -1002,7 +1002,7 @@ def tile(x, count, dim=0):
 #


-class BertSumOptimizer(object):
+class BertSumOptimizer:
    """Specific optimizer for BertSum.

    As described in [1], the authors fine-tune BertSum for abstractive
--- a/examples/research_projects/decision_transformer/requirements.txt
+++ b/examples/research_projects/decision_transformer/requirements.txt
@ -97,7 +97,7 @@ jinja2-time==0.2.0
 jmespath==0.10.0
 joblib==1.2.0
 jsonschema==4.4.0
-keras==2.8.0
+keras==2.13.1
 Keras-Preprocessing==1.1.2
 kiwisolver==1.4.0
 kubernetes==12.0.1
--- a/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
+++ b/examples/research_projects/fsner/src/fsner/tokenizer_utils.py
@ -3,7 +3,7 @@ import torch
 from transformers import AutoTokenizer


-class FSNERTokenizerUtils(object):
+class FSNERTokenizerUtils:
    def __init__(self, pretrained_model_name_or_path):
        self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)

--- a/examples/research_projects/lxmert/modeling_frcnn.py
+++ b/examples/research_projects/lxmert/modeling_frcnn.py
@ -417,7 +417,7 @@ class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride
        return super().__new__(cls, channels, height, width, stride)


-class Box2BoxTransform(object):
+class Box2BoxTransform:
    """
    This R-CNN transformation scales the box's width and height
    by exp(dw), exp(dh) and shifts a box's center by the offset
@ -519,7 +519,7 @@ class Box2BoxTransform(object):
        return pred_boxes


-class Matcher(object):
+class Matcher:
    """
    This class assigns to each predicted "element" (e.g., a box) a ground-truth
    element. Each predicted element will have exactly zero or one matches; each
@ -622,7 +622,7 @@ class Matcher(object):
        match_labels[pred_inds_with_highest_quality] = 1


-class RPNOutputs(object):
+class RPNOutputs:
    def __init__(
        self,
        box2box_transform,
@ -1132,7 +1132,7 @@ class ROIPooler(nn.Module):
        return output


-class ROIOutputs(object):
+class ROIOutputs:
    def __init__(self, cfg, training=False):
        self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA
        self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
--- a/examples/research_projects/movement-pruning/emmental/modules/binarizer.py
+++ b/examples/research_projects/movement-pruning/emmental/modules/binarizer.py
@ -108,7 +108,7 @@ class TopKBinarizer(autograd.Function):
        return gradOutput, None


-class MagnitudeBinarizer(object):
+class MagnitudeBinarizer:
    """
    Magnitude Binarizer.
    Computes a binary mask M from a real value matrix S such that `M_{i,j} = 1` if and only if `S_{i,j}`
--- a/examples/research_projects/movement-pruning/masked_run_glue.py
+++ b/examples/research_projects/movement-pruning/masked_run_glue.py
@ -98,7 +98,7 @@ def regularization(model: nn.Module, mode: str):
            elif mode == "l0":
                regu += torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)).sum() / param.numel()
            else:
-                ValueError("Don't know this mode.")
+                raise ValueError("Don't know this mode.")
            counter += 1
    return regu / counter

--- a/examples/research_projects/movement-pruning/masked_run_squad.py
+++ b/examples/research_projects/movement-pruning/masked_run_squad.py
@ -101,7 +101,7 @@ def regularization(model: nn.Module, mode: str):
            elif mode == "l0":
                regu += torch.sigmoid(param - 2 / 3 * np.log(0.1 / 1.1)).sum() / param.numel()
            else:
-                ValueError("Don't know this mode.")
+                raise ValueError("Don't know this mode.")
            counter += 1
    return regu / counter

--- a/examples/research_projects/performer/modeling_flax_performer_utils.py
+++ b/examples/research_projects/performer/modeling_flax_performer_utils.py
@ -284,7 +284,7 @@ def make_fast_generalized_attention(
    return attention_fn


-class RandomMatrix(object):
+class RandomMatrix:
    r"""
    Abstract class providing a method for constructing 2D random arrays. Class is responsible for constructing 2D
    random arrays.
@ -348,7 +348,7 @@ class GaussianOrthogonalRandomMatrix(RandomMatrix):
        return jnp.matmul(jnp.diag(multiplier), final_matrix)


-class FastAttention(object):
+class FastAttention:
    r"""
    Abstract class providing a method for fast attention. Class is responsible for providing a method
    <dot_product_attention> for fast approximate attention.
--- a/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
+++ b/examples/research_projects/seq2seq-distillation/_test_seq2seq_examples.py
@ -418,7 +418,7 @@ class TestTheRest(TestCasePlus):
            with CaptureStdout() as cs:
                args = parser.parse_args(args)
            assert False, "--help is expected to sys.exit"
-        assert excinfo.type == SystemExit
+        assert excinfo.type is SystemExit
        expected = lightning_base.arg_to_scheduler_metavar
        assert expected in cs.out, "--help is expected to list the supported schedulers"

@ -429,7 +429,7 @@ class TestTheRest(TestCasePlus):
            with CaptureStderr() as cs:
                args = parser.parse_args(args)
            assert False, "invalid argument is expected to sys.exit"
-        assert excinfo.type == SystemExit
+        assert excinfo.type is SystemExit
        expected = f"invalid choice: '{unsupported_param}'"
        assert expected in cs.err, f"should have bailed on invalid choice of scheduler {unsupported_param}"

--- a/examples/research_projects/visual_bert/modeling_frcnn.py
+++ b/examples/research_projects/visual_bert/modeling_frcnn.py
@ -417,7 +417,7 @@ class ShapeSpec(namedtuple("_ShapeSpec", ["channels", "height", "width", "stride
        return super().__new__(cls, channels, height, width, stride)


-class Box2BoxTransform(object):
+class Box2BoxTransform:
    """
    This R-CNN transformation scales the box's width and height
    by exp(dw), exp(dh) and shifts a box's center by the offset
@ -519,7 +519,7 @@ class Box2BoxTransform(object):
        return pred_boxes


-class Matcher(object):
+class Matcher:
    """
    This class assigns to each predicted "element" (e.g., a box) a ground-truth
    element. Each predicted element will have exactly zero or one matches; each
@ -622,7 +622,7 @@ class Matcher(object):
        match_labels[pred_inds_with_highest_quality] = 1


-class RPNOutputs(object):
+class RPNOutputs:
    def __init__(
        self,
        box2box_transform,
@ -1132,7 +1132,7 @@ class ROIPooler(nn.Module):
        return output


-class ROIOutputs(object):
+class ROIOutputs:
    def __init__(self, cfg, training=False):
        self.smooth_l1_beta = cfg.ROI_BOX_HEAD.SMOOTH_L1_BETA
        self.box2box_transform = Box2BoxTransform(weights=cfg.ROI_BOX_HEAD.BBOX_REG_WEIGHTS)
--- a/examples/tensorflow/contrastive-image-text/run_clip.py
+++ b/examples/tensorflow/contrastive-image-text/run_clip.py
@ -51,7 +51,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version(
    "datasets>=1.8.0", "To fix: pip install -r examples/tensorflow/contrastive-image-text/requirements.txt"
--- a/examples/tensorflow/image-classification/run_image_classification.py
+++ b/examples/tensorflow/image-classification/run_image_classification.py
@ -55,7 +55,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/tensorflow/multiple-choice/run_swag.py
+++ b/examples/tensorflow/multiple-choice/run_swag.py
@ -50,7 +50,7 @@ from transformers.utils import PaddingStrategy, check_min_version, send_example_


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = logging.getLogger(__name__)

--- a/examples/tensorflow/question-answering/run_qa.py
+++ b/examples/tensorflow/question-answering/run_qa.py
@ -62,7 +62,7 @@ except (ModuleNotFoundError, ImportError):


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.43.0.dev0")
+check_min_version("4.45.0.dev0")

 logger = logging.getLogger(__name__)

--- a/Show More
+++ b/Show More