Add Bitnet model (#37742 )

* Adding BitNet b1.58 Model * Add testing code for BitNet * Fix format issues * Fix docstring format issues * Fix docstring * Fix docstring * Fix: weight back to uint8 * Fix * Fix format issues * Remove copy comments * Add model link to the docstring * Fix: set tie_word_embeddings default to false * Update * Generate modeling file * Change config name for automatically generating modeling file. * Generate modeling file * Fix class name * Change testing branch * Remove unused param * Fix config docstring * Add docstring for BitNetQuantConfig. * Fix docstring * Update docs/source/en/model_doc/bitnet.md Co-authored-by: Mohamed Mekkouri <93391238+MekkCyber@users.noreply.github.com> * Update docs/source/en/model_doc/bitnet.md Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com> * Update bitnet config * Update explanation between online and offline mode * Remove space * revert changes * more revert * spaces * update * fix-copies * doc fix * fix minor nits * empty * small nit * empty --------- Co-authored-by: Shuming Ma <shumingma@pku.edu.cn> Co-authored-by: shumingma <shmingm@gmail.com> Co-authored-by: Marc Sun <57196510+SunMarc@users.noreply.github.com>
[RT-DETR] Improve docs (#37814 )
2025-10-21 01:23:56 +08:00 · 2025-04-28 15:08:46 +02:00 · 2025-04-28 13:19:24 +02:00 · 2025-04-28 11:56:42 +01:00 · 2025-04-28 11:56:32 +01:00 · 2025-04-28 11:39:11 +01:00
614 changed files with 8369 additions and 5873 deletions
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -56,6 +56,12 @@ body:
          - ray/raytune: @richardliaw, @amogkam
          - Big Model Inference: @SunMarc
          - quantization (bitsandbytes, autogpt): @SunMarc @MekkCyber
+        
+        Devices/Backends:
+        
+          - AMD ROCm: @ivarflakstad
+          - Intel XPU: @IlyasMoutawwakil
+          - Ascend NPU: @ivarflakstad 

        Documentation: @stevhliu

--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,4 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: transformers
-      languages: ar de en es fr hi it ko pt tr zh ja te
+      languages: en
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -90,7 +90,7 @@ def summarize(run_dir, metrics, expand_metrics=False):

        model = benchmark.config.backend["model"]

-        # Ths looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
+        # This looks like `benchmark.input_shapes.batch_size=1,benchmark.input_shapes.sequence_length=5`.
        # (we rely on the usage of hydra's `${hydra.job.override_dirname}`.)
        benchmark_name = re.sub(f"backend.model={model},*", "", report_dir)
        benchmark_name = str(Path(benchmark_name).parts[-1])
--- a/benchmark/llama.py
+++ b/benchmark/llama.py
@ -293,7 +293,7 @@ def run_benchmark(logger: Logger, branch: str, commit_id: str, commit_msg: str,
                max_cache_len=seq_length + 128,
            )

-            # 3nd call
+            # 3rd call
            start = perf_counter()
            output = model.generate(**inputs, past_key_values=past_key_values)
            end = perf_counter()
--- a/docker/consistency.dockerfile
+++ b/docker/consistency.dockerfile
@ -5,7 +5,7 @@ ARG REF=main
 RUN apt-get update && apt-get install -y time git g++ pkg-config make git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools GitPython
-RUN uv pip install --no-cache-dir --upgrade 'torch' 'torchaudio' 'torchvision' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 # tensorflow pin matching setup.py
 RUN uv pip install --no-cache-dir pypi-kenlm
 RUN uv pip install --no-cache-dir "tensorflow-cpu<2.16" "tf-keras<2.16"
--- a/docker/custom-tokenizers.dockerfile
+++ b/docker/custom-tokenizers.dockerfile
@ -16,7 +16,7 @@ RUN cmake .. -DCMAKE_INSTALL_PREFIX=/usr/local
 RUN make install -j 10


-RUN uv pip install --no-cache --upgrade 'torch' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache --upgrade 'torch==2.6.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install  --no-cache-dir "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[ja,testing,sentencepiece,jieba,spacy,ftfy,rjieba]" unidic unidic-lite
 # spacy is not used so not tested. Causes to failures. TODO fix later
--- a/docker/examples-torch.dockerfile
+++ b/docker/examples-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]" seqeval albumentations jiwer
 RUN uv pip uninstall transformers
--- a/docker/exotic-models.dockerfile
+++ b/docker/exotic-models.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update && apt-get install -y libsndfile1-dev espeak-ng time git libgl1-mesa-glx libgl1 g++ tesseract-ocr
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv &&  uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir  --no-deps timm accelerate
 RUN pip install -U --upgrade-strategy eager --no-cache-dir pytesseract python-Levenshtein opencv-python nltk
 # RUN uv pip install --no-cache-dir natten==0.15.1+torch210cpu -f https://shi-labs.com/natten/wheels
--- a/docker/pipeline-torch.dockerfile
+++ b/docker/pipeline-torch.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git pkg-config openssh-client git
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
-RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing]"
 RUN uv pip uninstall transformers
--- a/docker/torch-light.dockerfile
+++ b/docker/torch-light.dockerfile
@ -5,7 +5,7 @@ USER root
 RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-dev espeak-ng time git g++ cmake pkg-config openssh-client git git-lfs
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir --upgrade 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-deps timm accelerate --extra-index-url https://download.pytorch.org/whl/cpu
 RUN uv pip install --no-cache-dir librosa "git+https://github.com/huggingface/transformers.git@${REF}#egg=transformers[sklearn,sentencepiece,vision,testing,tiktoken,num2words,video]"
 RUN uv pip uninstall transformers
--- a/docker/torch-tf-light.dockerfile
+++ b/docker/torch-tf-light.dockerfile
@ -7,7 +7,7 @@ RUN apt-get update &&  apt-get install -y --no-install-recommends libsndfile1-de
 ENV UV_PYTHON=/usr/local/bin/python
 RUN pip --no-cache-dir install uv && uv venv && uv pip install --no-cache-dir -U pip setuptools
 RUN uv pip install --no-cache-dir  --no-deps accelerate --extra-index-url https://download.pytorch.org/whl/cpu 
-RUN uv pip install --no-cache-dir 'torch' 'torchvision' 'torchaudio' --index-url https://download.pytorch.org/whl/cpu
+RUN uv pip install --no-cache-dir 'torch==2.6.0' 'torchaudio==2.6.0' 'torchvision==0.21.0' --index-url https://download.pytorch.org/whl/cpu
 RUN git lfs install

 RUN uv pip install --no-cache-dir pypi-kenlm
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -84,6 +84,9 @@ RUN python3 -m pip install --no-cache-dir compressed-tensors
 # Add AMD Quark for quantization testing
 RUN python3 -m pip install --no-cache-dir amd-quark

+# Add AutoRound for quantization testing
+RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
+
 # Add transformers in editable mode
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -167,6 +167,8 @@
    title: Quantization concepts
  - local: quantization/aqlm
    title: AQLM
+  - local: quantization/auto_round
+    title: AutoRound
  - local: quantization/awq
    title: AWQ
  - local: quantization/bitnet
@ -383,6 +385,8 @@
        title: BigBirdPegasus
      - local: model_doc/biogpt
        title: BioGpt
+      - local: model_doc/bitnet
+        title: BitNet
      - local: model_doc/blenderbot
        title: Blenderbot
      - local: model_doc/blenderbot-small
@ -491,8 +495,6 @@
        title: GraniteMoe
      - local: model_doc/granitemoeshared
        title: GraniteMoeShared
-      - local: model_doc/granitevision
-        title: GraniteVision
      - local: model_doc/helium
        title: Helium
      - local: model_doc/herbert
@ -513,8 +515,6 @@
        title: Llama2
      - local: model_doc/llama3
        title: Llama3
-      - local: model_doc/llama4
-        title: Llama4
      - local: model_doc/longformer
        title: Longformer
      - local: model_doc/longt5
@ -543,8 +543,6 @@
        title: MegatronGPT2
      - local: model_doc/mistral
        title: Mistral
-      - local: model_doc/mistral3
-        title: Mistral3
      - local: model_doc/mixtral
        title: Mixtral
      - local: model_doc/mluke
@ -595,8 +593,6 @@
        title: Phi
      - local: model_doc/phi3
        title: Phi-3
-      - local: model_doc/phi4_multimodal
-        title: Phi4 Multimodal
      - local: model_doc/phimoe
        title: PhiMoE
      - local: model_doc/phobert
@ -939,6 +935,8 @@
        title: GIT
      - local: model_doc/got_ocr2
        title: GOT-OCR2
+      - local: model_doc/granitevision
+        title: GraniteVision
      - local: model_doc/grounding-dino
        title: Grounding DINO
      - local: model_doc/groupvit
@ -969,6 +967,8 @@
        title: LayoutXLM
      - local: model_doc/lilt
        title: LiLT
+      - local: model_doc/llama4
+        title: Llama4
      - local: model_doc/llava
        title: Llava
      - local: model_doc/llava_next
@ -983,6 +983,8 @@
        title: MatCha
      - local: model_doc/mgp-str
        title: MGP-STR
+      - local: model_doc/mistral3
+        title: Mistral3
      - local: model_doc/mllama
        title: mllama
      - local: model_doc/nougat
@ -999,6 +1001,8 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
+      - local: model_doc/phi4_multimodal
+        title: Phi4 Multimodal
      - local: model_doc/pix2struct
        title: Pix2Struct
      - local: model_doc/pixtral
--- a/docs/source/en/internal/modeling_utils.md
+++ b/docs/source/en/internal/modeling_utils.md
@ -20,6 +20,10 @@ This page lists all the custom layers used by the library, as well as the utilit

 Most of those are only useful if you are studying the code of the models in the library.

+## Layers
+
+[[autodoc]] GradientCheckpointingLayer
+
 ## Attention Functions

 [[autodoc]] AttentionInterface
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -18,7 +18,7 @@ rendered properly in your Markdown viewer.

 The key-value (KV) vectors are used to calculate attention scores. For autoregressive models, KV scores are calculated *every* time because the model predicts one token at a time. Each prediction depends on the previous tokens, which means the model performs the same computations each time.

-A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation.md) doc for a more detailed explanation about how a cache works.
+A KV *cache* stores these calculations so they can be reused without recomputing them. Efficient caching is crucial for optimizing model performance because it reduces computation time and improves response rates. Refer to the [Caching](./cache_explanation) doc for a more detailed explanation about how a cache works.

 Transformers offers several [`Cache`] classes that implement different caching mechanisms. Some of these [`Cache`] classes are optimized to save memory while others are designed to maximize generation speed. Refer to the table below to compare cache types and use it to help you select the best cache for your use case.

--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -77,9 +77,9 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] TorchAoConfig

-## BitNetConfig
+## BitNetQuantConfig

-[[autodoc]] BitNetConfig
+[[autodoc]] BitNetQuantConfig

 ## SpQRConfig

@ -92,3 +92,7 @@ Learn how to quantize models in the [Quantization](../quantization) guide.
 ## QuarkConfig

 [[autodoc]] QuarkConfig
+
+## AutoRoundConfig
+
+[[autodoc]] AutoRoundConfig
--- a/docs/source/en/model_doc/bitnet.md
+++ b/docs/source/en/model_doc/bitnet.md
@ -0,0 +1,121 @@
+<!--Copyright 2025 The BitNet Team and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# BitNet
+
+## Overview
+
+Trained on a corpus of 4 trillion tokens, this model demonstrates that native 1-bit LLMs can achieve performance comparable to leading open-weight, full-precision models of similar size, while offering substantial advantages in computational efficiency (memory, energy, latency).
+
+➡️ **Technical Report:** [BitNet b1.58 2B4T Technical Report](https://arxiv.org/abs/2504.12285)
+
+➡️ **Official Inference Code:** [microsoft/BitNet (bitnet.cpp)](https://github.com/microsoft/BitNet)
+
+## Model Variants
+
+Several versions of the model weights are available on Hugging Face:
+
+* [**`microsoft/bitnet-b1.58-2B-4T`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T): Contains the packed 1.58-bit weights optimized for efficient inference. **Use this for deployment.**
+
+* [**`microsoft/bitnet-b1.58-2B-4T-bf16`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-bf16): Contains the master weights in BF16 format. **Use this only for training or fine-tuning purposes.**
+
+* [**`microsoft/bitnet-b1.58-2B-4T-gguf`**](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T-gguf): Contains the model weights in GGUF format, compatible with the `bitnet.cpp` library for CPU inference.
+
+
+### Model Details
+
+
+* **Architecture:** Transformer-based, modified with `BitLinear` layers (BitNet framework).
+    * Uses Rotary Position Embeddings (RoPE).
+    * Uses squared ReLU (ReLU²) activation in FFN layers.
+    * Employs [`subln`](https://proceedings.mlr.press/v202/wang23u.html) normalization.
+    * No bias terms in linear or normalization layers.
+* **Quantization:** Native 1.58-bit weights and 8-bit activations (W1.58A8).
+    * Weights are quantized to ternary values {-1, 0, +1} using absmean quantization during the forward pass.
+    * Activations are quantized to 8-bit integers using absmax quantization (per-token).
+    * **Crucially, the model was *trained from scratch* with this quantization scheme, not post-training quantized.**
+* **Parameters:** ~2 Billion
+* **Training Tokens:** 4 Trillion
+*   **Context Length:** Maximum sequence length of **4096 tokens**.
+    *   *Recommendation:* For optimal performance on tasks requiring very long contexts (beyond the pre-training length or for specialized long-reasoning tasks), we recommend performing intermediate long-sequence adaptation/training before the final fine-tuning stage.
+* **Training Stages:**
+    1.  **Pre-training:** Large-scale training on public text/code and synthetic math data using a two-stage learning rate and weight decay schedule.
+    2.  **Supervised Fine-tuning (SFT):** Fine-tuned on instruction-following and conversational datasets using sum loss aggregation and specific hyperparameter tuning.
+    3.  **Direct Preference Optimization (DPO):** Aligned with human preferences using preference pairs.
+* **Tokenizer:** LLaMA 3 Tokenizer (vocab size: 128,256).
+
+
+## Usage tips
+
+
+**VERY IMPORTANT NOTE ON EFFICIENCY**
+
+> Please do NOT expect performance efficiency gains (in terms of speed, latency, or energy consumption) when using this model with the standard transformers library.
+>
+> The current execution paths within transformers do not contain the specialized, highly optimized computational kernels required to leverage the advantages of the BitNet architecture. Running the model via transformers will likely result in inference speeds and energy usage comparable to, or potentially worse than, standard full-precision models within this framework on both CPU and GPU.
+>
+> While you might observe reduced memory usage due to the quantized weights, the primary computational efficiency benefits are not accessible through this standard transformers usage path.
+>
+> For achieving the efficiency benefits demonstrated in the technical paper, you MUST use the dedicated C++ implementation: [bitnet.cpp](https://github.com/microsoft/BitNet).
+
+### Requirements
+
+```bash
+pip install transformers
+```
+
+### Example
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "microsoft/bitnet-b1.58-2B-4T"
+
+# Load tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    torch_dtype=torch.bfloat16
+)
+
+# Apply the chat template
+messages = [
+    {"role": "system", "content": "You are a helpful AI assistant."},
+    {"role": "user", "content": "How are you?"},
+]
+chat_input = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
+
+# Generate response
+chat_outputs = model.generate(chat_input, max_new_tokens=50)
+response = tokenizer.decode(chat_outputs[0][chat_input.shape[-1]:], skip_special_tokens=True) # Decode only the response part
+print("\nAssistant Response:", response)
+```
+
+
+## BitNetConfig
+
+[[autodoc]] BitNetConfig
+
+## BitNetModel
+
+[[autodoc]] BitNetModel
+    - forward
+
+## BitNetForCausalLM
+
+[[autodoc]] BitNetForCausalLM
+    - forward
--- a/docs/source/en/model_doc/gemma.md
+++ b/docs/source/en/model_doc/gemma.md
@ -1,4 +1,5 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,31 +15,146 @@ rendered properly in your Markdown viewer.

 -->

-# Gemma
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Gemma

-The Gemma model was proposed in [Gemma: Open Models Based on Gemini Technology and Research](https://blog.google/technology/developers/gemma-open-models/) by Gemma Team, Google.
-Gemma models are trained on 6T tokens, and released with 2 versions, 2b and 7b.
+[Gemma](https://huggingface.co/papers/2403.08295) is a family of lightweight language models with pretrained and instruction-tuned variants, available in 2B and 7B parameters. The architecture is based on a transformer decoder-only design. It features Multi-Query Attention, rotary positional embeddings (RoPE), GeGLU activation functions, and RMSNorm layer normalization.

-The abstract from the paper is the following:
+The instruction-tuned variant was fine-tuned with supervised learning on instruction-following data, followed by reinforcement learning from human feedback (RLHF) to align the model outputs with human preferences.

-*This work introduces Gemma, a new family of open language models demonstrating strong performance across academic benchmarks for language understanding, reasoning, and safety. We release two sizes of models (2 billion and 7 billion parameters), and provide both pretrained and fine-tuned checkpoints. Gemma outperforms similarly sized open models on 11 out of 18 text-based tasks, and we present comprehensive evaluations of safety and responsibility aspects of the models, alongside a detailed description of our model development. We believe the responsible release of LLMs is critical for improving the safety of frontier models, and for enabling the next wave of LLM innovations*
+You can find all the original Gemma checkpoints under the [Gemma](https://huggingface.co/collections/google/gemma-release-65d5efbccdbb8c4202ec078b) release.

-Tips:

- The original checkpoints can be converted using the conversion script `src/transformers/models/gemma/convert_gemma_weights_to_hf.py` 
+> [!TIP]
+> Click on the Gemma models in the right sidebar for more examples of how to apply Gemma to different language tasks.

-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), [Sanchit Gandhi](https://huggingface.co/sanchit-gandhi), [Pedro Cuenca](https://huggingface.co/pcuenq).
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class, and from the command line.

+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="text-generation",
+    model="google/gemma-2b",
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+)
+
+pipeline("LLMs generate text through a process known as", max_new_tokens=50)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2b",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+input_text = "LLMs generate text through a process known as"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+outputs = model.generate(**input_ids, max_new_tokens=50, cache_implementation="static")
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "LLMs generate text through a process known as" | transformers-cli run --task text-generation --model google/gemma-2b --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+#!pip install bitsandbytes
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_quant_type="nf4"
+)
+tokenizer = AutoTokenizer.from_pretrained("google/gemma-7b")
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-7b",
+    quantization_config=quantization_config,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+
+input_text = "LLMs generate text through a process known as."
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+outputs = model.generate(
+    **input_ids, 
+    max_new_tokens=50, 
+    cache_implementation="static"
+)
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+```
+
+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.
+
+```py
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer
+
+visualizer = AttentionMaskVisualizer("google/gemma-2b")
+visualizer("LLMs generate text through a process known as") 
+```
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/gemma-attn-mask.png"/>
+</div>
+
+## Notes
+
+- The original Gemma models support standard kv-caching used in many transformer-based language models. You can use use the default [`DynamicCache`] instance or a tuple of tensors for past key values during generation. This makes it compatible with typical autoregressive generation workflows.
+
+   ```py
+   import torch
+   from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+   tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+   model = AutoModelForCausalLM.from_pretrained(
+       "google/gemma-2b",
+       torch_dtype=torch.bfloat16,
+       device_map="auto",
+       attn_implementation="sdpa"
+   )
+   input_text = "LLMs generate text through a process known as"
+   input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+   past_key_values = DynamicCache()
+   outputs = model.generate(**input_ids, max_new_tokens=50, past_key_values=past_key_values)
+   print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+   ```

 ## GemmaConfig

--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@ -257,6 +257,7 @@ InternVL models can also handle video inputs. Here is an example of how to perfo
 ...     add_generation_prompt=True,
 ...     tokenize=True,
 ...     return_dict=True,
+...     num_frames=8,
 >>> ).to(model.device, dtype=torch.float16)

 >>> output = model.generate(**inputs, max_new_tokens=25)
--- a/docs/source/en/model_doc/longformer.md
+++ b/docs/source/en/model_doc/longformer.md
@ -1,5 +1,4 @@
-<!--Copyright 2020 The HuggingFace Team. All rights reserved.
-
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at

@ -9,93 +8,95 @@ Unless required by applicable law or agreed to in writing, software distributed
 an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
 specific language governing permissions and limitations under the License.

-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.
-
 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    </div>
+</div>
+
 # Longformer

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-</div>
+[Longformer](https://huggingface.co/papers/2004.05150) is a transformer model designed for processing long documents. The self-attention operation usually scales quadratically with sequence length, preventing transformers from processing longer sequences. The Longformer attention mechanism overcomes this by scaling linearly with sequence length. It combines local windowed attention with task-specific global attention, enabling efficient processing of documents with thousands of tokens.

-## Overview
+You can find all the original Longformer checkpoints under the [Ai2](https://huggingface.co/allenai?search_models=longformer) organization.

-The Longformer model was presented in [Longformer: The Long-Document Transformer](https://arxiv.org/pdf/2004.05150.pdf) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+> [!TIP]
+> Click on the Longformer models in the right sidebar for more examples of how to apply Longformer to different language tasks.

-The abstract from the paper is the following:
+The example below demonstrates how to fill the `<mask>` token with [`Pipeline`], [`AutoModel`] and from the command line.

-*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
-quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
-mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
-longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
-windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
-evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
-contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
-pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
-WikiHop and TriviaQA.*
-
-This model was contributed by [beltagy](https://huggingface.co/beltagy). The Authors' code can be found [here](https://github.com/allenai/longformer).
-
-## Usage tips
-
- Since the Longformer is based on RoBERTa, it doesn't have `token_type_ids`. You don't need to indicate which
-  token belongs to which segment. Just separate your segments with the separation token `tokenizer.sep_token` (or
-  `</s>`).
- A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g., what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the local attention section for more information.
-
-## Longformer Self Attention
-
-Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
-attend "locally" to each other meaning that each token attends to its \\(\frac{1}{2} w\\) previous tokens and
-\\(\frac{1}{2} w\\) succeeding tokens with \\(w\\) being the window length as defined in
-`config.attention_window`. Note that `config.attention_window` can be of type `List` to define a
-different \\(w\\) for each layer. A selected few tokens attend "globally" to all other tokens, as it is
-conventionally done for all tokens in `BertSelfAttention`.
-
-Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. Also note
-that every "locally" attending token not only attends to tokens within its window \\(w\\), but also to all "globally"
-attending tokens so that global attention is *symmetric*.
-
-The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor
-`global_attention_mask` at run-time appropriately. All Longformer models employ the following logic for
-`global_attention_mask`:
-
- 0: the token attends "locally",
- 1: the token attends "globally".
-
-For more information please also refer to [`~LongformerModel.forward`] method.
-
-Using Longformer self attention, the memory and time complexity of the query-key matmul operation, which usually
-represents the memory and time bottleneck, can be reduced from \\(\mathcal{O}(n_s \times n_s)\\) to
-\\(\mathcal{O}(n_s \times w)\\), with \\(n_s\\) being the sequence length and \\(w\\) being the average window
-size. It is assumed that the number of "globally" attending tokens is insignificant as compared to the number of
-"locally" attending tokens.
-
-For more information, please refer to the official [paper](https://arxiv.org/pdf/2004.05150.pdf).
-
-
-## Training
-
-[`LongformerForMaskedLM`] is trained the exact same way [`RobertaForMaskedLM`] is
-trained and should be used as follows:
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
-input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
-mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
+import torch
+from transformers import pipeline

-loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+pipeline = pipeline(
+    task="fill-mask",
+    model="allenai/longformer-base-4096",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("""San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee.
+Spencer, a fifth-year pro, will be placed on injured reserve soon after undergoing surgery Wednesday to repair the ligament. He injured his knee late in the 49ers’ road victory at Seattle on Sept. 14, and missed last week’s victory over Detroit.
+Tarell Brown and Donald Strickland will compete to replace Spencer with the 49ers, who kept 12 defensive backs on their 53-man roster to start the season. Brown, a second-year pro, got his first career interception last weekend while filling in for Strickland, who also sat out with a knee injury.""")
 ```

-## Resources
+</hfoption>
+<hfoption id="AutoModel">

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
+```python
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096")
+model = AutoModelForMaskedLM.from_pretrained("allenai/longformer-base-4096")
+
+text = (
+"""
+San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee.
+Spencer, a fifth-year pro, will be placed on injured reserve soon after undergoing surgery Wednesday to repair the ligament. He injured his knee late in the 49ers’ road victory at Seattle on Sept. 14, and missed last week’s victory over Detroit.
+Tarell Brown and Donald Strickland will compete to replace Spencer with the 49ers, who kept 12 defensive backs on their 53-man roster to start the season. Brown, a second-year pro, got his first career interception last weekend while filling in for Strickland, who also sat out with a knee injury.
+"""
+)
+
+input_ids = tokenizer([text], return_tensors="pt")["input_ids"]
+logits = model(input_ids).logits
+
+masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+probs = logits[0, masked_index].softmax(dim=0)
+values, predictions = probs.topk(5)
+tokenizer.decode(predictions).split()
+```
+
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "San Francisco 49ers cornerback Shawntae Spencer will miss the rest of the <mask> with a torn ligament in his left knee." | transformers-cli run --task fill-mask --model allenai/longformer-base-4096 --device 0
+```
+
+</hfoption>
+</hfoptions
+
+
+## Notes
+
+- Longformer is based on [RoBERTa](https://huggingface.co/docs/transformers/en/model_doc/roberta) and doesn't have `token_type_ids`. You don't need to indicate which token belongs to which segment. You only need to separate the segments with the separation token `</s>` or `tokenizer.sep_token`.
+- You can set which tokens can attend locally and which tokens attend globally with the `global_attention_mask` at inference (see this [example](https://huggingface.co/docs/transformers/en/model_doc/longformer#transformers.LongformerModel.forward.example) for more details). A value of `0` means a token attends locally and a value of `1` means a token attends globally.
+- [`LongformerForMaskedLM`] is trained like [`RobertaForMaskedLM`] and should be used as shown below.
+
+  ```py
+    input_ids = tokenizer.encode("This is a sentence from [MASK] training data", return_tensors="pt")
+    mlm_labels = tokenizer.encode("This is a sentence from the training data", return_tensors="pt")
+    loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]
+    ```

 ## LongformerConfig

@ -139,9 +140,6 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]

 [[autodoc]] models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput

-<frameworkcontent>
-<pt>
-
 ## LongformerModel

 [[autodoc]] LongformerModel
@ -149,45 +147,42 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]

 ## LongformerForMaskedLM

-[[autodoc]] LongformerForMaskedLM
+[[autodoc]] LongformerForMaskedLM 
    - forward

 ## LongformerForSequenceClassification

-[[autodoc]] LongformerForSequenceClassification
+[[autodoc]] LongformerForSequenceClassification 
    - forward

 ## LongformerForMultipleChoice

-[[autodoc]] LongformerForMultipleChoice
+[[autodoc]] LongformerForMultipleChoice 
    - forward

 ## LongformerForTokenClassification

-[[autodoc]] LongformerForTokenClassification
+[[autodoc]] LongformerForTokenClassification 
    - forward

 ## LongformerForQuestionAnswering

-[[autodoc]] LongformerForQuestionAnswering
+[[autodoc]] LongformerForQuestionAnswering 
    - forward

-</pt>
-<tf>
-
 ## TFLongformerModel

-[[autodoc]] TFLongformerModel
+[[autodoc]] TFLongformerModel    
    - call

 ## TFLongformerForMaskedLM

-[[autodoc]] TFLongformerForMaskedLM
+[[autodoc]] TFLongformerForMaskedLM 
    - call

 ## TFLongformerForQuestionAnswering

-[[autodoc]] TFLongformerForQuestionAnswering
+[[autodoc]] TFLongformerForQuestionAnswering 
    - call

 ## TFLongformerForSequenceClassification
@ -197,13 +192,10 @@ loss = model(input_ids, labels=input_ids, masked_lm_labels=mlm_labels)[0]

 ## TFLongformerForTokenClassification

-[[autodoc]] TFLongformerForTokenClassification
+[[autodoc]] TFLongformerForTokenClassification 
    - call

 ## TFLongformerForMultipleChoice

-[[autodoc]] TFLongformerForMultipleChoice
+[[autodoc]] TFLongformerForMultipleChoice 
    - call
-
-</tf>
-</frameworkcontent>
--- a/docs/source/en/model_doc/mbart.md
+++ b/docs/source/en/model_doc/mbart.md
@ -14,154 +14,105 @@ rendered properly in your Markdown viewer.

 -->

-# MBart and MBart-50
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat">
+    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
 </div>

+# mBART

-## Overview of MBart
+[mBART](https://huggingface.co/papers/2001.08210) is a multilingual machine translation model that pretrains the entire translation model (encoder-decoder) unlike previous methods that only focused on parts of the model. The model is trained on a denoising objective which reconstructs the corrupted text. This allows mBART to handle the source language and the target text to translate to.

-The MBart model was presented in [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan
-Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+[mBART-50](https://huggingface.co/paper/2008.00401) is pretrained on an additional 25 languages.

-According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
-corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
-sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
-on the encoder, decoder, or reconstructing parts of the text.
+You can find all the original mBART checkpoints under the [AI at Meta](https://huggingface.co/facebook?search_models=mbart) organization.

-This model was contributed by [valhalla](https://huggingface.co/valhalla). The Authors' code can be found [here](https://github.com/pytorch/fairseq/tree/master/examples/mbart)
+> [!TIP]
+> Click on the mBART models in the right sidebar for more examples of applying mBART to different language tasks.

-### Training of MBart
+The example below demonstrates how to translate text with [`Pipeline`] or the [`AutoModel`] class.

-MBart is a multilingual encoder-decoder (sequence-to-sequence) model primarily intended for translation task. As the
-model is multilingual it expects the sequences in a different format. A special language id token is added in both the
-source and target text. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The
-target text format is `[tgt_lang_code] X [eos]`. `bos` is never used.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-The regular [`~MBartTokenizer.__call__`] will encode source text format passed as first argument or with the `text`
-keyword, and target text format passed with the `text_label` keyword argument.
+```py
+import torch
+from transformers import pipeline

- Supervised training
-
-```python
->>> from transformers import MBartForConditionalGeneration, MBartTokenizer
-
->>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
->>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
->>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
->>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_romanian, return_tensors="pt")
-
->>> model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
->>> # forward pass
->>> model(**inputs)
+pipeline = pipeline(
+    task="translation",
+    model="facebook/mbart-large-50-many-to-many-mmt",
+    device=0,
+    torch_dtype=torch.float16,
+    src_lang="en_XX",
+    tgt_lang="fr_XX",
+)
+print(pipeline("UN Chief Says There Is No Military Solution in Syria"))
 ```

- Generation
+</hfoption>
+<hfoption id="AutoModel">

-  While generating the target text set the `decoder_start_token_id` to the target language id. The following
-  example shows how to translate English to Romanian using the *facebook/mbart-large-en-ro* model.
+```py
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

-```python
->>> from transformers import MBartForConditionalGeneration, MBartTokenizer
+article_en = "UN Chief Says There Is No Military Solution in Syria"

->>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")
->>> article = "UN Chief Says There Is No Military Solution in Syria"
->>> inputs = tokenizer(article, return_tensors="pt")
->>> translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
->>> tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
-"Şeful ONU declară că nu există o soluţie militară în Siria"
+model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map="auto")
+tokenizer = AutoTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
+
+tokenizer.src_lang = "en_XX"
+encoded_hi = tokenizer(article_en, return_tensors="pt").to("cuda")
+generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"], cache_implementation="static")
+print(tokenizer.batch_decode(generated_tokens, skip_special_tokens=True))
 ```

-## Overview of MBart-50
+</hfoption>
+</hfoptions>

-MBart-50 was introduced in the [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) paper by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav
-Chaudhary, Jiatao Gu, Angela Fan. MBart-50 is created using the original *mbart-large-cc25* checkpoint by extending
-its embedding layers with randomly initialized vectors for an extra set of 25 language tokens and then pretrained on 50
-languages.
+## Notes

-According to the abstract
+- You can check the full list of language codes via `tokenizer.lang_code_to_id.keys()`.
+- mBART requires a special language id token in the source and target text during training. The source text format is `X [eos, src_lang_code]` where `X` is the source text. The target text format is `[tgt_lang_code] X [eos]`. The `bos` token is never used. The [`~PreTrainedTokenizerBase._call_`] encodes the source text format passed as the first argument or with the `text` keyword. The target text format is passed with the `text_label` keyword.
+- Set the `decoder_start_token_id` to the target language id for mBART.

-*Multilingual translation models can be created through multilingual finetuning. Instead of finetuning on one
-direction, a pretrained model is finetuned on many directions at the same time. It demonstrates that pretrained models
-can be extended to incorporate additional languages without loss of performance. Multilingual finetuning improves on
-average 1 BLEU over the strongest baselines (being either multilingual from scratch or bilingual finetuning) while
-improving 9.3 BLEU on average over bilingual baselines from scratch.*
+    ```py
+    import torch
+    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-en-ro", torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map="auto")
+    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX")

-### Training of MBart-50
+    article = "UN Chief Says There Is No Military Solution in Syria"
+    inputs = tokenizer(article, return_tensors="pt")

-The text format for MBart-50 is slightly different from mBART. For MBart-50 the language id token is used as a prefix
-for both source and target text i.e the text format is `[lang_code] X [eos]`, where `lang_code` is source
-language id for source text and target language id for target text, with `X` being the source or target text
-respectively.
+    translated_tokens = model.generate(**inputs, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
+    tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    ```

+- mBART-50 has a different text format. The language id token is used as the prefix for the source and target text. The text format is `[lang_code] X [eos]` where `lang_code` is the source language id for the source text and target language id for the target text. `X` is the source or target text respectively.
+- Set the `eos_token_id` as the `decoder_start_token_id` for mBART-50. The target language id is used as the first generated token by passing `forced_bos_token_id` to [`~GenerationMixin.generate`].

-MBart-50 has its own tokenizer [`MBart50Tokenizer`].
+    ```py
+    import torch
+    from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

-  Supervised training
+    model = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50-many-to-many-mmt", torch_dtype=torch.bfloat16, attn_implementation="sdpa", device_map="auto")
+    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")

-```python
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
+    article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
+    tokenizer.src_lang = "ar_AR"

-model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50")
-tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50", src_lang="en_XX", tgt_lang="ro_RO")
-
-src_text = " UN Chief Says There Is No Military Solution in Syria"
-tgt_text = "Şeful ONU declară că nu există o soluţie militară în Siria"
-
-model_inputs = tokenizer(src_text, text_target=tgt_text, return_tensors="pt")
-
-model(**model_inputs)  # forward pass
-```
-
- Generation
-
-  To generate using the mBART-50 multilingual translation models, `eos_token_id` is used as the
-  `decoder_start_token_id` and the target language id is forced as the first generated token. To force the
-  target language id as the first generated token, pass the *forced_bos_token_id* parameter to the *generate* method.
-  The following example shows how to translate between Hindi to French and Arabic to English using the
-  *facebook/mbart-50-large-many-to-many* checkpoint.
-
-```python
-from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
-
-article_hi = "संयुक्त राष्ट्र के प्रमुख का कहना है कि सीरिया में कोई सैन्य समाधान नहीं है"
-article_ar = "الأمين العام للأمم المتحدة يقول إنه لا يوجد حل عسكري في سوريا."
-
-model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-tokenizer = MBart50TokenizerFast.from_pretrained("facebook/mbart-large-50-many-to-many-mmt")
-
-# translate Hindi to French
-tokenizer.src_lang = "hi_IN"
-encoded_hi = tokenizer(article_hi, return_tensors="pt")
-generated_tokens = model.generate(**encoded_hi, forced_bos_token_id=tokenizer.lang_code_to_id["fr_XX"])
-tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-# => "Le chef de l 'ONU affirme qu 'il n 'y a pas de solution militaire en Syria."
-
-# translate Arabic to English
-tokenizer.src_lang = "ar_AR"
-encoded_ar = tokenizer(article_ar, return_tensors="pt")
-generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
-tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
-# => "The Secretary-General of the United Nations says there is no military solution in Syria."
-```
-
-## Documentation resources
-
- [Text classification task guide](../tasks/sequence_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
+    encoded_ar = tokenizer(article_ar, return_tensors="pt")
+    generated_tokens = model.generate(**encoded_ar, forced_bos_token_id=tokenizer.lang_code_to_id["en_XX"])
+    tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
+    ```

 ## MBartConfig

@ -253,4 +204,4 @@ tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    - decode

 </jax>
-</frameworkcontent>
+</frameworkcontent>
--- a/docs/source/en/model_doc/mobilenet_v1.md
+++ b/docs/source/en/model_doc/mobilenet_v1.md
@ -77,6 +77,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] MobileNetV1ImageProcessor
    - preprocess

+## MobileNetV1ImageProcessorFast
+
+[[autodoc]] MobileNetV1ImageProcessorFast
+    - preprocess
+
 ## MobileNetV1Model

 [[autodoc]] MobileNetV1Model
--- a/docs/source/en/model_doc/phi.md
+++ b/docs/source/en/model_doc/phi.md
@ -13,166 +13,117 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

 # Phi

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[Phi](https://huggingface.co/papers/2306.11644) is a 1.3B parameter transformer model optimized for Python code generation. It focuses on "textbook-quality" training data of code examples, exercises and synthetic Python problems rather than scaling the model size or compute.

-## Overview
+You can find all the original Phi checkpoints under the [Phi-1](https://huggingface.co/collections/microsoft/phi-1-6626e29134744e94e222d572) collection.

-The Phi-1 model was proposed in [Textbooks Are All You Need](https://arxiv.org/abs/2306.11644) by Suriya Gunasekar, Yi Zhang, Jyoti Aneja, Caio César Teodoro Mendes, Allie Del Giorno, Sivakanth Gopi, Mojan Javaheripi, Piero Kauffmann, Gustavo de Rosa, Olli Saarikivi, Adil Salim, Shital Shah, Harkirat Singh Behl, Xin Wang, Sébastien Bubeck, Ronen Eldan, Adam Tauman Kalai, Yin Tat Lee and Yuanzhi Li.
+> [!TIP]
+> Click on the Phi models in the right sidebar for more examples of how to apply Phi to different language tasks.

-The Phi-1.5 model was proposed in [Textbooks Are All You Need II: phi-1.5 technical report](https://arxiv.org/abs/2309.05463) by Yuanzhi Li, Sébastien Bubeck, Ronen Eldan, Allie Del Giorno, Suriya Gunasekar and Yin Tat Lee.
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`] and from the command line.

-### Summary
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-In Phi-1 and Phi-1.5 papers, the authors showed how important the quality of the data is in training relative to the model size.
-They selected high quality "textbook" data alongside with synthetically generated data for training their small sized Transformer
-based model Phi-1 with 1.3B parameters. Despite this small scale, phi-1 attains pass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP.
-They follow the same strategy for Phi-1.5 and created another 1.3B parameter model with performance on natural language tasks comparable
-to models 5x larger, and surpassing most non-frontier LLMs. Phi-1.5 exhibits many of the traits of much larger LLMs such as the ability
-to “think step by step” or perform some rudimentary in-context learning.
-With these two experiments the authors successfully showed the huge impact of quality of training data when training machine learning models.
+```py
+import torch
+from transformers import pipeline

-The abstract from the Phi-1 paper is the following:
+pipeline = pipeline(task="text-generation", model="microsoft/phi-1.5", device=0, torch_dtype=torch.bfloat16)
+pipeline("pipeline('''def print_prime(n): """ Print all primes between 1 and n"""''')")

-*We introduce phi-1, a new large language model for code, with significantly smaller size than
-competing models: phi-1 is a Transformer-based model with 1.3B parameters, trained for 4 days on
-8 A100s, using a selection of “textbook quality” data from the web (6B tokens) and synthetically
-generated textbooks and exercises with GPT-3.5 (1B tokens). Despite this small scale, phi-1 attains
-pass@1 accuracy 50.6% on HumanEval and 55.5% on MBPP. It also displays surprising emergent
-properties compared to phi-1-base, our model before our finetuning stage on a dataset of coding
-exercises, and phi-1-small, a smaller model with 350M parameters trained with the same pipeline as
-phi-1 that still achieves 45% on HumanEval.*
-
-The abstract from the Phi-1.5 paper is the following:
-
-*We continue the investigation into the power of smaller Transformer-based language models as
-initiated by TinyStories – a 10 million parameter model that can produce coherent English – and
-the follow-up work on phi-1, a 1.3 billion parameter model with Python coding performance close
-to the state-of-the-art. The latter work proposed to use existing Large Language Models (LLMs) to
-generate “textbook quality” data as a way to enhance the learning process compared to traditional
-web data. We follow the “Textbooks Are All You Need” approach, focusing this time on common
-sense reasoning in natural language, and create a new 1.3 billion parameter model named phi-1.5,
-with performance on natural language tasks comparable to models 5x larger, and surpassing most
-non-frontier LLMs on more complex reasoning tasks such as grade-school mathematics and basic
-coding. More generally, phi-1.5 exhibits many of the traits of much larger LLMs, both good –such
-as the ability to “think step by step” or perform some rudimentary in-context learning– and bad,
-including hallucinations and the potential for toxic and biased generations –encouragingly though, we
-are seeing improvement on that front thanks to the absence of web data. We open-source phi-1.5 to
-promote further research on these urgent topics.*
-
-This model was contributed by [Susnato Dhar](https://huggingface.co/susnato).
-
-The original code for Phi-1, Phi-1.5 and Phi-2 can be found [here](https://huggingface.co/microsoft/phi-1), [here](https://huggingface.co/microsoft/phi-1_5) and [here](https://huggingface.co/microsoft/phi-2), respectively.
-
-## Usage tips
-
- This model is quite similar to `Llama` with the main difference in [`PhiDecoderLayer`], where they used [`PhiAttention`] and [`PhiMLP`] layers in parallel configuration.
- The tokenizer used for this model is identical to the [`CodeGenTokenizer`].
-
-## How to use Phi-2
-
-<Tip warning={true}>
-
-Phi-2 has been integrated in the development version (4.37.0.dev) of `transformers`. Until the official version is released through `pip`, ensure that you are doing one of the following:
-
-* When loading the model, ensure that `trust_remote_code=True` is passed as an argument of the `from_pretrained()` function.
-
-* Update your local `transformers` to the development version: `pip uninstall -y transformers && pip install git+https://github.com/huggingface/transformers`. The previous command is an alternative to cloning and installing from the source.
-
-</Tip>
-
-```python
->>> from transformers import AutoModelForCausalLM, AutoTokenizer
-
->>> model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
-
->>> inputs = tokenizer('Can you help me write a formal email to a potential business partner proposing a joint venture?', return_tensors="pt", return_attention_mask=False)
-
->>> outputs = model.generate(**inputs, max_length=30)
->>> text = tokenizer.batch_decode(outputs)[0]
->>> print(text)
-Can you help me write a formal email to a potential business partner proposing a joint venture?
-Input: Company A: ABC Inc.
-Company B
 ```

-### Example :
+</hfoption>

-```python
->>> from transformers import PhiForCausalLM, AutoTokenizer
+<hfoption id="AutoModel">

->>> # define the model and tokenizer.
->>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5")
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM

->>> # feel free to change the prompt to your liking.
->>> prompt = "If I were an AI that had just achieved"
+tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")

->>> # apply the tokenizer.
->>> tokens = tokenizer(prompt, return_tensors="pt")
+input_ids = tokenizer('''def print_prime(n):
+   """
+   Print all primes between 1 and n
+   """''', return_tensors="pt").to("cuda")

->>> # use the model to generate new tokens.
->>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)
-
->>> tokenizer.batch_decode(generated_output)[0]
-'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled'
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

-## Combining Phi and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
+</hfoption>
+<hfoption id="transformers-cli">

 ```bash
-pip install -U flash-attn --no-build-isolation
+echo -e "'''def print_prime(n): """ Print all primes between 1 and n"""'''" | transformers-cli run --task text-classification --model microsoft/phi-1.5 --device 0
 ```

-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+</hfoption>
+</hfoptions>

-To load and run a model using Flash Attention 2, refer to the snippet below:
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

-```python
->>> import torch
->>> from transformers import PhiForCausalLM, AutoTokenizer
+The example below uses [bitsandbytes](https://huggingface.co/docs/transformers/en/quantization/bitsandbytes) to only quantize the weights to 4-bits.

->>> # define the model and tokenizer and push the model and tokens to the GPU.
->>> model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", torch_dtype=torch.float16, attn_implementation="flash_attention_2").to("cuda")  # doctest: +SKIP
->>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM

->>> # feel free to change the prompt to your liking.
->>> prompt = "If I were an AI that had just achieved"
+bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_compute_dtype=torch.bfloat16, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True)
+tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+model = AutoModelForCausalLM.from_pretrained("microsoft/phi-1", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa", quantization_config=bnb_config)

->>> # apply the tokenizer.
->>> tokens = tokenizer(prompt, return_tensors="pt").to("cuda")
+input_ids = tokenizer('''def print_prime(n):
+   """
+   Print all primes between 1 and n
+   """''', return_tensors="pt").to("cuda")

->>> # use the model to generate new tokens.
->>> generated_output = model.generate(**tokens, use_cache=True, max_new_tokens=10)  # doctest: +SKIP
-
->>> tokenizer.batch_decode(generated_output)[0]  # doctest: +SKIP
-'If I were an AI that had just achieved a breakthrough in machine learning, I would be thrilled'
+output = model.generate(**input_ids, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```

-### Expected speedups
+## Notes

-Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `microsoft/phi-1` checkpoint and the Flash Attention 2 version of the model using a sequence length of 2048.
+- If you're using Transformers < 4.37.0.dev, set `trust_remote_code=True` in [`~AutoModel.from_pretrained`]. Otherwise, make sure you update Transformers to the latest stable version.

-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/ybelkada/documentation-images/resolve/main/phi_1_speedup_plot.jpg">
-</div>
+    ```py
+    import torch
+    from transformers import AutoTokenizer, AutoModelForCausalLM
+    
+    tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1")
+    model = AutoModelForCausalLM.from_pretrained(
+        "microsoft/phi-1",
+        torch_dtype=torch.float16,
+        device_map="auto",
+        trust_remote_code=True,
+        attn_implementation="sdpa")
+    
+    input_ids = tokenizer('''def print_prime(n):
+       """
+       Print all primes between 1 and n
+       """''', return_tensors="pt").to("cuda")
+    
+    output = model.generate(**input_ids, cache_implementation="static")
+    print(tokenizer.decode(output[0], skip_special_tokens=True))
+    ```

 ## PhiConfig

 [[autodoc]] PhiConfig

-<frameworkcontent>
-<pt>
-
 ## PhiModel

 [[autodoc]] PhiModel
@ -193,6 +144,3 @@ Below is an expected speedup diagram that compares pure inference time between t

 [[autodoc]] PhiForTokenClassification
    - forward
-
-</pt>
-</frameworkcontent>
--- a/docs/source/en/model_doc/poolformer.md
+++ b/docs/source/en/model_doc/poolformer.md
@ -73,6 +73,11 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] PoolFormerImageProcessor
    - preprocess

+## PoolFormerImageProcessorFast
+
+[[autodoc]] PoolFormerImageProcessorFast
+    - preprocess
+
 ## PoolFormerModel

 [[autodoc]] PoolFormerModel
--- a/docs/source/en/model_doc/pvt.md
+++ b/docs/source/en/model_doc/pvt.md
@ -64,6 +64,11 @@ This model was contributed by [Xrenya](https://huggingface.co/Xrenya). The origi
 [[autodoc]] PvtImageProcessor
    - preprocess

+## PvtImageProcessorFast
+
+[[autodoc]] PvtImageProcessorFast
+    - preprocess
+
 ## PvtForImageClassification

 [[autodoc]] PvtForImageClassification
--- a/docs/source/en/model_doc/qwen2_5_omni.md
+++ b/docs/source/en/model_doc/qwen2_5_omni.md
@ -59,7 +59,7 @@ model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
 )
 processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

-conversation = [
+conversations = [
    {
        "role": "system",
        "content": [
@ -115,7 +115,7 @@ model = Qwen2_5OmniThinkerForConditionalGeneration.from_pretrained(
 )
 processor = Qwen2_5OmniProcessor.from_pretrained("Qwen/Qwen2.5-Omni-7B")

-conversation = [
+conversations = [
    {
        "role": "system",
        "content": [
--- a/docs/source/en/model_doc/siglip.md
+++ b/docs/source/en/model_doc/siglip.md
@ -14,184 +14,116 @@ rendered properly in your Markdown viewer.

 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
 # SigLIP

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[SigLIP](https://huggingface.co/papers/2303.15343) is a multimodal image-text model similar to [CLIP](clip). It uses separate image and text encoders to generate representations for both modalities.

-## Overview
+Unlike CLIP, SigLIP employs a pairwise sigmoid loss on image-text pairs during training. This training loss eliminates the need for a global view of all pairwise similarities between images and texts within a batch. Consequently, it enables more efficient scaling to larger batch sizes while also delivering superior performance with smaller batch sizes.

-The SigLIP model was proposed in [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) by Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer. SigLIP proposes to replace the loss function used in [CLIP](clip) by a simple pairwise sigmoid loss. This results in better performance in terms of zero-shot classification accuracy on ImageNet.
+You can find all the original SigLIP checkpoints under the [SigLIP](https://huggingface.co/collections/google/siglip-659d5e62f0ae1a57ae0e83ba) collection.

-The abstract from the paper is the following:

-*We propose a simple pairwise Sigmoid loss for Language-Image Pre-training (SigLIP). Unlike standard contrastive learning with softmax normalization, the sigmoid loss operates solely on image-text pairs and does not require a global view of the pairwise similarities for normalization. The sigmoid loss simultaneously allows further scaling up the batch size, while also performing better at smaller batch sizes. Combined with Locked-image Tuning, with only four TPUv4 chips, we train a SigLiT model that achieves 84.5% ImageNet zero-shot accuracy in two days. The disentanglement of the batch size from the loss further allows us to study the impact of examples vs pairs and negative to positive ratio. Finally, we push the batch size to the extreme, up to one million, and find that the benefits of growing batch size quickly diminish, with a more reasonable batch size of 32k being sufficient.*
+> [!TIP]
+> Click on the SigLIP models in the right sidebar for more examples of how to apply SigLIP to different image and text tasks.

-## Usage tips
+The example below demonstrates how to generate similarity scores between texts and image(s) with [`Pipeline`] or the [`AutoModel`] class.

- Usage of SigLIP is similar to [CLIP](clip). The main difference is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. One needs to apply the sigmoid activation function to the logits, rather than the softmax.
- Training is supported but does not use `torch.distributed` utilities which may limit the scalability of batch size. However, DDP and FDSP works on single-node multi-gpu setup.
- When using the standalone [`SiglipTokenizer`] or [`SiglipProcessor`], make sure to pass `padding="max_length"` as that's how the model was trained.
- To get the same results as the pipeline, a prompt template of "This is a photo of {label}." should be used.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip_table.jpeg"
-alt="drawing" width="600"/>
+```py
+import torch
+from transformers import pipeline

-<small> SigLIP evaluation results compared to CLIP. Taken from the <a href="https://arxiv.org/abs/2303.15343">original paper</a>.</small>
+image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/google-research/big_vision/tree/main).
-
-## Usage example
-
-There are 2 main ways to use SigLIP: either using the pipeline API, which abstracts away all the complexity for you, or by using the `SiglipModel` class yourself.
-
-### Pipeline API
-
-The pipeline allows to use the model in a few lines of code:
-
-```python
->>> from transformers import pipeline
->>> from PIL import Image
->>> import requests
-
->>> # load pipe
->>> image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224")
-
->>> # load image
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> # inference
->>> candidate_labels = ["2 cats", "a plane", "a remote"]
->>> outputs = image_classifier(image, candidate_labels=candidate_labels)
->>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
->>> print(outputs)
-[{'score': 0.1979, 'label': '2 cats'}, {'score': 0.0, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
+pipeline = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224", device=0, torch_dtype=torch.bfloat16)
+pipeline(image, candidate_labels=candidate_labels)
 ```

-### Using the model yourself
+</hfoption>
+<hfoption id="AutoModel">

-If you want to do the pre- and postprocessing yourself, here's how to do that:
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel

-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
+model = AutoModel.from_pretrained("google/siglip-base-patch16-224", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

->>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
->>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]
+inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to("cuda")

->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
+with torch.no_grad():
+    outputs = model(**inputs)

->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
-
->>> with torch.no_grad():
-...     outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```

-## Resources
+</hfoption>
+</hfoptions>

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP.
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.

- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification)
- Demo notebooks for SigLIP can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SigLIP). 🌎
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.

-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel, BitsAndBytesConfig

+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModel.from_pretrained("google/siglip-base-patch16-224", quantization_config=bnb_config, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")

-## Combining SigLIP and Flash Attention 2
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]
+inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to("cuda")

-First, make sure to install the latest version of Flash Attention 2.
+with torch.no_grad():
+    outputs = model(**inputs)

-```bash
-pip install -U flash-attn --no-build-isolation
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```
+## Notes

-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
+- Training is supported for DDP and FSDP on single-node multi-GPU setups. However, it does not use [torch.distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) utilities which may limit the scalability of batch size.
+- When using the standalone [`SiglipTokenizer`] or [`SiglipProcessor`], make sure to pass `padding="max_length"` because that is how the model was trained.
+- To get the same results as the [`Pipeline`], a prompt template of `"This is a photo of {label}."` should be passed to the processor.
+- Toggle the `attn_implementation` parameter to either `"sdpa"` or `"flash_attention_2"` to use a more memory-efficient attention.
+    ```py
+    # pip install -U flash-attn --no-build-isolation

-To load and run a model using Flash Attention 2, refer to the snippet below:
+    from transformers import SiglipModel

-```python
->>> import torch
->>> import requests
->>> from PIL import Image
->>> from transformers import SiglipProcessor, SiglipModel
->>> device = "cuda" # the device to load the model onto
-
->>> model = SiglipModel.from_pretrained(
-...     "google/siglip-so400m-patch14-384",
-...     attn_implementation="flash_attention_2",
-...     torch_dtype=torch.float16,
-...     device_map=device,
-... )
->>> processor = SiglipProcessor.from_pretrained("google/siglip-so400m-patch14-384")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
-
->>> with torch.no_grad():
-...     with torch.autocast(device):
-...         outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
-```
-
-
-## Using Scaled Dot Product Attention (SDPA)
-
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-You may set `attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used. Make sure you have `torch>=2.1.1`.
-
-```python
->>> from transformers import SiglipModel
-
->>> model = SiglipModel.from_pretrained(
-...     "google/siglip-so400m-patch14-384",
-...     attn_implementation="sdpa",
-...     torch_dtype=torch.float16,
-...     device_map=device,
-... )
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-
-## Expected speedups
-
-Below is an expected speedup diagram that compares inference time between the native implementation in transformers using `google/siglip-so400m-patch14-384` checkpoint in `float16` precision and the Flash Attention 2 / SDPA version of the model using different batch sizes.
-
-<div style="text-align: center">
-<img src="https://i.imgur.com/cWm4rsn.png">
-</div>
+    model = SiglipModel.from_pretrained(
+        "google/siglip-so400m-patch14-384",
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.float16,
+        device_map=device,
+    )
+    ```


 ## SiglipConfig
--- a/docs/source/en/model_doc/siglip2.md
+++ b/docs/source/en/model_doc/siglip2.md
@ -14,225 +14,160 @@ rendered properly in your Markdown viewer.

 -->

-# SigLIP2
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+            <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+            <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+            <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

+# SigLIP2
+
 ## Overview

-The SigLIP2 model was proposed in [SigLIP 2: Multilingual Vision-Language Encoders with Improved Semantic Understanding, Localization, and Dense Features](https://huggingface.co/papers/2502.14786) by Michael Tschannen, Alexey Gritsenko, Xiao Wang, Muhammad Ferjad Naeem, Ibrahim Alabdulmohsin,
-Nikhil Parthasarathy, Talfan Evans, Lucas Beyer, Ye Xia, Basil Mustafa, Olivier Hénaff, Jeremiah Harmsen,
-Andreas Steiner and Xiaohua Zhai.
+[SigLIP2](https://huggingface.co/papers/2502.14786) is a family of multilingual vision-language encoders that builds on the [SigLIP](./siglip) training recipe. It includes decoder-based pretraining, self-distillation, and masked prediction to improve dense prediction tasks (segmentation, depth estimation, etc.). This model is available in two variants:

-The model comes in two variants
+- NaFlex supports different resolutions and maintains the native image aspect ratio
+- FixRes supports fixed resolutions and is backwards compatible with [SigLIP](./siglip)

- 1) FixRes - model works with fixed resolution images (backward compatible with SigLIP v1)
- 2) NaFlex - model works with variable image aspect ratios and resolutions (SigLIP2 in `transformers`)

-The abstract from the paper is the following:
+You can find all the original SigLIP2 checkpoints under the [SigLIP2](https://huggingface.co/collections/google/siglip2-67b5dcef38c175486e240107) collection.

-*We introduce SigLIP 2, a family of new multilingual vision-language encoders that build on the success
-of the original SigLIP. In this second iteration, we extend the original image-text training objective with
-several prior, independently developed techniques into a unified recipe—this includes decoder-based
-pretraining, self-supervised losses (self-distillation, masked prediction) and online data curation. With
-these changes, SigLIP 2 models outperform their SigLIP counterparts at all model scales in core capabilities, 
-including zero-shot classification (best SigLIP 2 ViT-g/16 achieves 85.0% ImageNet zero-shot
-accuracy), image-text retrieval, and transfer performance when extracting visual representations for
-Vision-Language Models (VLMs). Furthermore, the new training recipe leads to significant improvements 
-on localization and dense prediction tasks. We also train variants which support multiple resolutions 
-and preserve the input’s native aspect ratio. Finally, we train on a more diverse data-mixture that
-includes de-biasing techniques, leading to much better multilingual understanding and improved fair-
-ness. To provide users with the ability to trade-off inference cost with performance, we release model
-checkpoints at four sizes (ViT-B/86M, L/303M, So400m/400M, and g/1B).*
+> [!TIP]
+> Click on the SigLIP2 models in the right sidebar for more examples of how to apply SigLIP2 to different image and text tasks.

-## Usage tips
+The example below demonstrates zero-shot classification with [`Pipeline`] or the [`AutoModel`] class.

- Usage of SigLIP2 is similar to [SigLIP](siglip) and [CLIP](clip). The main difference from CLIP is the training loss, which does not require a global view of all the pairwise similarities of images and texts within a batch. One needs to apply the sigmoid activation function to the logits, rather than the softmax.
- Training is supported but does not use `torch.distributed` utilities which may limit the scalability of batch size. However, DDP and FDSP works on single-node multi-gpu setup.
- When using the standalone [`GemmaTokenizerFast`] make sure to pass `padding="max_length"` and `max_length=64` as that's how the model was trained.
- Model was trained with *lowercased* text, make sure you make the same preprocessing for your text labels.
- To get the same results as the pipeline, a prompt template of "this is a photo of {label}" should be used.
- The NaFlex variant supports processing images at higher resolutions by adjusting the `max_num_patches` parameter in the `Processor`. The default value is `max_num_patches=256`. Increasing `max_num_patches` to 1024 (4x) will approximately double processed image height and width, while preserving the aspect ratio.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip2_metrics_table.png"
-alt="drawing" width="600"/>
+```py
+import torch
+from transformers import pipeline

-This model was contributed by [qubvel](https://huggingface.co/qubvel-hf).
-The original code can be found [here](https://github.com/google-research/big_vision/tree/main).
+image = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

-## Usage example
-
-There are 2 main ways to use SigLIP2: either using the pipeline API, which abstracts away all the complexity for you, or by using the `Siglip2Model` class yourself.
-
-### FixRes variant
-
-**Pipeline API**
-
-The pipeline allows to use the model in a few lines of code:
-
-```python
->>> from transformers import pipeline
->>> from PIL import Image
->>> import requests
-
->>> # load pipe
->>> image_classifier = pipeline(
-...     task="zero-shot-image-classification",
-...     model="google/siglip2-base-patch16-224",
-... )
-
->>> # load image
->>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> # inference
->>> candidate_labels = ["2 cats", "a plane", "a remote"]
->>> outputs = image_classifier(image, candidate_labels=candidate_labels)
->>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
->>> print(outputs)
-[{'score': 0.1499, 'label': '2 cats'}, {'score': 0.0008, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
+pipeline = pipeline(task="zero-shot-image-classification", model="google/siglip2-base-patch16-224", device=0, torch_dtype=torch.bfloat16)
+pipeline(image, candidate_labels=candidate_labels)
 ```

-**Using the model yourself**
+</hfoption>
+<hfoption id="AutoModel (FixRes)">

-If you want to do the pre- and postprocessing yourself, here's how to do that:
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel

-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
+model = AutoModel.from_pretrained("google/siglip2-base-patch16-224", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

->>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
->>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
->>> texts = [f"This is a photo of {label}." for label in candidate_labels]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]

 # IMPORTANT: we pass `padding=max_length` and `max_length=64` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt")
+inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt").to("cuda")

->>> with torch.no_grad():
-...     outputs = model(**inputs)
+with torch.no_grad():
+    outputs = model(**inputs)

->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-15.0% that image 0 is '2 cats'
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```

-### NaFlex variant
+</hfoption>
+<hfoption id="AutoModel (NaFlex)">

-NaFlex combines ideas from FlexiViT, i.e. supporting multiple, predefined sequence lengths 
-with a single ViT model, and NaViT, namely processing images at their native aspect ratio.
-This enables processing different types of images at appropriate resolution, e.g. using a
-larger resolution to process document images, while at the same time minimizing the impact 
-of aspect ratio distortion on certain inference tasks, e.g. on OCR.
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel

-Given a patch size and target sequence length, NaFlex preprocesses the data by first resizing 
-the input image such that the height and width after resizing are multiples of the patch size,
-while 
-    
-    1. keeping the aspect ratio distortion as small as possible
-    2. producing a sequence length of at most the desired target sequence length (`max_num_patches`)
-    
-The resulting distortion in width and height is at most `(patch_size - 1) / width` and
-`(patch_size - 1) / height`, respectively, which tends to be small for common resolutions and aspect ratios. 
-After resizing, the image is split into a sequence of patches, and a mask with padding information is added.
+model = AutoModel.from_pretrained("google/siglip2-base-patch16-naflex", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-naflex")

-```python
->>> from PIL import Image
->>> import requests
->>> from transformers import AutoProcessor, AutoModel
->>> import torch
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]

->>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-naflex")
->>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-naflex")
+# default value for `max_num_patches` is 256, but you can increase resulted image resolution providing higher values e.g. `max_num_patches=512`
+inputs = processor(text=texts, images=image, padding="max_length", max_num_patches=256, return_tensors="pt").to("cuda")

->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```py
+import torch
+import requests
+from PIL import Image
+from transformers import AutoProcessor, AutoModel, BitsAndBytesConfig
+
+bnb_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModel.from_pretrained("google/siglip2-large-patch16-512", quantization_config=bnb_config, device_map="auto", attn_implementation="sdpa")
+processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+image = Image.open(requests.get(url, stream=True).raw)
+candidate_labels = ["a Pallas cat", "a lion", "a Siberian tiger"]

->>> candidate_labels = ["2 cats", "2 dogs"]
 # follows the pipeline prompt template to get same results
->>> texts = [f"This is a photo of {label}." for label in candidate_labels]
+texts = [f'This is a photo of {label}.' for label in candidate_labels]

-# default value for `max_num_patches` is 256, but you can increase resulted image resolution providing
-# higher values e.g. `max_num_patches=512`
->>> inputs = processor(text=texts, images=image, max_num_patches=256, return_tensors="pt")
+# IMPORTANT: we pass `padding=max_length` and `max_length=64` since the model was trained with this
+inputs = processor(text=texts, images=image, padding="max_length", max_length=64, return_tensors="pt").to("cuda")

->>> with torch.no_grad():
-...     outputs = model(**inputs)
+with torch.no_grad():
+    outputs = model(**inputs)

->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-21.1% that image 0 is '2 cats'
+logits_per_image = outputs.logits_per_image
+probs = torch.sigmoid(logits_per_image)
+print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
 ```

-## Resources
+## Notes

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with SigLIP2.
+- Training is supported for DDP and FSDP on single-node multi-GPU setups. However, it does not use [torch.distributed](https://pytorch.org/tutorials/beginner/dist_overview.html) utilities which may limit the scalability of batch size.
+- When using the standalone [`GemmaTokenizerFast`] make sure to pass `padding="max_length"` and `max_length=64` as that's how the model was trained.
+- Model was trained with *lowercased* text, so make sure your text labels are preprocessed the same way.
+- To get the same results as the [`Pipeline`], a prompt template of `"This is a photo of {label}."` should be passed to the processor.
+- The NaFlex variant processes different types of images at the appropriate resolution (using a larger resolution to process document images for example), while also minimizing the impact of aspect ratio distortion for certain inference tasks like OCR.

- [Zero-shot image classification task guide](../tasks/zero_shot_image_classification)
- Demo notebook for SigLIP2 can be found [here](https://github.com/qubvel/transformers-notebooks/tree/master/notebooks/SigLIP2_inference.ipynb). 🌎
+   NaFlex resizes the input image so the height and width are multiples of the patch size after resizing. It keeps the aspect ratio distortion as low as possible and produces a sequence length of at most the desired target sequence length (`max_num_patches`). After resizing, the image is split into a sequence of patches and a mask with padding information is added.
+- Toggle the `attn_implementation` parameter to either `"sdpa"` or `"flash_attention_2"` to use a more memory-efficient attention.
+    ```py
+    # pip install -U flash-attn --no-build-isolation

-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-
-## Combining SigLIP2 and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2.
-
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
-
-To load and run a model using Flash Attention 2, refer to the snippet below:
-
-```python
->>> import torch
->>> import requests
->>> from PIL import Image
->>> from transformers import AutoProcessor, AutoModel
->>> device = "cuda" # the device to load the model onto
-
->>> model = AutoModel.from_pretrained(
-...     "google/siglip2-so400m-patch14-384",
-...     attn_implementation="flash_attention_2",
-...     torch_dtype=torch.float16,
-...     device_map=device,
-... )
->>> processor = AutoProcessor.from_pretrained("google/siglip2-so400m-patch14-384")
-
->>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
->>> image = Image.open(requests.get(url, stream=True).raw)
-
->>> candidate_labels = ["2 cats", "2 dogs"]
-# follows the pipeline prompt template to get same results
->>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
-# important: we pass `padding=max_length` since the model was trained with this
->>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
-
->>> with torch.no_grad():
-...     with torch.autocast(device):
-...         outputs = model(**inputs)
-
->>> logits_per_image = outputs.logits_per_image
->>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
->>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
-19.8% that image 0 is '2 cats'
-```
+    from transformers import SiglipModel

+    model = SiglipModel.from_pretrained(
+        "google/siglip2-so400m-patch14-384",
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.float16,
+        device_map=device,
+    )
+    ```
 ## Siglip2Config

 [[autodoc]] Siglip2Config
--- a/docs/source/en/quantization/auto_round.md
+++ b/docs/source/en/quantization/auto_round.md
@ -0,0 +1,286 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+-->
+
+# AutoRound
+
+[AutoRound](https://github.com/intel/auto-round) is an advanced quantization algorithm that delivers strong accuracy, even at 2-bit precision. 
+It leverages sign gradient descent to fine-tune both rounding values and min-max clipping thresholds in just 200 steps. Designed for broad compatibility, it seamlessly supports a wide range of LLMs and is actively expanding to cover more VLMs as well. 
+It also supports quantization and inference across multiple hardware platforms, including CPU, XPU, and CUDA.
+
+AutoRound also offers a variety of useful features, including mixed-bit tuning and inference, lm-head quantization, support for exporting to formats like GPTQ/AWQ/GGUF, and flexible tuning recipes. 
+For a comprehensive overview and the latest updates, check out the AutoRound [README](https://github.com/intel/auto-round).
+
+AutoRound was originally developed as part of the [Intel Neural Compressor](https://github.com/intel/neural-compressor), serving as a general-purpose model compression library for deep learning. 
+It has since evolved into a standalone library focused specifically on low-precision optimization for large language models (LLMs). 
+AutoRound remains fully integrated with the Intel Neural Compressor, and you can explore the repository for more details.
+
+
+## Installation
+
+```bash
+pip install auto-round
+```
+
+## Supported Quantization Configurations
+
+AutoRound supports several quantization configurations:
+
+- **Int8 Weight Only**
+- **Int4 Weight Only**
+- **Int3 Weight Only**
+- **Int2 Weight Only**
+- **Mixed bits Weight only**
+
+## Hardware Compatibility
+
+CPU, XPU, and CUDA for both quantization and inference.
+
+## Quantization and Serialization (offline)
+
+Currently, only offline mode is supported to generate quantized models.
+
+<hfoptions id="quantization">
+<hfoption id="quantization cmd">
+
+### Command Line Usage
+```bash
+auto-round \
+    --model facebook/opt-125m \
+    --bits 4 \
+    --group_size 128 \
+    --output_dir ./tmp_autoround
+```
+
+AutoRound also offer another two recipes, `auto-round-best` and `auto-round-light`, designed for optimal accuracy and improved speed, respectively. 
+For 2 bits, we recommend using `auto-round-best` or `auto-round`.
+</hfoption>
+
+<hfoption id="quantization auto-round api">
+
+### AutoRound API Usage
+This setting offers a better trade-off between accuracy and tuning cost, and is recommended in all scenarios.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "facebook/opt-125m"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+bits, group_size, sym = 4, 128, True
+# mixed bits config
+# layer_config = {"model.decoder.layers.6.self_attn.out_proj": {"bits": 2, "group_size": 32}}
+autoround = AutoRound(
+    model,
+    tokenizer,
+    bits=bits,
+    group_size=group_size,
+    sym=sym,
+    # enable_torch_compile=True,
+    # layer_config=layer_config,
+)
+
+output_dir = "./tmp_autoround"
+# format= 'auto_round'(default), 'auto_gptq', 'auto_awq'
+autoround.quantize_and_save(output_dir, format='auto_round') 
+```
+
+</hfoption>
+
+<hfoption id="quantization auto-round-best">
+
+### AutoRoundBest recipe
+This setting provides the best accuracy in most scenarios but is 4–5× slower than the standard AutoRound recipe. It is especially recommended for 2-bit quantization and is a good choice if sufficient resources are available.
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "facebook/opt-125m"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+bits, group_size, sym = 4, 128, True
+autoround = AutoRound(
+    model,
+    tokenizer,
+    bits=bits,
+    group_size=group_size,
+    sym=sym,
+    nsamples=512,
+    iters=1000,
+    low_gpu_mem_usage=True
+)
+
+output_dir = "./tmp_autoround"
+autoround.quantize_and_save(output_dir, format='auto_round') 
+```
+</hfoption>
+
+<hfoption id="quantization auto-round-light">
+
+### AutoRoundLight recipe
+This setting offers the best speed (2 - 3X faster than AutoRound), but it may cause a significant accuracy drop for small models and 2-bit quantization. It is recommended for 4-bit settings and models larger than 3B.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from auto_round import AutoRound
+
+model_name = "facebook/opt-125m"
+model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+bits, group_size, sym = 4, 128, True
+autoround = AutoRound(
+    model,
+    tokenizer,
+    bits=bits,
+    group_size=group_size,
+    sym=sym,
+    iters=50,
+    lr=5e-3,
+)
+
+output_dir = "./tmp_autoround"
+autoround.quantize_and_save(output_dir, format='auto_round') 
+```
+
+</hfoption>
+
+</hfoptions>
+
+W4G128 Average Accuracy of 13 tasks (mmlu-pro, if_eval, gsm8k, etc) and Time Cost Results (Testing was conducted on the Nvidia A100 80G using the version of PyTorch 2.6.0 with enable_torch_compile):
+
+| Model   | Qwen2.5-0.5B-Instruct | Falcon3-3B    | Qwen2.5-7B-Instruct | Meta-Llama-3.1-8B-Instruct | Falcon3-10B   | Qwen2.5-72B-Instruct |
+|---------|--------------------|---------------|------------------|----------------------------|---------------|-------------------|
+| 16bits  | 0.4192             | 0.5203        | 0.6470           | 0.6212                     | 0.6151        | 0.7229            |
+| Best    | **0.4137**(7m)     | **0.5142**(23m) | 0.6426(58m)      | **0.6116**(65m)            | **0.6092**(81m) | 0.7242(575m)      |
+| Default | 0.4129(2m)         | 0.5133(6m)    | 0.6441(13m)      | 0.6106(13m)                | 0.6080(18m)   | **0.7252**(118m)  |
+| Light   | 0.4052(2m)         | 0.5108(3m)    | **0.6453**(5m)   | 0.6104(6m)                 | 0.6063(6m)    | 0.7243(37m)       |
+
+## Inference
+
+AutoRound automatically selects the best available backend based on the installed libraries and prompts the user to install additional libraries when a better backend is found.
+<hfoptions id="inference">
+<hfoption id="inference cpu">
+
+### CPU
+
+Supports 2, 4, and 8 bits. We recommend using intel-extension-for-pytorch (IPEX) for 4 bits inference.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+</hfoption>
+
+<hfoption id="inference xpu">
+
+### XPU
+
+Supports 4 bits only. We recommend using intel-extension-for-pytorch (IPEX) for inference.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="xpu", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+</hfoption>
+
+<hfoption id="inference cuda">
+
+### CUDA
+
+Supports 2, 3, 4, and 8 bits. We recommend using GPTQModel for 4 and 8 bits inference.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+</hfoption>
+
+<hfoption id="inference backend">
+
+### Specify Inference Backend
+
+AutoRound automatically selects the backend for each layer based on compatibility. In general, the priority order is Marlin > ExLLaMAV2 > Triton, but the final choice depends on factors such as group size, bit width, packing format, hardware device, and other implementation details. For more details, please refer to [backends](https://github.com/intel/auto-round?tab=readme-ov-file#specify-backend),
+
+The backend may not always be the most suitable for certain devices. 
+You can specify your preferred backend such as "ipex" for CPU and CPU, "marlin/exllamav2/triton" for CUDA, according to your needs or hardware compatibility. Please note that additional corresponding libraries may be required.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
+
+model_name = "OPEA/Qwen2.5-1.5B-Instruct-int4-sym-inc"
+quantization_config = AutoRoundConfig(backend="ipex")
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+</hfoption>
+
+
+<hfoption id="format convert">
+
+### Convert GPTQ/AWQ to AutoRound
+
+Most GPTQ/AWQ models can be converted to the AutoRound format for better compatibility and support with Intel devices. Please note that the quantization config will be changed if the model is serialized.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, AutoRoundConfig
+
+model_name = "ybelkada/opt-125m-gptq-4bit"
+quantization_config = AutoRoundConfig()
+model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cpu", quantization_config=quantization_config, torch_dtype="auto")
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+text = "There is a girl who likes adventure,"
+inputs = tokenizer(text, return_tensors="pt").to(model.device)
+print(tokenizer.decode(model.generate(**inputs, max_new_tokens=50, do_sample=False)[0]))
+```
+
+</hfoption>
+
+</hfoptions>
+
+## Issues
+
+If you encounter any issues with the transformers integration, please open an issue on
+the [transformers](https://github.com/huggingface/transformers/issues) repository.  
+If you encounter any issues with auto-round, please open an issue on
+the [AutoRound](https://github.com/intel/auto-round/issues) repository.
+
+
+## Acknowledgement
+Special thanks to open-source low precision libraries such as AutoGPTQ, AutoAWQ, GPTQModel, Triton, Marlin, and ExLLaMAV2 for providing low-precision CUDA kernels, which are leveraged in AutoRound.
+
+## Contribution
+Contributions to [AutoRound](https://github.com/intel/auto-round/pulls) are welcome and greatly appreciated!
+Whether it's fixing bugs, improving documentation, adding new features, or suggesting improvements, your help is always valued.
--- a/docs/source/en/quantization/bitsandbytes.md
+++ b/docs/source/en/quantization/bitsandbytes.md
@ -14,13 +14,21 @@ rendered properly in your Markdown viewer.

 -->

-# bitsandbytes
+# Bitsandbytes

-[bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) features the LLM.int8 and QLoRA quantization to enable accessible large language model inference and training.
+The [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) library provides quantization tools for LLMs through a lightweight Python wrapper around CUDA functions. It enables working with large models using limited computational resources by reducing their memory footprint.

-[LLM.int8()](https://hf.co/papers/2208.07339) is a quantization method that aims to make large language model inference more accessible without significant degradation. Unlike naive 8-bit quantization, which can result in loss of critical information and accuracy, LLM.int8() dynamically adapts to ensure sensitive components of the computation retain higher precision when needed.
+At its core, bitsandbytes provides:

-QLoRA, or 4-bit quantization, compresses a model even further to 4-bits and inserts a small set of trainable low-rank adaptation (LoRA) weights to allowing training. 
+- **Quantized Linear Layers**: `Linear8bitLt` and `Linear4bit` layers that replace standard PyTorch linear layers with memory-efficient quantized alternatives
+- **Optimized Optimizers**: 8-bit versions of common optimizers through its `optim` module, enabling training of large models with reduced memory requirements
+- **Matrix Multiplication**: Optimized matrix multiplication operations that leverage the quantized format
+
+bitsandbytes offers two main quantization features:
+
+1. **LLM.int8()** - An 8-bit quantization method that makes inference more accessible without significant performance degradation. Unlike naive quantization, [LLM.int8()](https://hf.co/papers/2208.07339) dynamically preserves higher precision for critical computations, preventing information loss in sensitive parts of the model.
+
+2. **QLoRA** - A 4-bit quantization technique that compresses models even further while maintaining trainability by inserting a small set of trainable low-rank adaptation (LoRA) weights.

 > **Note:** For a user-friendly quantization experience, you can use the `bitsandbytes` [community space](https://huggingface.co/spaces/bnb-community/bnb-my-repo).

@ -30,12 +38,38 @@ Run the command below to install bitsandbytes.
 ```bash
 pip install --upgrade transformers accelerate bitsandbytes
 ```
+To compile from source, follow the instructions in the [bitsandbytes installation guide](https://huggingface.co/docs/bitsandbytes/main/en/installation).
+
+## Hardware Compatibility
+bitsandbytes is currently only supported on CUDA GPUs for CUDA versions 11.0 - 12.8. However, there's an ongoing multi-backend effort under development, which is currently in alpha. If you're interested in providing feedback or testing, check out the [bitsandbytes repository](https://github.com/bitsandbytes-foundation/bitsandbytes) for more information.
+
+### CUDA
+
+| Feature | Minimum Hardware Requirement |
+|---------|-------------------------------|
+| 8-bit optimizers | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
+| LLM.int8() | NVIDIA Turing (RTX 20 series, T4) or newer GPUs |
+| NF4/FP4 quantization | NVIDIA Maxwell (GTX 900 series, TITAN X, M40) or newer GPUs * |
+
+### Multi-backend
+
+| Backend | Supported Versions | Python versions | Architecture Support | Status |
+|---------|-------------------|----------------|---------------------|---------|
+| AMD ROCm | 6.1+ | 3.10+ | minimum CDNA - gfx90a, RDNA - gfx1100 | Alpha |
+| Apple Silicon (MPS) | WIP | 3.10+ | M1/M2 chips | Planned |
+| Intel CPU | v2.4.0+ (ipex) | 3.10+ | Intel CPU | Alpha |
+| Intel GPU | v2.4.0+ (ipex) | 3.10+ | Intel GPU | Experimental |
+| Ascend NPU | 2.1.0+ (torch_npu) | 3.10+ | Ascend NPU | Experimental |
+
+> **Note:** Bitsandbytes is moving away from the multi-backend approach towards using [Pytorch Custom Operators](https://pytorch.org/tutorials/advanced/custom_ops_landing_page.html), as the main mechanism for supporting new hardware, and dispatching to the correct backend.
+
+## Quantization Examples

 Quantize a model by passing a [`BitsAndBytesConfig`] to [`~PreTrainedModel.from_pretrained`]. This works for any model in any modality, as long as it supports [Accelerate](https://huggingface.co/docs/accelerate/index) and contains [torch.nn.Linear](https://pytorch.org/docs/stable/generated/torch.nn.Linear.html) layers.

 <hfoptions id="bnb">
 <hfoption id="8-bit">
-
+<div class="bnb-container" style="border: 1px solid #ddd; border-radius: 8px; padding: 20px; margin: 20px 0">
 Quantizing a model in 8-bit halves the memory-usage, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs.

 ```py
@ -45,6 +79,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)

 model_8bit = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-1b7", 
+    device_map="auto",
    quantization_config=quantization_config
 )
 ```
@ -59,6 +94,7 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)

 model_8bit = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-350m", 
+    device_map="auto",
    quantization_config=quantization_config, 
    torch_dtype="auto"
 )
@ -74,16 +110,16 @@ quantization_config = BitsAndBytesConfig(load_in_8bit=True)

 model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-560m", 
+    device_map="auto",
    quantization_config=quantization_config
 )
-tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")

 model.push_to_hub("bloom-560m-8bit")
 ```
-
+</div>
 </hfoption>
 <hfoption id="4-bit">
-
+<div class="bnb-container" style="border: 1px solid #ddd; border-radius: 8px; padding: 20px; margin: 20px 0">
 Quantizing a model in 4-bit reduces your memory-usage by 4x, and for large models, set `device_map="auto"` to efficiently distribute the weights across all available GPUs.

 ```py
@ -93,6 +129,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)

 model_4bit = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-1b7",
+    device_map="auto",
    quantization_config=quantization_config
 )
 ```
@ -107,6 +144,7 @@ quantization_config = BitsAndBytesConfig(load_in_4bit=True)

 model_4bit = AutoModelForCausalLM.from_pretrained(
    "facebook/opt-350m",
+    device_map="auto",
    quantization_config=quantization_config, 
    torch_dtype="auto"
 )
@ -115,6 +153,20 @@ model_4bit.model.decoder.layers[-1].final_layer_norm.weight.dtype

 Make sure you have the latest bitsandbytes version so you can serialize 4-bit models and push them to the Hub with [`~PreTrainedModel.push_to_hub`]. Use [`~PreTrainedModel.save_pretrained`] to save the 4-bit model locally.  

+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+
+model = AutoModelForCausalLM.from_pretrained(
+    "bigscience/bloom-560m", 
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+model.push_to_hub("bloom-560m-4bit")
+```
+</div>
 </hfoption>
 </hfoptions>

--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -22,25 +22,26 @@ Transformers supports many quantization methods, each with their pros and cons,

 Use the Space below to help you pick a quantization method depending on your hardware and number of bits to quantize to.

-| Quantization Method                           | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits          | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
-|-----------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|---------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
-| [AQLM](./aqlm)                             | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2         | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
-| [AWQ](./awq)                               | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4             | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
-| [bitsandbytes](./bitsandbytes)             | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🔴 | 4/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
-| [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8         | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
-| [EETQ](./eetq)                             | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8             | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
-| [GGUF / GGML (llama.cpp)](../gguf)         | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8         | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
-| [GPTQModel](./gptq)                        | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
-| [AutoGPTQ](./gptq)                         | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8 | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
-| [HIGGS](./higgs)                           | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4         | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
-| [HQQ](./hqq)                               | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
-| [optimum-quanto](./quanto)                 | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8     | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
-| [FBGEMM_FP8](./fbgemm_fp8)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
-| [torchao](./torchao)                       | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 | 🔴              |                 | 4/8         |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
-| [VPTQ](./vptq)                             | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8         | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
-| [FINEGRAINED_FP8](./finegrained_fp8)                 | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8             | 🔴               | 🟢                          | 🟢                      |        |
-| [SpQR](./spqr)                          | 🔴                       |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3              |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
-| [Quark](./quark)                           | 🔴                       | 🟢 | 🟢      | 🟢      | 🟢                   | 🟢       | ?               | 2/4/6/8/9/16 | 🔴                | 🔴                               | 🟢                       | https://quark.docs.amd.com/latest/                      |
+| Quantization Method                       | On the fly quantization | CPU             | CUDA GPU | ROCm GPU  | Metal (Apple Silicon)              | Intel GPU       | Torch compile() | Bits         | PEFT Fine Tuning | Serializable with 🤗Transformers | 🤗Transformers Support  | Link to library                             |
+|-------------------------------------------|----------------------|-----------------|----------|-----------|------------------------------------|-----------------|-----------------|--------------|------------------|-----------------------------|-------------------------|---------------------------------------------|
+| [AQLM](./aqlm)                            | 🔴                   | 🟢              |     🟢     | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/2          | 🟢               | 🟢                          | 🟢                      | https://github.com/Vahe1994/AQLM            |
+| [AutoRound](./auto_round)                 | 🔴                   | 🟢               | 🟢          |   🔴        |   🔴                                |   🟢              |   🔴               | 2/3/4/8      |    🔴              |       🟢                      |    🟢                       |      https://github.com/intel/auto-round                                       |
+| [AWQ](./awq)                              | 🔴                   | 🟢              | 🟢        | 🟢        | 🔴                                 | 🟢              | ?               | 4            | 🟢               | 🟢                          | 🟢                      | https://github.com/casper-hansen/AutoAWQ    |
+| [bitsandbytes](./bitsandbytes)            | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🔴 | 4/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
+| [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
+| [EETQ](./eetq)                            | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8            | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
+| [GPTQModel](./gptq)                       | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
+| [AutoGPTQ](./gptq)                        | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
+| [HIGGS](./higgs)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 2/4          | 🔴               | 🟢                          | 🟢                      | https://github.com/HanGuo97/flute           |       
+| [HQQ](./hqq)                              | 🟢                   | 🟢              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 1/8          | 🟢               | 🔴                          | 🟢                      | https://github.com/mobiusml/hqq/            |
+| [optimum-quanto](./quanto)                | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🟢              | 2/4/8        | 🔴               | 🔴                          | 🟢                      | https://github.com/huggingface/optimum-quanto       |
+| [FBGEMM_FP8](./fbgemm_fp8)                | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8            | 🔴               | 🟢                          | 🟢                      | https://github.com/pytorch/FBGEMM       |
+| [torchao](./torchao)                      | 🟢                   | 🟢               | 🟢        | 🔴        | 🟡 | 🔴              |                 | 4/8          |                  | 🟢🔴                        | 🟢                      | https://github.com/pytorch/ao       |
+| [VPTQ](./vptq)                            | 🔴                   | 🔴              |     🟢     | 🟡        | 🔴                                 | 🔴              | 🟢              | 1/8          | 🔴               | 🟢                          | 🟢                      | https://github.com/microsoft/VPTQ            |
+| [FINEGRAINED_FP8](./finegrained_fp8)      | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🔴              | 8            | 🔴               | 🟢                          | 🟢                      |        |
+| [SpQR](./spqr)                            | 🔴                     |  🔴   | 🟢        | 🔴              |    🔴    | 🔴         |         🟢              | 3            |              🔴                     | 🟢           | 🟢                      | https://github.com/Vahe1994/SpQR/       |
+| [Quark](./quark)                          | 🔴                     | 🟢 | 🟢      | 🟢      | 🟢                   | 🟢       | ?               | 2/4/6/8/9/16 | 🔴                | 🔴                               | 🟢                       | https://quark.docs.amd.com/latest/                      |

 ## Resources

--- a/docs/source/en/quantization/torchao.md
+++ b/docs/source/en/quantization/torchao.md
@ -33,10 +33,11 @@ See the table below for additional torchao features.

 torchao supports the [quantization techniques](https://github.com/pytorch/ao/blob/main/torchao/quantization/README.md) below.

- A16W8 Int8 WeightOnly Quantization
- A16W4 WeightOnly Quantization
- A8W8 Int8 Dynamic Quantization
+- A16W8 Float8 Dynamic Quantization
 - A16W8 Float8 WeightOnly Quantization
+- A8W8 Int8 Dynamic Quantization
+- A16W8 Int8 Weight Only Quantization
+- A16W4 Int4 Weight Only Quantization
 - Autoquantization


@ -44,7 +45,7 @@ Check the table below to see if your hardware is compatible.

 | Component | Compatibility |
 |----------|----------------|
-| CUDA Versions | ✅ cu118, cu124, cu126, cu128 |
+| CUDA Versions | ✅ cu118, cu126, cu128 |
 | CPU | ✅ change `device_map="cpu"` (see examples below) |


@ -56,14 +57,14 @@ Install torchao from PyPi or the PyTorch index with the following commands.

 ```bash
 # Updating 🤗 Transformers to the latest version, as the example script below uses the new auto compilation
-# Stable release from Pypi which will default to CUDA 12.4
+# Stable release from Pypi which will default to CUDA 12.6
 pip install --upgrade torchao transformers
 ```
 </hfoption> 
 <hfoption id="PyTorch Index">
 Stable Release from the PyTorch index
 ```bash
-pip install torchao --extra-index-url https://download.pytorch.org/whl/cu124 # options are cpu/cu118/cu124/cu126
+pip install torchao --index-url https://download.pytorch.org/whl/cu126 # options are cpu/cu118/cu126/cu128
 ```
 </hfoption>
 </hfoptions>
@ -80,15 +81,79 @@ You can manually choose the quantization types and settings or automatically sel

 Create a [`TorchAoConfig`] and specify the quantization type and `group_size` of the weights to quantize (for int8 weight only and int4 weight only). Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method.

-<hfoptions id="examples">
-<hfoption id="int8-weight-only cuda">
+We'll show examples for recommended quantization methods based on hardwares, e.g. A100 GPU, H100 GPU, CPU.
+
+### H100 GPU
+<hfoptions id="examples-H100-GPU">
+<hfoption id="float8-dynamic-and-weight-only">
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
+
+quant_config = Float8DynamicActivationFloat8WeightConfig()
+# or float8 weight only quantization
+# quant_config = Float8WeightOnlyConfig()
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+<hfoption id="int4-weight-only">

 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8WeightOnlyConfig
+from torchao.quantization import GemliteUIntXWeightOnlyConfig

-quant_config = Int8WeightOnlyConfig(group_size=128)
+# We integrated with gemlite, which optimizes for batch size N on A100 and H100
+quant_config = GemliteUIntXWeightOnlyConfig(group_size=128)
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
+### A100 GPU
+<hfoptions id="examples-A100-GPU">
+<hfoption id="int8-dynamic-and-weight-only">
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig
+
+quant_config = Int8DynamicActivationInt8WeightConfig()
+# or int8 weight only quantization
+# quant_config = Int8WeightOnlyConfig()
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -109,14 +174,52 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>

-<hfoption id="int8-weight-only cpu">
+<hfoption id="int4-weight-only">

 ```py
 import torch
 from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8WeightOnlyConfig
+from torchao.quantization import GemliteUIntXWeightOnlyConfig

-quant_config = Int8WeightOnlyConfig(group_size=128)
+# For batch size N, we recommend gemlite, which may require autotuning
+# default is 4 bit, 8 bit is also supported by passing `bit_width=8`
+quant_config = GemliteUIntXWeightOnlyConfig(group_size=128)
+
+# For batch size 1, we also have custom tinygemm kernel that's only optimized for this
+# We can set `use_hqq` to `True` for better accuracy
+# quant_config = Int4WeightOnlyConfig(group_size=128, use_hqq=True)
+
+quantization_config = TorchAoConfig(quant_type=quant_config)
+
+# Load and quantize the model
+quantized_model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B-Instruct",
+    torch_dtype="auto",
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
+input_text = "What are we having for dinner?"
+input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
+
+# auto-compile the quantized model with `cache_implementation="static"` to get speed up
+output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
+print(tokenizer.decode(output[0], skip_special_tokens=True))
+```
+</hfoption>
+</hfoptions>
+
+### CPU
+<hfoptions id="examples-CPU">
+<hfoption id="int8-dynamic-and-weight-only">
+```py
+import torch
+from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
+from torchao.quantization import Int8DynamicActivationInt8WeightConfig
+
+quant_config = Int8DynamicActivationInt8WeightConfig()
+# quant_config = Int8WeightOnlyConfig()
 quantization_config = TorchAoConfig(quant_type=quant_config)

 # Load and quantize the model
@ -136,35 +239,7 @@ output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implemen
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>
-<hfoption id="int4-weight-only cuda">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int4WeightOnlyConfig
-
-quant_config = Int4WeightOnlyConfig(group_size=128)
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-
-<hfoption id="int4-weight-only cpu">
+<hfoption id="int4-weight-only">

 > [!TIP]
 > Run the quantized model on a CPU by changing `device_map` to `"cpu"` and `layout` to `Int4CPULayout()`.
@ -195,116 +270,6 @@ output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implemen
 print(tokenizer.decode(output[0], skip_special_tokens=True))
 ```
 </hfoption>
-<hfoption id="int8-dynamic-quantization cuda">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig
-
-quant_config = Int8DynamicActivationInt8WeightConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-<hfoption id="int8-dynamic-quantization cpu">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Int8DynamicActivationInt8WeightConfig
-
-quant_config = Int8DynamicActivationInt8WeightConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="cpu",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-<hfoption id="float8-weight-only cuda">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Float8WeightOnlyConfig
-
-quant_config = Float8WeightOnlyConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="auto",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt").to("cuda")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-
-```
-</hfoption>
-<hfoption id="float8-weight-only cpu">
-
-```py
-import torch
-from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
-from torchao.quantization import Float8WeightOnlyConfig
-
-quant_config = Float8WeightOnlyConfig()
-quantization_config = TorchAoConfig(quant_type=quant_config)
-
-# Load and quantize the model
-quantized_model = AutoModelForCausalLM.from_pretrained(
-    "meta-llama/Llama-3.1-8B-Instruct",
-    torch_dtype="auto",
-    device_map="cpu",
-    quantization_config=quantization_config
-)
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.1-8B-Instruct")
-input_text = "What are we having for dinner?"
-input_ids = tokenizer(input_text, return_tensors="pt")
-
-# auto-compile the quantized model with `cache_implementation="static"` to get speed up
-output = quantized_model.generate(**input_ids, max_new_tokens=10, cache_implementation="static")
-print(tokenizer.decode(output[0], skip_special_tokens=True))
-```
-</hfoption>
-
 </hfoptions>

 ### Autoquant
@ -313,6 +278,8 @@ If you want to automatically choose a quantization type for quantizable layers (

 The `autoquant` API automatically chooses a quantization type by micro-benchmarking on input type and shape and compiling a single linear layer.

+Note: autoquant is for GPU only right now.
+
 Create a [`TorchAoConfig`] and set to `"autoquant"`. Set the `cache_implementation` to `"static"` to automatically [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html) the forward method. Finally, call `finalize_autoquant` on the quantized model to finalize the quantization and log the input shapes.


@ -346,11 +313,25 @@ torchao implements [torch.Tensor subclasses](https://pytorch.org/docs/stable/not

 To avoid arbitrary user code execution, torchao sets `weights_only=True` in [torch.load](https://pytorch.org/docs/stable/generated/torch.load.html) to ensure only tensors are loaded. Any known user functions can be whitelisted with [add_safe_globals](https://pytorch.org/docs/stable/notes/serialization.html#torch.serialization.add_safe_globals).

+<hfoptions id="serialization-examples">
+<hfoption id="save-locally">
 ```py
 # don't serialize model with Safetensors
 output_dir = "llama3-8b-int4wo-128"
 quantized_model.save_pretrained("llama3-8b-int4wo-128", safe_serialization=False)
 ```
+</hfoption>
+<hfoption id="push-to-huggingface-hub">
+```py
+# don't serialize model with Safetensors
+USER_ID = "your_huggingface_user_id"
+REPO_ID = "llama3-8b-int4wo-128"
+quantized_model.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128", safe_serialization=False)
+tokenizer.push_to_hub(f"{USER_ID}/llama3-8b-int4wo-128")
+```
+</hfoption>
+</hfoptions>
+

 ## Loading quantized models

@ -486,4 +467,4 @@ Refer to [Other Available Quantization Techniques](https://github.com/pytorch/ao

 ## Issues

-If you encounter any issues with the Transformers integration, please open an issue on the [Transformers](https://github.com/huggingface/transformers/issues) repository. For issues directly related to torchao, please open an issue on the [torchao](https://github.com/pytorch/ao/issues) repository.
+If you encounter any issues with the Transformers integration, please open an issue on the [Transformers](https://github.com/huggingface/transformers/issues) repository. For issues directly related to torchao, please open an issue on the [torchao](https://github.com/pytorch/ao/issues) repository.
--- a/docs/source/en/tasks/image_text_to_text.md
+++ b/docs/source/en/tasks/image_text_to_text.md
@ -160,7 +160,48 @@ outputs[0]["generated_text"]
 #  with a yellow center in the foreground. The flower is surrounded by red and white flowers with green stems
 ```

-## Streaming
+If you prefer, you can also load the images separately and pass them to the pipeline like so:
+
+```python
+pipe = pipeline("image-text-to-text", model="HuggingFaceTB/SmolVLM-256M-Instruct")
+
+img_urls = [
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
+]
+images = [
+    Image.open(requests.get(img_urls[0], stream=True).raw),
+    Image.open(requests.get(img_urls[1], stream=True).raw),
+]
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {"type": "image"},
+            {"type": "image"},
+            {"type": "text", "text": "What do you see in these images?"},
+        ],
+    }
+]
+outputs = pipe(text=messages, images=images, max_new_tokens=50, return_full_text=False)
+outputs[0]["generated_text"]
+" In the first image, there are two cats sitting on a plant. In the second image, there are flowers with a pinkish hue."
+```
+
+The images will still be included in the `"input_text"` field of the output:
+
+```python
+outputs[0]['input_text']
+"""
+[{'role': 'user',
+  'content': [{'type': 'image',
+    'image': <PIL.PngImagePlugin.PngImageFile image mode=RGBA size=622x412>},
+   {'type': 'image',
+    'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=5184x3456>},
+   {'type': 'text', 'text': 'What do you see in these images?'}]}]## Streaming
+"""
+```

 We can use [text streaming](./generation_strategies#streaming) for a better generation experience. Transformers supports streaming with the [`TextStreamer`] or [`TextIteratorStreamer`] classes. We will use the [`TextIteratorStreamer`] with IDEFICS-8B.

--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
@ -77,6 +77,8 @@
        title: 이미지 특징 추출
      - local: tasks/mask_generation
        title: 마스크 생성
+      - local: tasks/keypoint_detection
+        title: 키포인트 탐지
      - local: tasks/knowledge_distillation_for_image_classification
        title: 컴퓨터 비전(이미지 분류)를 위한 지식 증류(knowledge distillation)
    title: 컴퓨터 비전
@ -480,8 +482,8 @@
        title: (번역중) RemBERT
      - local: in_translation
        title: (번역중) RetriBERT
-      - local: in_translation
-        title: (번역중) RoBERTa
+      - local: model_doc/roberta
+        title: RoBERTa
      - local: in_translation
        title: (번역중) RoBERTa-PreLayerNorm
      - local: in_translation
@ -720,6 +722,8 @@
        title: Qwen2VL
      - local: in_translation
        title: (번역중) Segment Anything
+      - local: model_doc/siglip
+        title: SigLIP
      - local: in_translation
        title: (번역중) Speech Encoder Decoder Models
      - local: in_translation
--- a/docs/source/ko/model_doc/roberta.md
+++ b/docs/source/ko/model_doc/roberta.md
@ -0,0 +1,230 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# RoBERTa[[roberta]]
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## 개요[[overview]]
+
+RoBERTa 모델은 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov가 제안한 논문 [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692)에서 소개되었습니다. 이 모델은 2018년에 구글에서 발표한 BERT 모델을 기반으로 합니다.
+
+RoBERTa는 BERT를 기반으로 하며, 주요 하이퍼파라미터를 수정하고, 사전 학습 단계에서 다음 문장 예측(Next Sentence Prediction)을 제거했으며, 훨씬 더 큰 미니 배치 크기와 학습률을 사용하여 학습을 진행했습니다.
+
+해당 논문의 초록입니다:
+
+*언어 모델 사전 학습은 성능을 크게 향상시켰지만, 서로 다른 접근 방식을 면밀히 비교하는 것은 어렵습니다. 학습은 계산 비용이 많이 들고, 종종 크기가 서로 다른 비공개 데이터셋에서 수행되며, 본 논문에서 보여주듯이 하이퍼파라미터 선택이 최종 성능에 큰 영향을 미칩니다. 우리는 BERT 사전 학습(Devlin et al., 2019)에 대한 재현 연구를 수행하여, 여러 핵심 하이퍼파라미터와 학습 데이터 크기의 영향을 면밀히 측정하였습니다. 그 결과, BERT는 충분히 학습되지 않았으며, 이후 발표된 모든 모델의 성능을 맞추거나 능가할 수 있음을 발견했습니다. 우리가 제안한 최상의 모델은 GLUE, RACE, SQuAD에서 최고 성능(state-of-the-art)을 달성했습니다. 이 결과는 지금까지 간과되어 온 설계 선택의 중요성을 강조하며, 최근 보고된 성능 향상의 근원이 무엇인지에 대한 의문을 제기합니다. 우리는 본 연구에서 사용한 모델과 코드를 공개합니다.*
+
+이 모델은 [julien-c](https://huggingface.co/julien-c)가 기여하였습니다. 원본 코드는 [여기](https://github.com/pytorch/fairseq/tree/master/examples/roberta)에서 확인할 수 있습니다.
+
+## 사용 팁[[usage-tips]]
+
+- 이 구현은 [`BertModel`]과 동일하지만, 임베딩 부분에 약간의 수정이 있으며 RoBERTa 사전학습 모델에 맞게 설정되어 있습니다.
+- RoBERTa는 BERT와 동일한 아키텍처를 가지고 있지만, 토크나이저로 바이트 수준 BPE(Byte-Pair Encoding, GPT-2와 동일)를 사용하고, 사전학습 방식이 다릅니다.
+- RoBERTa는 `token_type_ids`를 사용하지 않기 때문에, 어떤 토큰이 어떤 문장(segment)에 속하는지 별도로 표시할 필요가 없습니다. 문장 구분은 분리 토큰 `tokenizer.sep_token`(또는 `</s>`)을 사용해 나누면 됩니다.
+- RoBERTa는 BERT와 유사하지만, 더 나은 사전학습 기법을 사용합니다:
+
+    * 동적 마스킹: RoBERTa는 매 에폭마다 토큰을 다르게 마스킹하는 반면, BERT는 한 번만 마스킹합니다.
+    * 문장 패킹: 여러 문장을 최대 512 토큰까지 함께 패킹하여, 문장이 여러 문서에 걸쳐 있을 수도 있습니다.
+    * 더 큰 배치 사이즈: 학습 시 더 큰 미니배치를 사용합니다.
+    * 바이트 수준 BPE 어휘: 문자를 단위로 하지 않고 바이트 단위로 BPE를 적용하여 유니코드 문자를 더 유연하게 처리할 수 있습니다.
+
+- [CamemBERT](camembert)은 RoBERTa를 기반으로 한 래퍼 모델입니다. 사용 예제는 해당 모델 페이지를 참고하세요.
+
+## 자료[[resources]]
+
+RoBERTa를 처음 다룰 때 도움이 되는 Hugging Face 공식 자료와 커뮤니티 자료(🌎 아이콘으로 표시됨) 목록입니다. 이 목록에 자료를 추가하고 싶다면 언제든지 Pull Request를 보내주세요! 저희가 검토 후 반영하겠습니다. 추가하려는 자료는 기존 자료를 단순히 복제하는 것이 아닌, 새롭거나 유의미한 내용을 포함하고 있는 것이 좋습니다.
+
+<PipelineTag pipeline="text-classification"/>
+
+- RoBERTa와 [Inference API](https://huggingface.co/inference-api)를 활용한 [트위터 감성 분석 시작하기](https://huggingface.co/blog/sentiment-analysis-twitter) 블로그 포스트.
+- RoBERTa를 활용한 [Kili 및 Hugging Face AutoTrain을 이용한 의견 분류](https://huggingface.co/blog/opinion-classification-with-kili)에 관한 블로그 포스트.
+- [감성 분석을 위한 RoBERTa 미세조정](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)을 하는 방법에 대한 노트북.🌎
+- ['RobertaForSequenceClassification']은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb)에서 지원됩니다.
+- [`TFRobertaForSequenceClassification`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForSequenceClassification`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/text-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification_flax.ipynb)에서 지원됩니다.
+- [텍스트 분류 작업 가이드](../tasks/sequence_classification)
+
+<PipelineTag pipeline="token-classification"/>
+
+- [`RobertaForTokenClassification`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/token-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification.ipynb)에서 지원됩니다.
+- [`TFRobertaForTokenClassification`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/token-classification)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/token_classification-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForTokenClassification`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/token-classification)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [토큰 분류 챕터](https://huggingface.co/course/chapter7/2?fw=pt)
+- [토큰 분류 작업 가이드](../tasks/token_classification)
+
+<PipelineTag pipeline="fill-mask"/>
+
+- RoBERTa를 활용한 [Transformers와 Tokenizers를 활용한 새로운 언어 모델을 처음부터 학습하는 방법](https://huggingface.co/blog/how-to-train)에 대한 블로그 포스트.
+- [`RobertaForMaskedLM`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#robertabertdistilbert-and-masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb)에서 지원됩니다.
+- [`TFRobertaForMaskedLM`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_mlmpy)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForMaskedLM`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#masked-language-modeling)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/masked_language_modeling_flax.ipynb)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [마스킹 언어 모델링 챕터](https://huggingface.co/course/chapter7/3?fw=pt)
+- [마스킹 언어 모델링 작업 가이드](../tasks/masked_language_modeling)
+
+<PipelineTag pipeline="question-answering"/>
+
+- RoBERTa를 활용한 질문 응답 작업에서의 [Optimum과 Transformers 파이프라인을 이용한 추론 가속화](https://huggingface.co/blog/optimum-inference)에 대한 블로그 포스트.
+- [`RobertaForQuestionAnswering`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb)에서 지원됩니다.
+- [`TFRobertaForQuestionAnswering`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/question-answering)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering-tf.ipynb)에서 지원됩니다.
+- [`FlaxRobertaForQuestionAnswering`]은 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/flax/question-answering)에서 지원됩니다.
+- 🤗 Hugging Face 코스의 [질의응답 챕터](https://huggingface.co/course/chapter7/7?fw=pt)
+- [질의응답 작업 가이드](../tasks/question_answering)
+
+**다중 선택**
+- [`RobertaForMultipleChoice`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/pytorch/multiple-choice)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice.ipynb)에서 지원됩니다.
+- [`TFRobertaForMultipleChoice`]는 [예제 스크립트](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/multiple-choice)와 [노트북](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/multiple_choice-tf.ipynb)에서 지원됩니다.
+- [다중 선택 작업 가이드](../tasks/multiple_choice)
+
+## RobertaConfig
+
+[[autodoc]] RobertaConfig
+
+## RobertaTokenizer
+
+[[autodoc]] RobertaTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## RobertaTokenizerFast
+
+[[autodoc]] RobertaTokenizerFast
+    - build_inputs_with_special_tokens
+
+<frameworkcontent>
+<pt>
+
+## RobertaModel
+
+[[autodoc]] RobertaModel
+    - forward
+
+## RobertaForCausalLM
+
+[[autodoc]] RobertaForCausalLM
+    - forward
+
+## RobertaForMaskedLM
+
+[[autodoc]] RobertaForMaskedLM
+    - forward
+
+## RobertaForSequenceClassification
+
+[[autodoc]] RobertaForSequenceClassification
+    - forward
+
+## RobertaForMultipleChoice
+
+[[autodoc]] RobertaForMultipleChoice
+    - forward
+
+## RobertaForTokenClassification
+
+[[autodoc]] RobertaForTokenClassification
+    - forward
+
+## RobertaForQuestionAnswering
+
+[[autodoc]] RobertaForQuestionAnswering
+    - forward
+
+</pt>
+<tf>
+
+## TFRobertaModel
+
+[[autodoc]] TFRobertaModel
+    - call
+
+## TFRobertaForCausalLM
+
+[[autodoc]] TFRobertaForCausalLM
+    - call
+
+## TFRobertaForMaskedLM
+
+[[autodoc]] TFRobertaForMaskedLM
+    - call
+
+## TFRobertaForSequenceClassification
+
+[[autodoc]] TFRobertaForSequenceClassification
+    - call
+
+## TFRobertaForMultipleChoice
+
+[[autodoc]] TFRobertaForMultipleChoice
+    - call
+
+## TFRobertaForTokenClassification
+
+[[autodoc]] TFRobertaForTokenClassification
+    - call
+
+## TFRobertaForQuestionAnswering
+
+[[autodoc]] TFRobertaForQuestionAnswering
+    - call
+
+</tf>
+<jax>
+
+## FlaxRobertaModel
+
+[[autodoc]] FlaxRobertaModel
+    - __call__
+
+## FlaxRobertaForCausalLM
+
+[[autodoc]] FlaxRobertaForCausalLM
+    - __call__
+
+## FlaxRobertaForMaskedLM
+
+[[autodoc]] FlaxRobertaForMaskedLM
+    - __call__
+
+## FlaxRobertaForSequenceClassification
+
+[[autodoc]] FlaxRobertaForSequenceClassification
+    - __call__
+
+## FlaxRobertaForMultipleChoice
+
+[[autodoc]] FlaxRobertaForMultipleChoice
+    - __call__
+
+## FlaxRobertaForTokenClassification
+
+[[autodoc]] FlaxRobertaForTokenClassification
+    - __call__
+
+## FlaxRobertaForQuestionAnswering
+
+[[autodoc]] FlaxRobertaForQuestionAnswering
+    - __call__
+
+</jax>
+</frameworkcontent>
--- a/docs/source/ko/model_doc/siglip.md
+++ b/docs/source/ko/model_doc/siglip.md
@ -0,0 +1,253 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# SigLIP[[siglip]]
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## 개요[[overview]]
+
+SigLIP 모델은 Xiaohua Zhai, Basil Mustafa, Alexander Kolesnikov, Lucas Beyer의 [Sigmoid Loss for Language Image Pre-Training](https://arxiv.org/abs/2303.15343) 논문에서 제안되었습니다. SigLIP은 [CLIP](clip)에서 사용된 손실 함수를 간단한 쌍별 시그모이드 손실(pairwise sigmoid loss)로 대체할 것을 제안합니다. 이는 ImageNet에서 제로샷 분류 정확도 측면에서 더 나은 성능을 보입니다.
+
+논문의 초록은 다음과 같습니다:
+
+*우리는 언어-이미지 사전 학습(Language-Image Pre-training, SigLIP)을 위한 간단한 쌍별 시그모이드 손실을 제안합니다. 소프트맥스 정규화를 사용하는 표준 대조 학습과 달리, 시그모이드 손실은 이미지-텍스트 쌍에만 작용하며 정규화를 위해 쌍별 유사성의 전역적 관점을 필요로 하지 않습니다. 시그모이드 손실은 배치 크기를 더욱 확장할 수 있게 하는 동시에 작은 배치 크기에서도 더 나은 성능을 보입니다. Locked-image Tuning과 결합하여, 단 4개의 TPUv4 칩만으로 이틀 만에 84.5%의 ImageNet 제로샷 정확도를 달성하는 SigLiT 모델을 학습했습니다. 손실 함수에서 배치 크기를 분리함으로써 예제 대 쌍의 영향과 Negative 대 Positive 비율을 연구할 수 있게 되었습니다. 마지막으로, 우리는 배치 크기를 100만 개까지 극단적으로 늘려보았고, 배치 크기 증가의 이점이 빠르게 감소하며 32k의 더 합리적인 배치 크기로도 충분하다는 것을 발견했습니다.*
+
+## 사용 팁[[usage-tips]]
+
+- SigLIP의 사용법은 [CLIP](clip)과 유사합니다. 주요 차이점은 학습 손실 함수로, 배치 내 모든 이미지와 텍스트 간의 쌍별 유사성에 대한 전역적 관점이 필요하지 않습니다. 소프트맥스 대신 로짓에 시그모이드 활성화 함수를 적용해야 합니다.
+- 학습은 지원되지만 `torch.distributed` 유틸리티를 사용하지 않아 배치 크기의 확장성이 제한될 수 있습니다. 그러나 단일 노드 다중 GPU 설정에서는 DDP와 FDSP가 작동합니다.
+- 독립형 [`SiglipTokenizer`] 또는 [`SiglipProcessor`]를 사용할 때는 모델이 그렇게 학습되었으므로 `padding="max_length"`를 전달해야 합니다.
+- 파이프라인과 동일한 결과를 얻으려면 "This is a photo of {label}."의 프롬프트 템플릿을 사용해야 합니다.
+
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/siglip_table.jpeg"
+alt="drawing" width="600"/>
+
+<small> CLIP과 비교한 SigLIP 평가 결과. <a href="https://arxiv.org/abs/2303.15343">원본 논문</a>에서 발췌.</small>
+
+이 모델은 [nielsr](https://huggingface.co/nielsr)가 기여했습니다.
+원본 코드는 [여기](https://github.com/google-research/big_vision/tree/main)에서 찾을 수 있습니다.
+
+## 사용 예시[[usage-example]]
+
+SigLIP을 사용하는 방법에는 두 가지 주요 방법이 있습니다: 모든 복잡성을 추상화하는 파이프라인 API를 사용하거나, 직접 `SiglipModel` 클래스를 사용하는 방법입니다.
+
+### 파이프라인 API[[pipeline-API]]
+
+파이프라인을 사용하면 몇 줄의 코드로 모델을 사용할 수 있습니다:
+
+```python
+>>> from transformers import pipeline
+>>> from PIL import Image
+>>> import requests
+
+>>> # 파이프라인 로드
+>>> image_classifier = pipeline(task="zero-shot-image-classification", model="google/siglip-base-patch16-224")
+
+>>> # 이미지 로드
+>>> url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> # 추론
+>>> candidate_labels = ["2 cats", "a plane", "a remote"]
+>>> outputs = image_classifier(image, candidate_labels=candidate_labels)
+>>> outputs = [{"score": round(output["score"], 4), "label": output["label"] } for output in outputs]
+>>> print(outputs)
+[{'score': 0.1979, 'label': '2 cats'}, {'score': 0.0, 'label': 'a remote'}, {'score': 0.0, 'label': 'a plane'}]
+```
+
+### 직접 모델 사용하기[[using-the-model-yourself]]
+
+전처리와 후처리를 직접 수행하려면 다음과 같이 하면 됩니다:
+
+```python
+>>> from PIL import Image
+>>> import requests
+>>> from transformers import AutoProcessor, AutoModel
+>>> import torch
+
+>>> model = AutoModel.from_pretrained("google/siglip-base-patch16-224")
+>>> processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# 파이프라인 프롬프트 템플릿을 따라 동일한 결과를 얻습니다
+>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
+# 중요: 모델이 이렇게 학습되었으므로 `padding=max_length`를 전달합니다
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")
+
+>>> with torch.no_grad():
+...     outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # 시그모이드 활성화 함수를 적용한 확률입니다
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+19.8% that image 0 is '2 cats'
+```
+
+## 리소스[[resources]]
+
+SigLIP을 시작하는 데 도움이 되는 공식 Hugging Face 및 커뮤니티(🌎로 표시) 리소스 목록입니다.
+
+- [제로샷 이미지 분류 작업 가이드](../tasks/zero_shot_image_classification)
+- SigLIP에 대한 데모 노트북은 [여기](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/SigLIP)에서 찾을 수 있습니다. 🌎
+
+여기에 포함될 리소스를 제출하는 데 관심이 있으시면 Pull Request를 열어주시면 검토하겠습니다! 리소스는 이상적으로 기존 리소스를 복제하는 대신 새로운 것을 보여주어야 합니다.
+
+
+## SigLIP과 Flash Attention 2 결합하기[[combining-siglip-with-flash-attention-2]]
+
+먼저 Flash Attention 2의 최신 버전을 설치해야 합니다.
+
+```bash
+pip install -U flash-attn --no-build-isolation
+```
+
+또한 Flash-Attention 2와 호환되는 하드웨어가 있는지 확인하세요. flash-attn 저장소의 공식 문서에서 자세히 알아보세요. 또한 모델을 반정밀도(예: `torch.float16`)로 로드해야 합니다.
+
+Flash Attention 2를 사용하여 모델을 로드하고 실행하려면 아래 코드를 참조하세요:
+
+```python
+>>> import torch
+>>> import requests
+>>> from PIL import Image
+>>> from transformers import SiglipProcessor, SiglipModel
+>>> device = "cuda" # 모델을 로드할 장치
+
+>>> model = SiglipModel.from_pretrained(
+...     "google/siglip-so400m-patch14-384",
+...     attn_implementation="flash_attention_2",
+...     torch_dtype=torch.float16,
+...     device_map=device,
+... )
+>>> processor = SiglipProcessor.from_pretrained("google/siglip-so400m-patch14-384")
+
+>>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+>>> image = Image.open(requests.get(url, stream=True).raw)
+
+>>> candidate_labels = ["2 cats", "2 dogs"]
+# 파이프라인 프롬프트 템플릿을 따라 동일한 결과를 얻습니다
+>>> texts = [f'This is a photo of {label}.' for label in candidate_labels]
+# 중요: 모델이 이렇게 학습되었으므로 `padding=max_length`를 전달합니다
+>>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt").to(device)
+
+>>> with torch.no_grad():
+...     with torch.autocast(device):
+...         outputs = model(**inputs)
+
+>>> logits_per_image = outputs.logits_per_image
+>>> probs = torch.sigmoid(logits_per_image) # 시그모이드 활성화 함수를 적용한 확률입니다
+>>> print(f"{probs[0][0]:.1%} that image 0 is '{candidate_labels[0]}'")
+19.8% that image 0 is '2 cats'
+```
+
+
+## Scaled Dot Product Attention(SDPA) 사용하기[using-scaled-dot-product-attention(SDPA)]]
+
+PyTorch는 `torch.nn.functional`의 일부로 스케일된 점곱 어텐션(SDPA) 연산자를 포함합니다. 이 함수는 
+입력과 사용 중인 하드웨어에 따라 적용할 수 있는 여러 구현을 포함합니다. 자세한 내용은 
+[공식 문서](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
+또는 [GPU 추론](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention) 
+페이지를 참조하세요.
+
+`from_pretrained()`에서 `attn_implementation="sdpa"`를 설정하여 SDPA를 명시적으로 요청할 수 있습니다. `torch>=2.1.1`이 설치되어 있는지 확인하세요.
+
+```python
+>>> from transformers import SiglipModel
+
+>>> model = SiglipModel.from_pretrained(
+...     "google/siglip-so400m-patch14-384",
+...     attn_implementation="sdpa",
+...     torch_dtype=torch.float16,
+...     device_map=device,
+... )
+```
+
+최상의 속도 향상을 위해 모델을 반정밀도(예: `torch.float16` 또는 `torch.bfloat16`)로 로드하는 것이 좋습니다.
+
+
+## 예상 속도 향상[[expected-speedups]]
+
+아래는 `google/siglip-so400m-patch14-384` 체크포인트를 `float16` 정밀도로 사용하는 transformers의 네이티브 구현과 Flash Attention 2 / SDPA 버전의 모델을 다양한 배치 크기로 비교한 추론 시간의 예상 속도 향상 다이어그램입니다.
+
+<div style="text-align: center">
+<img src="https://i.imgur.com/cWm4rsn.png">
+</div>
+
+
+## SiglipConfig
+
+[[autodoc]] SiglipConfig
+    - from_text_vision_configs
+
+## SiglipTextConfig
+
+[[autodoc]] SiglipTextConfig
+
+## SiglipVisionConfig
+
+[[autodoc]] SiglipVisionConfig
+
+## SiglipTokenizer
+
+[[autodoc]] SiglipTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## SiglipImageProcessor
+
+[[autodoc]] SiglipImageProcessor
+    - preprocess
+
+## SiglipImageProcessorFast
+
+[[autodoc]] SiglipImageProcessorFast
+    - preprocess
+
+## SiglipProcessor
+
+[[autodoc]] SiglipProcessor
+
+## SiglipModel
+
+[[autodoc]] SiglipModel
+    - forward
+    - get_text_features
+    - get_image_features
+
+## SiglipTextModel
+
+[[autodoc]] SiglipTextModel
+    - forward
+
+## SiglipVisionModel
+
+[[autodoc]] SiglipVisionModel
+    - forward
+
+
+## SiglipForImageClassification
+
+[[autodoc]] SiglipForImageClassification
+    - forward 
--- a/docs/source/ko/tasks/keypoint_detection.md
+++ b/docs/source/ko/tasks/keypoint_detection.md
@ -0,0 +1,155 @@
+<!--Copyright 2023 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 키포인트 탐지 [[keypoint-detection]]
+
+[[open-in-colab]]
+
+키포인트 감지(Keypoint detection)은 이미지 내의 특정 포인트를 식별하고 위치를 탐지합니다. 이러한 키포인트는 랜드마크라고도 불리며 얼굴 특징이나 물체의 일부와 같은 의미 있는 특징을 나타냅니다.
+키포인트 감지 모델들은 이미지를 입력으로 받아 아래와 같은 출력을 반환합니다.
+
+- **키포인트들과 점수**: 관심 포인트들과 해당 포인트에 대한 신뢰도 점수
+- **디스크립터(Descriptors)**: 각 키포인트를 둘러싼 이미지 영역의 표현으로 텍스처, 그라데이션, 방향 및 기타 속성을 캡처합니다.
+
+이번 가이드에서는 이미지에서 키포인트를 추출하는 방법을 다루어 보겠습니다.
+
+이번 튜토리얼에서는 키포인트 감지의 기본이 되는 모델인 [SuperPoint](./model_doc/superpoint)를 사용해보겠습니다.
+
+```python
+from transformers import AutoImageProcessor, SuperPointForKeypointDetection
+processor = AutoImageProcessor.from_pretrained("magic-leap-community/superpoint")
+model = SuperPointForKeypointDetection.from_pretrained("magic-leap-community/superpoint")
+```
+아래의 이미지로 모델을 테스트 해보겠습니다.
+
+<div style="display: flex; align-items: center;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg" 
+         alt="Bee" 
+         style="height: 200px; object-fit: contain; margin-right: 10px;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png" 
+         alt="Cats" 
+         style="height: 200px; object-fit: contain;">
+</div>
+
+
+```python
+import torch
+from PIL import Image
+import requests
+import cv2
+
+
+url_image_1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"
+image_1 = Image.open(requests.get(url_image_1, stream=True).raw)
+url_image_2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png"
+image_2 = Image.open(requests.get(url_image_2, stream=True).raw)
+
+images = [image_1, image_2]
+```
+
+이제 입력을 처리하고 추론을 할 수 있습니다.
+
+
+```python
+inputs = processor(images,return_tensors="pt").to(model.device, model.dtype)
+outputs = model(**inputs)
+```
+모델 출력에는 배치 내의 각 항목에 대한 상대적인 키포인트, 디스크립터, 마스크와 점수가 있습니다. 마스크는 이미지에서 키포인트가 있는 영역을 강조하는 역할을 합니다.
+
+```python
+SuperPointKeypointDescriptionOutput(loss=None, keypoints=tensor([[[0.0437, 0.0167],
+         [0.0688, 0.0167],
+         [0.0172, 0.0188],
+         ...,
+         [0.5984, 0.9812],
+         [0.6953, 0.9812]]]), 
+         scores=tensor([[0.0056, 0.0053, 0.0079,  ..., 0.0125, 0.0539, 0.0377],
+        [0.0206, 0.0058, 0.0065,  ..., 0.0000, 0.0000, 0.0000]],
+       grad_fn=<CopySlices>), descriptors=tensor([[[-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+         ...],
+       grad_fn=<CopySlices>), mask=tensor([[1, 1, 1,  ..., 1, 1, 1],
+        [1, 1, 1,  ..., 0, 0, 0]], dtype=torch.int32), hidden_states=None)
+```
+
+이미지에 실제 키포인트를 표시하기 위해선 결과값을 후처리 해야합니다. 이를 위해 실제 이미지 크기를 결과값과 함께 `post_process_keypoint_detection`에 전달해야 합니다.
+
+```python
+image_sizes = [(image.size[1], image.size[0]) for image in images]
+outputs = processor.post_process_keypoint_detection(outputs, image_sizes)
+```
+
+위 코드를 통해 결과값은 딕셔너리를 갖는 리스트가 되고, 각 딕셔너리들은 후처리된 키포인트, 점수 및 디스크립터로 이루어져있습니다.
+
+
+```python
+[{'keypoints': tensor([[ 226,   57],
+          [ 356,   57],
+          [  89,   64],
+          ...,
+          [3604, 3391]], dtype=torch.int32),
+  'scores': tensor([0.0056, 0.0053, ...], grad_fn=<IndexBackward0>),
+  'descriptors': tensor([[-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357],
+          [-0.0807,  0.0114, -0.1210,  ..., -0.1122,  0.0899,  0.0357]],
+         grad_fn=<IndexBackward0>)},
+    {'keypoints': tensor([[ 46,   6],
+          [ 78,   6],
+          [422,   6],
+          [206, 404]], dtype=torch.int32),
+  'scores': tensor([0.0206, 0.0058, 0.0065, 0.0053, 0.0070, ...,grad_fn=<IndexBackward0>),
+  'descriptors': tensor([[-0.0525,  0.0726,  0.0270,  ...,  0.0389, -0.0189, -0.0211],
+          [-0.0525,  0.0726,  0.0270,  ...,  0.0389, -0.0189, -0.0211]}]
+```
+
+이제 위 딕셔너리를 사용하여 키포인트를 표시할 수 있습니다.
+
+```python
+import matplotlib.pyplot as plt
+import torch
+
+for i in range(len(images)):
+  keypoints = outputs[i]["keypoints"]
+  scores = outputs[i]["scores"]
+  descriptors = outputs[i]["descriptors"]
+  keypoints = outputs[i]["keypoints"].detach().numpy()
+  scores = outputs[i]["scores"].detach().numpy()
+  image = images[i]
+  image_width, image_height = image.size
+
+  plt.axis('off')
+  plt.imshow(image)
+  plt.scatter(
+      keypoints[:, 0],
+      keypoints[:, 1],
+      s=scores * 100,
+      c='cyan',
+      alpha=0.4
+  )
+  plt.show()
+```
+
+아래에서 결과를 확인할 수 있습니다.
+
+<div style="display: flex; align-items: center;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee_keypoint.png" 
+         alt="Bee" 
+         style="height: 200px; object-fit: contain; margin-right: 10px;">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats_keypoint.png" 
+         alt="Cats" 
+         style="height: 200px; object-fit: contain;">
+</div>
+
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@ -19,6 +19,7 @@ import logging
 import os
 import random
 import sys
+from collections import Counter
 from dataclasses import dataclass, field
 from typing import Optional

@ -467,6 +468,14 @@ def main():
            load_from_cache_file=not data_args.overwrite_cache,
            desc="Running tokenizer on dataset",
        )
+
+    def print_class_distribution(dataset, split_name):
+        label_counts = Counter(dataset["label"])
+        total = sum(label_counts.values())
+        logger.info(f"Class distribution in {split_name} set:")
+        for label, count in label_counts.items():
+            logger.info(f"  Label {label}: {count} ({count / total:.2%})")
+
    if training_args.do_train:
        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
@ -474,6 +483,7 @@ def main():
        if data_args.max_train_samples is not None:
            max_train_samples = min(len(train_dataset), data_args.max_train_samples)
            train_dataset = train_dataset.select(range(max_train_samples))
+        print_class_distribution(train_dataset, "train")

    if training_args.do_eval:
        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
@ -482,6 +492,7 @@ def main():
        if data_args.max_eval_samples is not None:
            max_eval_samples = min(len(eval_dataset), data_args.max_eval_samples)
            eval_dataset = eval_dataset.select(range(max_eval_samples))
+        print_class_distribution(eval_dataset, "validation")

    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
@ -490,6 +501,7 @@ def main():
        if data_args.max_predict_samples is not None:
            max_predict_samples = min(len(predict_dataset), data_args.max_predict_samples)
            predict_dataset = predict_dataset.select(range(max_predict_samples))
+        print_class_distribution(predict_dataset, "test")

    # Log a few random samples from the training set:
    if training_args.do_train:
@ -508,8 +520,12 @@ def main():
    # predictions and label_ids field) and has to return a dictionary string to float.
    def compute_metrics(p: EvalPrediction):
        preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
+        labels = p.label_ids
+        if not training_args.eval_do_concat_batches:
+            preds = np.concatenate(preds, axis=0)
+            labels = np.concatenate(p.label_ids, axis=0)
        preds = np.squeeze(preds) if is_regression else np.argmax(preds, axis=1)
-        result = metric.compute(predictions=preds, references=p.label_ids)
+        result = metric.compute(predictions=preds, references=labels)
        if len(result) > 1:
            result["combined_score"] = np.mean(list(result.values())).item()
        return result
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@ -38,8 +38,6 @@ from transformers import (
    OpenAIGPTLMHeadModel,
    OpenAIGPTTokenizer,
    OPTForCausalLM,
-    TransfoXLLMHeadModel,
-    TransfoXLTokenizer,
    XLMTokenizer,
    XLMWithLMHeadModel,
    XLNetLMHeadModel,
@ -62,7 +60,6 @@ MODEL_CLASSES = {
    "ctrl": (CTRLLMHeadModel, CTRLTokenizer),
    "openai-gpt": (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    "xlnet": (XLNetLMHeadModel, XLNetTokenizer),
-    "transfo-xl": (TransfoXLLMHeadModel, TransfoXLTokenizer),
    "xlm": (XLMWithLMHeadModel, XLMTokenizer),
    "gptj": (GPTJForCausalLM, AutoTokenizer),
    "bloom": (BloomForCausalLM, BloomTokenizerFast),
@ -368,10 +365,7 @@ def main():
        prepare_input = PREPROCESSING_FUNCTIONS.get(args.model_type)
        preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)

-        if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
-            tokenizer_kwargs = {"add_space_before_punct_symbol": True}
-        else:
-            tokenizer_kwargs = {}
+        tokenizer_kwargs = {}

        encoded_prompt = tokenizer.encode(
            preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@ -529,6 +529,9 @@ def main():

    def compute_metrics(p):
        predictions, labels = p
+        if not training_args.eval_do_concat_batches:
+            predictions = np.hstack(predictions)
+            labels = np.hstack(labels)
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
--- a/model_cards/README.md
+++ b/model_cards/README.md
@ -1,22 +0,0 @@
-## 🔥 Model cards now live inside each huggingface.co model repo 🔥
-
-
-For consistency, ease of use and scalability, `README.md` model cards now live directly inside each model repo on the HuggingFace model hub.
-
-### How to update a model card
-
-You can directly update a model card inside any model repo you have **write access** to, i.e.:
- a model under your username namespace
- a model under any organization you are a part of.
-
-You can either:
- update it, commit and push using your usual git workflow (command line, GUI, etc.)
- or edit it directly from the website's UI.
-
-**What if you want to create or update a model card for a model you don't have write access to?**
-
-In that case, you can open a [Hub pull request](https://huggingface.co/docs/hub/repositories-pull-requests-discussions)! Check out the [announcement](https://huggingface.co/blog/community-update) of this feature for more details 🤗.
-
-### What happened to the model cards here?
-
-We migrated every model card from the repo to its corresponding huggingface.co model repo. Individual commits were preserved, and they link back to the original commit on GitHub.
--- a/pyproject.toml
+++ b/pyproject.toml
@ -55,3 +55,4 @@ markers = [
 ]
 log_cli = 1
 log_cli_level = "WARNING"
+asyncio_default_fixture_loop_scope = "function"
--- a/scripts/benchmark/trainer-benchmark.py
+++ b/scripts/benchmark/trainer-benchmark.py
@ -1,448 +0,0 @@
-#!/usr/bin/env python
-
-# HF Trainer benchmarking tool
-#
-# This tool can be used to run and compare multiple dimensions of the HF Trainers args.
-#
-# It then prints a report once in github format with all the information that needs to be shared
-# with others and second time in a console-friendly format, so it's easier to use for tuning things up.
-#
-# The main idea is:
-#
-#     ./trainer-benchmark.py --base-cmd '<cmd args that don't change>' \
-#     --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1' \
-#     --target-metric-key train_samples_per_second
-#
-# The variations can be any command line argument that you want to compare and not just dtype as in
-# the example.
-#
-# --variations allows you to compare variations in multiple dimensions.
-#
-# as the first dimension has 2 options and the second 3 in our example, this will run the trainer 6
-# times adding one of:
-#
-#    1. --tf32 0 --fp16 0
-#    2. --tf32 0 --fp16 1
-#    3. --tf32 0 --bf16 1
-#    4. --tf32 1 --fp16 0
-#    5. --tf32 1 --fp16 1
-#    6. --tf32 1 --bf16 1
-#
-# and print the results. This is just a cartesian product - and more than 2 dimensions can be used.
-#
-# If you want to rely on defaults, this:
-#    --variations '--tf32 0|--tf32 1' '--fp16 0|--fp16 1|--bf16 1'
-# is identical to this:
-#    --variations '--tf32 0|--tf32 1' '|--fp16|--bf16'
-#
-# the leading empty variation in the 2nd dimension is a valid variation.
-#
-# So here we get the following 6 variations:
-#
-#    1. --tf32 0
-#    2. --tf32 0 --fp16
-#    3. --tf32 0 --bf16
-#    4. --tf32 1
-#    5. --tf32 1 --fp16
-#    6. --tf32 1 --bf16
-#
-# In this particular case we don't know what the default tf32 setting is as it's normally
-# pytorch-version dependent). That's why it's best to do an explicit setting of each variation:
-#    `--tf32 0|--tf32 1`
-#
-# Here is a full example of a train:
-#
-# CUDA_VISIBLE_DEVICES=0 python ./scripts/benchmark/trainer-benchmark.py \
-# --base-cmd \
-# ' examples/pytorch/translation/run_translation.py --model_name_or_path google-t5/t5-small \
-# --output_dir output_dir --do_train --label_smoothing 0.1 --logging_strategy no \
-# --save_strategy no --per_device_train_batch_size 32 --max_source_length 512 \
-# --max_target_length 512 --num_train_epochs 1 --overwrite_output_dir \
-# --source_lang en --target_lang ro --dataset_name wmt16 --dataset_config "ro-en" \
-# --source_prefix "translate English to Romanian: " --warmup_steps 50 \
-# --max_train_samples 20000 --dataloader_num_workers 2 ' \
-# --target-metric-key train_samples_per_second --repeat-times 1 --variations \
-# '|--fp16|--bf16' '--tf32 0|--tf32 1' --report-metric-keys train_loss \
-# --repeat-times 1 --base-variation '--tf32 0'
-#
-# and here is a possible output:
-#
-#
-# | Variation       |     Train |   Diff |   Train |
-# |                 |   samples |      % |    loss |
-# |                 |       per |        |         |
-# |                 |    second |        |         |
-# |:----------------|----------:|-------:|--------:|
-# | --tf32 0        |    285.11 |      0 |    2.51 |
-# | --tf32 1        |    342.09 |     20 |    2.51 |
-# | --fp16 --tf32 0 |    423.49 |     49 |    2.51 |
-# | --fp16 --tf32 1 |    423.13 |     48 |    2.51 |
-# | --bf16 --tf32 0 |    416.80 |     46 |    2.52 |
-# | --bf16 --tf32 1 |    415.87 |     46 |    2.52 |
-#
-#
-# So you can quickly compare the different outcomes.
-#
-# Typically running each experiment once is enough, but if the environment is unstable you can
-# re-run each multiple times, e.g., 3 using --repeat-times 3 and it will report the averaged results.
-#
-# By default it'll use the lowest result as the base line to use as 100% and then compare the rest to
-# it as can be seen from the table above, but you can also specify which combination is the one to use as
-# the baseline, e.g., to change to another entry use: --base-variation '--tf32 1 --fp16 0'
-#
-# --target-metric-key is there to tell the program which metrics to compare - the different metric keys are
-# inside output_dir/all_results.json. e.g., to measure eval performance instead of train use:
-#    --target-metric-key eval_samples_per_second
-# but of course you will need to adjust the --base-cmd value in the example to perform evaluation as
-# well (as currently it doesn't)
-#
-
-import argparse
-import datetime
-import io
-import itertools
-import json
-import math
-import os
-import platform
-import re
-import shlex
-import subprocess
-import sys
-from pathlib import Path
-from statistics import fmean
-
-import pandas as pd
-import torch
-from tqdm import tqdm
-
-import transformers
-
-
-nan = float("nan")
-
-
-class Tee:
-    """
-    A helper class to tee print's output into a file.
-    Usage:
-    sys.stdout = Tee(filename)
-    """
-
-    def __init__(self, filename):
-        self.stdout = sys.stdout
-        self.file = open(filename, "a")
-
-    def __getattr__(self, attr):
-        return getattr(self.stdout, attr)
-
-    def write(self, msg):
-        self.stdout.write(msg)
-        # strip tqdm codes
-        self.file.write(re.sub(r"^.*\r", "", msg, 0, re.M))
-
-
-def get_original_command(max_width=80, full_python_path=False):
-    """
-    Return the original command line string that can be replayed nicely and wrapped for 80 char width.
-
-    Args:
-        max_width (`int`, *optional*, defaults to 80):
-            The width to wrap for.
-        full_python_path (`bool`, `optional`, defaults to `False`):
-             Whether to replicate the full path or just the last segment (i.e. `python`).
-    """
-
-    cmd = []
-
-    # deal with critical env vars
-    env_keys = ["CUDA_VISIBLE_DEVICES"]
-    for key in env_keys:
-        val = os.environ.get(key, None)
-        if val is not None:
-            cmd.append(f"{key}={val}")
-
-    # python executable (not always needed if the script is executable)
-    python = sys.executable if full_python_path else sys.executable.split("/")[-1]
-    cmd.append(python)
-
-    # now the normal args
-    cmd += list(map(shlex.quote, sys.argv))
-
-    # split up into up to MAX_WIDTH lines with shell multi-line escapes
-    lines = []
-    current_line = ""
-    while len(cmd) > 0:
-        current_line += f"{cmd.pop(0)} "
-        if len(cmd) == 0 or len(current_line) + len(cmd[0]) + 1 > max_width - 1:
-            lines.append(current_line)
-            current_line = ""
-    return "\\\n".join(lines)
-
-
-def get_base_command(args, output_dir):
-
-    # unwrap multi-line input
-    args.base_cmd = re.sub(r"[\\\n]+", " ", args.base_cmd)
-
-    # remove --output_dir if any and set our own
-    args.base_cmd = re.sub("--output_dir\s+[^\s]+", "", args.base_cmd)
-    args.base_cmd += f" --output_dir {output_dir}"
-
-    # ensure we have --overwrite_output_dir
-    args.base_cmd = re.sub("--overwrite_output_dir\s+", "", args.base_cmd)
-    args.base_cmd += " --overwrite_output_dir"
-
-    return [sys.executable] + shlex.split(args.base_cmd)
-
-
-def process_run_single(id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose):
-
-    # Enable to debug everything but the run itself, to do it fast and see the progress.
-    # This is useful for debugging the output formatting quickly - we can remove it later once
-    # everybody is happy with the output
-    if 0:
-        import random
-        from time import sleep
-
-        sleep(0)
-        return dict(
-            {k: random.uniform(0, 100) for k in metric_keys},
-            **{target_metric_key: random.choice([nan, 10.31, 100.2, 55.6666, 222.22222222])},
-        )
-
-    result = subprocess.run(cmd, capture_output=True, text=True)
-
-    if verbose:
-        print("STDOUT", result.stdout)
-        print("STDERR", result.stderr)
-
-    # save the streams
-    prefix = variation.replace(" ", "-")
-    with open(Path(output_dir) / f"log.{prefix}.stdout.txt", "w") as f:
-        f.write(result.stdout)
-    with open(Path(output_dir) / f"log.{prefix}.stderr.txt", "w") as f:
-        f.write(result.stderr)
-
-    if result.returncode != 0:
-        if verbose:
-            print("failed")
-        return {target_metric_key: nan}
-
-    with io.open(f"{output_dir}/all_results.json", "r", encoding="utf-8") as f:
-        metrics = json.load(f)
-
-    # filter out just the keys we want
-    return {k: v for k, v in metrics.items() if k in metric_keys}
-
-
-def process_run(
-    id,
-    cmd,
-    variation_key,
-    variation,
-    longest_variation_len,
-    target_metric_key,
-    report_metric_keys,
-    repeat_times,
-    output_dir,
-    verbose,
-):
-    results = []
-    metrics = []
-    preamble = f"{id}: {variation:<{longest_variation_len}}"
-    outcome = f"{preamble}: "
-    metric_keys = set(report_metric_keys + [target_metric_key])
-    for i in tqdm(range(repeat_times), desc=preamble, leave=False):
-        single_run_metrics = process_run_single(
-            id, cmd, variation, output_dir, target_metric_key, metric_keys, verbose
-        )
-        result = single_run_metrics[target_metric_key]
-        if not math.isnan(result):
-            metrics.append(single_run_metrics)
-            results.append(result)
-            outcome += "✓"
-        else:
-            outcome += "✘"
-    outcome = f"\33[2K\r{outcome}"
-    if len(metrics) > 0:
-        mean_metrics = {k: fmean([x[k] for x in metrics]) for k in metrics[0].keys()}
-        mean_target = round(mean_metrics[target_metric_key], 2)
-        results_str = f"{outcome} {mean_target}"
-        if len(metrics) > 1:
-            results_str += f" {tuple(round(x, 2) for x in results)}"
-        print(results_str)
-        mean_metrics[variation_key] = variation
-        return mean_metrics
-    else:
-        print(outcome)
-        return {variation_key: variation, target_metric_key: nan}
-
-
-def get_versions():
-    properties = torch.cuda.get_device_properties(torch.device("cuda"))
-    return f"""
-Datetime    : {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
-
-Software:
-transformers: {transformers.__version__}
-torch       : {torch.__version__}
-cuda        : {torch.version.cuda}
-python      : {platform.python_version()}
-
-Hardware:
-{torch.cuda.device_count()} GPUs      : {properties.name}, {properties.total_memory/2**30:0.2f}GB
-"""
-
-
-def process_results(results, target_metric_key, report_metric_keys, base_variation, output_dir):
-
-    df = pd.DataFrame(results)
-    variation_key = "variation"
-    diff_key = "diff_%"
-
-    sentinel_value = nan
-    if base_variation is not None and len(df[df[variation_key] == base_variation]):
-        # this may still return nan
-        sentinel_value = df.loc[df[variation_key] == base_variation][target_metric_key].item()
-    if math.isnan(sentinel_value):
-        # as a fallback, use the minimal value as the sentinel
-        sentinel_value = df.loc[df[target_metric_key] != nan][target_metric_key].min()
-
-    # create diff column if possible
-    if not math.isnan(sentinel_value):
-        df[diff_key] = df.apply(
-            lambda r: round(100 * (r[target_metric_key] - sentinel_value) / sentinel_value)
-            if not math.isnan(r[target_metric_key])
-            else 0,
-            axis="columns",
-        )
-
-    # re-order columns
-    cols = [variation_key, target_metric_key, diff_key, *report_metric_keys]
-    df = df.reindex(cols, axis="columns")  # reorder cols
-
-    # capitalize
-    df = df.rename(str.capitalize, axis="columns")
-
-    # make the cols as narrow as possible
-    df_github = df.rename(lambda c: c.replace("_", "<br>"), axis="columns")
-    df_console = df.rename(lambda c: c.replace("_", "\n"), axis="columns")
-
-    report = ["", "Copy between the cut-here-lines and paste as is to github or a forum"]
-    report += ["----------8<-----------------8<--------"]
-    report += ["*** Results:", df_github.to_markdown(index=False, floatfmt=".2f")]
-    report += ["```"]
-    report += ["*** Setup:", get_versions()]
-    report += ["*** The benchmark command line was:", get_original_command()]
-    report += ["```"]
-    report += ["----------8<-----------------8<--------"]
-    report += ["*** Results (console):", df_console.to_markdown(index=False, floatfmt=".2f")]
-
-    print("\n\n".join(report))
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--base-cmd",
-        default=None,
-        type=str,
-        required=True,
-        help="Base cmd",
-    )
-    parser.add_argument(
-        "--variations",
-        default=None,
-        type=str,
-        nargs="+",
-        required=True,
-        help="Multi-dimensional variations, example: '|--fp16|--bf16' '|--tf32'",
-    )
-    parser.add_argument(
-        "--base-variation",
-        default=None,
-        type=str,
-        help="Baseline variation to compare to. if None the minimal target value will be used to compare against",
-    )
-    parser.add_argument(
-        "--target-metric-key",
-        default=None,
-        type=str,
-        required=True,
-        help="Target metric key in output_dir/all_results.json, e.g., train_samples_per_second",
-    )
-    parser.add_argument(
-        "--report-metric-keys",
-        default="",
-        type=str,
-        help="Report metric keys - other metric keys from output_dir/all_results.json to report, e.g., train_loss. Use a single argument e.g., 'train_loss train_samples",
-    )
-    parser.add_argument(
-        "--repeat-times",
-        default=1,
-        type=int,
-        help="How many times to re-run each variation - an average will be reported",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default="output_benchmark",
-        type=str,
-        help="The output directory where all the benchmark reports will go to and additionally this directory will be used to override --output_dir in the script that is being benchmarked",
-    )
-    parser.add_argument(
-        "--verbose",
-        default=False,
-        action="store_true",
-        help="Whether to show the outputs of each run or just the benchmark progress",
-    )
-    args = parser.parse_args()
-
-    output_dir = args.output_dir
-    Path(output_dir).mkdir(exist_ok=True)
-    base_cmd = get_base_command(args, output_dir)
-
-    # split each dimension into its --foo variations
-    dims = [list(map(str.strip, re.split(r"\|", x))) for x in args.variations]
-    # build a cartesian product of dimensions and convert those back into cmd-line arg strings,
-    # while stripping white space for inputs that were empty
-    variations = list(map(str.strip, map(" ".join, itertools.product(*dims))))
-    longest_variation_len = max(len(x) for x in variations)
-
-    # split wanted keys
-    report_metric_keys = args.report_metric_keys.split()
-
-    # capture prints into a log file for convenience
-    report_fn = f"benchmark-report-{datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')}.txt"
-    print(f"\nNote: each run's output is also logged under {output_dir}/log.*.std*.txt")
-    print(f"and this script's output is also piped into {report_fn}")
-
-    sys.stdout = Tee(report_fn)
-
-    print(f"\n*** Running {len(variations)} benchmarks:")
-    print(f"Base command: {' '.join(base_cmd)}")
-
-    variation_key = "variation"
-    results = []
-    for id, variation in enumerate(tqdm(variations, desc="Total completion: ", leave=False)):
-        cmd = base_cmd + variation.split()
-        results.append(
-            process_run(
-                id + 1,
-                cmd,
-                variation_key,
-                variation,
-                longest_variation_len,
-                args.target_metric_key,
-                report_metric_keys,
-                args.repeat_times,
-                output_dir,
-                args.verbose,
-            )
-        )
-
-    process_results(results, args.target_metric_key, report_metric_keys, args.base_variation, output_dir)
-
-
-if __name__ == "__main__":
-    main()
--- a/scripts/deberta_scrtipt.py
+++ b/scripts/deberta_scrtipt.py
@ -1,85 +0,0 @@
-import time
-
-import torch
-
-from transformers import AutoModel, AutoTokenizer, pipeline
-
-
-test_sentence = 'Do you [MASK] the muffin man?'
-
-# for comparison
-bert = pipeline('fill-mask', model = 'bert-base-uncased')
-print('\n'.join([d['sequence'] for d in bert(test_sentence)]))
-
-
-deberta = pipeline('fill-mask', model = 'microsoft/deberta-v3-base', model_kwargs={"legacy": False})
-print('\n'.join([d['sequence'] for d in deberta(test_sentence)]))
-
-
-tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-v3-base")
-
-tokenized_dict = tokenizer(
-    ["Is this working",], ["Not yet",],
-    return_tensors="pt"
-)
-
-deberta.model.forward = torch.compile(deberta.model.forward)
-start=time.time()
-deberta.model(**tokenized_dict)
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-deberta.model(**tokenized_dict)
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-deberta.model(**tokenized_dict)
-end=time.time()
-print(end-start)
-
-
-model = AutoModel.from_pretrained('microsoft/deberta-base')
-model.config.return_dict = False
-model.config.output_hidden_states=False
-input_tuple = (tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-
-
-start=time.time()
-traced_model = torch.jit.trace(model, input_tuple)
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-start=time.time()
-traced_model(tokenized_dict['input_ids'], tokenized_dict['attention_mask'])
-end=time.time()
-print(end-start)
-
-
-torch.jit.save(traced_model, "compiled_deberta.pt")
-
-
-
-# my_script_module = torch.jit.script(model)
--- a/scripts/fsmt/convert-allenai-wmt16.sh
+++ b/scripts/fsmt/convert-allenai-wmt16.sh
@ -1,71 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script acquires data and converts it to fsmt model
-# it covers:
-# - allenai/wmt16-en-de-dist-12-1
-# - allenai/wmt16-en-de-dist-6-1
-# - allenai/wmt16-en-de-12-1
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-mkdir data
-
-# get data (run once)
-
-cd data
-gdown 'https://drive.google.com/uc?id=1x_G2cjvM1nW5hjAB8-vWxRqtQTlmIaQU'
-gdown 'https://drive.google.com/uc?id=1oA2aqZlVNj5FarxBlNXEHpBS4lRetTzU'
-gdown 'https://drive.google.com/uc?id=1Wup2D318QYBFPW_NKI1mfP_hXOfmUI9r'
-tar -xvzf trans_ende_12-1_0.2.tar.gz
-tar -xvzf trans_ende-dist_12-1_0.2.tar.gz
-tar -xvzf trans_ende-dist_6-1_0.2.tar.gz
-gdown 'https://drive.google.com/uc?id=1mNufoynJ9-Zy1kJh2TA_lHm2squji0i9'
-gdown 'https://drive.google.com/uc?id=1iO7um-HWoNoRKDtw27YUSgyeubn9uXqj'
-tar -xvzf wmt16.en-de.deep-shallow.dist.tar.gz
-tar -xvzf wmt16.en-de.deep-shallow.tar.gz
-cp wmt16.en-de.deep-shallow/data-bin/dict.*.txt trans_ende_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/data-bin/dict.*.txt trans_ende-dist_6-1_0.2
-cp wmt16.en-de.deep-shallow/bpecodes trans_ende_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_12-1_0.2
-cp wmt16.en-de.deep-shallow.dist/bpecodes trans_ende-dist_6-1_0.2
-cd -
-
-# run conversions and uploads
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-12-1
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende-dist_6-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-dist-6-1
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/trans_ende_12-1_0.2/checkpoint_top5_average.pt --pytorch_dump_folder_path data/wmt16-en-de-12-1
-
-
-# upload
-cd data
-transformers-cli upload -y wmt16-en-de-dist-12-1
-transformers-cli upload -y wmt16-en-de-dist-6-1
-transformers-cli upload -y wmt16-en-de-12-1
-cd -
-
-
-# if updating just small files and not the large models, here is a script to generate the right commands:
-perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
-# add/remove files as needed
-
--- a/scripts/fsmt/convert-allenai-wmt19.sh
+++ b/scripts/fsmt/convert-allenai-wmt19.sh
@ -1,59 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script acquires data and converts it to fsmt model
-# it covers:
-# - allenai/wmt19-de-en-6-6-base
-# - allenai/wmt19-de-en-6-6-big
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-mkdir data
-
-# get data (run once)
-
-cd data
-gdown 'https://drive.google.com/uc?id=1j6z9fYdlUyOYsh7KJoumRlr1yHczxR5T'
-gdown 'https://drive.google.com/uc?id=1yT7ZjqfvUYOBXvMjeY8uGRHQFWoSo8Q5'
-gdown 'https://drive.google.com/uc?id=15gAzHeRUCs-QV8vHeTReMPEh1j8excNE'
-tar -xvzf wmt19.de-en.tar.gz
-tar -xvzf wmt19_deen_base_dr0.1_1.tar.gz
-tar -xvzf wmt19_deen_big_dr0.1_2.tar.gz
-cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_base_dr0.1_1
-cp wmt19.de-en/data-bin/dict.*.txt wmt19_deen_big_dr0.1_2
-cd -
-
-# run conversions and uploads
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_base_dr0.1_1/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-base
-
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19_deen_big_dr0.1_2/checkpoint_last3_avg.pt --pytorch_dump_folder_path data/wmt19-de-en-6-6-big
-
-
-# upload
-cd data
-transformers-cli upload -y wmt19-de-en-6-6-base
-transformers-cli upload -y wmt19-de-en-6-6-big
-cd -
-
-
-# if updating just small files and not the large models, here is a script to generate the right commands:
-perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for ("wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
-# add/remove files as needed
-
--- a/scripts/fsmt/convert-facebook-wmt19.sh
+++ b/scripts/fsmt/convert-facebook-wmt19.sh
@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script acquires data and converts it to fsmt model
-# it covers:
-# - facebook/wmt19-ru-en
-# - facebook/wmt19-en-ru
-# - facebook/wmt19-de-en
-# - facebook/wmt19-en-de
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-mkdir data
-
-# get data (run once)
-
-cd data
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-de.joined-dict.ensemble.tar.gz
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.de-en.joined-dict.ensemble.tar.gz
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.en-ru.ensemble.tar.gz
-wget https://dl.fbaipublicfiles.com/fairseq/models/wmt19.ru-en.ensemble.tar.gz
-tar -xvzf wmt19.en-de.joined-dict.ensemble.tar.gz
-tar -xvzf wmt19.de-en.joined-dict.ensemble.tar.gz
-tar -xvzf wmt19.en-ru.ensemble.tar.gz
-tar -xvzf wmt19.ru-en.ensemble.tar.gz
-cd -
-
-# run conversions and uploads
-
-export PAIR=ru-en
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-export PAIR=en-ru
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-export PAIR=de-en
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-export PAIR=en-de
-PYTHONPATH="src" python src/transformers/convert_fsmt_original_pytorch_checkpoint_to_pytorch.py --fsmt_checkpoint_path data/wmt19.$PAIR.joined-dict.ensemble/model4.pt --pytorch_dump_folder_path data/wmt19-$PAIR
-
-
-# upload
-cd data
-transformers-cli upload -y wmt19-ru-en
-transformers-cli upload -y wmt19-en-ru
-transformers-cli upload -y wmt19-de-en
-transformers-cli upload -y wmt19-en-de
-cd -
-
-# if updating just small files and not the large models, here is a script to generate the right commands:
-perl -le 'for $f (@ARGV) { print qq[transformers-cli upload -y $_/$f --filename $_/$f] for map { "wmt19-$_" } ("en-ru", "ru-en", "de-en", "en-de")}' vocab-src.json vocab-tgt.json tokenizer_config.json config.json
-# add/remove files as needed
-
--- a/scripts/fsmt/eval-allenai-wmt16.sh
+++ b/scripts/fsmt/eval-allenai-wmt16.sh
@ -1,79 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script evals the following fsmt models
-# it covers:
-# - allenai/wmt16-en-de-dist-12-1
-# - allenai/wmt16-en-de-dist-6-1
-# - allenai/wmt16-en-de-12-1
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
-
-### Normal eval ###
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=64
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt16-en-de-dist-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-MODEL_PATH=allenai/wmt16-en-de-dist-6-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-MODEL_PATH=allenai/wmt16-en-de-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-
-### Searching hparams eval ###
-
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=32
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt16-en-de-dist-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-
-MODEL_PATH=allenai/wmt16-en-de-dist-6-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-
-MODEL_PATH=allenai/wmt16-en-de-12-1
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
--- a/scripts/fsmt/eval-allenai-wmt19.sh
+++ b/scripts/fsmt/eval-allenai-wmt19.sh
@ -1,67 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script evals the following fsmt models
-# it covers:
-# - allenai/wmt19-de-en-6-6-base
-# - allenai/wmt19-de-en-6-6-big
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
-
-### Normal eval ###
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=64
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-base
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-big
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-
-### Searching hparams eval ###
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-base
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-MODEL_PATH=allenai/wmt19-de-en-6-6-big
-echo $PAIR $MODEL_PATH
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py $MODEL_PATH $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:10:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
--- a/scripts/fsmt/eval-facebook-wmt19.sh
+++ b/scripts/fsmt/eval-facebook-wmt19.sh
@ -1,161 +0,0 @@
-#!/usr/bin/env bash
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script evals the following fsmt models
-# it covers:
-# - facebook/wmt19-ru-en
-# - facebook/wmt19-en-ru
-# - facebook/wmt19-de-en
-# - facebook/wmt19-en-de
-
-
-# this script needs to be run from the top level of the transformers repo
-if [ ! -d "src/transformers" ]; then
-    echo "Error: This script needs to be run from the top of the transformers repo"
-    exit 1
-fi
-
-
-# In these scripts you may have to lower BS if you get CUDA OOM (or increase it if you have a large GPU)
-
-### a short estimate version for quick testing ###
-
-export PAIR=en-ru
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=8
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src | head -10 > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref | head -10 > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-
-### Normal eval ###
-
-# ru-en
-
-export PAIR=ru-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=50
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-
-# (target BLEU: 41.3 http://matrix.statmt.org/matrix/output/1907?run_id=6937)
-
-
-# en-ru
-
-export PAIR=en-ru
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=50
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-# (target BLEU: 36.4 http://matrix.statmt.org/matrix/output/1914?score_id=37605)
-
-
-
-# en-de
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-# (target BLEU: 43.1 http://matrix.statmt.org/matrix/output/1909?run_id=6862)
-
-
-# de-en
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=50
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-
-# (target BLEU: 42.3 http://matrix.statmt.org/matrix/output/1902?run_id=6750)
-
-
-### Searching hparams eval ###
-
-# en-ru
-
-export PAIR=ru-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=32
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1"
-
-
-# en-ru
-
-export PAIR=en-ru
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-mkdir -p $DATA_DIR
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="0" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
-
-# en-de
-
-export PAIR=en-de
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
-
-# de-en
-
-export PAIR=de-en
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=16
-mkdir -p $DATA_DIR
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-CUDA_VISIBLE_DEVICES="1" PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval_search.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --search="num_beams=5:8:11:15 length_penalty=0.6:0.7:0.8:0.9:1.0:1.1 early_stopping=true:false"
--- a/scripts/fsmt/fsmt-make-super-tiny-model.py
+++ b/scripts/fsmt/fsmt-make-super-tiny-model.py
@ -1,88 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script creates a super tiny model that is useful inside tests, when we just want to test that
-# the machinery works, without needing to the check the quality of the outcomes.
-#
-# This version creates a tiny vocab first, and then a tiny model - so the outcome is truly tiny -
-# all files ~60KB. As compared to taking a full-size model, reducing to the minimum its layers and
-# emb dimensions, but keeping the full vocab + merges files, leading to ~3MB in total for all files.
-# The latter is done by `fsmt-make-super-tiny-model.py`.
-#
-# It will be used then as "stas/tiny-wmt19-en-ru"
-
-import json
-import tempfile
-from pathlib import Path
-
-from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
-from transformers.models.fsmt.tokenization_fsmt import VOCAB_FILES_NAMES
-
-
-mname_tiny = "tiny-wmt19-en-ru"
-
-# Build
-
-# borrowed from a test
-vocab = [ "l", "o", "w", "e", "r", "s", "t", "i", "d", "n", "w</w>", "r</w>", "t</w>", "lo", "low", "er</w>", "low</w>", "lowest</w>", "newer</w>", "wider</w>", "<unk>", ]
-vocab_tokens = dict(zip(vocab, range(len(vocab))))
-merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
-
-with tempfile.TemporaryDirectory() as tmpdirname:
-    build_dir = Path(tmpdirname)
-    src_vocab_file = build_dir / VOCAB_FILES_NAMES["src_vocab_file"]
-    tgt_vocab_file = build_dir / VOCAB_FILES_NAMES["tgt_vocab_file"]
-    merges_file = build_dir / VOCAB_FILES_NAMES["merges_file"]
-    with open(src_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
-    with open(tgt_vocab_file, "w") as fp: fp.write(json.dumps(vocab_tokens))
-    with open(merges_file, "w") as fp   : fp.write("\n".join(merges))
-
-    tokenizer = FSMTTokenizer(
-        langs=["en", "ru"],
-        src_vocab_size = len(vocab),
-        tgt_vocab_size = len(vocab),
-        src_vocab_file=src_vocab_file,
-        tgt_vocab_file=tgt_vocab_file,
-        merges_file=merges_file,
-    )
-
-config = FSMTConfig(
-    langs=['ru', 'en'],
-    src_vocab_size=1000, tgt_vocab_size=1000,
-    d_model=4,
-    encoder_layers=1, decoder_layers=1,
-    encoder_ffn_dim=4, decoder_ffn_dim=4,
-    encoder_attention_heads=1, decoder_attention_heads=1,
-)
-
-tiny_model = FSMTForConditionalGeneration(config)
-print(f"num of params {tiny_model.num_parameters()}")
-
-# Test
-batch = tokenizer(["Making tiny model"], return_tensors="pt")
-outputs = tiny_model(**batch)
-
-print("test output:", len(outputs.logits[0]))
-
-# Save
-tiny_model.half() # makes it smaller
-tiny_model.save_pretrained(mname_tiny)
-tokenizer.save_pretrained(mname_tiny)
-
-print(f"Generated {mname_tiny}")
-
-# Upload
-# transformers-cli upload tiny-wmt19-en-ru
--- a/scripts/fsmt/fsmt-make-tiny-model.py
+++ b/scripts/fsmt/fsmt-make-tiny-model.py
@ -1,61 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# This script creates a super tiny model that is useful inside tests, when we just want to test that
-# the machinery works, without needing to the check the quality of the outcomes.
-#
-# This version creates a tiny model through reduction of a normal pre-trained model, but keeping the
-# full vocab, merges file, and thus also resulting in a larger model due to a large vocab size.
-# This gives ~3MB in total for all files.
-#
-# If you want a 50 times smaller than this see `fsmt-make-super-tiny-model.py`, which is slightly more complicated
-#
-#
-# It will be used then as "stas/tiny-wmt19-en-de"
-
-# Build
-from transformers import FSMTConfig, FSMTForConditionalGeneration, FSMTTokenizer
-
-
-mname = "facebook/wmt19-en-de"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-# get the correct vocab sizes, etc. from the master model
-config = FSMTConfig.from_pretrained(mname)
-config.update({
-    "d_model": 4,
-    "encoder_layers": 1, "decoder_layers": 1,
-    "encoder_ffn_dim": 4, "decoder_ffn_dim": 4,
-    "encoder_attention_heads": 1, "decoder_attention_heads": 1})
-
-tiny_model = FSMTForConditionalGeneration(config)
-print(f"num of params {tiny_model.num_parameters()}")
-
-# Test
-batch = tokenizer(["Making tiny model"], return_tensors="pt")
-outputs = tiny_model(**batch)
-
-print("test output:", len(outputs.logits[0]))
-
-# Save
-mname_tiny = "tiny-wmt19-en-de"
-tiny_model.half() # makes it smaller
-tiny_model.save_pretrained(mname_tiny)
-tokenizer.save_pretrained(mname_tiny)
-
-print(f"Generated {mname_tiny}")
-
-# Upload
-# transformers-cli upload tiny-wmt19-en-de
--- a/scripts/fsmt/gen-card-allenai-wmt16.py
+++ b/scripts/fsmt/gen-card-allenai-wmt16.py
@ -1,156 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Usage:
-# ./gen-card-allenai-wmt16.py
-
-import os
-from pathlib import Path
-
-
-def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
-
-    texts = {
-        "en": "Machine learning is great, isn't it?",
-        "ru": "Машинное обучение - это здорово, не так ли?",
-        "de": "Maschinelles Lernen ist großartig, nicht wahr?",
-    }
-
-    # BLUE scores as follows:
-    # "pair": [fairseq, transformers]
-    scores = {
-        "wmt16-en-de-dist-12-1": [28.3, 27.52],
-        "wmt16-en-de-dist-6-1": [27.4, 27.11],
-        "wmt16-en-de-12-1": [26.9, 25.75],
-    }
-    pair = f"{src_lang}-{tgt_lang}"
-
-    readme = f"""
---
-language:
- {src_lang}
- {tgt_lang}
-thumbnail:
-tags:
- translation
- wmt16
- allenai
-license: apache-2.0
-datasets:
- wmt16
-metrics:
- bleu
---
-
-# FSMT
-
-## Model description
-
-This is a ported version of fairseq-based [wmt16 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
-
-For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
-
-All 3 models are available:
-
-* [wmt16-en-de-dist-12-1](https://huggingface.co/allenai/wmt16-en-de-dist-12-1)
-* [wmt16-en-de-dist-6-1](https://huggingface.co/allenai/wmt16-en-de-dist-6-1)
-* [wmt16-en-de-12-1](https://huggingface.co/allenai/wmt16-en-de-12-1)
-
-
-## Intended uses & limitations
-
-#### How to use
-
-```python
-from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-mname = "allenai/{model_name}"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-model = FSMTForConditionalGeneration.from_pretrained(mname)
-
-input = "{texts[src_lang]}"
-input_ids = tokenizer.encode(input, return_tensors="pt")
-outputs = model.generate(input_ids)
-decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(decoded) # {texts[tgt_lang]}
-
-```
-
-#### Limitations and bias
-
-
-## Training data
-
-Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
-
-## Eval results
-
-Here are the BLEU scores:
-
-model   | fairseq | transformers
-------|---------|----------
-{model_name}  | {scores[model_name][0]} | {scores[model_name][1]}
-
-The score is slightly below the score reported in the paper, as the researchers don't use `sacrebleu` and measure the score on tokenized outputs. `transformers` score was measured using `sacrebleu` on detokenized outputs.
-
-The score was calculated using this code:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-export PAIR={pair}
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt16 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt16 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-```
-
-## Data Sources
-
- [training, etc.](http://www.statmt.org/wmt16/)
- [test set](http://matrix.statmt.org/test_sets/newstest2016.tgz?1504722372)
-
-
-### BibTeX entry and citation info
-
-```
-@misc{{kasai2020deep,
-    title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
-    author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
-    year={{2020}},
-    eprint={{2006.10369}},
-    archivePrefix={{arXiv}},
-    primaryClass={{cs.CL}}
-}}
-```
-
-"""
-    model_card_dir.mkdir(parents=True, exist_ok=True)
-    path = os.path.join(model_card_dir, "README.md")
-    print(f"Generating {path}")
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(readme)
-
-# make sure we are under the root of the project
-repo_dir = Path(__file__).resolve().parent.parent.parent
-model_cards_dir = repo_dir / "model_cards"
-
-for model_name in ["wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1"]:
-    model_card_dir = model_cards_dir / "allenai" / model_name
-    write_model_card(model_card_dir, src_lang="en", tgt_lang="de", model_name=model_name)
--- a/scripts/fsmt/gen-card-allenai-wmt19.py
+++ b/scripts/fsmt/gen-card-allenai-wmt19.py
@ -1,153 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Usage:
-# ./gen-card-allenai-wmt19.py
-
-import os
-from pathlib import Path
-
-
-def write_model_card(model_card_dir, src_lang, tgt_lang, model_name):
-
-    texts = {
-        "en": "Machine learning is great, isn't it?",
-        "ru": "Машинное обучение - это здорово, не так ли?",
-        "de": "Maschinelles Lernen ist großartig, nicht wahr?",
-    }
-
-    # BLUE scores as follows:
-    # "pair": [fairseq, transformers]
-    scores = {
-        "wmt19-de-en-6-6-base": [0, 38.37],
-        "wmt19-de-en-6-6-big": [0, 39.90],
-    }
-    pair = f"{src_lang}-{tgt_lang}"
-
-    readme = f"""
---
-
-language:
- {src_lang}
- {tgt_lang}
-thumbnail:
-tags:
- translation
- wmt19
- allenai
-license: apache-2.0
-datasets:
- wmt19
-metrics:
- bleu
---
-
-# FSMT
-
-## Model description
-
-This is a ported version of fairseq-based [wmt19 transformer](https://github.com/jungokasai/deep-shallow/) for {src_lang}-{tgt_lang}.
-
-For more details, please, see [Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation](https://arxiv.org/abs/2006.10369).
-
-2 models are available:
-
-* [wmt19-de-en-6-6-big](https://huggingface.co/allenai/wmt19-de-en-6-6-big)
-* [wmt19-de-en-6-6-base](https://huggingface.co/allenai/wmt19-de-en-6-6-base)
-
-
-## Intended uses & limitations
-
-#### How to use
-
-```python
-from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-mname = "allenai/{model_name}"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-model = FSMTForConditionalGeneration.from_pretrained(mname)
-
-input = "{texts[src_lang]}"
-input_ids = tokenizer.encode(input, return_tensors="pt")
-outputs = model.generate(input_ids)
-decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(decoded) # {texts[tgt_lang]}
-
-```
-
-#### Limitations and bias
-
-
-## Training data
-
-Pretrained weights were left identical to the original model released by allenai. For more details, please, see the [paper](https://arxiv.org/abs/2006.10369).
-
-## Eval results
-
-Here are the BLEU scores:
-
-model   |  transformers
-------|---------
-{model_name}  |  {scores[model_name][1]}
-
-The score was calculated using this code:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-export PAIR={pair}
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=5
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py allenai/{model_name} $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-```
-
-## Data Sources
-
- [training, etc.](http://www.statmt.org/wmt19/)
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
-
-
-### BibTeX entry and citation info
-
-```
-@misc{{kasai2020deep,
-    title={{Deep Encoder, Shallow Decoder: Reevaluating the Speed-Quality Tradeoff in Machine Translation}},
-    author={{Jungo Kasai and Nikolaos Pappas and Hao Peng and James Cross and Noah A. Smith}},
-    year={{2020}},
-    eprint={{2006.10369}},
-    archivePrefix={{arXiv}},
-    primaryClass={{cs.CL}}
-}}
-```
-
-"""
-    model_card_dir.mkdir(parents=True, exist_ok=True)
-    path = os.path.join(model_card_dir, "README.md")
-    print(f"Generating {path}")
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(readme)
-
-# make sure we are under the root of the project
-repo_dir = Path(__file__).resolve().parent.parent.parent
-model_cards_dir = repo_dir / "model_cards"
-
-for model_name in ["wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"]:
-    model_card_dir = model_cards_dir / "allenai" / model_name
-    write_model_card(model_card_dir, src_lang="de", tgt_lang="en", model_name=model_name)
--- a/scripts/fsmt/gen-card-facebook-wmt19.py
+++ b/scripts/fsmt/gen-card-facebook-wmt19.py
@ -1,165 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Usage:
-# ./gen-card-facebook-wmt19.py
-
-import os
-from pathlib import Path
-
-
-def write_model_card(model_card_dir, src_lang, tgt_lang):
-
-    texts = {
-        "en": "Machine learning is great, isn't it?",
-        "ru": "Машинное обучение - это здорово, не так ли?",
-        "de": "Maschinelles Lernen ist großartig, oder?",
-    }
-
-    # BLUE scores as follows:
-    # "pair": [fairseq, transformers]
-    scores = {
-        "ru-en": ["[41.3](http://matrix.statmt.org/matrix/output/1907?run_id=6937)", "39.20"],
-        "en-ru": ["[36.4](http://matrix.statmt.org/matrix/output/1914?run_id=6724)", "33.47"],
-        "en-de": ["[43.1](http://matrix.statmt.org/matrix/output/1909?run_id=6862)", "42.83"],
-        "de-en": ["[42.3](http://matrix.statmt.org/matrix/output/1902?run_id=6750)", "41.35"],
-    }
-    pair = f"{src_lang}-{tgt_lang}"
-
-    readme = f"""
---
-language:
- {src_lang}
- {tgt_lang}
-thumbnail:
-tags:
- translation
- wmt19
- facebook
-license: apache-2.0
-datasets:
- wmt19
-metrics:
- bleu
---
-
-# FSMT
-
-## Model description
-
-This is a ported version of [fairseq wmt19 transformer](https://github.com/pytorch/fairseq/blob/master/examples/wmt19/README.md) for {src_lang}-{tgt_lang}.
-
-For more details, please see, [Facebook FAIR's WMT19 News Translation Task Submission](https://arxiv.org/abs/1907.06616).
-
-The abbreviation FSMT stands for FairSeqMachineTranslation
-
-All four models are available:
-
-* [wmt19-en-ru](https://huggingface.co/facebook/wmt19-en-ru)
-* [wmt19-ru-en](https://huggingface.co/facebook/wmt19-ru-en)
-* [wmt19-en-de](https://huggingface.co/facebook/wmt19-en-de)
-* [wmt19-de-en](https://huggingface.co/facebook/wmt19-de-en)
-
-## Intended uses & limitations
-
-#### How to use
-
-```python
-from transformers import FSMTForConditionalGeneration, FSMTTokenizer
-mname = "facebook/wmt19-{src_lang}-{tgt_lang}"
-tokenizer = FSMTTokenizer.from_pretrained(mname)
-model = FSMTForConditionalGeneration.from_pretrained(mname)
-
-input = "{texts[src_lang]}"
-input_ids = tokenizer.encode(input, return_tensors="pt")
-outputs = model.generate(input_ids)
-decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
-print(decoded) # {texts[tgt_lang]}
-
-```
-
-#### Limitations and bias
-
- The original (and this ported model) doesn't seem to handle well inputs with repeated sub-phrases, [content gets truncated](https://discuss.huggingface.co/t/issues-with-translating-inputs-containing-repeated-phrases/981)
-
-## Training data
-
-Pretrained weights were left identical to the original model released by fairseq. For more details, please, see the [paper](https://arxiv.org/abs/1907.06616).
-
-## Eval results
-
-pair   | fairseq | transformers
-------|---------|----------
-{pair}  | {scores[pair][0]} | {scores[pair][1]}
-
-The score is slightly below the score reported by `fairseq`, since `transformers`` currently doesn't support:
- model ensemble, therefore the best performing checkpoint was ported (``model4.pt``).
- re-ranking
-
-The score was calculated using this code:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-export PAIR={pair}
-export DATA_DIR=data/$PAIR
-export SAVE_DIR=data/$PAIR
-export BS=8
-export NUM_BEAMS=15
-mkdir -p $DATA_DIR
-sacrebleu -t wmt19 -l $PAIR --echo src > $DATA_DIR/val.source
-sacrebleu -t wmt19 -l $PAIR --echo ref > $DATA_DIR/val.target
-echo $PAIR
-PYTHONPATH="src:examples/seq2seq" python examples/seq2seq/run_eval.py facebook/wmt19-$PAIR $DATA_DIR/val.source $SAVE_DIR/test_translations.txt --reference_path $DATA_DIR/val.target --score_path $SAVE_DIR/test_bleu.json --bs $BS --task translation --num_beams $NUM_BEAMS
-```
-note: fairseq reports using a beam of 50, so you should get a slightly higher score if re-run with `--num_beams 50`.
-
-## Data Sources
-
- [training, etc.](http://www.statmt.org/wmt19/)
- [test set](http://matrix.statmt.org/test_sets/newstest2019.tgz?1556572561)
-
-
-### BibTeX entry and citation info
-
-```bibtex
-@inproceedings{{...,
-  year={{2020}},
-  title={{Facebook FAIR's WMT19 News Translation Task Submission}},
-  author={{Ng, Nathan and Yee, Kyra and Baevski, Alexei and Ott, Myle and Auli, Michael and Edunov, Sergey}},
-  booktitle={{Proc. of WMT}},
-}}
-```
-
-
-## TODO
-
- port model ensemble (fairseq uses 4 model checkpoints)
-
-"""
-    os.makedirs(model_card_dir, exist_ok=True)
-    path = os.path.join(model_card_dir, "README.md")
-    print(f"Generating {path}")
-    with open(path, "w", encoding="utf-8") as f:
-        f.write(readme)
-
-# make sure we are under the root of the project
-repo_dir = Path(__file__).resolve().parent.parent.parent
-model_cards_dir = repo_dir / "model_cards"
-
-for model_name in ["wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"]:
-    base, src_lang, tgt_lang = model_name.split("-")
-    model_card_dir = model_cards_dir / "facebook" / model_name
-    write_model_card(model_card_dir, src_lang=src_lang, tgt_lang=tgt_lang)
--- a/scripts/fsmt/s3-move.sh
+++ b/scripts/fsmt/s3-move.sh
@ -1,116 +0,0 @@
-
-# this is the process of uploading the updated models to s3. As I can't upload them directly to the correct orgs, this script shows how this is done
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-1. upload updated models to my account
-
-transformers-cli upload -y wmt19-ru-en
-transformers-cli upload -y wmt19-en-ru
-transformers-cli upload -y wmt19-de-en
-transformers-cli upload -y wmt19-en-de
-transformers-cli upload -y wmt19-de-en-6-6-base
-transformers-cli upload -y wmt19-de-en-6-6-big
-transformers-cli upload -y wmt16-en-de-dist-12-1
-transformers-cli upload -y wmt16-en-de-dist-6-1
-transformers-cli upload -y wmt16-en-de-12-1
-
-
-2. ask someone to move them to:
-
-* to facebook: "wmt19-ru-en", "wmt19-en-ru", "wmt19-en-de", "wmt19-de-en"
-* to allenai: "wmt16-en-de-dist-12-1", "wmt16-en-de-dist-6-1", "wmt16-en-de-12-1", "wmt19-de-en-6-6-base", "wmt19-de-en-6-6-big"
-
-export b="s3://models.huggingface.co/bert"
-stas_to_fb () {
-	src=$1
-	shift
-	aws s3 sync $b/stas/$src $b/facebook/$src $@
-}
-
-stas_to_allenai () {
-	src=$1
-	shift
-	aws s3 sync $b/stas/$src $b/allenai/$src $@
-}
-
-stas_to_fb wmt19-en-ru
-stas_to_fb wmt19-ru-en
-stas_to_fb wmt19-en-de
-stas_to_fb wmt19-de-en
-
-stas_to_allenai wmt16-en-de-dist-12-1
-stas_to_allenai wmt16-en-de-dist-6-1
-stas_to_allenai wmt16-en-de-6-1
-stas_to_allenai wmt16-en-de-12-1
-stas_to_allenai wmt19-de-en-6-6-base
-stas_to_allenai wmt19-de-en-6-6-big
-
-
-3. and then remove all these model files from my account
-
-transformers-cli s3 rm wmt16-en-de-12-1/config.json
-transformers-cli s3 rm wmt16-en-de-12-1/merges.txt
-transformers-cli s3 rm wmt16-en-de-12-1/pytorch_model.bin
-transformers-cli s3 rm wmt16-en-de-12-1/tokenizer_config.json
-transformers-cli s3 rm wmt16-en-de-12-1/vocab-src.json
-transformers-cli s3 rm wmt16-en-de-12-1/vocab-tgt.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/config.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/merges.txt
-transformers-cli s3 rm wmt16-en-de-dist-12-1/pytorch_model.bin
-transformers-cli s3 rm wmt16-en-de-dist-12-1/tokenizer_config.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-src.json
-transformers-cli s3 rm wmt16-en-de-dist-12-1/vocab-tgt.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/config.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/merges.txt
-transformers-cli s3 rm wmt16-en-de-dist-6-1/pytorch_model.bin
-transformers-cli s3 rm wmt16-en-de-dist-6-1/tokenizer_config.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-src.json
-transformers-cli s3 rm wmt16-en-de-dist-6-1/vocab-tgt.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/config.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/merges.txt
-transformers-cli s3 rm wmt19-de-en-6-6-base/pytorch_model.bin
-transformers-cli s3 rm wmt19-de-en-6-6-base/tokenizer_config.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-src.json
-transformers-cli s3 rm wmt19-de-en-6-6-base/vocab-tgt.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/config.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/merges.txt
-transformers-cli s3 rm wmt19-de-en-6-6-big/pytorch_model.bin
-transformers-cli s3 rm wmt19-de-en-6-6-big/tokenizer_config.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-src.json
-transformers-cli s3 rm wmt19-de-en-6-6-big/vocab-tgt.json
-transformers-cli s3 rm wmt19-de-en/config.json
-transformers-cli s3 rm wmt19-de-en/merges.txt
-transformers-cli s3 rm wmt19-de-en/pytorch_model.bin
-transformers-cli s3 rm wmt19-de-en/tokenizer_config.json
-transformers-cli s3 rm wmt19-de-en/vocab-src.json
-transformers-cli s3 rm wmt19-de-en/vocab-tgt.json
-transformers-cli s3 rm wmt19-en-de/config.json
-transformers-cli s3 rm wmt19-en-de/merges.txt
-transformers-cli s3 rm wmt19-en-de/pytorch_model.bin
-transformers-cli s3 rm wmt19-en-de/tokenizer_config.json
-transformers-cli s3 rm wmt19-en-de/vocab-src.json
-transformers-cli s3 rm wmt19-en-de/vocab-tgt.json
-transformers-cli s3 rm wmt19-en-ru/config.json
-transformers-cli s3 rm wmt19-en-ru/merges.txt
-transformers-cli s3 rm wmt19-en-ru/pytorch_model.bin
-transformers-cli s3 rm wmt19-en-ru/tokenizer_config.json
-transformers-cli s3 rm wmt19-en-ru/vocab-src.json
-transformers-cli s3 rm wmt19-en-ru/vocab-tgt.json
-transformers-cli s3 rm wmt19-ru-en/config.json
-transformers-cli s3 rm wmt19-ru-en/merges.txt
-transformers-cli s3 rm wmt19-ru-en/pytorch_model.bin
-transformers-cli s3 rm wmt19-ru-en/tokenizer_config.json
-transformers-cli s3 rm wmt19-ru-en/vocab-src.json
-transformers-cli s3 rm wmt19-ru-en/vocab-tgt.json
--- a/scripts/pegasus/build_test_sample_spm_no_bos.py
+++ b/scripts/pegasus/build_test_sample_spm_no_bos.py
@ -1,34 +0,0 @@
-#!/usr/bin/env python
-# Copyright 2020 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# this script builds a small sample spm file tests/fixtures/test_sentencepiece_no_bos.model, with features needed by pegasus
-
-# 1. pip install sentencepiece
-#
-# 2. wget https://raw.githubusercontent.com/google/sentencepiece/master/data/botchan.txt
-
-# 3. build
-import sentencepiece as spm
-
-
-# pegasus:
-# 1. no bos
-# 2. eos_id is 1
-# 3. unk_id is 2
-# build a sample spm file accordingly
-spm.SentencePieceTrainer.train('--input=botchan.txt --model_prefix=test_sentencepiece_no_bos --bos_id=-1 --unk_id=2  --eos_id=1  --vocab_size=1000')
-
-# 4. now update the fixture
-# mv test_sentencepiece_no_bos.model ../../tests/fixtures/
--- a/scripts/tatoeba/README.md
+++ b/scripts/tatoeba/README.md
@ -1,72 +0,0 @@
-<!---
-Copyright 2020 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-->
-
-Setup transformers following instructions in README.md, (I would fork first).
-```bash
-git clone git@github.com:huggingface/transformers.git
-cd transformers
-pip install -e .
-pip install pandas GitPython wget
-```
-
-Get required metadata
-```bash
-curl https://cdn-datasets.huggingface.co/language_codes/language-codes-3b2.csv  > language-codes-3b2.csv
-curl https://cdn-datasets.huggingface.co/language_codes/iso-639-3.csv > iso-639-3.csv
-```
-
-Install Tatoeba-Challenge repo inside transformers
-```bash
-git clone git@github.com:Helsinki-NLP/Tatoeba-Challenge.git
-```
-
-To convert a few models, call the conversion script from command line:
-```bash
-python src/transformers/models/marian/convert_marian_tatoeba_to_pytorch.py --models heb-eng eng-heb --save_dir converted
-```
-
-To convert lots of models you can pass your list of Tatoeba model names to `resolver.convert_models` in a python client or script.
-
-```python
-from transformers.convert_marian_tatoeba_to_pytorch import TatoebaConverter
-resolver = TatoebaConverter(save_dir='converted')
-resolver.convert_models(['heb-eng', 'eng-heb'])
-```
-
-
-### Upload converted models
-Since version v3.5.0, the model sharing workflow is switched to git-based system . Refer to [model sharing doc](https://huggingface.co/transformers/main/model_sharing.html#model-sharing-and-uploading) for more details.
-
-To upload all converted models, 
-
-1. Install [git-lfs](https://git-lfs.github.com/).
-
-2. Login to `huggingface-cli`
-
-```bash
-huggingface-cli login
-```
-
-3. Run the `upload_models` script
-
-```bash
-./scripts/tatoeba/upload_models.sh
-```
-
-
-### Modifications
- To change naming logic, change the code near `os.rename`. The model card creation code may also need to change.
- To change model card content, you must modify `TatoebaCodeResolver.write_model_card`
--- a/scripts/tatoeba/upload_models.sh
+++ b/scripts/tatoeba/upload_models.sh
@ -1,12 +0,0 @@
-#!/bin/bash
-
-for FILE in converted/*; do 
-  model_name=`basename $FILE`
-  huggingface-cli repo create $model_name -y
-  git clone https://huggingface.co/Helsinki-NLP/$model_name
-  mv $FILE/* $model_name/
-  cd $model_name
-  git add . && git commit -m "initial commit" 
-  git push
-  cd ..
-done
--- a/setup.py
+++ b/setup.py
@ -149,7 +149,7 @@ _deps = [
    "psutil",
    "pyyaml>=5.1",
    "pydantic",
-    "pytest>=7.2.0,<8.0.0",
+    "pytest>=7.2.0",
    "pytest-asyncio",
    "pytest-rerunfailures",
    "pytest-timeout",
@ -163,6 +163,9 @@ _deps = [
    "rjieba",
    "rouge-score!=0.0.7,!=0.0.8,!=0.1,!=0.1.1",
    "ruff==0.11.2",
+    # `sacrebleu` not used in `transformers`. However, it is needed in several tests, when a test calls
+    # `evaluate.load("sacrebleu")`. This metric is used in the examples that we use to test the `Trainer` with, in the
+    # `Trainer` tests (see references to `run_translation.py`).
    "sacrebleu>=1.4.12,<2.0.0",
    "sacremoses",
    "safetensors>=0.4.3",
@ -186,7 +189,7 @@ _deps = [
    "tiktoken",
    "timm<=1.0.11",
    "tokenizers>=0.21,<0.22",
-    "torch>=2.1",
+    "torch>=2.1,<2.7",  # Installing torch 2.7 results in slower compiled LLMs. Pinned while we investigate.
    "torchaudio",
    "torchvision",
    "pyctcdecode>=0.4.0",
@ -344,7 +347,6 @@ extras["testing"] = (
        "evaluate",
        "pytest-timeout",
        "ruff",
-        "sacrebleu",
        "rouge-score",
        "nltk",
        "GitPython",
@ -354,6 +356,7 @@ extras["testing"] = (
        "tensorboard",
        "pydantic",
        "sentencepiece",
+        "sacrebleu",  # needed in trainer tests, see references to `run_translation.py`
    )
    + extras["retrieval"]
    + extras["modelcreation"]
--- a/src/transformers/init.py
+++ b/src/transformers/init.py
@ -259,8 +259,9 @@ _import_structure = {
    ],
    "utils.quantization_config": [
        "AqlmConfig",
+        "AutoRoundConfig",
        "AwqConfig",
-        "BitNetConfig",
+        "BitNetQuantConfig",
        "BitsAndBytesConfig",
        "CompressedTensorsConfig",
        "EetqConfig",
@ -438,6 +439,7 @@ else:
    ]

    _import_structure["modeling_flash_attention_utils"] = []
+    _import_structure["modeling_layers"] = ["GradientCheckpointingLayer"]
    _import_structure["modeling_outputs"] = []
    _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update"]
    _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"]
@ -753,8 +755,9 @@ if TYPE_CHECKING:
    # bitsandbytes config
    from .utils.quantization_config import (
        AqlmConfig,
+        AutoRoundConfig,
        AwqConfig,
-        BitNetConfig,
+        BitNetQuantConfig,
        BitsAndBytesConfig,
        CompressedTensorsConfig,
        EetqConfig,
@ -911,6 +914,7 @@ if TYPE_CHECKING:
        from .model_debugging_utils import (
            model_addition_debugger_context,
        )
+        from .modeling_layers import GradientCheckpointingLayer
        from .modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
        from .modeling_utils import AttentionInterface, PreTrainedModel

--- a/src/transformers/audio_utils.py
+++ b/src/transformers/audio_utils.py
@ -37,15 +37,15 @@ def load_audio(audio: Union[str, np.ndarray], sampling_rate=16000, timeout=None)

    Args:
        audio (`str` or `np.ndarray`):
-            The audio to be laoded to the numpy array format.
+            The audio to be loaded to the numpy array format.
        sampling_rate (`int`, *optional*, defaults to 16000):
-            The samlping rate to be used when loading the audio. It should be same as the
+            The sampling rate to be used when loading the audio. It should be same as the
            sampling rate the model you will be using further was trained with.
        timeout (`float`, *optional*):
            The timeout value in seconds for the URL request.

    Returns:
-        `np.ndarray`: A numpy artay representing the audio.
+        `np.ndarray`: A numpy array representing the audio.
    """
    requires_backends(load_audio, ["librosa"])

@ -1146,9 +1146,9 @@ def stft(frames: np.array, windowing_function: np.array, fft_window_size: Option
            tutorial]https://download.ni.com/evaluation/pxi/Understanding%20FFTs%20and%20Windowing.pdf
        fft_window_size (`int`, *optional*):
            Size of the window om which the Fourier transform is applied. This controls the frequency resolution of the
-            spectrogram. 400 means that the fourrier transform is computed on windows of 400 samples. The number of
+            spectrogram. 400 means that the fourier transform is computed on windows of 400 samples. The number of
            frequency bins (`nb_frequency_bins`) used to divide the window into equal strips is equal to
-            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionnally.
+            `(1+fft_window_size)//2`. An increase of the fft_window_size slows the calculus time proportionally.

    Example:

--- a/src/transformers/cache_utils.py
+++ b/src/transformers/cache_utils.py
@ -1697,7 +1697,7 @@ class HybridCache(Cache):
            min(config.sliding_window, max_cache_len),
            self.head_dim,
        )
-        device = torch.device(device) if device is not None and isinstance(device, str) else None
+        device = torch.device(device) if device is not None else None
        for i in range(config.num_hidden_layers):
            if layer_device_map is not None:
                layer_device = layer_device_map[i]
@ -1919,7 +1919,7 @@ class HybridChunkedCache(Cache):
            full_key_states = torch.cat((k_out[:, :, 1:, :], key_states), dim=-2)
            full_value_states = torch.cat((v_out[:, :, 1:, :], value_states), dim=-2)
            # Fast decoding path -> here as the effective size is still sliding window, it is extremely important
-            # to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed adress
+            # to return `self.key_cache[layer_idx]` and `self.value_cache[layer_idx]`, as they have the fixed address
            # in memory (the values are the same as the full states, but not the address!!)
            if key_states.shape[-2] == 1:
                self.key_cache[layer_idx].copy_(full_key_states)
@ -2031,7 +2031,7 @@ class OffloadedHybridCache(HybridChunkedCache):
        self.active_device_layer = 0

    def initialise_cache_layer(self, layer_idx, key_states):
-        """Overriden to use the correct device if offloaded layer (and pin memory)."""
+        """Overridden to use the correct device if offloaded layer (and pin memory)."""
        if len(self.key_cache) > layer_idx:
            return

@ -2243,7 +2243,7 @@ class OffloadedStaticCache(StaticCache):
            The device to offload to. Defaults to CPU.
        layer_device_map (`Dict[int, Union[str, torch.device, int]]`, *optional*):
            Mapping between the layers and its device. This is required when you are manually initializing the cache
-            and the model is splitted between differents gpus. You can know which layers mapped to which device by
+            and the model is split between different gpus. You can know which layers mapped to which device by
            checking the associated device_map: `model.hf_device_map`.

    Example:
--- a/src/transformers/configuration_utils.py
+++ b/src/transformers/configuration_utils.py
@ -843,29 +843,16 @@ class PretrainedConfig(PushToHubMixin):
            ):
                serializable_config_dict[key] = value

+        self._remove_keys_not_serialized(serializable_config_dict)
+
        if hasattr(self, "quantization_config"):
            serializable_config_dict["quantization_config"] = (
                self.quantization_config.to_dict()
                if not isinstance(self.quantization_config, dict)
                else self.quantization_config
            )
-            # Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
-            _ = serializable_config_dict.pop("_pre_quantization_dtype", None)
-
        self.dict_torch_dtype_to_str(serializable_config_dict)

-        if "_attn_implementation_internal" in serializable_config_dict:
-            del serializable_config_dict["_attn_implementation_internal"]
-        # Do not serialize `base_model_tp_plan` for now
-        if "base_model_tp_plan" in serializable_config_dict:
-            del serializable_config_dict["base_model_tp_plan"]
-        # Do not serialize `base_model_pp_plan` for now
-        if "base_model_pp_plan" in serializable_config_dict:
-            del serializable_config_dict["base_model_pp_plan"]
-
-        if "_name_or_path" in serializable_config_dict:
-            del serializable_config_dict["_name_or_path"]
-
        return serializable_config_dict

    def to_dict(self) -> dict[str, Any]:
@ -878,18 +865,6 @@ class PretrainedConfig(PushToHubMixin):
        output = copy.deepcopy(self.__dict__)
        if hasattr(self.__class__, "model_type"):
            output["model_type"] = self.__class__.model_type
-        if "_auto_class" in output:
-            del output["_auto_class"]
-        if "_commit_hash" in output:
-            del output["_commit_hash"]
-        if "_attn_implementation_internal" in output:
-            del output["_attn_implementation_internal"]
-        # Do not serialize `base_model_tp_plan` for now
-        if "base_model_tp_plan" in output:
-            del output["base_model_tp_plan"]
-        # Do not serialize `base_model_pp_plan` for now
-        if "base_model_pp_plan" in output:
-            del output["base_model_pp_plan"]

        # Transformers version when serializing the model
        output["transformers_version"] = __version__
@ -902,16 +877,14 @@ class PretrainedConfig(PushToHubMixin):

            output[key] = value

+        self._remove_keys_not_serialized(output)
+
        if hasattr(self, "quantization_config"):
            output["quantization_config"] = (
                self.quantization_config.to_dict()
                if not isinstance(self.quantization_config, dict)
                else self.quantization_config
            )
-
-            # pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
-            _ = output.pop("_pre_quantization_dtype", None)
-
        self.dict_torch_dtype_to_str(output)

        return output
@ -1011,6 +984,33 @@ class PretrainedConfig(PushToHubMixin):
            if isinstance(value, dict):
                self.dict_torch_dtype_to_str(value)

+    def _remove_keys_not_serialized(self, d: dict[str, Any]) -> None:
+        """
+        Checks and removes if there are any keys in the dict that should not be serialized when saving the config.
+        Runs recursive check on the dict, to remove from all sub configs.
+        """
+        if hasattr(self, "quantization_config"):
+            # Pop the `_pre_quantization_dtype` as torch.dtypes are not serializable.
+            _ = d.pop("_pre_quantization_dtype", None)
+
+        if "_auto_class" in d:
+            del d["_auto_class"]
+        if "_commit_hash" in d:
+            del d["_commit_hash"]
+        if "_attn_implementation_internal" in d:
+            del d["_attn_implementation_internal"]
+        # Do not serialize `base_model_tp_plan` for now
+        if "base_model_tp_plan" in d:
+            del d["base_model_tp_plan"]
+        # Do not serialize `base_model_pp_plan` for now
+        if "base_model_pp_plan" in d:
+            del d["base_model_pp_plan"]
+        if "_name_or_path" in d:
+            del d["_name_or_path"]
+        for value in d.values():
+            if isinstance(value, dict):
+                self._remove_keys_not_serialized(value)
+
    @classmethod
    def register_for_auto_class(cls, auto_class="AutoConfig"):
        """
--- a/src/transformers/data/datasets/glue.py
+++ b/src/transformers/data/datasets/glue.py
@ -24,7 +24,7 @@ from filelock import FileLock
 from torch.utils.data import Dataset

 from ...tokenization_utils_base import PreTrainedTokenizerBase
-from ...utils import logging
+from ...utils import check_torch_load_is_safe, logging
 from ..processors.glue import glue_convert_examples_to_features, glue_output_modes, glue_processors
 from ..processors.utils import InputFeatures

@ -122,6 +122,7 @@ class GlueDataset(Dataset):
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
+                check_torch_load_is_safe()
                self.features = torch.load(cached_features_file, weights_only=True)
                logger.info(
                    f"Loading features from cached file {cached_features_file} [took %.3f s]", time.time() - start
--- a/src/transformers/data/datasets/squad.py
+++ b/src/transformers/data/datasets/squad.py
@ -24,7 +24,7 @@ from torch.utils.data import Dataset

 from ...models.auto.modeling_auto import MODEL_FOR_QUESTION_ANSWERING_MAPPING
 from ...tokenization_utils import PreTrainedTokenizer
-from ...utils import logging
+from ...utils import check_torch_load_is_safe, logging
 from ..processors.squad import SquadFeatures, SquadV1Processor, SquadV2Processor, squad_convert_examples_to_features


@ -148,6 +148,7 @@ class SquadDataset(Dataset):
        with FileLock(lock_path):
            if os.path.exists(cached_features_file) and not args.overwrite_cache:
                start = time.time()
+                check_torch_load_is_safe()
                self.old_features = torch.load(cached_features_file, weights_only=True)

                # Legacy cache files have only features, while new cache files
--- a/src/transformers/debug_utils.py
+++ b/src/transformers/debug_utils.py
@ -80,7 +80,7 @@ class DebugUnderflowOverflow:
    You can see here, that `T5DenseGatedGeluDense.forward` resulted in output activations, whose absolute max value was
    around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have `Dropout` which
    renormalizes the weights, after it zeroed some of the elements, which pushes the absolute max value to more than
-    64K, and we get an overlow.
+    64K, and we get an overflow.

    As you can see it's the previous frames that we need to look into when the numbers start going into very large for
    fp16 numbers.
--- a/src/transformers/dependency_versions_table.py
+++ b/src/transformers/dependency_versions_table.py
@ -55,7 +55,7 @@ deps = {
    "psutil": "psutil",
    "pyyaml": "pyyaml>=5.1",
    "pydantic": "pydantic",
-    "pytest": "pytest>=7.2.0,<8.0.0",
+    "pytest": "pytest>=7.2.0",
    "pytest-asyncio": "pytest-asyncio",
    "pytest-rerunfailures": "pytest-rerunfailures",
    "pytest-timeout": "pytest-timeout",
@ -91,7 +91,7 @@ deps = {
    "tiktoken": "tiktoken",
    "timm": "timm<=1.0.11",
    "tokenizers": "tokenizers>=0.21,<0.22",
-    "torch": "torch>=2.1",
+    "torch": "torch>=2.1,<2.7",
    "torchaudio": "torchaudio",
    "torchvision": "torchvision",
    "pyctcdecode": "pyctcdecode>=0.4.0",
--- a/src/transformers/generation/beam_search.py
+++ b/src/transformers/generation/beam_search.py
@ -850,7 +850,7 @@ class ConstrainedBeamSearchScorer(BeamScorer):
                    beam_hyp.add(final_tokens, final_score, beam_indices=beam_index, generated_len=generated_len)
                    ids_collect.append(beam_id)

-            # due to overly complex constraints or other factors, sometimes we can't gaurantee a successful
+            # due to overly complex constraints or other factors, sometimes we can't guarantee a successful
            # generation. In these cases we simply return the highest scoring outputs.
            if len(ids_collect) < self.num_beam_hyps_to_keep:
                for beam_id in range(self.num_beams):
--- a/src/transformers/generation/configuration_utils.py
+++ b/src/transformers/generation/configuration_utils.py
@ -192,7 +192,7 @@ class GenerationConfig(PushToHubMixin):
            our [cache documentation](https://huggingface.co/docs/transformers/en/kv_cache) for further information.
        cache_config (`CacheConfig` or `dict`, *optional*, default to `None`):
            Arguments used in the key-value cache class can be passed in `cache_config`. Can be passed as a `Dict` and
-            it will be converted to its repsective `CacheConfig` internally.
+            it will be converted to its respective `CacheConfig` internally.
            Otherwise can be passed as a `CacheConfig` class matching the indicated `cache_implementation`.
        return_legacy_cache (`bool`, *optional*, default to `True`):
            Whether to return the legacy or new format of the cache when `DynamicCache` is used by default.
@ -235,7 +235,7 @@ class GenerationConfig(PushToHubMixin):
            The parameter for repetition penalty. 1.0 means no penalty. See [this
            paper](https://arxiv.org/pdf/1909.05858.pdf) for more details.
        encoder_repetition_penalty (`float`, *optional*, defaults to 1.0):
-            The paramater for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
+            The parameter for encoder_repetition_penalty. An exponential penalty on sequences that are not in the
            original input. 1.0 means no penalty.
        length_penalty (`float`, *optional*, defaults to 1.0):
            Exponential penalty to the length that is used with beam-based generation. It is applied as an exponent to
@ -381,10 +381,12 @@ class GenerationConfig(PushToHubMixin):
        > Parameters related to performances and compilation

        compile_config (CompileConfig, *optional*):
-            If using a static cache, this controls how `generate` will `compile` the forward pass for performance
-            gains.
-
-        disable_compile (`bool`, *optional*): Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when specific criteria are met, including using a compileable cache. Please open an issue if you find the need to use this flag.
+            If using a compilable cache, this controls how `generate` will `compile` the forward pass for faster
+            inference.
+        disable_compile (`bool`, *optional*):
+            Whether to disable the automatic compilation of the forward pass. Automatic compilation happens when
+            specific criteria are met, including using a compilable cache. Please open an issue if you find the
+            need to use this flag.

        > Wild card

@ -489,7 +491,7 @@ class GenerationConfig(PushToHubMixin):
        self.target_lookbehind = kwargs.pop("target_lookbehind", 10)

        # Performance
-        self.compile_config = kwargs.pop("compile_config", CompileConfig())
+        self.compile_config = kwargs.pop("compile_config", None)
        self.disable_compile = kwargs.pop("disable_compile", False)
        # Wild card
        self.generation_kwargs = kwargs.pop("generation_kwargs", {})
@ -708,7 +710,7 @@ class GenerationConfig(PushToHubMixin):
                    UserWarning,
                )

-        # 3. detect incorrect paramaterization specific to advanced beam modes
+        # 3. detect incorrect parameterization specific to advanced beam modes
        else:
            # constrained beam search
            if self.constraints is not None or self.force_words_ids is not None:
@ -811,9 +813,10 @@ class GenerationConfig(PushToHubMixin):
            self.watermarking_config.validate()

        # 7. performances arguments
-        if not isinstance(self.compile_config, CompileConfig):
+        if self.compile_config is not None and not isinstance(self.compile_config, CompileConfig):
            raise ValueError(
-                f"You provided `compile_config` as an instance of {type(self.compile_config)}, but it must be an instance of `CompileConfig`."
+                f"You provided `compile_config` as an instance of {type(self.compile_config)}, but it must be an "
+                "instance of `CompileConfig`."
            )

        # 8. other incorrect combinations
--- a/src/transformers/generation/flax_logits_process.py
+++ b/src/transformers/generation/flax_logits_process.py
@ -271,7 +271,7 @@ class FlaxMinLengthLogitsProcessor(FlaxLogitsProcessor):

 class FlaxSuppressTokensAtBeginLogitsProcessor(FlaxLogitsProcessor):
    r"""
-    [`FlaxLogitsProcessor`] supressing a list of tokens as soon as the `generate` function starts generating using
+    [`FlaxLogitsProcessor`] suppressing a list of tokens as soon as the `generate` function starts generating using
    `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are not sampled at the
    beginning of the generation.

--- a/src/transformers/generation/logits_process.py
+++ b/src/transformers/generation/logits_process.py
@ -292,7 +292,8 @@ class TemperatureLogitsWarper(LogitsProcessor):
 class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    r"""
    [`LogitsProcessor`] that prevents the repetition of previous tokens through a penalty. This penalty is applied at
-    most once per token. Note that, for decoder-only models like most LLMs, the considered tokens include the prompt.
+    most once per token. Note that, for decoder-only models like most LLMs, the considered tokens include the prompt
+    by default.

    In the original [paper](https://arxiv.org/pdf/1909.05858.pdf), the authors suggest the use of a penalty of around
    1.2 to achieve a good balance between truthful generation and lack of repetition. To penalize and reduce
@ -303,11 +304,13 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
        penalty (`float`):
            The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 penalizes previously generated
            tokens. Between 0.0 and 1.0 rewards previously generated tokens.
+        prompt_ignore_length (`int`, *optional*):
+            The original input ids sequence length, which if provided, will not be used in the penalty calculation.

    Examples:

    ```py
-    >>> from transformers import AutoTokenizer, AutoModelForCausalLM
+    >>> from transformers import AutoTokenizer, AutoModelForCausalLM, RepetitionPenaltyLogitsProcessor

    >>> # Initializing the model and tokenizer for it
    >>> model = AutoModelForCausalLM.from_pretrained("distilbert/distilgpt2")
@ -323,17 +326,36 @@ class RepetitionPenaltyLogitsProcessor(LogitsProcessor):
    >>> penalized_ids = model.generate(**inputs, repetition_penalty=1.1)
    >>> print(tokenizer.batch_decode(penalized_ids, skip_special_tokens=True)[0])
    I'm not going to be able to do that. I'll just have to go out and play
+
+    >>> # We can also exclude the input prompt by creating an instance of this class
+    >>> # with a `prompt_ignore_length` and passing it as a custom logit processor
+    >>> rep_pen_processor = RepetitionPenaltyLogitsProcessor(
+    ...     penalty=1.1,
+    ...     prompt_ignore_length=inputs["input_ids"].shape[-1]
+    ... )
+    >>> penalized_ids = model.generate(**inputs, logits_processor=[rep_pen_processor])
+    >>> print(tokenizer.batch_decode(penalized_ids, skip_special_tokens=True)[0])
+    I'm not going to be able to do that. I'm going to have to go through a lot of things, and
    ```
    """

-    def __init__(self, penalty: float):
+    def __init__(self, penalty: float, prompt_ignore_length: Optional[int] = None):
        if not isinstance(penalty, float) or not (penalty > 0):
            raise ValueError(f"`penalty` has to be a strictly positive float, but is {penalty}")

+        if prompt_ignore_length is not None and (
+            not isinstance(prompt_ignore_length, int) or prompt_ignore_length < 0
+        ):
+            raise ValueError(f"`prompt_ignore_length` has to be a positive integer, but is {prompt_ignore_length}")
+
        self.penalty = penalty
+        self.prompt_ignore_length = prompt_ignore_length

    @add_start_docstrings(LOGITS_PROCESSOR_INPUTS_DOCSTRING)
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        if self.prompt_ignore_length:
+            input_ids = input_ids[:, self.prompt_ignore_length :]
+
        score = torch.gather(scores, 1, input_ids)

        # if score < 0 then repetition penalty has to be multiplied to reduce the token probabilities
@ -521,7 +543,7 @@ class TopKLogitsWarper(LogitsProcessor):
 class MinPLogitsWarper(LogitsProcessor):
    """
    [`LogitsProcessor`] that performs min-p, i.e. keeps all tokens that are above a minimum probability, scaled by the
-    probability of the most likely token. As a result, the filter becomes more agressive in the presence of
+    probability of the most likely token. As a result, the filter becomes more aggressive in the presence of
    high-probability tokens, which is a sign of a confident output that we shouldn't deviate from.

    Often used together with [`TemperatureLogitsWarper`]. Used as an alternative to [`TopPLogitsWarper`] and
@ -716,7 +738,7 @@ class EpsilonLogitsWarper(LogitsProcessor):

    >>> # With epsilon sampling, the output gets restricted to high-probability tokens. Note that this is similar to
    >>> # Top P sampling, which restricts tokens based on their cumulative probability.
-    >>> # Pro tip: The paper recomends using `epsilon_cutoff` values between 3e-4 and 9e-4
+    >>> # Pro tip: The paper recommends using `epsilon_cutoff` values between 3e-4 and 9e-4
    >>> outputs = model.generate(**inputs, do_sample=True, epsilon_cutoff=0.1)
    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@ -797,7 +819,7 @@ class EtaLogitsWarper(LogitsProcessor):

    >>> # With eta sampling, the output gets restricted to high-probability tokens. You can see it as a dynamic form of
    >>> # epsilon sampling that adapts its cutoff probability based on the entropy (high entropy = lower cutoff).
-    >>> # Pro tip: The paper recomends using `eta_cutoff` values between 3e-4 to 4e-3
+    >>> # Pro tip: The paper recommends using `eta_cutoff` values between 3e-4 to 4e-3
    >>> outputs = model.generate(**inputs, do_sample=True, eta_cutoff=0.1)
    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
    A sequence: 1, 2, 3, 4, 5, 6, 7, 8, 9
@ -1326,7 +1348,7 @@ class PrefixConstrainedLogitsProcessor(LogitsProcessor):
    >>> print(tokenizer.batch_decode(outputs, skip_special_tokens=True)[0])
    Alice and Bob are friends

-    >>> # We can contrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
+    >>> # We can constrain it with `prefix_allowed_tokens_fn` to force a certain behavior based on a prefix.
    >>> # For instance, we can force an entire entity to be generated when its beginning is detected.
    >>> entity = tokenizer(" Bob Marley", return_tensors="pt").input_ids[0]  # 3 tokens
    >>> def prefix_allowed_tokens_fn(batch_id, input_ids):
@ -1769,7 +1791,7 @@ class LogitNormalization(LogitsProcessor):

 class SuppressTokensAtBeginLogitsProcessor(LogitsProcessor):
    r"""
-    [`SuppressTokensAtBeginLogitsProcessor`] supresses a list of tokens as soon as the `generate` function starts
+    [`SuppressTokensAtBeginLogitsProcessor`] suppresses a list of tokens as soon as the `generate` function starts
    generating using `begin_index` tokens. This should ensure that the tokens defined by `begin_suppress_tokens` are
    not generated at the beginning. Originally created for
    [Whisper](https://huggingface.co/docs/transformers/model_doc/whisper).
@ -2620,7 +2642,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
        We assume that the scores are in the log space.
        Args:
            scores (`torch.FloatTensor`): Scores (batch_size, vocab_size).
-            g_values (`torch.FloatTensor`): G valus (batch_size, vocab_size, depth).
+            g_values (`torch.FloatTensor`): G values (batch_size, vocab_size, depth).

        Returns:
            Updated scores (batch_size, vocab_size).
@ -2646,7 +2668,7 @@ class SynthIDTextWatermarkLogitsProcessor(LogitsProcessor):
        if self.debug_mode:
            scores = torch.ones_like(scores)

-        # Currently indices is just a arange to compute watermarking on the desnse logits.
+        # Currently indices is just a arange to compute watermarking on the dense logits.
        all_indices = torch.stack([torch.arange(vocab_size, device=self.device) for _ in range(batch_size)])

        if self.state is None:
--- a/src/transformers/generation/streamers.py
+++ b/src/transformers/generation/streamers.py
@ -162,7 +162,7 @@ class TextStreamer(BaseStreamer):
 class TextIteratorStreamer(TextStreamer):
    """
    Streamer that stores print-ready text in a queue, to be used by a downstream application as an iterator. This is
-    useful for applications that benefit from acessing the generated text in a non-blocking way (e.g. in an interactive
+    useful for applications that benefit from accessing the generated text in a non-blocking way (e.g. in an interactive
    Gradio demo).

    <Tip warning={true}>
@ -233,7 +233,7 @@ class TextIteratorStreamer(TextStreamer):
 class AsyncTextIteratorStreamer(TextStreamer):
    """
    Streamer that stores print-ready text in a queue, to be used by a downstream application as an async iterator.
-    This is useful for applications that benefit from acessing the generated text asynchronously (e.g. in an
+    This is useful for applications that benefit from accessing the generated text asynchronously (e.g. in an
    interactive Gradio demo).

    <Tip warning={true}>
--- a/src/transformers/generation/tf_logits_process.py
+++ b/src/transformers/generation/tf_logits_process.py
@ -343,7 +343,7 @@ class TFNoBadWordsLogitsProcessor(TFLogitsProcessor):
                )

            def _match_found():
-                # Finaly, runs the actual comparison. Can only be called if the previous comparisons do not yield
+                # Finally, runs the actual comparison. Can only be called if the previous comparisons do not yield
                # an answer (otherwise we get indexing exceptions)
                compare_len = self.bad_word_seqs_len[bad_word_seq_number] - 1
                return tf.cond(
--- a/src/transformers/generation/tf_utils.py
+++ b/src/transformers/generation/tf_utils.py
@ -962,7 +962,7 @@ class TFGenerationMixin:
                raise ValueError(
                    "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
                    f" num_return_sequences, got {generation_config.num_beams} and"
-                    f" {generation_config.num_return_sequences} (respectivelly)"
+                    f" {generation_config.num_return_sequences} (respectively)"
                )

            # 11. broadcast inputs to the desired number of beams
@ -994,7 +994,7 @@ class TFGenerationMixin:
                raise ValueError(
                    "Beam search decoding cannot return more sequences than it has beams. Please set num_beams >="
                    f" num_return_sequences, got {generation_config.num_beams} and"
-                    f" {generation_config.num_return_sequences} (respectivelly)"
+                    f" {generation_config.num_return_sequences} (respectively)"
                )

            # 11. prepare logits warper
@ -1626,7 +1626,7 @@ class TFGenerationMixin:
        )
        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -1910,7 +1910,7 @@ class TFGenerationMixin:
        )
        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -2082,7 +2082,7 @@ class TFGenerationMixin:

        def gather_fn(tensor):
            if batch_axis > 0:
-                # pushes all dimentions before the batch to the end, so we get (batch, beam_id, ...)
+                # pushes all dimensions before the batch to the end, so we get (batch, beam_id, ...)
                perm = tf.concat((tf.range(tf.rank(tensor))[batch_axis:], tf.range(batch_axis)), axis=0)
                tensor = tf.transpose(tensor, perm=perm)

@ -2253,7 +2253,7 @@ class TFGenerationMixin:

        use_cache = model_kwargs.pop("use_cache", self.generation_config.use_cache)
        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
@ -2788,7 +2788,7 @@ class TFGenerationMixin:
        model_kwargs.pop("use_cache", None)

        use_xla = not tf.executing_eagerly()
-        # TODO (Joao): fix cache format or find programatic way to detect cache index
+        # TODO (Joao): fix cache format or find programmatic way to detect cache index
        # GPT2 and other models has a slightly different cache structure, with a different batch axis
        model_name = str(self.decoder) if "EncoderDecoder" in str(self) else str(self)
        cache_batch_axis = 1 if any(model_prefix in model_name for model_prefix in ("TFGPT2", "TFCTRL")) else 0
--- a/src/transformers/generation/utils.py
+++ b/src/transformers/generation/utils.py
@ -362,7 +362,7 @@ class GenerationMixin:
           inherit from `GenerationMixin` to benefit from all generation-related automation in our codebase;
        - `BarkModel` has a custom `generate` method and one of its inner models calls `GenerationMixin.generate`.
            However, its `generate` does not share the same interface as `GenerationMixin.generate`. In this case,
-            `BarkModel` shoud NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.
+            `BarkModel` should NOT inherit from `GenerationMixin`, as it breaks the `generate` interface.

    The class exposes [`~generation.GenerationMixin.generate`], which can be used for:
        - *greedy decoding* if `num_beams=1` and `do_sample=False`
@ -392,7 +392,7 @@ class GenerationMixin:
        - Exception 1: when passing input_embeds, input_ids may be missing entries
        - Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
        - Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
-        - Excpetion 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
+        - Exception 4: If input_embeds are passed then slice it through `cache_position`, to keep only the unprocessed tokens and
          generate the first token for each sequence. Later use the generated Input ids for continuation.

        The current implementation does not rely on ``self`` and could be
@ -557,23 +557,21 @@ class GenerationMixin:
        if isinstance(past_key_values, StaticCache) and attention_mask.ndim == 2:
            if model_inputs["inputs_embeds"] is not None:
                batch_size, sequence_length, _ = model_inputs["inputs_embeds"].shape
-                device = model_inputs["inputs_embeds"].device
            else:
                batch_size, sequence_length = model_inputs[input_ids_key].shape
-                device = model_inputs[input_ids_key].device

            # Create the causal mask with fixed shape in advance, to reduce recompilations. If the function to create
-            # the 4D causal mask exists, it should be present in the base model (XXXModel class).
-            base_model = getattr(self, self.base_model_prefix, None)
-            if base_model is None:
+            # the 4D causal mask exists, it should be present in the base model (XXXModel class) or in its decoder.
+            base_model = getattr(self, self.base_model_prefix, self)
+            decoder = base_model.get_decoder() if hasattr(base_model, "get_decoder") else None
+            causal_mask_creation_function = getattr(
+                base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None
+            )
+            if causal_mask_creation_function is None and decoder is not None:  # it may be in the decoder
                causal_mask_creation_function = getattr(
-                    self, "_prepare_4d_causal_attention_mask_with_cache_position", None
+                    decoder, "_prepare_4d_causal_attention_mask_with_cache_position", None
                )
-            else:
-                causal_mask_creation_function = getattr(
-                    base_model, "_prepare_4d_causal_attention_mask_with_cache_position", None
-                )
-            if causal_mask_creation_function is None:
+            if causal_mask_creation_function is None:  # can't be found
                logger.warning_once(
                    f"{self.__class__.__name__} has no `_prepare_4d_causal_attention_mask_with_cache_position` method "
                    "defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're "
@ -586,7 +584,6 @@ class GenerationMixin:
                    sequence_length=sequence_length,
                    target_length=past_key_values.get_max_cache_shape(),
                    dtype=self.dtype,
-                    device=device,
                    cache_position=cache_position,
                    batch_size=batch_size,
                    config=self.config,
@ -970,7 +967,7 @@ class GenerationMixin:
                    assistant_model=assistant_model,
                    assistant_prune_lm_head=True,  # prune LM head of assistant model
                )
-                # Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismaches between token ids and logits index
+                # Since we prune the LM head, we cannot use the repetition penalty on the assistant model due to mismatches between token ids and logits index
                assistant_model.generation_config.repetition_penalty = None
                candidate_generator = UniversalSpeculativeDecodingGenerator(
                    input_ids=input_ids,
@ -1288,7 +1285,7 @@ class GenerationMixin:
        Merge user-defined processors/criteria with the ones instantiated inside `generate`. In case the same
        processor/criteria is present on both lists, use the user-defined one.

-        (Note: up to v4.49.0, this funtion threw an exception is the same logit processor was found twice.)
+        (Note: up to v4.49.0, this function threw an exception is the same logit processor was found twice.)
        """
        if len(custom_list) == 0:
            return default_list
@ -2097,6 +2094,47 @@ class GenerationMixin:
        generation_config._pad_token_tensor = pad_token_tensor
        generation_config._decoder_start_token_tensor = decoder_start_token_tensor

+    def _valid_auto_compile_criteria(self, model_kwargs: Dict, generation_config: GenerationConfig) -> bool:
+        """
+        Determines whether to trigger auto-compilation of the model's forward pass at generation time.
+        """
+        # Override: honor `disable_compile` flag
+        if generation_config.disable_compile:
+            return False
+
+        # Base logic
+        valid_hardware = self.device.type == "cuda" or bool(
+            generation_config.compile_config is not None and generation_config.compile_config._compile_all_devices
+        )
+        using_compilable_cache = (
+            isinstance(model_kwargs.get("past_key_values"), Cache) and model_kwargs["past_key_values"].is_compileable
+        )
+        can_compile = valid_hardware and using_compilable_cache and self._supports_static_cache
+
+        # Exception 1: Some quantization methods do not support compilation
+        if getattr(self, "hf_quantizer", None) is not None:
+            can_compile &= self.hf_quantizer.is_compileable
+
+        if hasattr(self, "hf_device_map"):
+            all_model_devices = set(self.hf_device_map.values())
+            # Exception 2: Don't compile if the model is using CPU offload (as of April 2025, this results in a crash)
+            has_cpu_offload = "cpu" in all_model_devices and len(all_model_devices) > 1
+            can_compile &= not has_cpu_offload
+
+            # Exception 3: Disk offload is not supported for compilation
+            has_disk_offload = "disk" in all_model_devices
+            can_compile &= not has_disk_offload
+
+        # Finally: if the user has manually specified compilation options, but compilation is not possible, let's warn
+        # them
+        if generation_config.compile_config is not None and not can_compile:
+            logger.warning_once(
+                "You have set `compile_config`, but we are unable to meet the criteria for compilation. Compilation "
+                "will be skipped."
+            )
+
+        return can_compile
+
    @torch.no_grad()
    def generate(
        self,
@ -3389,16 +3427,10 @@ class GenerationMixin:
        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)

        model_forward = self.__call__
-        if isinstance(model_kwargs.get("past_key_values"), Cache):
-            is_compileable = model_kwargs["past_key_values"].is_compileable and self._supports_static_cache
-            if getattr(self, "hf_quantizer", None) is not None:
-                is_compileable &= self.hf_quantizer.is_compileable
-            is_compileable = is_compileable and not generation_config.disable_compile
-            if is_compileable and (
-                self.device.type == "cuda" or generation_config.compile_config._compile_all_devices
-            ):
-                os.environ["TOKENIZERS_PARALLELISM"] = "0"
-                model_forward = self.get_compiled_call(generation_config.compile_config)
+        compile_forward = self._valid_auto_compile_criteria(model_kwargs, generation_config)
+        if compile_forward:
+            os.environ["TOKENIZERS_PARALLELISM"] = "0"
+            model_forward = self.get_compiled_call(generation_config.compile_config)

        if generation_config.prefill_chunk_size is not None:
            model_kwargs = self._prefill_chunking(input_ids, generation_config, **model_kwargs)
@ -3820,7 +3852,7 @@ class GenerationMixin:

        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)

-        # (joao) feature lost in the refactor. Probably won't implement, hurts readbility with minimal gains (there
+        # (joao) feature lost in the refactor. Probably won't implement, hurts readability with minimal gains (there
        # are newer low-memory alternatives like the offloaded cache)
        sequential = generation_config.low_memory
        if sequential:
--- a/src/transformers/generation/watermarking.py
+++ b/src/transformers/generation/watermarking.py
@ -538,7 +538,7 @@ class SynthIDTextWatermarkDetector:
        context_repetition_mask = self.logits_processor.compute_context_repetition_mask(
            input_ids=tokenized_outputs,
        )
-        # context repitition mask shape [batch_size, output_len - (ngram_len - 1)]
+        # context repetition mask shape [batch_size, output_len - (ngram_len - 1)]

        combined_mask = context_repetition_mask * eos_token_mask

--- a/src/transformers/integrations/accelerate.py
+++ b/src/transformers/integrations/accelerate.py
@ -171,7 +171,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
    ```
    """

-    # get ALL model parameters and thier names
+    # get ALL model parameters and their names
    all_named_parameters = dict(model.named_parameters(remove_duplicate=False))

    # get ONLY unique named parameters,
@ -187,7 +187,7 @@ def find_tied_parameters(model: "nn.Module", **kwargs):
    for tied_param_name in tied_param_names:
        tied_param = all_named_parameters[tied_param_name]
        for param_name, param in no_duplicate_named_parameters.items():
-            # compare if parameters are the same, if so, group thier names together
+            # compare if parameters are the same, if so, group their names together
            if param is tied_param:
                if param_name not in tied_param_groups:
                    tied_param_groups[param_name] = []
--- a/src/transformers/integrations/aqlm.py
+++ b/src/transformers/integrations/aqlm.py
@ -30,7 +30,7 @@ def replace_with_aqlm_linear(
    """
    Public method that recursively replaces the Linear layers of the given model with AQLM quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.

    Args:
        model (`torch.nn.Module`):
--- a/src/transformers/integrations/awq.py
+++ b/src/transformers/integrations/awq.py
@ -101,7 +101,7 @@ def replace_with_awq_linear(
    """
    Public method that recursively replaces the Linear layers of the given model with AWQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.

    During the module replacement, we also infer the backend to use through the `quantization_config` object.

--- a/src/transformers/integrations/bitnet.py
+++ b/src/transformers/integrations/bitnet.py
@ -190,6 +190,98 @@ class BitLinear(nn.Module):
        return y


+class WeightQuant(torch.autograd.Function):
+    """
+    Implements a custom autograd function for weight quantization.
+    This performs ternary quantization (-1, 0, 1) based on scaling by the
+    mean absolute value of the weights. It uses the Straight-Through Estimator
+    (STE) for the backward pass.
+    """
+
+    @staticmethod
+    @torch.compile
+    def forward(ctx, weight):
+        dtype = weight.dtype
+        weight = weight.float()
+        scale = 1.0 / weight.abs().mean().clamp_(min=1e-5)
+        weight = (weight * scale).round().clamp(-1, 1) / scale
+        return weight.to(dtype)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = grad_output.clone()
+        return grad_input
+
+
+class ActQuant(torch.autograd.Function):
+    """
+    Implements a custom autograd function for activation quantization.
+    This performs symmetric 8-bit quantization (to the range [-128, 127])
+    based on the maximum absolute value along the last dimension (per-token/row scaling).
+    It uses the Straight-Through Estimator (STE) for the backward pass.
+    """
+
+    @staticmethod
+    @torch.compile
+    def forward(ctx, activation):
+        dtype = activation.dtype
+        activation = activation.float()
+        scale = 127 / activation.abs().max(dim=-1, keepdim=True).values.clamp_(min=1e-5)
+        activation = (activation * scale).round().clamp(-128, 127) / scale
+        return activation.to(dtype)
+
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_input = grad_output.clone()
+        return grad_input
+
+
+class AutoBitLinear(nn.Linear):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = True,
+        device=None,
+        dtype=None,
+        online_quant: bool = False,
+    ):
+        super().__init__(in_features, out_features, bias)
+        self.online_quant = online_quant
+        if not online_quant:
+            self.register_buffer(
+                "weight_scale",
+                torch.ones(
+                    (1),
+                    dtype=dtype,
+                    device=device,
+                ),
+            )
+            self._register_load_state_dict_pre_hook(self.load_hook)
+
+    def load_hook(
+        self,
+        state_dict,
+        prefix,
+        *args,
+        **kwargs,
+    ):
+        if (prefix + "weight") in state_dict and state_dict[prefix + "weight"].dtype != self.weight.dtype:
+            state_dict[prefix + "weight"] = unpack_weights(state_dict[prefix + "weight"], dtype=self.weight.dtype)
+        return state_dict
+
+    def forward(self, input):
+        if self.online_quant:
+            weight = WeightQuant.apply(self.weight)
+        else:
+            weight = self.weight
+        input = ActQuant.apply(input)
+        output = F.linear(input, weight, self.bias)
+        if not self.online_quant:
+            output = output * self.weight_scale
+        return output
+
+
 def _replace_with_bitnet_linear(
    model,
    modules_to_not_convert=None,
@ -201,7 +293,7 @@ def _replace_with_bitnet_linear(
    """
    Private method that wraps the recursion for module replacement.

-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
    """

    if current_key_name is None:
@ -218,15 +310,27 @@ def _replace_with_bitnet_linear(
                if isinstance(module, nn.Linear) and name not in modules_to_not_convert:
                    in_features = module.in_features
                    out_features = module.out_features
-                    model._modules[name] = BitLinear(
-                        in_features=in_features,
-                        out_features=out_features,
-                        bias=module.bias is not None,
-                        device=module.weight.device,
-                        dtype=module.weight.dtype,
-                    )
+                    if quantization_config and quantization_config.linear_class == "autobitlinear":
+                        model._modules[name] = AutoBitLinear(
+                            in_features=in_features,
+                            out_features=out_features,
+                            bias=module.bias is not None,
+                            device=module.weight.device,
+                            dtype=module.weight.dtype,
+                            online_quant=(quantization_config.quantization_mode == "online"),
+                        )
+                        if quantization_config.quantization_mode == "offline":
+                            model._modules[name].requires_grad_(False)
+                    else:
+                        model._modules[name] = BitLinear(
+                            in_features=in_features,
+                            out_features=out_features,
+                            bias=module.bias is not None,
+                            device=module.weight.device,
+                            dtype=module.weight.dtype,
+                        )
+                        model._modules[name].requires_grad_(False)
                    has_been_replaced = True
-                    model._modules[name].requires_grad_(False)

        if len(list(module.children())) > 0:
            _, has_been_replaced = _replace_with_bitnet_linear(
--- a/src/transformers/integrations/bitsandbytes.py
+++ b/src/transformers/integrations/bitsandbytes.py
@ -158,7 +158,7 @@ def _replace_with_bnb_linear(
    """
    Private method that wraps the recursion for module replacement.

-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
    """
    for name, module in model.named_children():
        if current_key_name is None:
@ -280,7 +280,7 @@ def replace_8bit_linear(*args, **kwargs):
    return replace_with_bnb_linear(*args, **kwargs)


-# For backward compatiblity
+# For backward compatibility
 def set_module_8bit_tensor_to_device(*args, **kwargs):
    warnings.warn(
        "`set_module_8bit_tensor_to_device` will be deprecated in a future version, please use `set_module_quantized_tensor_to_device` instead",
@ -403,7 +403,7 @@ def _dequantize_and_replace(
    some performance drop compared to the original model before quantization - use it only for specific usecases
    such as QLoRA adapters merging.

-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
    """
    quant_method = quantization_config.quantization_method()

--- a/src/transformers/integrations/eetq.py
+++ b/src/transformers/integrations/eetq.py
@ -36,7 +36,7 @@ def _replace_with_eetq_linear(
    """
    Private method that wraps the recursion for module replacement.

-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
    """
    if current_key_name is None:
        current_key_name = []
--- a/src/transformers/integrations/executorch.py
+++ b/src/transformers/integrations/executorch.py
@ -20,15 +20,207 @@ from ..utils.import_utils import is_torch_available


 if is_torch_available():
-    from transformers import PreTrainedModel, StaticCache
+    from transformers import HybridCache, PreTrainedModel, StaticCache
    from transformers.pytorch_utils import is_torch_greater_or_equal, is_torch_greater_or_equal_than_2_3


+class TorchExportableModuleForDecoderOnlyLM(torch.nn.Module):
+    """
+    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
+    specifically for decoder-only LM with cache. This module ensures that the
+    exported model is compatible with further lowering and execution in `ExecuTorch`.
+    """
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        max_batch_size: int = 1,
+        max_cache_len: int = 4096,
+    ):
+        """
+        Initializes the exportable module with `HybridCache`.
+
+        Args:
+            model (`PreTrainedModel`): The pretrained model to wrap.
+            max_batch_size (int): Maximum batch size for the cache.
+            max_cache_len (int): Maximum sequence length for the cache.
+
+        Raises:
+            ValueError: If the model is configured with a unsupported cache implementation.
+        """
+        super().__init__()
+
+        if model.config.cache_implementation == "static":
+            self.model = TorchExportableModuleWithStaticCache(model)
+        elif model.config.cache_implementation == "hybrid":
+            self.model = TorchExportableModuleWithHybridCache(model, max_batch_size, max_cache_len)
+        else:
+            raise ValueError(
+                f"Unsupported cache implementation in this export recipe: '{model.config.cache_implementation}'"
+            )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        cache_position: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the module, which is compatible with the ExecuTorch llm runner.
+
+        Args:
+            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
+            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.
+
+        Returns:
+            torch.Tensor: Logits output from the model.
+        """
+        return self.model.forward(input_ids, cache_position)
+
+    def export(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        dynamic_shapes: Optional[dict] = None,
+        strict: Optional[bool] = None,
+    ) -> torch.export.ExportedProgram:
+        """
+        Export the wrapped module using `torch.export`.
+
+        Args:
+            input_ids (`Optional[torch.Tensor]`):
+                Tensor representing current input token id to the module. If not provided, a default tensor will be used.
+            cache_position (`Optional[torch.Tensor]`):
+                Tensor representing current input position in the cache. If not provided, a default tensor will be used.
+            dynamic_shapes (`Optional[dict]`):
+                Dynamic shapes to use for export if specified.
+            strict(`Optional[bool]`):
+                Flag to instruct `torch.export` to use `torchdynamo`.
+        """
+        example_input_ids = input_ids if input_ids is not None else torch.tensor([[1]], dtype=torch.long)
+        example_cache_position = cache_position if cache_position is not None else torch.tensor([0], dtype=torch.long)
+
+        return torch.export.export(
+            self.model,
+            args=(example_input_ids, example_cache_position),
+            kwargs={},
+            dynamic_shapes=dynamic_shapes,
+            strict=strict if strict is not None else True,
+        )
+
+    @staticmethod
+    def generate(
+        exported_program: torch.export.ExportedProgram,
+        tokenizer,
+        prompt: str,
+        max_new_tokens: int = 20,
+        do_sample: bool = False,
+        temperature: float = 1.0,
+        top_k: int = 50,
+        top_p: float = 1.0,
+        device: str = "cpu",
+    ) -> str:
+        """
+        Generate a sequence of tokens using an exported program.
+
+        Args:
+            exported_program (`torch.export.ExportedProgram`): The exported model being used for generate.
+            tokenizer: The tokenizer to use.
+            prompt (str): The input prompt.
+            max_new_tokens (int): Maximum number of new tokens to generate.
+            do_sample (bool): Whether to use sampling or greedy decoding.
+            temperature (float): The temperature for sampling.
+            top_k (int): The number of highest probability tokens to keep for top-k sampling.
+            top_p (float): The cumulative probability for nucleus sampling.
+            device (str): The device to use.
+
+        Returns:
+            str: The generated text.
+        """
+        # Get the module from the exported program
+        exported_module = exported_program.module()
+
+        # Tokenize the prompt
+        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+
+        # Initialize with the prompt
+        generated_ids = input_ids.clone()
+
+        # Process the prompt tokens first
+        curr_position = 0
+        for i in range(input_ids.shape[1]):
+            # Process one token at a time
+            curr_input_ids = input_ids[:, i : i + 1]
+            curr_cache_position = torch.tensor([curr_position], dtype=torch.long, device=device)
+
+            # Forward pass
+            _ = exported_module(curr_input_ids, curr_cache_position)
+            curr_position += 1
+
+        # Generate new tokens
+        for _ in range(max_new_tokens):
+            # Get the last token as input
+            curr_input_ids = generated_ids[:, -1:]
+            curr_cache_position = torch.tensor([curr_position], dtype=torch.long, device=device)
+
+            # Forward pass to get next token logits
+            outputs = exported_module(curr_input_ids, curr_cache_position)
+
+            # Get the next token ID
+            if do_sample:
+                # Apply temperature
+                if temperature > 0:
+                    logits = outputs / temperature
+                else:
+                    logits = outputs
+
+                # Apply top-k filtering
+                if top_k > 0:
+                    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+                    logits[indices_to_remove] = float("-inf")
+
+                # Apply top-p (nucleus) filtering
+                if top_p < 1.0:
+                    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+                    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+
+                    # Remove tokens with cumulative probability above the threshold
+                    sorted_indices_to_remove = cumulative_probs > top_p
+                    # Shift the indices to the right to keep also the first token above the threshold
+                    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+                    sorted_indices_to_remove[..., 0] = 0
+
+                    # Scatter sorted tensors to original indexing
+                    indices_to_remove = sorted_indices_to_remove.scatter(-1, sorted_indices, sorted_indices_to_remove)
+                    logits[indices_to_remove] = float("-inf")
+
+                # Sample from the filtered distribution
+                probs = torch.softmax(logits, dim=-1)
+                next_token_id = torch.multinomial(probs, num_samples=1)
+            else:
+                # Greedy decoding
+                next_token_id = outputs.argmax(dim=-1, keepdim=True)
+
+            # Ensure next_token_id has the right shape before concatenation
+            if next_token_id.dim() > 2:
+                next_token_id = next_token_id.squeeze(-1)
+
+            # Append to the generated sequence
+            generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)
+            curr_position += 1
+
+            # Stop if we generate an EOS token
+            if next_token_id.item() == tokenizer.eos_token_id:
+                break
+
+        # Decode the generated text
+        return tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+
 class TorchExportableModuleWithStaticCache(torch.nn.Module):
    """
-    A wrapper module designed to make a `PreTrainedModel` exportable with `torch.export`,
-    specifically for use with static caching. This module ensures that the exported model
-    is compatible with further lowering and execution in `ExecuTorch`.
+    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
+    specifically for decoder-only LM to `StaticCache`. This module ensures that the
+    exported model is compatible with further lowering and execution in `ExecuTorch`.

    Note:
        This class is specifically designed to support export process using `torch.export`
@ -137,7 +329,7 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
        This util function is designed to test exported models by simulating the generation process.
        It processes the input prompt tokens sequentially (no parallel prefill).
        This generate function is not intended to replace the original `generate` method, and the support
-        for leveraging the original `generate` is potentially planed!
+        for leveraging the original `generate` is potentially planned!

        Args:
            exported_program (`torch.export.ExportedProgram`): The exported program generated via `torch.export`.
@ -178,6 +370,94 @@ class TorchExportableModuleWithStaticCache(torch.nn.Module):
        return torch.tensor([response_tokens], dtype=torch.long)


+class TorchExportableModuleWithHybridCache(torch.nn.Module):
+    """
+    A recipe module designed to make a `PreTrainedModel` exportable with `torch.export`,
+    specifically for decoder-only LM to `HybridCache`. This module ensures that the
+    exported model is compatible with further lowering and execution in `ExecuTorch`.
+    """
+
+    def __init__(
+        self,
+        model: PreTrainedModel,
+        max_batch_size: int = 1,
+        max_cache_len: int = 4096,
+    ):
+        """
+        Initializes the exportable module with `HybridCache`.
+
+        Args:
+            model (`PreTrainedModel`): The pretrained model to wrap.
+            max_batch_size (int): Maximum batch size for the cache.
+            max_cache_len (int): Maximum sequence length for the cache.
+
+        Raises:
+            AssertionError: If the model doesn't have the expected configuration for HybridCache.
+        """
+        super().__init__()
+        self.model = model
+
+        # Verify the model is configured for HybridCache
+        if not self.model.config.use_cache:
+            raise AssertionError("Model must have caching enabled")
+
+        if (
+            not hasattr(self.model.config, "cache_implementation")
+            or self.model.config.cache_implementation != "hybrid"
+        ):
+            raise AssertionError("Model must use 'hybrid' cache implementation")
+
+        # Initialize the HybridCache
+        self.cache = HybridCache(
+            config=self.model.config,
+            max_batch_size=max_batch_size,
+            max_cache_len=max_cache_len,
+            device=self.model.device,
+            dtype=self.model.dtype,
+        )
+
+        # Register all key and value cache tensors as buffers
+        for i in range(len(self.cache.key_cache)):
+            self.register_buffer(f"key_cache_{i}", self.cache.key_cache[i], persistent=False)
+            self.register_buffer(f"value_cache_{i}", self.cache.value_cache[i], persistent=False)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        cache_position: torch.Tensor,
+    ) -> torch.Tensor:
+        """
+        Forward pass of the module, which is compatible with the ExecuTorch llm runner.
+
+        Args:
+            input_ids (`torch.Tensor`): Tensor representing current input token id to the module.
+            cache_position (`torch.Tensor`): Tensor representing current input position in the cache.
+
+        Returns:
+            torch.Tensor: Logits output from the model.
+        """
+        batch_size, seq_len = input_ids.shape
+
+        # Generate position_ids from cache_position
+        position_ids = cache_position.unsqueeze(0).expand(batch_size, -1)
+
+        # Create attention mask (always ones for token-by-token generation)
+        attention_mask = torch.ones((batch_size, seq_len), dtype=torch.long, device=input_ids.device)
+
+        # Forward pass with the model
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=self.cache,
+            use_cache=True,
+            cache_position=cache_position,
+        )
+
+        # Return only the logits to simplify the export
+        return outputs.logits
+
+
 def convert_and_export_with_cache(
    model: PreTrainedModel,
    example_input_ids: Optional[torch.Tensor] = None,
--- a/src/transformers/integrations/fbgemm_fp8.py
+++ b/src/transformers/integrations/fbgemm_fp8.py
@ -167,7 +167,7 @@ def _replace_with_fbgemm_fp8_linear(
    """
    Private method that wraps the recursion for module replacement.

-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.
    """

    import re
@ -196,7 +196,7 @@ def _replace_with_fbgemm_fp8_linear(

                    # Force requires grad to False to avoid unexpected errors
                    model._modules[name].requires_grad_(False)
-                # set non persistant buffer outside of init_empty_weights
+                # set non persistent buffer outside of init_empty_weights
                model._modules[name].input_scale_ub = torch.tensor(
                    [quantization_config.activation_scale_ub],
                    dtype=torch.float,
--- a/src/transformers/integrations/ggml.py
+++ b/src/transformers/integrations/ggml.py
@ -424,7 +424,7 @@ class GGUFLlamaConverter(LlamaConverter):
        if post_processor:
            tokenizer.post_processor = post_processor

-        # HACK: patch the llama-3 tokenizer to use the correspinding pre-tokenizer
+        # HACK: patch the llama-3 tokenizer to use the corresponding pre-tokenizer
        # and normalizer
        if self.is_llama_3_tokenizer:
            tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
--- a/src/transformers/integrations/higgs.py
+++ b/src/transformers/integrations/higgs.py
@ -558,7 +558,7 @@ def replace_with_higgs_linear(
    """
    Public method that recursively replaces the Linear layers of the given model with HIGGS quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.

    Args:
        model (`torch.nn.Module`):
--- a/src/transformers/integrations/hqq.py
+++ b/src/transformers/integrations/hqq.py
@ -28,7 +28,7 @@ def autoname_modules(model):
        module.name = name


-# Get the linear_tag from a modul name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
+# Get the linear_tag from a module name. For example: model.layers.31.self_attn.k_proj -> self_attn.k_proj
 def name_to_linear_tag(name):
    return ".".join([n for n in name.split(".") if ((n not in ["model", "layers"]) and (not n.isnumeric()))])

@ -86,9 +86,9 @@ def prepare_for_hqq_linear(model, quantization_config=None, modules_to_not_conve
    """
    Prepares nn.Linear layers for HQQ quantization.
    Since each layer type can have separate quantization parameters, we need to do the following:
-    1- tag each module with its neme via autoname_modules()
+    1- tag each module with its name via autoname_modules()
    2- Extract linear_tags (e.g. ['self_attn.q_proj', ...])
-    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear exepects it, this is referred to as patch_params
+    3- Map quantization parameters as a dictionary linear_tag -> quant_params as HQQLinear expects it, this is referred to as patch_params
    """

    modules_to_not_convert = [] if modules_to_not_convert is None else modules_to_not_convert
--- a/src/transformers/integrations/hub_kernels.py
+++ b/src/transformers/integrations/hub_kernels.py
@ -13,6 +13,8 @@
 # limitations under the License.
 from typing import Dict, Union

+from ..utils import is_torchdynamo_compiling
+

 try:
    from kernels import (
@ -20,7 +22,9 @@ try:
        LayerRepository,
        register_kernel_mapping,
        replace_kernel_forward_from_hub,
-        use_kernel_forward_from_hub,
+    )
+    from kernels import (
+        use_kernel_forward_from_hub as original_use_kernel_forward_from_hub,
    )

    _hub_kernels_available = True
@ -56,6 +60,40 @@ try:

    register_kernel_mapping(_KERNEL_MAPPING)

+    def use_kernel_forward_from_hub(*args, **kwargs):
+        """
+        Expands `kernels`' `use_kernel_forward_from_hub` to NOT use a kernel at compile time. This should be removed
+        when `kernels` supports `torch.compile`.
+
+        If the layer has a `config` attribute, we can also set `config.disable_custom_kernels = True` to disable the
+        kernel.
+        """
+
+        def decorator_with_compile_path(cls):
+            # Keeps a reference to the original forward method
+            original_forward = cls.forward
+
+            # Applies the original decorator
+            decorator = original_use_kernel_forward_from_hub(*args, **kwargs)
+            cls = decorator(cls)
+
+            # Replaces the kernel forward with a compile-friendly version
+            kernel_forward = cls.forward
+
+            def forward_with_compile_path(*forward_args, **forward_kwargs):
+                disable_custom_kernels = hasattr(cls, "config") and getattr(cls.config, "disable_custom_kernels", None)
+                if is_torchdynamo_compiling() or disable_custom_kernels:
+                    return original_forward(*forward_args, **forward_kwargs)
+                else:
+                    return kernel_forward(*forward_args, **forward_kwargs)
+
+            cls.forward = forward_with_compile_path
+
+            return cls
+
+        return decorator_with_compile_path
+
+
 except ImportError:
    # Stub to make decorators int transformers work when `kernels`
    # is not installed.
--- a/src/transformers/integrations/integration_utils.py
+++ b/src/transformers/integrations/integration_utils.py
@ -1093,7 +1093,7 @@ class CometCallback(TrainerCallback):
            if state.is_hyper_param_search:
                if mode is not None:
                    logger.warning(
-                        "Hyperparameter Search is enabled, forcing the creation of new experimetns, COMET_MODE value %r  is ignored",
+                        "Hyperparameter Search is enabled, forcing the creation of new experiments, COMET_MODE value %r  is ignored",
                        comet_old_mode,
                    )
                mode = "create"
--- a/src/transformers/integrations/npu_flash_attention.py
+++ b/src/transformers/integrations/npu_flash_attention.py
@ -171,7 +171,7 @@ def npu_flash_attn_func(
        head_num = q.shape[2]
        output = torch_npu.npu_fusion_attention(q, k, v, head_num, "BSND", keep_prob=keep_prob, scale=softmax_scale)[0]
    else:
-        attn_mask_npu = torch.triu(torch.ones([2048, 2048]), diagonal=1).bool().to(q.device)
+        attn_mask_npu = torch.triu(torch.ones([2048, 2048], device=q.device), diagonal=1).bool()
        head_num = q.shape[2]
        output = torch_npu.npu_fusion_attention(
            q,
@ -222,7 +222,7 @@ def npu_flash_attn_varlen_func(
            actual_seq_kvlen=tuple(cu_seqlens_k[1:].cpu().numpy().tolist()),
        )[0]
    else:
-        attn_mask_npu = torch.triu(torch.ones([2048, 2048]), diagonal=1).bool().to(q.device)
+        attn_mask_npu = torch.triu(torch.ones([2048, 2048], device=q.device), diagonal=1).bool()
        head_num = q.shape[1]
        output = torch_npu.npu_fusion_attention(
            q,
--- a/src/transformers/integrations/peft.py
+++ b/src/transformers/integrations/peft.py
@ -350,7 +350,7 @@ class PeftAdapterMixin:

        for _, module in self.named_modules():
            if isinstance(module, (BaseTunerLayer, ModulesToSaveWrapper)):
-                # For backward compatbility with previous PEFT versions
+                # For backward compatibility with previous PEFT versions
                if hasattr(module, "set_adapter"):
                    module.set_adapter(adapter_name)
                else:
--- a/src/transformers/integrations/quanto.py
+++ b/src/transformers/integrations/quanto.py
@ -30,7 +30,7 @@ def replace_with_quanto_layers(
 ):
    """
    Public method that recursively replaces the Linear layers of the given model with Quanto quantized layers.
-    Returns the converted model and a boolean that indicates if the conversion has been successfull or not.
+    Returns the converted model and a boolean that indicates if the conversion has been successful or not.

    Args:
        model (`torch.nn.Module`):
--- a/src/transformers/integrations/tensor_parallel.py
+++ b/src/transformers/integrations/tensor_parallel.py
@ -160,7 +160,7 @@ def distribute_module(
    output_fn=None,
 ) -> nn.Module:
    """
-    Copy pasted from torch's function but we remove the communications (partitionning)
+    Copy pasted from torch's function but we remove the communications (partitioning)
    as well as buffer registering that is similarly not efficient.
    """
    if len(module._forward_pre_hooks) == 0:
@ -225,7 +225,7 @@ class GatherParallel(TensorParallelLayer):

    @staticmethod
    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
-        # this op cannot be asynch, otherwise it completely breaks the outputs of models
+        # this op cannot be async, otherwise it completely breaks the outputs of models
        torch.distributed.all_reduce(outputs[0], op=torch.distributed.ReduceOp.SUM, async_op=False)
        return outputs

@ -307,7 +307,7 @@ class ColwiseParallel(TensorParallelLayer):
            parameter = parameter.contiguous()
        if self.use_dtensor:
            parameter = DTensor.from_local(parameter, device_mesh, shard, run_check=False)
-        return nn.Parameter(parameter)
+        return nn.Parameter(parameter, requires_grad=parameter.is_floating_point())

    @staticmethod
    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
@ -329,7 +329,7 @@ class PackedColwiseParallel(ColwiseParallel):
            parameter = parameter.contiguous()
        if self.use_dtensor:
            parameter = DTensor.from_local(parameter, device_mesh, [Shard(-2)], run_check=False)
-        return nn.Parameter(parameter)
+        return nn.Parameter(parameter, requires_grad=parameter.is_floating_point())


 class RowwiseParallel(TensorParallelLayer):
@ -381,7 +381,7 @@ class RowwiseParallel(TensorParallelLayer):
            parameter = parameter.contiguous()
        if self.use_dtensor:
            parameter = DTensor.from_local(parameter, device_mesh, shard, run_check=False)
-        return nn.Parameter(parameter)
+        return nn.Parameter(parameter, requires_grad=parameter.is_floating_point())

    @staticmethod
    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
@ -443,7 +443,7 @@ class PackedRowwiseParallel(RowwiseParallel):
            parameter = parameter.contiguous()
        if self.use_dtensor:
            parameter = DTensor.from_local(parameter, device_mesh, [Shard(-1)], run_check=False)
-        return nn.Parameter(parameter)
+        return nn.Parameter(parameter, requires_grad=parameter.is_floating_point())


 class SequenceParallel(TensorParallelLayer):
@ -521,13 +521,13 @@ class SequenceParallel(TensorParallelLayer):
        # colwise shard weight/bias to Shard(0), weight be Shard(-2) (0 if you have 1 dim only)
        # means Colwise as Linear is input * weight^T + bias, where
        # weight would become Shard(1)
-        parameter = param[:]
+        parameter = param[...]
        parameter = parameter.to(param_casting_dtype)
        if to_contiguous:
            parameter = parameter.contiguous()
        if self.use_dtensor:
            parameter = DTensor.from_local(parameter, device_mesh, [Replicate()], run_check=False)
-        return nn.Parameter(parameter)
+        return nn.Parameter(parameter, requires_grad=parameter.is_floating_point())


 SUPPORTED_TP_STYLES = {
@ -606,14 +606,14 @@ def add_tensor_parallel_hooks_to_module(model, module, tp_plan, layer_name, curr
                f"Trying to prepare {layer_name}, but it's not supported. Corresponding module: {module} Fix it's TP plan: {e}"
            )

-    # 2. We add hooks to the parrent module if needed
+    # 2. We add hooks to the parent module if needed
    if "." in layer_name:
-        parrent_layer_name = layer_name.rsplit(".", 1)[0]
-        generic_name = re.sub(r"\d+", "*", parrent_layer_name)
+        parent_layer_name = layer_name.rsplit(".", 1)[0]
+        generic_name = re.sub(r"\d+", "*", parent_layer_name)
        # The module itself needs hooks
        if module_plan := tp_plan.get(generic_name, False):
            tp_layer = translate_to_torch_parallel_style(module_plan)
-            module_to_tp_ = model.get_submodule(parrent_layer_name)
+            module_to_tp_ = model.get_submodule(parent_layer_name)
            tp_layer.prepare_module_tp(module_to_tp_, device_mesh)


@ -666,7 +666,7 @@ def shard_and_distribute_module(
    # SUPER IMPORTANT we have to use setattr
    # otherwise loading is crazy slow
    if not isinstance(param, torch.nn.Parameter):
-        param = torch.nn.Parameter(param)
+        param = torch.nn.Parameter(param, requires_grad=param.is_floating_point())
    setattr(module_to_tp, param_type, param)
    # module_to_tp.load_state_dict({param_type: param}, strict=False, assign=True)
    return param
--- a/src/transformers/integrations/vptq.py
+++ b/src/transformers/integrations/vptq.py
@ -28,7 +28,7 @@ def replace_with_vptq_linear(
    """
    Public method that recursively replaces the Linear layers of the given model with VPTQ quantized layers.
    `accelerate` is needed to use this method. Returns the converted model and a boolean that indicates if the
-    conversion has been successfull or not.
+    conversion has been successful or not.

    Args:
        model (`torch.nn.Module`):
--- a/src/transformers/loss/loss_for_object_detection.py
+++ b/src/transformers/loss/loss_for_object_detection.py
@ -343,7 +343,7 @@ class HungarianMatcher(nn.Module):

        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
        class_cost = -out_prob[:, target_ids]

        # Compute the L1 cost between boxes
--- a/src/transformers/loss/loss_rt_detr.py
+++ b/src/transformers/loss/loss_rt_detr.py
@ -99,7 +99,7 @@ class RTDetrHungarianMatcher(nn.Module):
        target_bbox = torch.cat([v["boxes"] for v in targets])
        # Compute the classification cost. Contrary to the loss, we don't use the NLL,
        # but approximate it in 1 - proba[target class].
-        # The 1 is a constant that doesn't change the matching, it can be ommitted.
+        # The 1 is a constant that doesn't change the matching, it can be omitted.
        if self.use_focal_loss:
            out_prob = F.sigmoid(outputs["logits"].flatten(0, 1))
            out_prob = out_prob[:, target_ids]
@ -112,7 +112,7 @@ class RTDetrHungarianMatcher(nn.Module):

        # Compute the L1 cost between boxes
        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
-        # Compute the giou cost betwen boxes
+        # Compute the giou cost between boxes
        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
        # Compute the final cost matrix
        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
--- a/src/transformers/modeling_flax_pytorch_utils.py
+++ b/src/transformers/modeling_flax_pytorch_utils.py
@ -27,7 +27,7 @@ from flax.traverse_util import flatten_dict, unflatten_dict
 import transformers

 from . import is_safetensors_available, is_torch_available
-from .utils import logging
+from .utils import check_torch_load_is_safe, logging


 if is_torch_available():
@ -71,6 +71,7 @@ def load_pytorch_checkpoint_in_flax_state_dict(
                )
                raise

+            check_torch_load_is_safe()
            pt_state_dict = torch.load(pt_path, map_location="cpu", weights_only=True)
            logger.info(f"PyTorch checkpoint contains {sum(t.numel() for t in pt_state_dict.values()):,} parameters.")

@ -247,6 +248,7 @@ def convert_pytorch_sharded_state_dict_to_flax(shard_filenames, flax_model):
    flax_state_dict = {}
    for shard_file in shard_filenames:
        # load using msgpack utils
+        check_torch_load_is_safe()
        pt_state_dict = torch.load(shard_file, weights_only=True)
        weight_dtypes = {k: v.dtype for k, v in pt_state_dict.items()}
        pt_state_dict = {
--- a/src/transformers/modeling_gguf_pytorch_utils.py
+++ b/src/transformers/modeling_gguf_pytorch_utils.py
@ -258,6 +258,8 @@ TENSOR_PROCESSORS = {


 def read_field(reader, field):
+    if field not in reader.fields:
+        return []
    value = reader.fields[field]
    return [_gguf_parse_value(value.parts[_data_index], value.types) for _data_index in value.data]

@ -369,6 +371,7 @@ def load_gguf_checkpoint(gguf_checkpoint_path, return_tensors=False, model_to_lo
    parsed_parameters = {k: {} for k in GGUF_TO_TRANSFORMERS_MAPPING}

    architecture = read_field(reader, "general.architecture")[0]
+    # NOTE: Some GGUF checkpoints may miss `general.name` field in metadata
    model_name = read_field(reader, "general.name")

    updated_architecture = None
--- a/src/transformers/modeling_layers.py
+++ b/src/transformers/modeling_layers.py
@ -0,0 +1,48 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+
+import torch.nn as nn
+
+
+class GradientCheckpointingLayer(nn.Module):
+    """Base class for layers with gradient checkpointing.
+
+    This class enables gradient checkpointing functionality for a layer. By default, gradient checkpointing is disabled
+    (`gradient_checkpointing = False`). When `model.set_gradient_checkpointing()` is called, gradient checkpointing is
+    enabled by setting `gradient_checkpointing = True` and assigning a checkpointing function to `_gradient_checkpointing_func`.
+
+    Important:
+
+        When using gradient checkpointing with `use_reentrant=True`, inputs that require gradients (e.g. hidden states)
+        must be passed as positional arguments (`*args`) rather than keyword arguments to properly propagate gradients.
+
+        Example:
+
+            ```python
+            >>> # Correct - hidden_states passed as positional arg
+            >>> out = self.layer(hidden_states, attention_mask=attention_mask)
+
+            >>> # Incorrect - hidden_states passed as keyword arg
+            >>> out = self.layer(hidden_states=hidden_states, attention_mask=attention_mask)
+            ```
+    """
+
+    gradient_checkpointing = False
+
+    def __call__(self, *args, **kwargs):
+        if self.gradient_checkpointing and self.training:
+            return self._gradient_checkpointing_func(partial(super().__call__, **kwargs), *args)
+        return super().__call__(*args, **kwargs)
--- a/Show More
+++ b/Show More