debug circleci

Rename huggingface_cli to hf (#39630 )
* Rename huggingface_cli to hf * hfh
2025-10-21 17:48:57 +08:00 · 2025-07-26 08:31:39 +02:00 · 2025-07-25 14:10:04 +02:00 · 2025-07-25 12:09:44 +00:00 · 2025-07-25 11:27:45 +00:00 · 2025-07-25 10:06:03 +01:00
1130 changed files with 57058 additions and 31416 deletions
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -178,7 +178,7 @@ class CircleCIJob:
            {"run": {"name": "fetch hub objects before pytest", "command": "python3 utils/fetch_hub_objects_for_ci.py"}},
            {"run": {
                "name": "Run tests",
-                "command": f"({timeout_cmd} python3 -m pytest {marker_cmd} -n {self.pytest_num_workers} {junit_flags} {repeat_on_failure_flags} {' '.join(pytest_flags)} $(cat splitted_tests.txt) | tee tests_output.txt)"}
+                "command": f"({timeout_cmd} python3 -m pytest -n 6 tests/models/vit/test_modeling_vit.py | tee tests_output.txt)"}
            },
            {"run": {"name": "Expand to show skipped tests", "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --skip"}},
            {"run": {"name": "Failed tests: show reasons",   "when": "always", "command": f"python3 .circleci/parse_test_outputs.py --file tests_output.txt --fail"}},
@ -303,7 +303,7 @@ non_model_job = CircleCIJob(
    docker_image=[{"image": "huggingface/transformers-torch-light"}],
    # networkx==3.3 (after #36957) cause some issues
    # TODO: remove this once it works directly
-    install_steps=["uv venv && uv pip install ."],
+    install_steps=["uv venv && uv pip install .[serving]"],
    marker="not generate",
    parallelism=6,
 )
@ -333,12 +333,12 @@ doc_test_job = CircleCIJob(
    pytest_num_workers=1,
 )

-REGULAR_TESTS = [torch_job, hub_job, onnx_job, tokenization_job, processor_job, generate_job, non_model_job] # fmt: skip
+REGULAR_TESTS = [torch_job] # fmt: skip
 EXAMPLES_TESTS = [examples_torch_job]
 PIPELINE_TESTS = [pipelines_torch_job]
 REPO_UTIL_TESTS = [repo_utils_job]
 DOC_TESTS = [doc_test_job]
-ALL_TESTS = REGULAR_TESTS + EXAMPLES_TESTS + PIPELINE_TESTS + REPO_UTIL_TESTS + DOC_TESTS + [custom_tokenizers_job] + [exotic_models_job]  # fmt: skip
+ALL_TESTS = REGULAR_TESTS


 def create_circleci_config(folder=None):
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -18,6 +18,10 @@ jobs:
      notebook_folder: transformers_doc
      languages: ar de en es fr hi it ko pt tr zh ja te
      custom_container: huggingface/transformers-doc-builder
+      # Temporary pin to work around datasets exception in the docbuilder.Remove after docker images and main have
+      # the right dependencies (which **should** be the case by 2025-07-20). See
+      # https://github.com/huggingface/transformers/actions/runs/16365952006/job/46243081358?pr=38545
+      pre_command: uv pip install datasets>=2.15.0
    secrets:
      token: ${{ secrets.HUGGINGFACE_PUSH }}
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -15,3 +15,7 @@ jobs:
      pr_number: ${{ github.event.number }}
      package: transformers
      languages: en
+      # Temporary pin to work around datasets exception in the docbuilder. Remove after docker images and main have
+      # the right dependencies (which **should** be the case by 2025-07-20). See
+      # https://github.com/huggingface/transformers/actions/runs/16365952006/job/46243081358?pr=38545
+      pre_command: uv pip install datasets>=2.15.0
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada", "vasqu", "ivarflakstad", "stevhliu"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
--- a/.gitignore
+++ b/.gitignore
@ -167,3 +167,6 @@ tags

 # ruff
 .ruff_cache
+
+# modular conversion
+*.modular_backup
--- a/README.md
+++ b/README.md
@ -44,7 +44,7 @@ limitations under the License.
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ja.md">日本語</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_hd.md">हिन्दी</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_ru.md">Русский</a> |
-        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Рortuguês</a> |
+        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_pt-br.md">Português</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_te.md">తెలుగు</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_fr.md">Français</a> |
        <a href="https://github.com/huggingface/transformers/blob/main/i18n/README_de.md">Deutsch</a> |
--- a/conftest.py
+++ b/conftest.py
@ -28,6 +28,7 @@ from transformers.testing_utils import HfDoctestModule, HfDocTestParser

 NOT_DEVICE_TESTS = {
    "test_tokenization",
+    "test_tokenization_mistral_common",
    "test_processor",
    "test_processing",
    "test_beam_constraints",
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -30,6 +30,8 @@ RUN python3 -m pip install --no-cache-dir -e ./transformers[dev,onnxruntime] &&

 RUN python3 -m pip uninstall -y flax jax

+RUN python3 -m pip install --no-cache-dir -U timm
+
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

--- a/docker/transformers-pytorch-amd-gpu/Dockerfile
+++ b/docker/transformers-pytorch-amd-gpu/Dockerfile
@ -1,10 +1,10 @@
-FROM rocm/pytorch:rocm6.4_ubuntu22.04_py3.10_pytorch_release_2.6.0
+FROM rocm/pytorch:rocm6.4.1_ubuntu24.04_py3.12_pytorch_release_2.7.1
 LABEL maintainer="Hugging Face"

 ARG DEBIAN_FRONTEND=noninteractive

-ARG TORCH_VISION='0.21.0'
-ARG TORCH_AUDIO='2.6.0'
+ARG TORCH_VISION='0.22.0'
+ARG TORCH_AUDIO='2.7.0'

 RUN apt update && \
    apt install -y --no-install-recommends git libsndfile1-dev tesseract-ocr espeak-ng python3 python3-dev python3-pip python3-dev ffmpeg git-lfs && \
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -78,6 +78,9 @@ RUN git clone https://github.com/NetEase-FuXi/EETQ.git && cd EETQ/ && git submod
 # RUN python3 -m pip install --no-cache-dir flute-kernel==0.4.1
 # RUN python3 -m pip install --no-cache-dir git+https://github.com/Dao-AILab/fast-hadamard-transform.git

+# Add fp-quant for quantization testing
+RUN python3 -m pip install --no-cache-dir "fp-quant>=0.1.6"
+
 # Add compressed-tensors for quantization testing
 RUN python3 -m pip install --no-cache-dir compressed-tensors

--- a/docs/source/ar/custom_models.md
+++ b/docs/source/ar/custom_models.md
@ -280,7 +280,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 الآن لإرسال النموذج إلى Hub، تأكد من تسجيل الدخول. إما تشغيل في المحطة الأوامر الطرفية الخاصة بك:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 أو من دفتر ملاحظات:
--- a/docs/source/ar/model_sharing.md
+++ b/docs/source/ar/model_sharing.md
@ -41,7 +41,7 @@ picture-in-picture" allowfullscreen></iframe>
 قبل مشاركة نموذج على Hub، ستحتاج إلى بيانات اعتماد حساب Hugging Face الخاصة بك.  إذا كنت تستخدم منصة الأوامر، فقم بتشغيل الأمر التالي في بيئة افتراضية حيث تم تثبيت 🤗 Transformers. سيقوم هذا الأمر بتخزين رمز الدخول الخاص بك في مجلد تخزين المؤقت لـ Hugging Face (`~/.cache/` بشكل افتراضي):

 ```bash
-huggingface-cli login
+hf auth login
 ```

 إذا كنت تستخدم دفتر ملاحظات مثل Jupyter أو Colaboratory، فتأكد من تثبيت مكتبة [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). تسمح لك هذه المكتبة بالتفاعل برمجيًا مع Hub.
--- a/docs/source/ar/run_scripts.md
+++ b/docs/source/ar/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 يمكن لجميع النصوص البرمجية رفع نموذجك النهائي إلى [مركز النماذج](https://huggingface.co/models). تأكد من تسجيل الدخول إلى Hugging Face قبل البدء:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 ثم أضف المعلمة `push_to_hub` إلى النص البرمجي . ستقوم هذه المعلمة بإنشاء مستودع باستخدام اسم مستخدم Hugging Face واسم المجلد المحدد في `output_dir`.
--- a/docs/source/de/model_sharing.md
+++ b/docs/source/de/model_sharing.md
@ -56,7 +56,7 @@ Dateien lassen sich auch in einem Repository leicht bearbeiten, und Sie können
 Bevor Sie ein Modell für den Hub freigeben, benötigen Sie Ihre Hugging Face-Anmeldedaten. Wenn Sie Zugang zu einem Terminal haben, führen Sie den folgenden Befehl in der virtuellen Umgebung aus, in der 🤗 Transformers installiert ist. Dadurch werden Ihre Zugangsdaten in Ihrem Hugging Face-Cache-Ordner (standardmäßig `~/.cache/`) gespeichert:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Wenn Sie ein Notebook wie Jupyter oder Colaboratory verwenden, stellen Sie sicher, dass Sie die [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) Bibliothek installiert haben. Diese Bibliothek ermöglicht Ihnen die programmatische Interaktion mit dem Hub.
--- a/docs/source/de/run_scripts.md
+++ b/docs/source/de/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 Alle Skripte können Ihr endgültiges Modell in den [Model Hub](https://huggingface.co/models) hochladen. Stellen Sie sicher, dass Sie bei Hugging Face angemeldet sind, bevor Sie beginnen:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Dann fügen Sie dem Skript das Argument `push_to_hub` hinzu. Mit diesem Argument wird ein Repository mit Ihrem Hugging Face-Benutzernamen und dem in `output_dir` angegebenen Ordnernamen erstellt.
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -72,8 +72,6 @@
      title: Caching
    - local: kv_cache
      title: KV cache strategies
-    - local: serving
-      title: Serving
    - local: llm_tutorial_optimization
      title: Getting the most out of LLMs
    - local: perplexity
@ -105,6 +103,10 @@
    title: Agents
  - local: tools
    title: Tools
+  - local: serving
+    title: Serving
+  - local: transformers_as_backend
+    title: Inference server backends
  title: Inference
 - isExpanded: false
  sections:
@ -177,6 +179,8 @@
    title: FBGEMM
  - local: quantization/finegrained_fp8
    title: Fine-grained FP8
+  - local: quantization/fp_quant
+    title: FP-Quant
  - local: gguf
    title: GGUF
  - local: quantization/gptq
@ -441,6 +445,10 @@
        title: Encoder Decoder Models
      - local: model_doc/ernie
        title: ERNIE
+      - local: model_doc/ernie4_5
+        title: Ernie4_5
+      - local: model_doc/ernie4_5_moe
+        title: Ernie4_5_MoE
      - local: model_doc/ernie_m
        title: ErnieM
      - local: model_doc/esm
@ -475,6 +483,8 @@
        title: GLM
      - local: model_doc/glm4
        title: glm4
+      - local: model_doc/glm4_moe
+        title: glm4_moe
      - local: model_doc/openai-gpt
        title: GPT
      - local: model_doc/gpt_neo
@ -517,6 +527,8 @@
        title: Jukebox
      - local: model_doc/led
        title: LED
+      - local: model_doc/lfm2
+        title: LFM2
      - local: model_doc/llama
        title: LLaMA
      - local: model_doc/llama2
@ -561,6 +573,8 @@
        title: MobileBERT
      - local: model_doc/modernbert
        title: ModernBert
+      - local: model_doc/modernbert-decoder
+        title: ModernBERTDecoder
      - local: model_doc/mpnet
        title: MPNet
      - local: model_doc/mpt
@ -709,6 +723,8 @@
        title: D-FINE
      - local: model_doc/dab-detr
        title: DAB-DETR
+      - local: model_doc/deepseek_v2
+        title: DeepSeek-V2
      - local: model_doc/deformable_detr
        title: Deformable DETR
      - local: model_doc/deit
@ -735,6 +751,8 @@
        title: DPT
      - local: model_doc/efficientformer
        title: EfficientFormer
+      - local: model_doc/efficientloftr
+        title: EfficientLoFTR
      - local: model_doc/efficientnet
        title: EfficientNet
      - local: model_doc/eomt
@ -1035,6 +1053,8 @@
        title: PaliGemma
      - local: model_doc/perceiver
        title: Perceiver
+      - local: model_doc/perception_lm
+        title: PerceptionLM
      - local: model_doc/phi4_multimodal
        title: Phi4 Multimodal
      - local: model_doc/pix2struct
@ -1087,6 +1107,8 @@
        title: Vision Text Dual Encoder
      - local: model_doc/visual_bert
        title: VisualBERT
+      - local: model_doc/voxtral
+        title: Voxtral
      - local: model_doc/xclip
        title: X-CLIP
      title: Multimodal models
--- a/docs/source/en/attention_interface.md
+++ b/docs/source/en/attention_interface.md
@ -60,11 +60,11 @@ You will see it prints "I just entered the attention computation" as many times

 ## Dynamically switching attention function

-You could dynamically change the model's attention function as well, by overriding the `config._attn_implementation` field:
+You could dynamically change the model's attention function as well:

 ```python
 # Back to use original sdpa implementation
-model.config._attn_implementation = "sdpa"
+model.set_attn_implementation("sdpa")

 model(torch.ones(1, 5, dtype=int))
 ```
@ -72,6 +72,34 @@ model(torch.ones(1, 5, dtype=int))
 and it will stop printing the statements, as it now uses the `sdpa` attention.  
 This allows to quickly change an attention function, without needing to reload the model!

+## Different attention per backbone in multimodal models
+
+For multimodal models different attention functions may work better for each backbone module. For example, some vision backbones perform better in fp32, but are incompatible with FlashAttention. To continue using FlashAttention while keeping the vision encoder in fp32, create a dict and map each config to an attention implementation as shown below.
+
+```python
+from transformers import AutoModelForImageTextToText
+
+model_id = "facebook/chameleon-7b"
+
+attention_implementation_per_backbone = {"vision_config": "sdpa", "text_config": "flash_attention_2"}
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation=attention_implementation_per_backbone)
+
+# NOTE: keys in the attention implementation have to be the same as the sub-config names
+for key in attention_implementation_per_backbone:
+    assert key in model.config.sub_configs, f"Invalid key in `attention_implementation`"
+
+# You can omit certain backbones - the default attention function (SDPA) will be used
+# This is equivalent to the previous example
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"text_config": "flash_attention_2"})
+
+
+# Set the same attention implementation for all backbones with single string, same as in non-multimodal models
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation="eager")
+
+# Alternatively use a dict with an empty key for global configuration
+model = AutoModelForImageTextToText.from_pretrained(model_id, attn_implementation={"": "eager"})
+```
+
 ## What about new args needed in my custom attention function?

 But indeed, what if the new function requires a new arg to be properly used? It's no issue! Models supporting the
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -64,9 +64,9 @@ Arguments can also be passed directly to `@auto_docstring` for more control. Use
    It builds upon the standard Transformer architecture with unique modifications.""",
    custom_args="""
    custom_parameter (`type`, *optional*, defaults to `default_value`):
-        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+        A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+        A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
    """
 )
 class MySpecialModel(PreTrainedModel):
@ -85,13 +85,40 @@ class MySpecialModel(PreTrainedModel):
    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
        r"""
        custom_parameter (`type`, *optional*, defaults to `default_value`):
-            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+            A concise description for custom_parameter if not defined or overriding the description in `auto_docstring.py`.
        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
-            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+            A concise description for internal_helper_arg if not defined or overriding the description in `auto_docstring.py`.
        """
        # ...
 ```

+You should also use the `@auto_docstring` decorator for classes that inherit from [`~utils.ModelOutput`].
+
+```python
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Custom model outputs with additional fields.
+    """
+)
+class MyModelOutput(ImageClassifierOutput):
+    r"""
+    loss (`torch.FloatTensor`, *optional*):
+        The loss of the model.
+    custom_field (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
+        A custom output field specific to this model.
+    """
+
+    # Standard fields like hidden_states, logits, attentions etc. can be automatically documented if the description is the same as the standard arguments.
+    # However, given that the loss docstring is often different per model, you should document it in the docstring above.
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    # Custom fields need to be documented in the docstring above
+    custom_field: Optional[torch.FloatTensor] = None
+```
+
 </hfoption>
 <hfoption id="functions">

@ -171,7 +198,7 @@ class MyModel(PreTrainedModel):

 There are some rules for documenting different types of arguments and they're listed below.

- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `args_doc.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `args_doc.py`.
+- Standard arguments (`input_ids`, `attention_mask`, `pixel_values`, etc.) are defined and retrieved from `auto_docstring.py`. It is the single source of truth for standard arguments and should not be redefined locally if an argument's description and shape is the same as an argument in `auto_docstring.py`.

    If a standard argument behaves differently in your model, then you can override it locally in a `r""" """` block. This local definition has a higher priority. For example, the `labels` argument is often customized per model and typically requires overriding.

@ -245,7 +272,7 @@ When working with modular files (`modular_model.py`), follow the guidelines belo
 The `@auto_docstring` decorator automatically generates docstrings by:

 1. Inspecting the signature (arguments, types, defaults) of the decorated class' `__init__` method or the decorated function.
-2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `args_doc.py` file.
+2. Retrieving the predefined docstrings for common arguments (`input_ids`, `attention_mask`, etc.) from internal library sources like [`ModelArgs`], [`ImageProcessorArgs`], and the `auto_docstring.py` file.
 3. Adding argument descriptions in one of two ways as shown below.

    | method | description | usage |
@ -253,7 +280,7 @@ The `@auto_docstring` decorator automatically generates docstrings by:
    | `r""" """` | add custom docstring content directly to a method signature or within the `__init__` docstring | document new arguments or override standard descriptions |
    | `custom_args` | add custom docstrings for specific arguments directly in `@auto_docstring` | define docstring for new arguments once if they're repeated in multiple places in the modeling file |

-4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `args_doc.py`.
+4. Adding class and function descriptions. For model classes with standard naming patterns, like `ModelForCausalLM`, or if it belongs to a pipeline, `@auto_docstring` automatically generates the appropriate descriptions with `ClassDocstring` from `auto_docstring.py`.

    `@auto_docstring` also accepts the `custom_intro` argument to describe a class or function.

--- a/docs/source/en/cache_explanation.md
+++ b/docs/source/en/cache_explanation.md
@ -82,24 +82,18 @@ When you use Transformers' [`Cache`] class, the self-attention module performs s

 ## Cache storage implementation

-The actual storage of key-value pairs varies between cache implementations. As an example, consider the [`DynamicCache`].
+Caches are structured as a list of layers, where each layer contains a key and value cache. The key and value caches are tensors with the shape `[batch_size, num_heads, seq_len, head_dim]`.

+Layers can be of different types (e.g. `DynamicLayer`, `StaticLayer`, `SlidingWindowLayer`), which mostly changes how sequence length is handled and how the cache is updated.

-In [`DynamicCache`], the key-value pairs are stored as two lists of tensors. Each tensor in the lists have the shape `[batch_size, num_heads, seq_len, head_dim]`.
- `key_cache`: A list of tensors, one for each layer.
- `value_cache`: A list of tensors, one for each layer.
+The simplest is a `DynamicLayer` that grows as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token:

-When new tokens are processed:
-
-1. For each layer, the new key and value states are concatenated with the existing cache.
 ```py
-self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=-2)
-self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=-2)
+cache.layers[idx].keys = torch.cat([cache.layers[idx].keys, key_states], dim=-2)
+cache.layers[idx].values = torch.cat([cache.layers[idx].values, value_states], dim=-2)
 ```

-2. The cache grows dynamically as more tokens are processed. The sequence length dimension (`seq_len`) increases with each new token.
-
-3. The cache maintains a count of seen tokens through `self._seen_tokens`. This is updated when the first layer processes a new token.
+Other layer types like `StaticLayer` and `SlidingWindowLayer` have a fixed sequence length that is set when the cache is created. This makes them compatible with `torch.compile`. In the case of `SlidingWindowLayer`, existing tokens are shifted out of the cache when a new token is added.

 The example below demonstrates how to create a generation loop with [`DynamicCache`]. As discussed, the attention mask is a concatenation of past and current token values and `1` is added to the cache position for the next token.

@ -134,6 +128,34 @@ for _ in range(max_new_tokens):
 print(tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0])
 "[INST] Hello, what's your name. [/INST]  Hello! My name is LLaMA,"
 ```
+
+## Cache position
+
+The cache position tracks where to insert new tokens in the attention cache. It represents the *absolute* position of each token in the context, independent of padding or batch structure. Suppose you already cached `N` tokens and are now processing `K` new tokens. The cache position for the new tokens will range from `N` to `N + K - 1`. In other words, you're processing tokens at positions - `[N, N + 1, N + 2, ..., N + K - 1]`.
+
+Cache position is used internally for two purposes:
+
+1. Selecting new tokens to process in the input sequence and ensuring only tokens that haven’t been cached yet are passed to the model's `forward`.
+2. Storing key/value pairs at the correct positions in the cache. This is especially important for fixed-size caches, like [`StaticCache`], that pre-allocates a specific cache length.
+
+The generation loop usually takes care of the cache position, but if you're writing a custom generation method, it is important that cache positions are accurate since they are used to write and read key/value states into fixed slots.
+
+
+```py
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, DynamicCache
+
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, device_map="cuda:0")
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+messages = [{"role": "user", "content": "You are a helpful assistant."}]
+inputs = tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt", return_dict=True).to("cuda:0")
+generated_ids = model.generate(**inputs, use_cache=True, max_new_tokens=10)
+
+```
+
+
 ## Legacy cache format

 Before the [`Cache`] class, the cache used to be stored as a tuple of tuples of tensors. This format is dynamic because it grows as text is generated, similar to [`DynamicCache`].
@ -143,7 +165,7 @@ The legacy format is essentially the same data structure but organized different
 - The tensors have the same shape `[batch_size, num_heads, seq_len, head_dim]`.
 - The format is less flexible and doesn't support features like quantization or offloading.

-If your project depends on this legacy format, you can convert between [`DynamicCache`] and a tuple of tuples as shown below with the [`~DynamicCache.from_legacy_cache`] and [`DynamicCache.to_legacy_cache`] functions. This is helpful if you have custom logic for manipulating a cache in a specific format.
+If your project depends on this legacy format, we recommend to convert to [`DynamicCache`] with [`~DynamicCache.from_legacy_cache`]. Note that legacy cache format is deprecated and not used anymore in `Transformers`. You can convert back to tuple format with [`DynamicCache.to_legacy_cache`] functions, which is helpful if you have custom logic for manipulating a cache in a specific format.

 ```py
 import torch
@ -159,4 +181,4 @@ generation_outputs = model.generate(**inputs, return_dict_in_generate=True, retu

 cache = DynamicCache.from_legacy_cache(generation_outputs.past_key_values)
 legacy_format_cache = cache.to_legacy_cache()
-```
+```
--- a/docs/source/en/custom_models.md
+++ b/docs/source/en/custom_models.md
@ -271,7 +271,7 @@ The model is ready to be pushed to the Hub now. Log in to your Hugging Face acco
 <hfoption id="huggingface-CLI">

 ```bash
-huggingface-cli login
+hf auth login
 ```

 </hfoption>
--- a/docs/source/en/internal/generation_utils.md
+++ b/docs/source/en/internal/generation_utils.md
@ -356,66 +356,93 @@ A [`Constraint`] can be used to force the generation to include specific tokens

 ## Caches

-[[autodoc]] Cache
-    - update
-
-[[autodoc]] CacheConfig
-	- update
-
-[[autodoc]] QuantizedCacheConfig
-	- validate
-
-[[autodoc]] DynamicCache
+[[autodoc]] CacheLayerMixin
    - update
    - get_seq_length
+    - get_mask_sizes
+    - get_max_cache_shape
+    - reset
    - reorder_cache
+
+[[autodoc]] DynamicLayer
+    - update
+    - crop
+    - batch_repeat_interleave
+    - batch_select_indices
+
+[[autodoc]] StaticLayer
+    - update
+
+[[autodoc]] SlidingWindowLayer
+    - update
+
+[[autodoc]] CacheProcessor
+    - pre_update
+    - post_update
+
+[[autodoc]] OffloadedCacheProcessor
+    - pre_update
+
+[[autodoc]] QuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] QuantoQuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] HQQQuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] Cache
+    - update
+    - get_seq_length
+    - get_mask_sizes
+    - get_max_cache_shape
+    - reset
+    - reorder_cache
+    - crop
+    - batch_repeat_interleave
+    - batch_select_indices
+
+[[autodoc]] DynamicCache
    - to_legacy_cache
    - from_legacy_cache

 [[autodoc]] QuantizedCache
-    - update
-    - get_seq_length

 [[autodoc]] QuantoQuantizedCache

+[[autodoc]] QuantoQuantizedCacheProcessor
+
 [[autodoc]] HQQQuantizedCache

+[[autodoc]] HQQQuantizedCacheProcessor
+
 [[autodoc]] OffloadedCache
-    - update
-    - prefetch_layer
-    - evict_previous_layer

 [[autodoc]] StaticCache
-    - update
-    - get_seq_length
-    - reset

 [[autodoc]] OffloadedStaticCache
-    - update
-    - get_seq_length
-    - reset

 [[autodoc]] HybridCache
-    - update
-    - get_seq_length
-    - reset
+
+[[autodoc]] HybridChunkedCache

 [[autodoc]] SlidingWindowCache
-    - update
-    - reset

 [[autodoc]] EncoderDecoderCache
-    - get_seq_length
    - to_legacy_cache
    - from_legacy_cache
-    - reset
-    - reorder_cache

 [[autodoc]] MambaCache
    - update_conv_state
    - update_ssm_state
    - reset

+[[autodoc]] CacheConfig
+
+[[autodoc]] QuantizedCacheConfig
+
+
 ## Watermark Utils

 [[autodoc]] WatermarkingConfig
--- a/docs/source/en/internal/model_debugging_utils.md
+++ b/docs/source/en/internal/model_debugging_utils.md
@ -247,3 +247,114 @@ first and last layer will be shown. This is useful when some layers (typically c
 layers.

 [[autodoc]] model_addition_debugger_context
+
+## Analyzer of skipped tests
+
+### Scan skipped tests - for model adders and maintainers
+
+This small util is a power user tool intended for model adders and maintainers. It lists all test methods
+existing in `test_modeling_common.py`, inherited by all model tester classes, and scans the repository to measure
+how many tests are being skipped and for which models. 
+
+### Rationale
+
+When porting models to transformers, tests fail as they should, and sometimes `test_modeling_common` feels irreconcilable with the peculiarities of our brand new model. But how can we be sure we're not breaking everything by adding a seemingly innocent skip?
+
+This utility:
+- scans all test_modeling_common methods
+- looks for times where a method is skipped
+- returns a summary json you can load as a DataFrame/inspect
+
+**For instance test_inputs_embeds is skipped in a whooping 39% proportion at the time of writing this util.**
+
+![download-icon](https://huggingface.co/datasets/huggingface/documentation-images/resolve/f7f671f69b88ce4967e19179172c248958d35742/transformers/tests_skipped_visualisation.png)
+
+
+### Usage 
+
+You can run the skipped test analyzer in two ways:
+
+#### Full scan (default)
+
+From the root of `transformers` repo, scans all common test methods and outputs the results to a JSON file (default: `all_tests_scan_result.json`).
+
+```bash
+python utils/scan_skipped_tests.py --output_dir path/to/output
+```
+
+- `--output_dir` (optional): Directory where the JSON results will be saved. Defaults to the current directory.
+
+**Example output:**
+
+```
+🔬 Parsing 331 model test files once each...
+📝 Aggregating 224 tests...
+  (224/224) test_update_candidate_strategy_with_matches_1es_3d_is_nonecodet_schedule_fa_kwargs
+✅ Scan complete.
+
+📄 JSON saved to /home/pablo/git/transformers/all_tests_scan_result.json
+
+```
+
+And it will generate `all_tests_scan_result.json` file that you can inspect. The JSON is indexed by method name, and each entry follows this schema, indicating the origin as well (from `common`or `GenerationMixin`.)
+
+```json
+{
+  "<method_name>": {
+    "origin": "<test suite>"
+    "models_ran": ["<model_name>", ...],
+    "models_skipped": ["<model_name>", ...],
+    "skipped_proportion": <float>,
+    "reasons_skipped": ["<model_name>: <reason>",
+      ...
+    ]
+  },
+  ...
+}
+```
+
+Which you can visualise as above with e.g. `pandas`
+
+```python
+df = pd.read_json('all_tests_scan_result.json').T
+df.sort_values(by=['skipped_proportion'], ascending=False)
+
+```
+
+### Scan a single test method
+
+You can focus on a specific test method using `--test_method_name`:
+
+```bash
+$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds --output_dir path/to/output
+```
+
+- `--test_method_name`: Name of the test method to scan (e.g., `test_inputs_embeds`).
+- `--output_dir` (optional): Directory where the JSON result will be saved.
+
+**Example output:**
+
+```bash
+$ python utils/scan_skipped_tests.py --test_method_name test_inputs_embeds
+
+🔬 Parsing 331 model test files once each...
+
+== test_inputs_embeds ==
+
+Ran    : 199/323
+Skipped : 124/323 (38.4%)
+ - aimv2: Aimv2 does not use inputs_embeds
+ - align: Inputs_embeds is tested in individual model tests
+ - altclip: Inputs_embeds is tested in individual model tests
+ - audio_spectrogram_transformer: AST does not use inputs_embeds
+ - beit: BEiT does not use inputs_embeds
+ - bit: Bit does not use inputs_embeds
+ - blip: Blip does not use inputs_embeds
+ - blip_2: Inputs_embeds is tested in individual model tests
+ - bridgetower: 
+ - canine: CANINE does not have a get_input_embeddings() method.
+ - ...
+
+📄 JSON saved to /home/pablo/git/transformers/scan_test_inputs_embeds.json
+
+```
--- a/docs/source/en/kv_cache.md
+++ b/docs/source/en/kv_cache.md
@ -134,7 +134,7 @@ The [`QuantizedCache`] reduces memory requirements by quantizing the KV values t
 > [!WARNING]
 > Quantizing the cache can harm latency if the context length is short and there is enough GPU memory available for generation without enabling cache quantization. Try to find a balance between memory efficiency and latency.

-Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and indicate the quantization backend in [`QuantizedCacheConfig`]. Any additional quantization related parameters should also be passed either as a dict or an instance of [`QuantizedCacheConfig`]. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.
+Enable [`QuantizedCache`] by configuring `cache_implementation="quantized"` in [`GenerationConfig`], and the quantization backend, as well as any additional quantization related parameters should also be passed either as a dict. You should use the default values for these additional parameters unless you're running out-of-memory. In that case, consider decreasing the residual length.

 <hfoptions id="quantized-cache">
 <hfoption id="HQQQuantizedCache">
@ -143,7 +143,7 @@ For [`HQQQuantizedCache`], we recommend setting the `axis-key` and `axis-value`

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache, QuantizedCacheConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, HQQQuantizedCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@ -161,7 +161,7 @@ For [`QuantoQuantizedCache`], we recommend setting the `axis-key` and `axis-valu

 ```py
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache, QuantizedCacheConfig
+from transformers import AutoTokenizer, AutoModelForCausalLM, QuantoQuantizedCache

 tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf")
 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-chat-hf", torch_dtype=torch.float16, device_map="auto")
@ -275,7 +275,6 @@ from transformers.cache_utils import (
    StaticCache,
    SlidingWindowCache,
    QuantoQuantizedCache,
-    QuantizedCacheConfig,
 )

 model_id = "meta-llama/Llama-2-7b-chat-hf"
--- a/docs/source/en/llm_optims.md
+++ b/docs/source/en/llm_optims.md
@ -341,7 +341,7 @@ A known issue with transformer models is that the self-attention mechanism grows

 FlashAttention and [FlashAttention-2](./perf_infer_gpu_one#flashattention-2) break up the attention computation into smaller chunks and reduces the number of intermediate read/write operations to the GPU memory to speed up inference. FlashAttention-2 improves on the original FlashAttention algorithm by also parallelizing over sequence length dimension and better partitioning work on the hardware to reduce synchronization and communication overhead.

-To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`].
+To use FlashAttention-2, set [attn_implementation](https://hf.co/docs/transformers/main/en/main_classes/text_generation#transformers.PreTrainedModel.from_pretrained.attn_implementation) to `"flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or set with `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface) after the model is loaded.

 ```py
 from transformers import AutoModelForCausalLM, BitsAndBytesConfig
@ -353,6 +353,14 @@ model = AutoModelForCausalLM.from_pretrained(
    torch_dtype=torch.bfloat16,
    attn_implementation="flash_attention_2",
 )
+
+# Change the model's attention dynamically after loading
+model = AutoModelForCausalLM.from_pretrained(
+    "google/gemma-2b",
+    quantization_config=quant_config,
+    torch_dtype=torch.bfloat16
+)
+model.set_attention_implementation("flash_attention_2")
 ```

 ### PyTorch scaled dot product attention
@ -360,7 +368,7 @@ model = AutoModelForCausalLM.from_pretrained(
 Scaled dot product attention (SDPA) is automatically enabled in PyTorch 2.0 and it supports FlashAttention, xFormers, and PyTorch's C++ implementation. SDPA chooses the most performant attention algorithm if you're using a CUDA backend. For other backends, SDPA defaults to the PyTorch C++ implementation.

 > [!TIP]
-> SDPA automaticallysupports FlashAttention-2 as long as you have the latest PyTorch version installed.
+> SDPA automatically supports FlashAttention-2 as long as you have the latest PyTorch version installed.

 Use the [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/stable/generated/torch.nn.attention.sdpa_kernel.html) context manager to explicitly enable or disable any of the four attention algorithms. For example, use `SDPBackend.FLASH_ATTENTION` to enable FlashAttention.

--- a/docs/source/en/main_classes/callback.md
+++ b/docs/source/en/main_classes/callback.md
@ -33,6 +33,7 @@ By default, `TrainingArguments.report_to` is set to `"all"`, so a [`Trainer`] wi
  it's the second one).
 - [`~integrations.TensorBoardCallback`] if tensorboard is accessible (either through PyTorch >= 1.4
  or tensorboardX).
+- [`~integrations.TrackioCallback`] if [trackio](https://github.com/gradio-app/trackio) is installed.
 - [`~integrations.WandbCallback`] if [wandb](https://www.wandb.com/) is installed.
 - [`~integrations.CometCallback`] if [comet_ml](https://www.comet.com/site/) is installed.
 - [`~integrations.MLflowCallback`] if [mlflow](https://www.mlflow.org/) is installed.
@ -72,6 +73,9 @@ Here is the list of the available [`TrainerCallback`] in the library:

 [[autodoc]] integrations.TensorBoardCallback

+[[autodoc]] integrations.TrackioCallback
+    - setup
+
 [[autodoc]] integrations.WandbCallback
    - setup

--- a/docs/source/en/main_classes/quantization.md
+++ b/docs/source/en/main_classes/quantization.md
@ -93,6 +93,10 @@ Learn how to quantize models in the [Quantization](../quantization) guide.

 [[autodoc]] QuarkConfig

+## FPQuantConfig
+
+[[autodoc]] FPQuantConfig
+
 ## AutoRoundConfig

 [[autodoc]] AutoRoundConfig
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -258,6 +258,10 @@ The following auto classes are available for the following computer vision tasks

 [[autodoc]] AutoModelForKeypointDetection

+### AutoModelForKeypointMatching
+
+[[autodoc]] AutoModelForKeypointMatching
+
 ### AutoModelForMaskedImageModeling

 [[autodoc]] AutoModelForMaskedImageModeling
--- a/docs/source/en/model_doc/camembert.md
+++ b/docs/source/en/model_doc/camembert.md
@ -14,49 +14,105 @@ rendered properly in your Markdown viewer.

 -->

-# CamemBERT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+	<div class="flex flex-wrap space-x-1">
+		<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+		<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+	</div>
 </div>

-## Overview
+# CamemBERT

-The CamemBERT model was proposed in [CamemBERT: a Tasty French Language Model](https://huggingface.co/papers/1911.03894) by
-[Louis Martin](https://huggingface.co/louismartin), [Benjamin Muller](https://huggingface.co/benjamin-mlr), [Pedro Javier Ortiz Suárez](https://huggingface.co/pjox), Yoann Dupont, Laurent Romary, Éric Villemonte de la
-Clergerie, [Djamé Seddah](https://huggingface.co/Djame), and [Benoît Sagot](https://huggingface.co/sagot). It is based on Facebook's RoBERTa model released in 2019. It is a model
-trained on 138GB of French text.
+[CamemBERT](https://huggingface.co/papers/1911.03894) is a language model based on [RoBERTa](./roberta), but trained specifically on French text from the OSCAR dataset, making it more effective for French language tasks.

-The abstract from the paper is the following:
+What sets CamemBERT apart is that it learned from a huge, high quality collection of French data, as opposed to mixing lots of languages. This helps it really understand French better than many multilingual models.

-*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
-models have either been trained on English data or on the concatenation of data in multiple languages. This makes
-practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
-we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
-performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
-dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
-for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
-downstream applications for French NLP.*
+Common applications of CamemBERT include masked language modeling (Fill-mask prediction), text classification (sentiment analysis), token classification (entity recognition) and sentence pair classification (entailment tasks).

-This model was contributed by [the ALMAnaCH team (Inria)](https://huggingface.co/almanach). The original code can be found [here](https://camembert-model.fr/).
+You can find all the original CamemBERT checkpoints under the [ALMAnaCH](https://huggingface.co/almanach/models?search=camembert) organization.

-<Tip>
+> [!TIP]
+> This model was contributed by the [ALMAnaCH (Inria)](https://huggingface.co/almanach) team.
+>
+> Click on the CamemBERT models in the right sidebar for more examples of how to apply CamemBERT to different NLP tasks.

-This implementation is the same as RoBERTa. Refer to the [documentation of RoBERTa](roberta) for usage examples as well 
-as the information relative to the inputs and outputs.
+The examples below demonstrate how to predict the `<mask>` token with [`Pipeline`], [`AutoModel`], and from the command line.

-</Tip>
+<hfoptions id="usage">

-## Resources
+<hfoption id="Pipeline">

- [Text classification task guide](../tasks/sequence_classification)
- [Token classification task guide](../tasks/token_classification)
- [Question answering task guide](../tasks/question_answering)
- [Causal language modeling task guide](../tasks/language_modeling)
- [Masked language modeling task guide](../tasks/masked_language_modeling)
- [Multiple choice task guide](../tasks/multiple_choice)
+```python
+import torch
+from transformers import pipeline
+
+pipeline = pipeline("fill-mask", model="camembert-base", torch_dtype=torch.float16, device=0)
+pipeline("Le camembert est un délicieux fromage <mask>.")
+```
+</hfoption> 
+
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoTokenizer, AutoModelForMaskedLM
+
+tokenizer = AutoTokenizer.from_pretrained("camembert-base")
+model = AutoModelForMaskedLM.from_pretrained("camembert-base", torch_dtype="auto", device_map="auto", attn_implementation="sdpa")
+inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+</hfoption> 
+
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "Le camembert est un délicieux fromage <mask>." | transformers run --task fill-mask --model camembert-base --device 0
+```
+
+</hfoption> 
+
+</hfoptions> 
+
+
+Quantization reduces the memory burden of large models by representing weights in lower precision. Refer to the [Quantization](../quantization/overview) overview for available options.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) quantization to quantize the weights to 8-bits.
+  
+```python
+from transformers import AutoTokenizer, AutoModelForMaskedLM, BitsAndBytesConfig
+import torch
+
+quant_config = BitsAndBytesConfig(load_in_8bit=True)
+model = AutoModelForMaskedLM.from_pretrained(
+    "almanach/camembert-large",
+    quantization_config=quant_config,
+    device_map="auto"
+)
+tokenizer = AutoTokenizer.from_pretrained("almanach/camembert-large")
+
+inputs = tokenizer("Le camembert est un délicieux fromage <mask>.", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```

 ## CamembertConfig

@ -137,5 +193,4 @@ as the information relative to the inputs and outputs.
 [[autodoc]] TFCamembertForQuestionAnswering

 </tf>
-</frameworkcontent>
-
+</frameworkcontent>
--- a/docs/source/en/model_doc/deepseek_v2.md
+++ b/docs/source/en/model_doc/deepseek_v2.md
@ -0,0 +1,49 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# DeepSeek-V2
+
+## Overview
+
+The DeepSeek-V2 model was proposed in [DeepSeek-V2: A Strong, Economical, and Efficient Mixture-of-Experts Language Model](https://arxiv.org/abs/2405.04434) by DeepSeek-AI Team.
+
+The abstract from the paper is the following:
+We present DeepSeek-V2, a strong Mixture-of-Experts (MoE) language model characterized by economical training and efficient inference. It comprises 236B total parameters, of which 21B are activated for each token, and supports a context length of 128K tokens. DeepSeek-V2 adopts innovative architectures including Multi-head Latent Attention (MLA) and DeepSeekMoE. MLA guarantees efficient inference through significantly compressing the Key-Value (KV) cache into a latent vector, while DeepSeekMoE enables training strong models at an economical cost through sparse computation. Compared with DeepSeek 67B, DeepSeek-V2 achieves significantly stronger performance, and meanwhile saves 42.5% of training costs, reduces the KV cache by 93.3%, and boosts the maximum generation throughput to 5.76 times. We pretrain DeepSeek-V2 on a high-quality and multi-source corpus consisting of 8.1T tokens, and further perform Supervised Fine-Tuning (SFT) and Reinforcement Learning (RL) to fully unlock its potential. Evaluation results show that, even with only 21B activated parameters, DeepSeek-V2 and its chat versions still achieve top-tier performance among open-source models.
+
+This model was contributed by [VladOS95-cyber](https://github.com/VladOS95-cyber).
+The original code can be found [here](https://huggingface.co/deepseek-ai/DeepSeek-V2).
+
+### Usage tips
+The model uses Multi-head Latent Attention (MLA) and DeepSeekMoE architectures for efficient inference and cost-effective training. It employs an auxiliary-loss-free strategy for load balancing and multi-token prediction training objective. The model can be used for various language tasks after being pre-trained on 14.8 trillion tokens and going through Supervised Fine-Tuning and Reinforcement Learning stages.
+
+## DeepseekV2Config
+
+[[autodoc]] DeepseekV2Config
+
+## DeepseekV2Model
+
+[[autodoc]] DeepseekV2Model
+    - forward
+
+## DeepseekV2ForCausalLM
+
+[[autodoc]] DeepseekV2ForCausalLM
+    - forward
+
+## DeepseekV2ForSequenceClassification
+
+[[autodoc]] DeepseekV2ForSequenceClassification
+    - forward
--- a/docs/source/en/model_doc/efficientloftr.md
+++ b/docs/source/en/model_doc/efficientloftr.md
@ -0,0 +1,114 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the MIT License; you may not use this file except in compliance with
+the License.
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+
+-->
+
+# EfficientLoFTR
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+## Overview
+
+The EfficientLoFTR model was proposed in [Efficient LoFTR: Semi-Dense Local Feature Matching with Sparse-Like Speed](https://arxiv.org/abs/2403.04765) by Yifan Wang, Xingyi He, Sida Peng, Dongli Tan and Xiaowei Zhou.
+
+This model consists of matching two images together by finding pixel correspondences. It can be used to estimate the pose between them. 
+This model is useful for tasks such as image matching, homography estimation, etc.
+
+The abstract from the paper is the following:
+
+*We present a novel method for efficiently producing semidense matches across images. Previous detector-free matcher 
+LoFTR has shown remarkable matching capability in handling large-viewpoint change and texture-poor scenarios but suffers
+from low efficiency. We revisit its design choices and derive multiple improvements for both efficiency and accuracy. 
+One key observation is that performing the transformer over the entire feature map is redundant due to shared local 
+information, therefore we propose an aggregated attention mechanism with adaptive token selection for efficiency. 
+Furthermore, we find spatial variance exists in LoFTR’s fine correlation module, which is adverse to matching accuracy. 
+A novel two-stage correlation layer is proposed to achieve accurate subpixel correspondences for accuracy improvement. 
+Our efficiency optimized model is ∼ 2.5× faster than LoFTR which can even surpass state-of-the-art efficient sparse 
+matching pipeline SuperPoint + LightGlue. Moreover, extensive experiments show that our method can achieve higher 
+accuracy compared with competitive semi-dense matchers, with considerable efficiency benefits. This opens up exciting 
+prospects for large-scale or latency-sensitive applications such as image retrieval and 3D reconstruction. 
+Project page: [https://zju3dv.github.io/efficientloftr/](https://zju3dv.github.io/efficientloftr/).*
+
+## How to use
+
+Here is a quick example of using the model. 
+```python
+import torch
+
+from transformers import AutoImageProcessor, AutoModelForKeypointMatching
+from transformers.image_utils import load_image
+
+
+image1 = load_image("https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg")
+image2 = load_image("https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg")
+
+images = [image1, image2]
+
+processor = AutoImageProcessor.from_pretrained("stevenbucaille/efficientloftr")
+model = AutoModelForKeypointMatching.from_pretrained("stevenbucaille/efficientloftr")
+
+inputs = processor(images, return_tensors="pt")
+with torch.no_grad():
+    outputs = model(**inputs)
+```
+
+You can use the `post_process_keypoint_matching` method from the `ImageProcessor` to get the keypoints and matches in a more readable format:
+
+```python
+image_sizes = [[(image.height, image.width) for image in images]]
+outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+for i, output in enumerate(outputs):
+    print("For the image pair", i)
+    for keypoint0, keypoint1, matching_score in zip(
+            output["keypoints0"], output["keypoints1"], output["matching_scores"]
+    ):
+        print(
+            f"Keypoint at coordinate {keypoint0.numpy()} in the first image matches with keypoint at coordinate {keypoint1.numpy()} in the second image with a score of {matching_score}."
+        )
+```
+
+From the post processed outputs, you can visualize the matches between the two images using the following code:
+```python
+images_with_matching = processor.visualize_keypoint_matching(images, outputs)
+```
+
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/2nJZQlFToCYp_iLurvcZ4.png)
+
+This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+The original code can be found [here](https://github.com/zju3dv/EfficientLoFTR).
+
+## EfficientLoFTRConfig
+
+[[autodoc]] EfficientLoFTRConfig
+
+## EfficientLoFTRImageProcessor
+
+[[autodoc]] EfficientLoFTRImageProcessor
+
+- preprocess
+- post_process_keypoint_matching
+- visualize_keypoint_matching
+
+## EfficientLoFTRModel
+
+[[autodoc]] EfficientLoFTRModel
+
+- forward
+
+## EfficientLoFTRForKeypointMatching
+
+[[autodoc]] EfficientLoFTRForKeypointMatching
+
+- forward
--- a/docs/source/en/model_doc/encodec.md
+++ b/docs/source/en/model_doc/encodec.md
@ -47,7 +47,8 @@ Here is a quick example of how to encode and decode an audio using this model:
 >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")

 >>> encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])
->>> audio_values = model.decode(encoder_outputs.audio_codes, encoder_outputs.audio_scales, inputs["padding_mask"])[0]
+>>> # `encoder_outputs.audio_codes` contains discrete codes
+>>> audio_values = model.decode(**encoder_outputs, padding_mask=inputs["padding_mask"])[0]
 >>> # or the equivalent with a forward pass
 >>> audio_values = model(inputs["input_values"], inputs["padding_mask"]).audio_values
 ```
--- a/docs/source/en/model_doc/encoder-decoder.md
+++ b/docs/source/en/model_doc/encoder-decoder.md
@ -14,115 +14,88 @@ rendered properly in your Markdown viewer.

 -->

-# Encoder Decoder Models
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Encoder Decoder Models

-The [`EncoderDecoderModel`] can be used to initialize a sequence-to-sequence model with any
-pretrained autoencoding model as the encoder and any pretrained autoregressive model as the decoder.
+[`EncoderDecoderModel`](https://huggingface.co/papers/1706.03762) initializes a sequence-to-sequence model with any pretrained autoencoder and pretrained autoregressive model. It is effective for sequence generation tasks as demonstrated in [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) which uses [`BertModel`] as the encoder and decoder.

-The effectiveness of initializing sequence-to-sequence models with pretrained checkpoints for sequence generation tasks
-was shown in [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://huggingface.co/papers/1907.12461) by
-Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+> [!TIP]
+> This model was contributed by [thomwolf](https://huggingface.co/thomwolf) and the TensorFlow/Flax version by [ydshieh](https://huggingface.co/ydshieh).
+>
+> Click on the Encoder Decoder models in the right sidebar for more examples of how to apply Encoder Decoder to different language tasks.

-After such an [`EncoderDecoderModel`] has been trained/fine-tuned, it can be saved/loaded just like
-any other models (see the examples for more information).
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.

-An application of this architecture could be to leverage two pretrained [`BertModel`] as the encoder
-and decoder for a summarization model as was shown in: [Text Summarization with Pretrained Encoders](https://huggingface.co/papers/1908.08345) by Yang Liu and Mirella Lapata.
-
-## Randomly initializing `EncoderDecoderModel` from model configurations.
-
-[`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config. In the following example, we show how to do this using the default [`BertModel`] configuration for the encoder and the default [`BertForCausalLM`] configuration for the decoder.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
->>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel
+from transformers import pipeline

->>> config_encoder = BertConfig()
->>> config_decoder = BertConfig()
+summarizer = pipeline(
+    "summarization",
+    model="patrickvonplaten/bert2bert-cnn_dailymail-fp16",
+    device=0
+)

->>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
->>> model = EncoderDecoderModel(config=config)
+text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
+print(summarizer(text))
 ```

-## Initialising `EncoderDecoderModel` from a pretrained encoder and a pretrained decoder.
-
-[`EncoderDecoderModel`] can be initialized from a pretrained encoder checkpoint and a pretrained decoder checkpoint. Note that any pretrained auto-encoding model, *e.g.* BERT, can serve as the encoder and both pretrained auto-encoding models, *e.g.* BERT, pretrained causal language models, *e.g.* GPT2, as well as the pretrained decoder part of sequence-to-sequence models, *e.g.* decoder of BART, can be used as the decoder.
-Depending on which architecture you choose as the decoder, the cross-attention layers might be randomly initialized.
-Initializing [`EncoderDecoderModel`] from a pretrained encoder and decoder checkpoint requires the model to be fine-tuned on a downstream task, as has been shown in [the *Warm-starting-encoder-decoder blog post*](https://huggingface.co/blog/warm-starting-encoder-decoder).
-To do so, the `EncoderDecoderModel` class provides a [`EncoderDecoderModel.from_encoder_decoder_pretrained`] method.
+</hfoption>
+<hfoption id="AutoModel">

 ```python
->>> from transformers import EncoderDecoderModel, BertTokenizer
+import torch  
+from transformers import AutoModelForCausalLM, AutoTokenizer  

->>> tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
->>> model = EncoderDecoderModel.from_encoder_decoder_pretrained("google-bert/bert-base-uncased", "google-bert/bert-base-uncased")
+tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+model = AutoModelForCausalLM.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16", torch_dtype=torch.bfloat16, device_map="auto",attn_implementation="sdpa")  
+
+text = "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen."
+
+inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(model.device)
+
+summary = model.generate(**inputs, max_length=60, num_beams=4, early_stopping=True)
+print(tokenizer.decode(summary[0], skip_special_tokens=True))
 ```

-## Loading an existing `EncoderDecoderModel` checkpoint and perform inference.
+</hfoption>
+<hfoption id="transformers CLI">

-To load fine-tuned checkpoints of the `EncoderDecoderModel` class, [`EncoderDecoderModel`] provides the `from_pretrained(...)` method just like any other model architecture in Transformers.
+```bash
+echo -e "Plants create energy through a process known as photosynthesis. This involves capturing sunlight and converting carbon dioxide and water into glucose and oxygen." | transformers-cli run --task summarization --model "patrickvonplaten/bert2bert-cnn_dailymail-fp16" --device 0
+```

-To perform inference, one uses the [`generate`] method, which allows to autoregressively generate text. This method supports various forms of decoding, such as greedy, beam search and multinomial sampling.
+</hfoption>
+</hfoptions>
+
+## Notes
+
+- [`EncoderDecoderModel`] can be initialized using any pretrained encoder and decoder. But depending on the decoder architecture, the cross-attention layers may be randomly initialized.
+
+These models require downstream fine-tuning, as discussed in this [blog post](https://huggingface.co/blog/warm-starting-encoder-decoder). Use [`~EncoderDecoderModel.from_encoder_decoder_pretrained`] to combine encoder and decoder checkpoints.

 ```python
->>> from transformers import AutoTokenizer, EncoderDecoderModel
+from transformers import EncoderDecoderModel, BertTokenizer

->>> # load a fine-tuned seq2seq model and corresponding tokenizer
->>> model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
->>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/bert2bert_cnn_daily_mail")
-
->>> # let's perform inference on a long piece of text
->>> ARTICLE_TO_SUMMARIZE = (
-...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
-...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
-...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
-... )
->>> input_ids = tokenizer(ARTICLE_TO_SUMMARIZE, return_tensors="pt").input_ids
-
->>> # autoregressively generate summary (uses greedy decoding by default)
->>> generated_ids = model.generate(input_ids)
->>> generated_text = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
->>> print(generated_text)
-nearly 800 thousand customers were affected by the shutoffs. the aim is to reduce the risk of wildfires. nearly 800, 000 customers were expected to be affected by high winds amid dry conditions. pg & e said it scheduled the blackouts to last through at least midday tomorrow.
+tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased")
+model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+    "google-bert/bert-base-uncased", 
+    "google-bert/bert-base-uncased"
+)
 ```

-## Loading a PyTorch checkpoint into `TFEncoderDecoderModel`.
-
-[`TFEncoderDecoderModel.from_pretrained`] currently doesn't support initializing the model from a
-pytorch checkpoint. Passing `from_pt=True` to this method will throw an exception. If there are only pytorch
-checkpoints for a particular encoder-decoder model, a workaround is:
-
-```python
->>> # a workaround to load from pytorch checkpoint
->>> from transformers import EncoderDecoderModel, TFEncoderDecoderModel
-
->>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
-
->>> _model.encoder.save_pretrained("./encoder")
->>> _model.decoder.save_pretrained("./decoder")
-
->>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
-...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
-... )
->>> # This is only for copying some specific attributes of this particular model.
->>> model.config = _model.config
-```
-
-## Training
-
-Once the model is created, it can be fine-tuned similar to BART, T5 or any other encoder-decoder model.
-As you can see, only 2 inputs are required for the model in order to compute a loss: `input_ids` (which are the
-`input_ids` of the encoded input sequence) and `labels` (which are the `input_ids` of the encoded
-target sequence).
+- Encoder Decoder models can be fine-tuned like BART, T5 or any other encoder-decoder model. Only 2 inputs are required to compute a loss, `input_ids` and `labels`. Refer to this [notebook](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for a more detailed training example.

 ```python
 >>> from transformers import BertTokenizer, EncoderDecoderModel
@ -147,11 +120,42 @@ target sequence).
 >>> loss = model(input_ids=input_ids, labels=labels).loss
 ```

-Detailed [colab](https://colab.research.google.com/drive/1WIk2bxglElfZewOHboPFNj8H44_VAyKE?usp=sharing#scrollTo=ZwQIEhKOrJpl) for training.
+- [`EncoderDecoderModel`] can be randomly initialized from an encoder and a decoder config as shown below.

-This model was contributed by [thomwolf](https://github.com/thomwolf). This model's TensorFlow and Flax versions
-were contributed by [ydshieh](https://github.com/ydshieh).
+```python
+>>> from transformers import BertConfig, EncoderDecoderConfig, EncoderDecoderModel

+>>> config_encoder = BertConfig()
+>>> config_decoder = BertConfig()
+
+>>> config = EncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)
+>>> model = EncoderDecoderModel(config=config)
+```
+
+- The Encoder Decoder Model can also be used for translation as shown below.
+
+```python
+from transformers import AutoTokenizer, EncoderDecoderModel  
+
+# Load a pre-trained translation model  
+model_name = "google/bert2bert_L-24_wmt_en_de" 
+tokenizer = AutoTokenizer.from_pretrained(model_name, pad_token="<pad>", eos_token="</s>", bos_token="<s>")  
+model = EncoderDecoderModel.from_pretrained(model_name)  
+
+# Input sentence to translate  
+input_text = "Plants create energy through a process known as"  
+
+# Encode the input text  
+inputs = tokenizer(input_text, return_tensors="pt", add_special_tokens=False).input_ids  
+
+# Generate the translated output  
+outputs = model.generate(inputs)[0]  
+
+# Decode the output tokens to get the translated sentence  
+translated_text = tokenizer.decode(outputs, skip_special_tokens=True)  
+
+print("Translated text:", translated_text)  
+```

 ## EncoderDecoderConfig

--- a/docs/source/en/model_doc/ernie4_5.md
+++ b/docs/source/en/model_doc/ernie4_5.md
@ -0,0 +1,99 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+# Ernie 4.5
+
+## Overview
+
+The Ernie 4.5 model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
+This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
+model without mixture of experts (moe) with 0.3B parameters in total. It uses the standard [Llama](./llama.md) at its core.
+
+Other models from the family can be found at [Ernie 4.5 MoE](./ernie4_5_moe.md).
+
+<div class="flex justify-center">
+    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
+</div>
+
+
+## Usage Tips
+
+### Generate text
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "baidu/ERNIE-4.5-0.3B-PT"
+
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+
+# prepare the model input
+inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
+prompt = "Hey, are you conscious? Can you talk to me?"
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
+
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32,
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+
+# decode the generated ids
+generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
+```
+
+This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
+The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
+
+
+## Ernie4_5Config
+
+[[autodoc]] Ernie4_5Config
+
+## Ernie4_5Model
+
+[[autodoc]] Ernie4_5Model
+    - forward
+
+## Ernie4_5ForCausalLM
+
+[[autodoc]] Ernie4_5ForCausalLM
+    - forward
--- a/docs/source/en/model_doc/ernie4_5_moe.md
+++ b/docs/source/en/model_doc/ernie4_5_moe.md
@ -0,0 +1,183 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="Tensor parallelism" src="https://img.shields.io/badge/Tensor%20parallelism-06b6d4?style=flat&logoColor=white">
+    </div>
+</div>
+
+# Ernie 4.5 MoE
+
+## Overview
+
+The Ernie 4.5 MoE model was released in the [Ernie 4.5 Model Family](https://ernie.baidu.com/blog/posts/ernie4.5/) release by baidu.
+This family of models contains multiple different architectures and model sizes. This model in specific targets the base text
+model with mixture of experts (moe) - one with 21B total, 3B active parameters and another one with 300B total, 47B active parameters.
+It uses the standard [Llama](./llama.md) at its core combined with a specialized MoE based on [Mixtral](./mixtral.md) with additional shared
+experts.
+
+Other models from the family can be found at [Ernie 4.5](./ernie4_5.md).
+
+<div class="flex justify-center">
+    <img src="https://ernie.baidu.com/blog/posts/ernie4.5/overview.png"/>
+</div>
+
+
+## Usage Tips
+
+### Generate text
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
+
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+)
+
+# prepare the model input
+inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
+prompt = "Hey, are you conscious? Can you talk to me?"
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
+
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32,
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+
+# decode the generated ids
+generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
+```
+
+### Distributed Generation with Tensor Parallelism
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
+
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    tp_plan="auto",
+)
+
+# prepare the model input
+inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
+prompt = "Hey, are you conscious? Can you talk to me?"
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
+
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32,
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+
+# decode the generated ids
+generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
+```
+
+### Quantization with Bitsandbytes
+
+```python
+import torch
+from transformers import BitsAndBytesConfig, AutoModelForCausalLM, AutoTokenizer
+
+model_name = "baidu/ERNIE-4.5-21B-A3B-PT"
+
+# load the tokenizer and the model
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(
+    model_name,
+    device_map="auto",
+    quantization_config=BitsAndBytesConfig(load_in_4bit=True),
+)
+
+# prepare the model input
+inputs = tokenizer("Hey, are you conscious? Can you talk to me?", return_tensors="pt")
+prompt = "Hey, are you conscious? Can you talk to me?"
+messages = [
+    {"role": "user", "content": prompt}
+]
+text = tokenizer.apply_chat_template(
+    messages,
+    tokenize=False,
+    add_generation_prompt=True
+)
+model_inputs = tokenizer([text], add_special_tokens=False, return_tensors="pt").to(model.device)
+
+# conduct text completion
+generated_ids = model.generate(
+    **model_inputs,
+    max_new_tokens=32,
+)
+output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist()
+
+# decode the generated ids
+generate_text = tokenizer.decode(output_ids, skip_special_tokens=True)
+```
+
+This model was contributed by [Anton Vlasjuk](https://huggingface.co/AntonV).
+The original code can be found [here](https://github.com/PaddlePaddle/ERNIE).
+
+
+## Ernie4_5_MoEConfig
+
+[[autodoc]] Ernie4_5_MoEConfig
+
+## Ernie4_5_MoEModel
+
+[[autodoc]] Ernie4_5_MoEModel
+    - forward
+
+## Ernie4_5_MoEForCausalLM
+
+[[autodoc]] Ernie4_5_MoEForCausalLM
+    - forward
+    - generate
--- a/docs/source/en/model_doc/falcon_mamba.md
+++ b/docs/source/en/model_doc/falcon_mamba.md
@ -110,6 +110,13 @@ outputs = model.generate(**inputs, max_new_tokens=100)
 print(tokenizer.decode(outputs[0], skip_special_tokens=True))
 ```

+## FalconMambaCache
+
+[[autodoc]] FalconMambaCache
+    - update_conv_state
+    - update_ssm_state
+    - reset
+
 ## FalconMambaConfig

 [[autodoc]] FalconMambaConfig
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@ -267,3 +267,8 @@ visualizer("<img>What is shown in this image?")

 [[autodoc]] Gemma3ForConditionalGeneration
    - forward
+
+## Gemma3ForSequenceClassification
+
+[[autodoc]] Gemma3ForSequenceClassification
+    - forward
--- a/docs/source/en/model_doc/glm4_moe.md
+++ b/docs/source/en/model_doc/glm4_moe.md
@ -0,0 +1,35 @@
+<!--Copyright 2025 The ZhipuAI Inc. and The HuggingFace Inc. team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Glm4Moe
+
+## Overview
+
+This will update After model release.
+
+## Glm4MoeConfig
+
+[[autodoc]] Glm4MoeConfig
+
+## Glm4MoeModel
+
+[[autodoc]] Glm4MoeModel
+    - forward
+
+## Glm4MoeForCausalLM
+
+[[autodoc]] Glm4MoeForCausalLM
+    - forward
--- a/docs/source/en/model_doc/gpt2.md
+++ b/docs/source/en/model_doc/gpt2.md
@ -57,7 +57,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2", torch_dtype=torch.float16, device_map="auto", attn_implementation="sdpa")
 tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

-input_ids = tokenzier("Hello, I'm a language model". return_tensors="pt").to("cuda")
+input_ids = tokenizer("Hello, I'm a language model", return_tensors="pt").to("cuda")

 output = model.generate(**input_ids, cache_implementation="static")
 print(tokenizer.decode(output[0], skip_special_tokens=True))
--- a/docs/source/en/model_doc/ijepa.md
+++ b/docs/source/en/model_doc/ijepa.md
@ -1,4 +1,4 @@
-<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.

 Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 the License. You may obtain a copy of the License at
@ -14,53 +14,107 @@ rendered properly in your Markdown viewer.

 -->

-# I-JEPA
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# I-JEPA

-The I-JEPA model was proposed in [Image-based Joint-Embedding Predictive Architecture](https://huggingface.co/papers/2301.08243) by Mahmoud Assran, Quentin Duval, Ishan Misra, Piotr Bojanowski, Pascal Vincent, Michael Rabbat, Yann LeCun, Nicolas Ballas.
-I-JEPA is a self-supervised learning method that predicts the representations of one part of an image based on other parts of the same image. This approach focuses on learning semantic features without relying on pre-defined invariances from hand-crafted data transformations, which can bias specific tasks, or on filling in pixel-level details, which often leads to less meaningful representations.
+[I-JEPA](https://huggingface.co/papers/2301.08243) is a self-supervised learning method that learns semantic image representations by predicting parts of an image from other parts of the image. It compares the abstract representations of the image (rather than pixel level comparisons), which avoids the typical pitfalls of data augmentation bias and pixel-level details that don't capture semantic meaning.

-The abstract from the paper is the following:
+You can find the original I-JEPA checkpoints under the [AI at Meta](https://huggingface.co/facebook/models?search=ijepa) organization.
+> [!TIP]
+> This model was contributed by [jmtzt](https://huggingface.co/jmtzt).

-This paper demonstrates an approach for learning highly semantic image representations without relying on hand-crafted data-augmentations. We introduce the Image- based Joint-Embedding Predictive Architecture (I-JEPA), a non-generative approach for self-supervised learning from images. The idea behind I-JEPA is simple: from a single context block, predict the representations of various target blocks in the same image. A core design choice to guide I-JEPA towards producing semantic representations is the masking strategy; specifically, it is crucial to (a) sample tar- get blocks with sufficiently large scale (semantic), and to (b) use a sufficiently informative (spatially distributed) context block. Empirically, when combined with Vision Transform- ers, we find I-JEPA to be highly scalable. For instance, we train a ViT-Huge/14 on ImageNet using 16 A100 GPUs in under 72 hours to achieve strong downstream performance across a wide range of tasks, from linear classification to object counting and depth prediction.

-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/ijepa_architecture.jpg"
-alt="drawing" width="600"/>
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/ijepa_architecture.jpg">

-<small> I-JEPA architecture. Taken from the <a href="https://huggingface.co/papers/2301.08243">original paper.</a> </small>

-This model was contributed by [jmtzt](https://huggingface.co/jmtzt).
-The original code can be found [here](https://github.com/facebookresearch/ijepa).
+> Click on the I-JEPA models in the right sidebar for more examples of how to apply I-JEPA to different image representation and classification tasks.

-## How to use
+The example below demonstrates how to extract image features with [`Pipeline`] or the [`AutoModel`] class.

-Here is how to use this model for image feature extraction:
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-```python
+```py
+import torch
+from transformers import pipeline
+feature_extractor = pipeline(
+    task="image-feature-extraction",
+    model="facebook/ijepa_vith14_1k",
+    device=0,
+    torch_dtype=torch.bfloat16
+)
+features = feature_extractor("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg", return_tensors=True)  
+
+print(f"Feature shape: {features.shape}")
+
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
 import requests
 import torch
 from PIL import Image
 from torch.nn.functional import cosine_similarity
+from transformers import AutoModel, AutoProcessor  

-from transformers import AutoModel, AutoProcessor
+url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"  
+url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
+image_1 = Image.open(requests.get(url_1, stream=True).raw)
+image_2 = Image.open(requests.get(url_2, stream=True).raw)
+
+processor = AutoProcessor.from_pretrained("facebook/ijepa_vith14_1k")  
+model = AutoModel.from_pretrained("facebook/ijepa_vith14_1k", torch_dtype="auto", attn_implementation="sdpa")  
+
+
+def infer(image):  
+    inputs = processor(image, return_tensors="pt")  
+    outputs = model(**inputs)  
+    return outputs.last_hidden_state.mean(dim=1)  
+
+
+embed_1 = infer(image_1)  
+embed_2 = infer(image_2)  
+
+similarity = cosine_similarity(embed_1, embed_2)  
+print(similarity)
+```
+</hfoption>
+</hfoptions>
+
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
+
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoModel, AutoProcessor
+from datasets import load_dataset
+
+quantization_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_quant_type="nf4",
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+)

 url_1 = "http://images.cocodataset.org/val2017/000000039769.jpg"
 url_2 = "http://images.cocodataset.org/val2017/000000219578.jpg"
 image_1 = Image.open(requests.get(url_1, stream=True).raw)
 image_2 = Image.open(requests.get(url_2, stream=True).raw)

-model_id = "facebook/ijepa_vith14_1k"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModel.from_pretrained(model_id)
+processor = AutoProcessor.from_pretrained("facebook/ijepa_vitg16_22k")
+model = AutoModel.from_pretrained("facebook/ijepa_vitg16_22k", quantization_config=quantization_config, torch_dtype="auto", attn_implementation="sdpa")
+

-@torch.no_grad()
 def infer(image):
    inputs = processor(image, return_tensors="pt")
    outputs = model(**inputs)
@ -74,15 +128,6 @@ similarity = cosine_similarity(embed_1, embed_2)
 print(similarity)
 ```

-## Resources
-
-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with I-JEPA.
-
-<PipelineTag pipeline="image-classification"/>
-
- [`IJepaForImageClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/image_classification.ipynb).
- See also: [Image classification task guide](../tasks/image_classification)
-
 ## IJepaConfig

 [[autodoc]] IJepaConfig
@ -95,4 +140,5 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h
 ## IJepaForImageClassification

 [[autodoc]] IJepaForImageClassification
-    - forward
+    - forward
+
--- a/docs/source/en/model_doc/lfm2.md
+++ b/docs/source/en/model_doc/lfm2.md
@ -0,0 +1,84 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div class="flex flex-wrap space-x-1">
+<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+</div>
+
+# LFM2
+
+## Overview
+
+[LFM2](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models) represents a new generation of Liquid Foundation Models developed by [Liquid AI](https://liquid.ai/), specifically designed for edge AI and on-device deployment. 
+
+The models are available in three sizes (350M, 700M, and 1.2B parameters) and are engineered to run efficiently on CPU, GPU, and NPU hardware, making them particularly well-suited for applications requiring low latency, offline operation, and privacy.
+
+## Architecture
+
+The architecture consists of 16 blocks total: 10 double-gated short-range convolution blocks and 6 blocks of grouped query attention. This design stems from the concept of dynamical systems, where linear operations are modulated by input-dependent gates, allowing for "liquid" dynamics that can adapt in real-time. The short convolutions are particularly optimized for embedded SoC CPUs, making them ideal for devices that require fast, local inference without relying on cloud connectivity.
+
+The key architectural innovation of LFM2 lies in its systematic approach to balancing quality, latency, and memory efficiency through our STAR neural architecture search engine. Using STAR, Liquid AI optimized the models for real-world performance on embedded hardware, measuring actual peak memory usage and inference speed on Qualcomm Snapdragon processors. This results in models that achieve 2x faster decode and prefill performance compared to similar-sized models, while maintaining superior benchmark performance across knowledge, mathematics, instruction following, and multilingual tasks.
+
+## Example
+
+The following example shows how to generate an answer using the `AutoModelForCausalLM` class.
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# Load model and tokenizer
+model_id = "LiquidAI/LFM2-1.2B"
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype="bfloat16",
+)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+
+# Generate answer
+prompt = "What is C. elegans?"
+input_ids = tokenizer.apply_chat_template(
+    [{"role": "user", "content": prompt}],
+    add_generation_prompt=True,
+    return_tensors="pt",
+    tokenize=True,
+)
+
+output = model.generate(
+    input_ids,
+    do_sample=True,
+    temperature=0.3,
+    min_p=0.15,
+    repetition_penalty=1.05,
+    max_new_tokens=512,
+)
+
+print(tokenizer.decode(output[0], skip_special_tokens=False))
+```
+
+## Lfm2Config
+
+[[autodoc]] Lfm2Config
+
+## Lfm2Model
+
+[[autodoc]] Lfm2Model
+    - forward
+
+## Lfm2ForCausalLM
+
+[[autodoc]] Lfm2ForCausalLM
+    - forward
--- a/docs/source/en/model_doc/lightglue.md
+++ b/docs/source/en/model_doc/lightglue.md
@ -10,37 +10,31 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

-
 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
 # LightGlue

-## Overview
+[LightGlue](https://arxiv.org/abs/2306.13643) is a deep neural network that learns to match local features across images. It revisits multiple design decisions of SuperGlue and derives simple but effective improvements. Cumulatively, these improvements make LightGlue more efficient - in terms of both memory and computation, more accurate, and much easier to train. Similar to [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor), this model consists of matching two sets of local features extracted from two images, with the goal of being faster than SuperGlue. Paired with the [SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and estimate the pose between them.

-The LightGlue model was proposed in [LightGlue: Local Feature Matching at Light Speed](https://arxiv.org/abs/2306.13643)
-by Philipp Lindenberger, Paul-Edouard Sarlin and Marc Pollefeys.
+You can find all the original LightGlue checkpoints under the [ETH-CVG](https://huggingface.co/ETH-CVG) organization.

-Similar to [SuperGlue](https://huggingface.co/magic-leap-community/superglue_outdoor), this model consists of matching
-two sets of local features extracted from two images, its goal is to be faster than SuperGlue. Paired with the 
-[SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and 
-estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc.
+> [!TIP]
+> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+>
+> Click on the LightGlue models in the right sidebar for more examples of how to apply LightGlue to different computer vision tasks.

-The abstract from the paper is the following:
+The example below demonstrates how to match keypoints between two images with the [`AutoModel`] class.

-*We introduce LightGlue, a deep neural network that learns to match local features across images. We revisit multiple
-design decisions of SuperGlue, the state of the art in sparse matching, and derive simple but effective improvements. 
-Cumulatively, they make LightGlue more efficient - in terms of both memory and computation, more accurate, and much
-easier to train. One key property is that LightGlue is adaptive to the difficulty of the problem: the inference is much
-faster on image pairs that are intuitively easy to match, for example because of a larger visual overlap or limited
-appearance change. This opens up exciting prospects for deploying deep matchers in latency-sensitive applications like
-3D reconstruction. The code and trained models are publicly available at this [https URL](https://github.com/cvg/LightGlue)*
+<hfoptions id="usage">
+<hfoption id="AutoModel">

-## How to use
-
-Here is a quick example of using the model. Since this model is an image matching model, it requires pairs of images to be matched. 
-The raw outputs contain the list of keypoints detected by the keypoint detector as well as the list of matches with their corresponding 
-matching scores.
-```python
+```py
 from transformers import AutoImageProcessor, AutoModel
 import torch
 from PIL import Image
@ -59,31 +53,70 @@ model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
 inputs = processor(images, return_tensors="pt")
 with torch.no_grad():
    outputs = model(**inputs)
-```

-You can use the `post_process_keypoint_matching` method from the `LightGlueImageProcessor` to get the keypoints and matches in a readable format:
-```python
+# Post-process to get keypoints and matches
 image_sizes = [[(image.height, image.width) for image in images]]
-outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-for i, output in enumerate(outputs):
-    print("For the image pair", i)
-    for keypoint0, keypoint1, matching_score in zip(
-            output["keypoints0"], output["keypoints1"], output["matching_scores"]
-    ):
-        print(
-            f"Keypoint at coordinate {keypoint0.numpy()} in the first image matches with keypoint at coordinate {keypoint1.numpy()} in the second image with a score of {matching_score}."
-        )
+processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
 ```

-You can visualize the matches between the images by providing the original images as well as the outputs to this method:
-```python
-processor.plot_keypoint_matching(images, outputs)
-```
+</hfoption>
+</hfoptions>

-![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/duPp09ty8NRZlMZS18ccP.png)
+## Notes

-This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
-The original code can be found [here](https://github.com/cvg/LightGlue).
+- LightGlue is adaptive to the task difficulty. Inference is much faster on image pairs that are intuitively easy to match, for example, because of a larger visual overlap or limited appearance change.
+
+    ```py
+    from transformers import AutoImageProcessor, AutoModel
+    import torch
+    from PIL import Image
+    import requests
+    
+    processor = AutoImageProcessor.from_pretrained("ETH-CVG/lightglue_superpoint")
+    model = AutoModel.from_pretrained("ETH-CVG/lightglue_superpoint")
+    
+    # LightGlue requires pairs of images
+    images = [image1, image2]
+    inputs = processor(images, return_tensors="pt")
+    outputs = model(**inputs)
+    
+    # Extract matching information
+    keypoints0 = outputs.keypoints0  # Keypoints in first image
+    keypoints1 = outputs.keypoints1  # Keypoints in second image
+    matches = outputs.matches        # Matching indices
+    matching_scores = outputs.matching_scores  # Confidence scores
+    ```
+
+- The model outputs matching indices, keypoints, and confidence scores for each match, similar to SuperGlue but with improved efficiency.
+- For better visualization and analysis, use the [`LightGlueImageProcessor.post_process_keypoint_matching`] method to get matches in a more readable format.
+
+    ```py
+    # Process outputs for visualization
+    image_sizes = [[(image.height, image.width) for image in images]]
+    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+    
+    for i, output in enumerate(processed_outputs):
+        print(f"For the image pair {i}")
+        for keypoint0, keypoint1, matching_score in zip(
+                output["keypoints0"], output["keypoints1"], output["matching_scores"]
+        ):
+            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
+    ```
+
+- Visualize the matches between the images using the built-in plotting functionality.
+
+    ```py
+    # Easy visualization using the built-in plotting method
+    processor.plot_keypoint_matching(images, processed_outputs)
+    ```
+
+<div class="flex justify-center">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/duPp09ty8NRZlMZS18ccP.png">
+</div>
+
+## Resources
+
+- Refer to the [original LightGlue repository](https://github.com/cvg/LightGlue) for more examples and implementation details.

 ## LightGlueConfig

@ -97,8 +130,13 @@ The original code can be found [here](https://github.com/cvg/LightGlue).
 - post_process_keypoint_matching
 - plot_keypoint_matching

+<frameworkcontent>
+<pt>
 ## LightGlueForKeypointMatching

 [[autodoc]] LightGlueForKeypointMatching

 - forward
+
+</pt>
+</frameworkcontent>
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -14,287 +14,178 @@ rendered properly in your Markdown viewer.

 -->

-# LLaVA-NeXT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
 </div>

-## Overview
+# LLaVA-NeXT

-The LLaVA-NeXT model was proposed in [LLaVA-NeXT: Improved reasoning, OCR, and world knowledge](https://llava-vl.github.io/blog/2024-01-30-llava-next/) by Haotian Liu, Chunyuan Li, Yuheng Li, Bo Li, Yuanhan Zhang, Sheng Shen, Yong Jae Lee. LLaVa-NeXT (also called LLaVa-1.6) improves upon [LLaVa](llava) by increasing the input image resolution and training on an improved visual instruction tuning dataset to improve OCR and common sense reasoning.
+[LLaVA‑NeXT](https://llava-vl.github.io/blog/2024-05-10-llava-next-stronger-llms/) improves on [Llava](./llava) by increasing the input image resolution by 4x more pixels and supporting 3 aspect ratios (up to 672x672, 336x1344, 1344x336) to better grasp visual details. It is also trained on an improved visual instruction tuning dataset covering more scenarios and applications to improve OCR and common sense reasoning.

-The introduction from the blog is the following:
+You can find all the original LLaVA‑NeXT checkpoints under the [LLaVA-NeXT](https://huggingface.co/collections/llava-hf/llava-next-65f75c4afac77fd37dbbe6cf) collection.

-*In October 2023, we released LLaVA-1.5 with a simple and efficient design along with great performance on a benchmark suite of 12 datasets. It has since served as the foundation of many comprehensive studies of data, model, and capabilities of large multimodal models (LMM), and has enabled various new applications.
+> [!TIP]
+> This model was contributed by [nielsr](https://huggingface.co/nielsr).
+>
+> Click on the LLaVA‑NeXT models in the right sidebar for more examples of how to apply Llava-NeXT to different multimodal tasks.

-Today, we are thrilled to present LLaVA-NeXT, with improved reasoning, OCR, and world knowledge. LLaVA-NeXT even exceeds Gemini Pro on several benchmarks.
+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

-Compared with LLaVA-1.5, LLaVA-NeXT has several improvements:
+<hfoptions id="usage">

-Increasing the input image resolution to 4x more pixels. This allows it to grasp more visual details. It supports three aspect ratios, up to 672x672, 336x1344, 1344x336 resolution.
-Better visual reasoning and OCR capability with an improved visual instruction tuning data mixture.
-Better visual conversation for more scenarios, covering different applications. Better world knowledge and logical reasoning.
-Efficient deployment and inference with SGLang.
-Along with performance improvements, LLaVA-NeXT maintains the minimalist design and data efficiency of LLaVA-1.5. It re-uses the pretrained connector of LLaVA-1.5, and still uses less than 1M visual instruction tuning samples. The largest 34B variant finishes training in ~1 day with 32 A100s.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_overview.png"
-alt="drawing" width="600"/>
-
-<small> LLaVa-NeXT incorporates a higher input resolution by encoding various patches of the input image. Taken from the <a href="https://huggingface.co/papers/2310.03744">original paper.</a> </small>
-
-This model was contributed by [nielsr](https://huggingface.co/nielsr).
-The original code can be found [here](https://github.com/haotian-liu/LLaVA/tree/main).
-
-## Usage tips
-
- We advise users to use `padding_side="left"` when computing batched generation as it leads to more accurate results. Simply make sure to call `processor.tokenizer.padding_side = "left"` before generating.
-
-<Tip warning={true}>
-
- Llava-Next uses different number of patches for images and thus has to pad the inputs inside modeling code, aside from the padding done when processing the inputs. The default setting is "left-padding" if model is in `eval()` mode, otherwise "right-padding".
-
-</Tip>
-
-
-> [!NOTE]
-> LLaVA models after release v4.46 will raise warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add the attributes to the processor if you own the model checkpoint, or open a PR if it is not owned by you.
-Adding these attributes means that LLaVA will try to infer the number of image tokens required per image and expand the text with as many `<image>` placeholders as there will be tokens. Usually it is around 500 tokens per image, so make sure that the text is not truncated as otherwise there will be failure when merging the embeddings.
-The attributes can be obtained from model config, as `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`. The `num_additional_image_tokens` should be `1` if the vision backbone adds a CLS token or `0` if nothing extra is added to the vision patches.
-
-
-### Formatting Prompts with Chat Templates  
-
-Each **checkpoint** is trained with a specific prompt format, depending on the underlying large language model backbone. To ensure correct formatting, use the processor’s `apply_chat_template` method.  
-
-**Important:**  
- You must construct a conversation history — passing a plain string won't work.  
- Each message should be a dictionary with `"role"` and `"content"` keys.  
- The `"content"` should be a list of dictionaries for different modalities like `"text"` and `"image"`.  
-
-
-Here’s an example of how to structure your input. We will use [llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) and a conversation history of text and image.
+<hfoption id="Pipeline">

 ```python
-from transformers import LlavaNextProcessor
+import torch  
+from transformers import pipeline  

-processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-
-conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What’s shown in this image?"},
-        ],
-    },
-    {
-        "role": "assistant",
-        "content": [{"type": "text", "text": "This image shows a red stop sign."},]
-    },
-    {
-
-        "role": "user",
-        "content": [
-            {"type": "text", "text": "Describe the image in more details."},
-        ],
-    },
-]
-
-text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-
-# Note that the template simply formats your prompt, you still have to tokenize it and obtain pixel values for your images
-print(text_prompt)
->>> "[INST] <image>\nWhat's shown in this image? [/INST] This image shows a red stop sign. [INST] Describe the image in more details. [/INST]"
+pipeline = pipeline(  
+    task="image-text-to-text",  
+    model="llava-hf/llava-v1.6-mistral-7b-hf",  
+    device=0,  
+    torch_dtype=torch.bfloat16  
+)  
+messages = [  
+    {  
+        "role": "user",  
+        "content": [  
+            {  
+                "type": "image",  
+                "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg",  
+            },  
+            { "type": "text", "text": "Describe this image."},  
+        ]  
+    }  
+]  
+pipeline(text=messages, max_new_tokens=20, return_full_text=False)
 ```

- If you want to construct a chat prompt yourself, below is a list of possible formats
-.
-[llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf) requires the following format:
-```bash
-"[INST] <image>\nWhat is shown in this image? [/INST]"
+</hfoption>
+
+<hfoption id="AutoModel">
+
+```python
+import torch  
+import requests  
+from PIL import Image  
+from transformers import AutoProcessor, LlavaNextForConditionalGeneration  
+
+processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")  
+model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16).to("cuda")  
+
+url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"  
+image = Image.open(requests.get(url, stream=True).raw)  
+
+conversation = [  
+    {  
+        "role": "user",  
+        "content": [  
+            {"type": "image"},  
+            {"type": "text", "text": "What is shown in this image?"},  
+        ],  
+    },  
+]  
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)  
+inputs = processor(image, prompt, return_tensors="pt").to("cuda")  
+output = model.generate(**inputs, max_new_tokens=100)  
+print(processor.decode(output[0], skip_special_tokens=True))  
 ```

-[llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf) and [llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf) require the following format:
-```bash
-"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
+</hfoption>
+
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to int4.
+
+```python
+import torch  
+import requests  
+from PIL import Image  
+from transformers import AutoModelForImageTextToText, AutoProcessor, BitsAndBytesConfig  
+
+quant_config = BitsAndBytesConfig(  
+    load_in_4bit=True,  
+    bnb_4bit_compute_dtype=torch.float16,  
+    bnb_4bit_quant_type="nf4"  
+)  
+
+processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")  
+model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quant_config, device_map="auto")  
+
+url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"  
+image = Image.open(requests.get(url, stream=True).raw)  
+
+conversation = [  
+    {  
+        "role": "user",  
+        "content": [  
+            {"type": "image"},  
+            {"type": "text", "text": "What does this chart show?"},  
+        ],  
+    },  
+]  
+prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)  
+inputs = processor(image, prompt, return_tensors="pt").to("cuda")  
+
+with torch.inference_mode():  
+    output = model.generate(**inputs, max_new_tokens=100)  
+print(processor.decode(output[0], skip_special_tokens=True))  
 ```

-[llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf) requires the following format:
-```bash
-"<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|><|im_start|>assistant\n"
+
+## Notes
+
+* Different checkpoints (Mistral, Vicuna, etc.) require a specific prompt format depending on the underlying LLM. Always use [`~ProcessorMixin.apply_chat_template`] to ensure correct formatting. Refer to the [Templates](../chat_templating) guide for more details.
+
+* Set `padding_side="left"` during batched generation for more accurate results.
+
+```py
+processor.tokenizer.padding_side = "left"
 ```

-[llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llava-next-8b-hf) requires the following format:
+* LLaVA-NeXT uses different numbers of patches for images and pads the inputs inside the modeling code except when padding is done during processing. The default setting is *left-padding* if the model is in `eval()` mode, otherwise it is *right-padding*.

-```bash
-"<|start_header_id|>system<|end_header_id|>\n\nYou are a helpful language and vision assistant. You are able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language.<|eot_id|><|start_header_id|><|start_header_id|>user<|end_header_id|>\n\n<image>\nWhat is shown in this image?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
-```
+* LLaVA models after v4.46 raises warnings about adding `processor.patch_size = {{patch_size}}`, `processor.num_additional_image_tokens = {{num_additional_image_tokens}}`, and `processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. It is strongly recommended to add these attributes to the processor if you own the model checkpoint or open a PR if it isn't.

-[llava-next-72b-hf](https://huggingface.co/llava-hf/llava-next-72b-hf) and [llava-next-110b-hf](https://huggingface.co/llava-hf/llava-next-110b-hf) require the following format:
+  Adding these attributes means LLaVA will try to infer the number of image tokens required per image and expand the text with the same number of `<image>` token placeholders. There are usually ~500 tokens per image, so make sure the text is not truncated because it will cause a failure when merging the embeddings. The attributes can be found in `model.config.vision_config.patch_size` or `model.config.vision_feature_select_strategy`.

-```bash
-"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<image>\nWhat is shown in this image?<|im_end|>\n<|im_start|>assistant\n"
-```
+  The `num_additional_image_tokens` should be `1` if the vision backbone adds a `CLS` token or `0` if nothing extra is added.

-🚀 **Bonus:** If you're using `transformers>=4.49.0`, you can also get a vectorized output from `apply_chat_template`. See the **Usage Examples** below for more details on how to use it.
-
-
-
-## Usage example
-
-### Single image inference
-
-Here's how to load the model and perform inference in half-precision (`torch.float16`):
+* The example below demonstrates inference with multiple input images.

 ```python
 from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration
-import torch
 from PIL import Image
-import requests
+import requests, torch

 processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+model = LlavaNextForConditionalGeneration.from_pretrained(
+    "llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16
+).to("cuda")

-model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16)
-model.to("cuda:0")
+# Load multiple images
+url1 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_ocr.png"
+url2 = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/llava_next_comparison.png"

-# prepare image and text prompt, using the appropriate prompt template
-url = "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
-image = Image.open(requests.get(url, stream=True).raw)
+image1 = Image.open(requests.get(url1, stream=True).raw)
+image2 = Image.open(requests.get(url2, stream=True).raw)

 conversation = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-        ],
-    },
+    {"role": "user", "content": [{"type": "image"}, {"type": "image"}, {"type": "text", "text": "Compare these two images and describe the differences."}]}
 ]
 prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
-inputs = processor(image, prompt, return_tensors="pt").to("cuda:0")
+inputs = processor([image1, image2], prompt, return_tensors="pt").to("cuda")

-# autoregressively complete prompt
 output = model.generate(**inputs, max_new_tokens=100)
-
 print(processor.decode(output[0], skip_special_tokens=True))
 ```

-### Multi image inference
-
-LLaVa-Next can perform inference with multiple images as input, where images either belong to the same prompt or different prompts (in batched inference). Here is how you can do it:
-
-```python
-import requests
-from PIL import Image
-import torch
-from transformers import AutoProcessor, AutoModelForImageTextToText
-
-# Load the model in half-precision
-model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto")
-processor = AutoProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
-
-# Get three different images
-url = "https://www.ilankelman.org/stopsigns/australia.jpg"
-image_stop = Image.open(requests.get(url, stream=True).raw)
-
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image_cats = Image.open(requests.get(url, stream=True).raw)
-
-url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
-image_snowman = Image.open(requests.get(url, stream=True).raw)
-
-# Prepare a batch of two prompts, where the first one is a multi-turn conversation and the second is not
-conversation_1 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-            ],
-    },
-    {
-        "role": "assistant",
-        "content": [
-            {"type": "text", "text": "There is a red stop sign in the image."},
-            ],
-    },
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What about this image? How many cats do you see?"},
-            ],
-    },
-]
-
-conversation_2 = [
-    {
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": "What is shown in this image?"},
-            ],
-    },
-]
-
-prompt_1 = processor.apply_chat_template(conversation_1, add_generation_prompt=True)
-prompt_2 = processor.apply_chat_template(conversation_2, add_generation_prompt=True)
-prompts = [prompt_1, prompt_2]
-
-# We can simply feed images in the order they have to be used in the text prompt
-# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
-inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(model.device)
-
-# Generate
-generate_ids = model.generate(**inputs, max_new_tokens=30)
-processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
-```
-
-## Model optimization
-
-### Quantization using Bitsandbytes
-
-The model can be loaded in 8 or 4 bits, greatly reducing the memory requirements while maintaining the performance of the original model. First make sure to install bitsandbytes, `pip install bitsandbytes`, and to have access to a GPU/accelerator that is supported by the library.
-
-<Tip>
-
-bitsandbytes is being refactored to support multiple backends beyond CUDA. Currently, ROCm (AMD GPU) and Intel CPU implementations are mature, with Intel XPU in progress and Apple Silicon support expected by Q4/Q1. For installation instructions and the latest backend updates, visit [this link](https://huggingface.co/docs/bitsandbytes/main/en/installation#multi-backend).
-
-We value your feedback to help identify bugs before the full release! Check out [these docs](https://huggingface.co/docs/bitsandbytes/main/en/non_cuda_backends) for more details and feedback links.
-
-</Tip>
-
-Simply change the snippet above with:
-
-```python
-from transformers import AutoModelForImageTextToText, BitsAndBytesConfig
-
-# specify how to quantize the model
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_compute_dtype=torch.float16,
-)
-
-model = AutoModelForImageTextToText.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", quantization_config=quantization_config, device_map="auto")
-```
-
-### Use Flash-Attention 2 to further speed-up generation
-
-First make sure to install flash-attn. Refer to the [original repository of Flash Attention](https://github.com/Dao-AILab/flash-attention) regarding that package installation. Simply change the snippet above with:
-
-```python
-from transformers import AutoModelForImageTextToText
-
-model = AutoModelForImageTextToText.from_pretrained(
-    model_id,
-    torch_dtype=torch.float16,
-    use_flash_attention_2=True
-).to(0)
-```

 ## LlavaNextConfig

--- a/docs/source/en/model_doc/mamba.md
+++ b/docs/source/en/model_doc/mamba.md
@ -28,6 +28,7 @@ You can find all the original Mamba checkpoints under the [State Space Models](h


 > [!TIP]
+> This model was contributed by [Molbap](https://huggingface.co/Molbap) and [AntonV](https://huggingface.co/AntonV).
 > Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.

 The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
@ -115,6 +116,13 @@ print(tokenizer.decode(output[0], skip_special_tokens=True))
  trainer.train()
   ```

+## MambaCache
+
+[[autodoc]] MambaCache
+    - update_conv_state
+    - update_ssm_state
+    - reset
+
 ## MambaConfig

 [[autodoc]] MambaConfig
--- a/docs/source/en/model_doc/mamba2.md
+++ b/docs/source/en/model_doc/mamba2.md
@ -26,6 +26,7 @@ rendered properly in your Markdown viewer.
 You can find all the original Mamba 2 checkpoints under the [State Space Models](https://huggingface.co/state-spaces) organization, but the examples shown below use [mistralai/Mamba-Codestral-7B-v0.1](https://huggingface.co/mistralai/Mamba-Codestral-7B-v0.1) because a Hugging Face implementation isn't supported yet for the original checkpoints.

 > [!TIP]
+> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ).
 > Click on the Mamba models in the right sidebar for more examples of how to apply Mamba to different language tasks.

 The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.
--- a/docs/source/en/model_doc/marian.md
+++ b/docs/source/en/model_doc/marian.md
@ -14,159 +14,138 @@ rendered properly in your Markdown viewer.

 -->

-# MarianMT
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
 ">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
-
-A framework for translation models, using the same models as BART. Translations should be similar, but not identical to output in the test set linked to in each model card.
-This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
+# MarianMT


-## Implementation Notes

- Each model is about 298 MB on disk, there are more than 1,000 models.
- The list of supported language pairs can be found [here](https://huggingface.co/Helsinki-NLP).
- Models were originally trained by [Jörg Tiedemann](https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann) using the [Marian](https://marian-nmt.github.io/) C++ library, which supports fast training and translation.
- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented
-  in a model card.
- The 80 opus models that require BPE preprocessing are not supported.
- The modeling code is the same as [`BartForConditionalGeneration`] with a few minor modifications:
+[MarianMT](https://huggingface.co/papers/1804.00344) is a machine translation model trained with the Marian framework which is written in pure C++. The framework includes its own custom auto-differentiation engine and efficient meta-algorithms to train encoder-decoder models like BART.

-  - static (sinusoid) positional embeddings (`MarianConfig.static_position_embeddings=True`)
-  - no layernorm_embedding (`MarianConfig.normalize_embedding=False`)
-  - the model starts generating with `pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
-    `<s/>`),
- Code to bulk convert models can be found in `convert_marian_to_pytorch.py`.
+All MarianMT models are transformer encoder-decoders with 6 layers in each component, use static sinusoidal positional embeddings, don't have a layernorm embedding, and the model starts generating with the prefix `pad_token_id` instead of `<s/>`.


-## Naming

- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`
- The language codes used to name models are inconsistent. Two digit codes can usually be found [here](https://developers.google.com/admin-sdk/directory/v1/languages), three digit codes require googling "language
-  code {code}".
- Codes formatted like `es_AR` are usually `code_{region}`. That one is Spanish from Argentina.
- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
-  group use a combination of ISO-639-5 codes and ISO-639-2 codes.
+You can find all the original MarianMT checkpoints under the [Language Technology Research Group at the University of Helsinki](https://huggingface.co/Helsinki-NLP/models?search=opus-mt) organization.


-## Examples
+> [!TIP]
+> This model was contributed by [sshleifer](https://huggingface.co/sshleifer).
+>
+> Click on the MarianMT models in the right sidebar for more examples of how to apply MarianMT to translation tasks.

- Since Marian models are smaller than many other translation models available in the library, they can be useful for
-  fine-tuning experiments and integration tests.
- [Fine-tune on GPU](https://github.com/huggingface/transformers/blob/master/examples/legacy/seq2seq/train_distil_marian_enro.sh)

-## Multilingual Models
+The example below demonstrates how to translate text using [`Pipeline`] or the [`AutoModel`] class.

- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`:
- If a model can output multiple languages, and you should specify a language code by prepending the desired output
-  language to the `src_text`.
- You can see a models's supported language codes in its model card, under target constituents, like in [opus-mt-en-roa](https://huggingface.co/Helsinki-NLP/opus-mt-en-roa).
- Note that if a model is only multilingual on the source side, like `Helsinki-NLP/opus-mt-roa-en`, no language
-  codes are required.
-
-New multi-lingual models from the [Tatoeba-Challenge repo](https://github.com/Helsinki-NLP/Tatoeba-Challenge)
-require 3 character language codes:
+<hfoptions id="usage">
+<hfoption id="Pipeline">

 ```python
->>> from transformers import MarianMTModel, MarianTokenizer

->>> src_text = [
-...     ">>fra<< this is a sentence in english that we want to translate to french",
-...     ">>por<< This should go to portuguese",
-...     ">>esp<< And this to Spanish",
-... ]
+import torch
+from transformers import pipeline

->>> model_name = "Helsinki-NLP/opus-mt-en-roa"
->>> tokenizer = MarianTokenizer.from_pretrained(model_name)
->>> print(tokenizer.supported_language_codes)
-['>>zlm_Latn<<', '>>mfe<<', '>>hat<<', '>>pap<<', '>>ast<<', '>>cat<<', '>>ind<<', '>>glg<<', '>>wln<<', '>>spa<<', '>>fra<<', '>>ron<<', '>>por<<', '>>ita<<', '>>oci<<', '>>arg<<', '>>min<<']
+pipeline = pipeline("translation_en_to_de", model="Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, device=0)
+pipeline("Hello, how are you?")

->>> model = MarianMTModel.from_pretrained(model_name)
->>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
->>> [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-["c'est une phrase en anglais que nous voulons traduire en français",
- 'Isto deve ir para o português.',
- 'Y esto al español']
 ```

-Here is the code to see all available pretrained models on the hub:
+</hfoption>
+
+<hfoption id="AutoModel">

 ```python
-from huggingface_hub import list_models

-model_list = list_models()
-org = "Helsinki-NLP"
-model_ids = [x.id for x in model_list if x.id.startswith(org)]
-suffix = [x.split("/")[1] for x in model_ids]
-old_style_multi_models = [f"{org}/{s}" for s in suffix if s != s.lower()]
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-en-de")
+model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-de", torch_dtype=torch.float16, attn_implementation="sdpa", device_map="auto")
+
+inputs = tokenizer("Hello, how are you?", return_tensors="pt").to("cuda")
+outputs = model.generate(**inputs, cache_implementation="static")
+print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+
 ```

-## Old Style Multi-Lingual Models
+</hfoption>
+</hfoptions>

-These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
-group:
-
-```python no-style
-['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
- 'Helsinki-NLP/opus-mt-ROMANCE-en',
- 'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
- 'Helsinki-NLP/opus-mt-de-ZH',
- 'Helsinki-NLP/opus-mt-en-CELTIC',
- 'Helsinki-NLP/opus-mt-en-ROMANCE',
- 'Helsinki-NLP/opus-mt-es-NORWAY',
- 'Helsinki-NLP/opus-mt-fi-NORWAY',
- 'Helsinki-NLP/opus-mt-fi-ZH',
- 'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
- 'Helsinki-NLP/opus-mt-sv-NORWAY',
- 'Helsinki-NLP/opus-mt-sv-ZH']
-GROUP_MEMBERS = {
- 'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
- 'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
- 'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
- 'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
- 'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
- 'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
- 'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
-}
-```
-
-Example of translating english to many romance languages, using old-style 2 character language codes

+Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/blob/beb9b5b02246b9b7ee81ddf938f93f44cfeaad19/src/transformers/utils/attention_visualizer.py#L139) to better understand what tokens the model can and cannot attend to.

 ```python
->>> from transformers import MarianMTModel, MarianTokenizer
+from transformers.utils.attention_visualizer import AttentionMaskVisualizer

->>> src_text = [
-...     ">>fr<< this is a sentence in english that we want to translate to french",
-...     ">>pt<< This should go to portuguese",
-...     ">>es<< And this to Spanish",
-... ]
-
->>> model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"
->>> tokenizer = MarianTokenizer.from_pretrained(model_name)
-
->>> model = MarianMTModel.from_pretrained(model_name)
->>> translated = model.generate(**tokenizer(src_text, return_tensors="pt", padding=True))
->>> tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-["c'est une phrase en anglais que nous voulons traduire en français",
- 'Isto deve ir para o português.',
- 'Y esto al español']
+visualizer = AttentionMaskVisualizer("Helsinki-NLP/opus-mt-en-de")
+visualizer("Hello, how are you?")
 ```
+<div class="flex justify-center">
+   <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/marianmt-attn-mask.png"/>
+</div>

-## Resources
+## Notes

- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)
- [Causal language modeling task guide](../tasks/language_modeling)
+- MarianMT models are ~298MB on disk and there are more than 1000 models. Check this [list](https://huggingface.co/Helsinki-NLP) for supported language pairs. The language codes may be inconsistent. Two digit codes can be found [here](https://developers.google.com/admin-sdk/directory/v1/languages) while three digit codes may require further searching.
+- Models that require BPE preprocessing are not supported.
+- All model names use the following format: `Helsinki-NLP/opus-mt-{src}-{tgt}`. Language codes formatted like `es_AR` usually refer to the `code_{region}`. For example, `es_AR` refers to Spanish from Argentina.
+- If a model can output multiple languages, prepend the desired output language to `src_txt` as shown below. New multilingual models from the [Tatoeba-Challenge](https://github.com/Helsinki-NLP/Tatoeba-Challenge) require 3 character language codes.
+
+```python
+
+from transformers import MarianMTModel, MarianTokenizer
+
+# Model trained on multiple source languages → multiple target languages
+# Example: multilingual to Arabic (arb)
+model_name = "Helsinki-NLP/opus-mt-mul-mul"  # Tatoeba Challenge model
+tokenizer = MarianTokenizer.from_pretrained(model_name)
+model = MarianMTModel.from_pretrained(model_name)
+
+# Prepend the desired output language code (3-letter ISO 639-3)
+src_texts = ["arb>> Hello, how are you today?"]
+
+# Tokenize and translate
+inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
+translated = model.generate(**inputs)
+
+# Decode and print result
+translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
+print(translated_texts[0])
+
+```
+   
+- Older multilingual models use 2 character language codes.
+
+```python
+
+from transformers import MarianMTModel, MarianTokenizer
+
+# Example: older multilingual model (like en → many)
+model_name = "Helsinki-NLP/opus-mt-en-ROMANCE"  # English → French, Spanish, Italian, etc.
+tokenizer = MarianTokenizer.from_pretrained(model_name)
+model = MarianMTModel.from_pretrained(model_name)
+
+# Prepend the 2-letter ISO 639-1 target language code (older format)
+src_texts = [">>fr<< Hello, how are you today?"]
+
+# Tokenize and translate
+inputs = tokenizer(src_texts, return_tensors="pt", padding=True, truncation=True)
+translated = model.generate(**inputs)
+
+# Decode and print result
+translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
+print(translated_texts[0])
+
+```

 ## MarianConfig

--- a/docs/source/en/model_doc/mask2former.md
+++ b/docs/source/en/model_doc/mask2former.md
@ -77,4 +77,12 @@ The resource should ideally demonstrate something new instead of duplicating an
    - encode_inputs
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## Mask2FormerImageProcessorFast
+
+[[autodoc]] Mask2FormerImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
--- a/docs/source/en/model_doc/maskformer.md
+++ b/docs/source/en/model_doc/maskformer.md
@ -76,6 +76,14 @@ This model was contributed by [francesco](https://huggingface.co/francesco). The
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation

+## MaskFormerImageProcessorFast
+
+[[autodoc]] MaskFormerImageProcessorFast
+    - preprocess
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
 ## MaskFormerFeatureExtractor

 [[autodoc]] MaskFormerFeatureExtractor
--- a/docs/source/en/model_doc/mistral.md
+++ b/docs/source/en/model_doc/mistral.md
@ -139,6 +139,10 @@ Use the [AttentionMaskVisualizer](https://github.com/huggingface/transformers/bl

 [[autodoc]] MistralConfig

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## MistralModel

 [[autodoc]] MistralModel
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@ -13,116 +13,125 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&amp;logo=pytorch&amp;logoColor=white">
+    </div>
+</div>

-# Mistral3
+# Mistral 3

-## Overview
+[Mistral 3](https://mistral.ai/news/mistral-small-3) is a latency optimized model with a lot fewer layers to reduce the time per forward pass. This model adds vision understanding and supports long context lengths of up to 128K tokens without compromising performance.

-Building upon Mistral Small 3 (2501), Mistral Small 3.1 (2503) adds state-of-the-art vision understanding and enhances long context capabilities up to 128k tokens without compromising text performance. With 24 billion parameters, this model achieves top-tier capabilities in both text and vision tasks.
+You can find the original Mistral 3 checkpoints under the [Mistral AI](https://huggingface.co/mistralai/models?search=mistral-small-3) organization.

-It is ideal for:
- Fast-response conversational agents.
- Low-latency function calling.
- Subject matter experts via fine-tuning.
- Local inference for hobbyists and organizations handling sensitive data.
- Programming and math reasoning.
- Long document understanding.
- Visual understanding.

-This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).
+> [!TIP]
+> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez) and [yonigozlan](https://huggingface.co/yonigozlan).
+> Click on the Mistral3 models in the right sidebar for more examples of how to apply Mistral3 to different tasks.

-The original code can be found [here](https://github.com/vllm-project/vllm/blob/main/vllm/model_executor/models/pixtral.py) and [here](https://github.com/mistralai/mistral-common).
+The example below demonstrates how to generate text for an image with [`Pipeline`] and the [`AutoModel`] class.

-## Usage example
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-### Inference with Pipeline
+```py
+import torch
+from transformers import pipeline

-Here is how you can use the `image-text-to-text` pipeline to perform inference with the `Mistral3` models in just a few lines of code:
-```python
->>> from transformers import pipeline
+messages = [
+    {"role": "user",
+        "content":[
+            {"type": "image",
+            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
+            {"type": "text", "text": "Describe this image."}
+        ,]
+    ,}
+,]

->>> messages = [
-...     {
-...         "role": "user",
-...         "content": [
-...             {
-...                 "type": "image",
-...                 "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",
-...             },
-...             {"type": "text", "text": "Describe this image."},
-...         ],
-...     },
-... ]
+pipeline = pipeline(
+    task="image-text-to-text", 
+    model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", 
+    torch_dtype=torch.bfloat16,
+    device=0
+)
+outputs = pipeline(text=messages, max_new_tokens=50, return_full_text=False)

->>> pipe = pipeline("image-text-to-text", model="mistralai/Mistral-Small-3.1-24B-Instruct-2503", torch_dtype=torch.bfloat16)
->>> outputs = pipe(text=messages, max_new_tokens=50, return_full_text=False)
->>> outputs[0]["generated_text"]
+outputs[0]["generated_text"]
 'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
 ```
-### Inference on a single image
+</hfoption>
+<hfoption id="AutoModel">

-This example demonstrates how to perform inference on a single image with the Mistral3 models using chat templates.
+```py
+import torch
+from transformers import AutoProcessor, AutoModelForImageTextToText 

-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
->>> import torch
+torch_device = "cuda"
+model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+model = AutoModelForImageTextToText.from_pretrained(
+    model_checkpoint, 
+    device_map=torch_device, 
+    torch_dtype=torch.bfloat16
+)

->>> torch_device = "cuda"
->>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+messages = [
+    {"role": "user",
+        "content":[
+            {"type": "image",
+            "image": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg",},
+            {"type": "text", "text": "Describe this image."}
+        ,]
+    ,}
+,]

->>> messages = [
-...     {
-...         "role": "user",
-...         "content": [
-...             {"type": "image", "url": "http://images.cocodataset.org/val2017/000000039769.jpg"},
-...             {"type": "text", "text": "Describe this image"},
-...         ],
-...     }
-... ]
+inputs = processor.apply_chat_template(
+    messages, 
+    add_generation_prompt=True, 
+    tokenize=True, return_dict=True, 
+    return_tensors="pt").to(model.device, dtype=torch.bfloat16)

->>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+generate_ids = model.generate(**inputs, max_new_tokens=20)
+decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)

->>> generate_ids = model.generate(**inputs, max_new_tokens=20)
->>> decoded_output = processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
-
->>> decoded_output
-"The image depicts two cats lying on a pink blanket. The larger cat, which appears to be an"...
+decoded_output
+'The image depicts a vibrant and lush garden scene featuring a variety of wildflowers and plants. The central focus is on a large, pinkish-purple flower, likely a Greater Celandine (Chelidonium majus), with a'
 ```
+</hfoption>
+</hfoptions>

-### Text-only generation
-This example shows how to generate text using the Mistral3 model without providing any image input.
+## Notes 

+- Mistral 3 supports text-only generation. 
+```py 
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch

-````python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
->>> import torch
+torch_device = "cuda"
+model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

->>> torch_device = "cuda"
->>> model_checkpoint = ".mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
+SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
+user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."

->>> SYSTEM_PROMPT = "You are a conversational agent that always answers straight to the point, always end your accurate response with an ASCII drawing of a cat."
->>> user_prompt = "Give me 5 non-formal ways to say 'See you later' in French."
+messages = [
+    {"role": "system", "content": SYSTEM_PROMPT},
+    {"role": "user", "content": user_prompt},
+]

->>> messages = [
-...    {"role": "system", "content": SYSTEM_PROMPT},
-...    {"role": "user", "content": user_prompt},
-... ]
+text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
+inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
+generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
+decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]

->>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
->>> inputs = processor(text=text, return_tensors="pt").to(0, dtype=torch.float16)
->>> generate_ids = model.generate(**inputs, max_new_tokens=50, do_sample=False)
->>> decoded_output = processor.batch_decode(generate_ids[:, inputs["input_ids"].shape[1] :], skip_special_tokens=True)[0]
-
->>> print(decoded_output)
+print(decoded_output)
 "1. À plus tard!
-2. Salut, à plus!
-3. À toute!
-4. À la prochaine!
-5. Je me casse, à plus!
+ 2. Salut, à plus!
+ 3. À toute!
+ 4. À la prochaine!
+ 5. Je me casse, à plus!

 ```
 /\_/\
@ -131,102 +140,101 @@ This example shows how to generate text using the Mistral3 model without providi
 ```"
 ````

-### Batched image and text inputs
-Mistral3 models also support batched image and text inputs.
+- Mistral 3 accepts batched image and text inputs. 
+```py
+from transformers import AutoProcessor, AutoModelForImageTextToText
+import torch

-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText
->>> import torch
+torch_device = "cuda"
+model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)

->>> torch_device = "cuda"
->>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> model = AutoModelForImageTextToText.from_pretrained(model_checkpoint, device_map=torch_device, torch_dtype=torch.bfloat16)
-
->>> messages = [
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-...                 {"type": "text", "text": "Write a haiku for this image"},
-...             ],
-...         },
-...     ],
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
-...                 {"type": "text", "text": "Describe this image"},
-...             ],
-...         },
-...     ],
-... ]
+messages = [
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                 {"type": "text", "text": "Write a haiku for this image"},
+             ],
+         },
+     ],
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://www.ilankelman.org/stopsigns/australia.jpg"},
+                 {"type": "text", "text": "Describe this image"},
+             ],
+         },
+     ],
+ ]


->>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+ inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

->>> output = model.generate(**inputs, max_new_tokens=25)
+ output = model.generate(**inputs, max_new_tokens=25)

->>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
->>> decoded_outputs
+ decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+ decoded_outputs
 ["Write a haiku for this imageCalm waters reflect\nWhispers of the forest's breath\nPeace on wooden path"
 , "Describe this imageThe image depicts a vibrant street scene in what appears to be a Chinatown district. The focal point is a traditional Chinese"]
 ```

-### Batched multi-image input and quantization with BitsAndBytes
-This implementation of the Mistral3 models supports batched text-images inputs with different number of images for each text.
-This example also how to use `BitsAndBytes` to load the model in 4bit quantization.
+- Mistral 3 also supported batched image and text inputs with a different number of images for each text. The example below quantizes the model with bitsandbytes. 

-```python
->>> from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
->>> import torch
+```py 
+from transformers import AutoProcessor, AutoModelForImageTextToText, BitsAndBytesConfig
+import torch

->>> torch_device = "cuda"
->>> model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
->>> processor = AutoProcessor.from_pretrained(model_checkpoint)
->>> quantization_config = BitsAndBytesConfig(load_in_4bit=True)
->>> model = AutoModelForImageTextToText.from_pretrained(
-...     model_checkpoint, quantization_config=quantization_config
-... )
+torch_device = "cuda"
+model_checkpoint = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
+processor = AutoProcessor.from_pretrained(model_checkpoint)
+quantization_config = BitsAndBytesConfig(load_in_4bit=True)
+model = AutoModelForImageTextToText.from_pretrained(
+     model_checkpoint, quantization_config=quantization_config
+ )

->>> messages = [
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
-...                 {"type": "text", "text": "Write a haiku for this image"},
-...             ],
-...         },
-...     ],
-...     [
-...         {
-...             "role": "user",
-...             "content": [
-...                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
-...                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
-...                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
-...             ],
-...         },
-...     ],
->>> ]
+messages = [
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://llava-vl.github.io/static/images/view.jpg"},
+                 {"type": "text", "text": "Write a haiku for this image"},
+             ],
+         },
+     ],
+     [
+         {
+             "role": "user",
+             "content": [
+                 {"type": "image", "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"},
+                 {"type": "image", "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg"},
+                 {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
+             ],
+         },
+     ],
+ ]

->>> inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
+ inputs = processor.apply_chat_template(messages, padding=True, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(model.device, dtype=torch.bfloat16)

->>> output = model.generate(**inputs, max_new_tokens=25)
+ output = model.generate(**inputs, max_new_tokens=25)

->>> decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
->>> decoded_outputs
+ decoded_outputs = processor.batch_decode(output, skip_special_tokens=True)
+ decoded_outputs
 ["Write a haiku for this imageSure, here is a haiku inspired by the image:\n\nCalm lake's wooden path\nSilent forest stands guard\n", "These images depict two different landmarks. Can you identify them? Certainly! The images depict two iconic landmarks:\n\n1. The first image shows the Statue of Liberty in New York City."]
 ```

-
 ## Mistral3Config

 [[autodoc]] Mistral3Config

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## Mistral3Model

 [[autodoc]] Mistral3Model
--- a/docs/source/en/model_doc/mixtral.md
+++ b/docs/source/en/model_doc/mixtral.md
@ -197,6 +197,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h

 [[autodoc]] MixtralConfig

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## MixtralModel

 [[autodoc]] MixtralModel
--- a/docs/source/en/model_doc/modernbert-decoder.md
+++ b/docs/source/en/model_doc/modernbert-decoder.md
@ -0,0 +1,188 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+    <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+  </div>
+</div>
+
+# ModernBERT Decoder
+
+ModernBERT Decoder has the same architecture as [ModernBERT](https://huggingface.co/papers/2412.13663) but it is trained from scratch with a causal language modeling objective from the [Ettin paper](https://huggingface.co/papers/2507.11412). This allows for using the same architecture to compare encoders and decoders. This model is the decoder architecture implementation of ModernBERT, designed for autoregressive text generation tasks.
+
+ModernBERT Decoder uses sliding window attention and rotary positional embeddings for efficiency and to handle longer sequences.
+
+You can find all the original ModernBERT Decoder checkpoints under the [jhu-clsp](https://huggingface.co/collections/jhu-clsp/encoders-vs-decoders-the-ettin-suite-686303e16142257eed8e6aeb) collection.
+
+> [!TIP]
+> This model was contributed by [orionw](https://huggingface.co/orionweller).
+>
+> Click on the ModernBERT Decoder models in the right sidebar for more examples of how to apply ModernBERT Decoder to different text generation tasks.
+
+The example below demonstrates how to use ModernBERT Decoder for text generation with [`Pipeline`], [`AutoModel`] (with and without quantization), and from the command line. 
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+generator = pipeline(
+    task="text-generation",
+    model="jhu-clsp/ettin-decoder-17m",
+    torch_dtype=torch.float16,
+    device=0
+)
+generator("The future of artificial intelligence is", max_length=50, num_return_sequences=1)
+
+# For sequence classification
+classifier = pipeline(
+    task="text-classification",
+    model="jhu-clsp/ettin-decoder-17m",
+    torch_dtype=torch.float16,
+    device=0
+)
+classifier("This movie is really great!")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-17m")
+model = AutoModelForCausalLM.from_pretrained(
+    "jhu-clsp/ettin-decoder-17m",
+    torch_dtype=torch.float16,
+    device_map="auto",
+)
+
+prompt = "The future of artificial intelligence is"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model.generate(
+        **inputs,
+        max_length=50,
+        num_return_sequences=1,
+        temperature=0.7,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
+    )
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(f"Generated text: {generated_text}")
+
+# For sequence classification
+from transformers import AutoModelForSequenceClassification
+
+classifier_model = AutoModelForSequenceClassification.from_pretrained(
+    "jhu-clsp/ettin-decoder-17m",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    num_labels=2
+)
+
+text = "This movie is really great!"
+inputs = tokenizer(text, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = classifier_model(**inputs)
+    predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
+    predicted_class = torch.argmax(predictions, dim=-1)
+
+print(f"Predicted class: {predicted_class.item()}")
+print(f"Prediction probabilities: {predictions}")
+```
+
+</hfoption>
+
+<hfoption id="AutoModel (w/quantization)">
+
+```
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+quantization_config = BitsAndBytesConfig(
+    load_in_8bit=True,
+)
+
+tokenizer = AutoTokenizer.from_pretrained("jhu-clsp/ettin-decoder-1b")
+model = AutoModelForCausalLM.from_pretrained(
+    "jhu-clsp/ettin-decoder-1b",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    quantization_config=quantization_config
+)
+
+prompt = "The future of artificial intelligence is"
+inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model.generate(
+        **inputs,
+        max_length=50,
+        num_return_sequences=1,
+        temperature=0.7,
+        do_sample=True,
+        pad_token_id=tokenizer.eos_token_id
+    )
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(f"Generated text: {generated_text}")
+```
+</hfoption>
+
+<hfoption id="transformers CLI">
+
+```bash
+echo "The future of artificial intelligence is" | transformers run --task text-generation --model jhu-clsp/ettin-decoder-17m --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+
+## ModernBertDecoderConfig
+
+[[autodoc]] ModernBertDecoderConfig
+
+<frameworkcontent>
+<pt>
+
+## ModernBertDecoderModel
+
+[[autodoc]] ModernBertDecoderModel
+    - forward
+
+## ModernBertDecoderForCausalLM
+
+[[autodoc]] ModernBertDecoderForCausalLM
+    - forward
+
+## ModernBertDecoderForSequenceClassification
+
+[[autodoc]] ModernBertDecoderForSequenceClassification
+    - forward
+
+</pt>
+</frameworkcontent>
--- a/docs/source/en/model_doc/olmoe.md
+++ b/docs/source/en/model_doc/olmoe.md
@ -14,27 +14,89 @@ rendered properly in your Markdown viewer.

 -->

-# OLMoE
-
+<div style="float: right;">
 <div class="flex flex-wrap space-x-1">
 <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
 <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
+</div>

-## Overview
+# OLMoE

-The OLMoE model was proposed in [OLMoE: Open Mixture-of-Experts Language Models](https://huggingface.co/papers/2409.02060) by Niklas Muennighoff, Luca Soldaini, Dirk Groeneveld, Kyle Lo, Jacob Morrison, Sewon Min, Weijia Shi, Pete Walsh, Oyvind Tafjord, Nathan Lambert, Yuling Gu, Shane Arora, Akshita Bhagia, Dustin Schwenk, David Wadden, Alexander Wettig, Binyuan Hui, Tim Dettmers, Douwe Kiela, Ali Farhadi, Noah A. Smith, Pang Wei Koh, Amanpreet Singh, Hannaneh Hajishirzi.
+[OLMoE](https://huggingface.co/papers/2409.02060) is a sparse Mixture-of-Experts (MoE) language model with 7B parameters but only 1B parameters are used per input token. It has similar inference costs as dense models but trains ~3x faster. OLMoE uses fine-grained routing with 64 small experts in each layer and uses a dropless token-based routing algorithm.

-OLMoE is a series of **O**pen **L**anguage **Mo**dels using sparse **M**ixture-**o**f-**E**xperts designed to enable the science of language models. We release all code, checkpoints, logs, and details involved in training these models.
+You can find all the original OLMoE checkpoints under the [OLMoE](https://huggingface.co/collections/allenai/olmoe-november-2024-66cf678c047657a30c8cd3da) collection.

-The abstract from the paper is the following:
+> [!TIP]
+> This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
+>
+> Click on the OLMoE models in the right sidebar for more examples of how to apply OLMoE to different language tasks.

-*We introduce OLMoE, a fully open, state-of-the-art language model leveraging sparse Mixture-of-Experts (MoE). OLMoE-1B-7B has 7 billion (B) parameters but uses only 1B per input token. We pretrain it on 5 trillion tokens and further adapt it to create OLMoE-1B-7B-Instruct. Our models outperform all available models with similar active parameters, even surpassing larger ones like Llama2-13B-Chat and DeepSeekMoE-16B. We present various experiments on MoE training, analyze routing in our model showing high specialization, and open-source all aspects of our work: model weights, training data, code, and logs.*
+The example below demonstrates how to generate text with [`Pipeline`] or the [`AutoModel`] class.

-This model was contributed by [Muennighoff](https://hf.co/Muennighoff).
-The original code can be found [here](https://github.com/allenai/OLMoE).
+<hfoptions id="usage">
+<hfoption id="Pipeline">

+```py
+import torch
+from transformers import pipeline
+
+pipe = pipeline(
+    task="text-generation",
+    model="allenai/OLMoE-1B-7B-0125",
+    torch_dtype=torch.float16,
+    device=0,
+)
+
+result = pipe("Dionysus is the god of")
+print(result)
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto").to(device)
+tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
+
+inputs = tokenizer("Bitcoin is", return_tensors="pt")
+inputs = {k: v.to(device) for k, v in inputs.items()}
+output = model.generate(**inputs, max_length=64)
+print(tokenizer.decode(output[0]))
+```
+
+## Quantization
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+The example below uses [bitsandbytes](../quantization/bitsandbytes) to only quantize the weights to 4-bits.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+quantization_config = BitsAndBytesConfig(
+   load_in_4bit=True,
+   bnb_4bit_compute_dtype=torch.float16,
+   bnb_4bit_use_double_quant=True,
+   bnb_4bit_quant_type="nf4"
+)
+
+model = AutoModelForCausalLM.from_pretrained("allenai/OLMoE-1B-7B-0924", attn_implementation="sdpa", torch_dtype="auto", device_map="auto", quantization_config=quantization_config).to(device)
+tokenizer = AutoTokenizer.from_pretrained("allenai/OLMoE-1B-7B-0924")
+
+inputs = tokenizer("Bitcoin is", return_tensors="pt")
+inputs = {k: v.to(device) for k, v in inputs.items()}
+output = model.generate(**inputs, max_length=64)
+print(tokenizer.decode(output[0]))
+```

 ## OlmoeConfig

--- a/docs/source/en/model_doc/oneformer.md
+++ b/docs/source/en/model_doc/oneformer.md
@ -38,7 +38,7 @@ This model was contributed by [Jitesh Jain](https://huggingface.co/praeclarumjj3

 ## Usage tips

-  OneFormer requires two inputs during inference: *image* and *task token*. 
+-  OneFormer requires two inputs during inference: *image* and *task token*.
 - During training, OneFormer only uses panoptic annotations.
 - If you want to train the model in a distributed environment across multiple nodes, then one should update the
  `get_num_masks` function inside in the `OneFormerLoss` class of `modeling_oneformer.py`. When training on multiple nodes, this should be
@ -69,7 +69,14 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerImageProcessor
    - preprocess
-    - encode_inputs
+    - post_process_semantic_segmentation
+    - post_process_instance_segmentation
+    - post_process_panoptic_segmentation
+
+## OneFormerImageProcessorFast
+
+[[autodoc]] OneFormerImageProcessorFast
+    - preprocess
    - post_process_semantic_segmentation
    - post_process_instance_segmentation
    - post_process_panoptic_segmentation
@ -87,4 +94,3 @@ The resource should ideally demonstrate something new instead of duplicating an

 [[autodoc]] OneFormerForUniversalSegmentation
    - forward
-    
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@ -1,194 +1,101 @@
-<!--Copyright 2022 The HuggingFace Team. All rights reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
-the License. You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
-an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
-specific language governing permissions and limitations under the License.
-
-⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
-rendered properly in your Markdown viewer.
-
-->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+           <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+           <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+           <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N9lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFmsnSos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQsKKaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScSKSBqKCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD82gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtbREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG23nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+">
+           <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+           <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

 # OPT

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
-<img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
-">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[OPT](https://huggingface.co/papers/2205.01068) is a suite of open-source decoder-only pre-trained transformers whose parameters range from 125M to 175B. OPT models are designed for casual language modeling and aim to enable responsible and reproducible research at scale. OPT-175B is comparable in performance to GPT-3 with only 1/7th the carbon footprint.

-## Overview
+You can find all the original OPT checkpoints under the [OPT](https://huggingface.co/collections/facebook/opt-66ed00e15599f02966818844) collection.

-The OPT model was proposed in [Open Pre-trained Transformer Language Models](https://huggingface.co/papers/2205.01068) by Meta AI.
-OPT is a series of open-sourced large causal language models which perform similar in performance to GPT3.
+> [!TIP]
+> This model was contributed by [ArthurZ](https://huggingface.co/ArthurZ), [ybelkada](https://huggingface.co/ybelkada), and [patrickvonplaten](https://huggingface.co/patrickvonplaten).
+>
+> Click on the OPT models in the right sidebar for more examples of how to apply OPT to different language tasks.

-The abstract from the paper is the following:
+The example below demonstrates how to generate text with [`Pipeline`], [`AutoModel`], and from the command line.

-*Large language models, which are often trained for hundreds of thousands of compute days, have shown remarkable capabilities for zero- and few-shot learning. Given their computational cost, these models are difficult to replicate without significant capital. For the few that are available through APIs, no access is granted to the full model weights, making them difficult to study. We present Open Pre-trained Transformers (OPT), a suite of decoder-only pre-trained transformers ranging from 125M to 175B parameters, which we aim to fully and responsibly share with interested researchers. We show that OPT-175B is comparable to GPT-3, while requiring only 1/7th the carbon footprint to develop. We are also releasing our logbook detailing the infrastructure challenges we faced, along with code for experimenting with all of the released models.*

-This model was contributed by [Arthur Zucker](https://huggingface.co/ArthurZ), [Younes Belkada](https://huggingface.co/ybelkada), and [Patrick Von Platen](https://huggingface.co/patrickvonplaten).
-The original code can be found [here](https://github.com/facebookresearch/metaseq).
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+  
+```py  
+import torch
+from transformers import pipeline

-Tips:
- OPT has the same architecture as [`BartDecoder`].
- Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.
+pipeline = pipeline(task="text-generation", model="facebook/opt-125m", torch_dtype=torch.float16, device=0)
+pipeline("Once upon a time, in a land far, far away,", max_length=50, num_return_sequences=1)
+```

-> [!NOTE]
-> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+device = "cuda"
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+
+prompt = ("Once upon a time, in a land far, far away, ")
+
+model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+model.to(device)
+
+generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+tokenizer.batch_decode(generated_ids)[0]
+```
+</hfoption>
+<hfoption id="transformers CLI">
+
+```py
+echo -e "Plants create energy through a process known as" | transformers run --task text-generation --model facebook/opt-125m --device 0
+```
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](..quantization/bitsandbytes) to quantize the weights to 8-bits.
+
+```py
+import torch
+from transformers import BitsAndBytesConfig, AutoTokenizer, AutoModelForCausalLM
+
+device = "cuda"
+
+bnb_config = BitsAndBytesConfig(load_in_8bit=True)
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-13b", torch_dtype=torch.float16, attn_implementation="sdpa", quantization_config=bnb_config)
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-13b")
+
+prompt = ("Once upon a time, in a land far, far away, ")
+
+model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
+model.to(device)
+
+generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
+tokenizer.batch_decode(generated_ids)[0]
+```
+
+## Notes
+
+- OPT adds an `EOS` token `</s>` to the beginning of every prompt.
+
+- The `head_mask` argument is ignored if the attention implementation isn't `"eager"`. Set `attn_implementation="eager"` to enable the `head_mask`.

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
-interested in submitting a resource to be included here, please feel free to open a Pull Request and we will review it.
-The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<PipelineTag pipeline="text-generation" />
-
- A notebook on [fine-tuning OPT with PEFT, bitsandbytes, and Transformers](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing). 🌎
- A blog post on [decoding strategies with OPT](https://huggingface.co/blog/introducing-csearch#62-example-two---opt).
- [Causal language modeling](https://huggingface.co/course/en/chapter7/6?fw=pt#training-a-causal-language-model-from-scratch) chapter of the 🤗 Hugging Face Course.
- [`OPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling#gpt-2gpt-and-causal-language-modeling) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling.ipynb).
- [`TFOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/tensorflow/language-modeling#run_clmpy) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/language_modeling-tf.ipynb).
- [`FlaxOPTForCausalLM`] is supported by this [causal language modeling example script](https://github.com/huggingface/transformers/tree/main/examples/flax/language-modeling#causal-language-modeling).
-
-<PipelineTag pipeline="text-classification" />
-
- [Text classification task guide](sequence_classification.md)
- [`OPTForSequenceClassification`] is supported by this [example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/text-classification) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb).
-
-<PipelineTag pipeline="question-answering" />
-
- [`OPTForQuestionAnswering`] is supported by this [question answering example script](https://github.com/huggingface/transformers/tree/main/examples/pytorch/question-answering) and [notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/question_answering.ipynb).
- [Question answering](https://huggingface.co/course/chapter7/7?fw=pt) chapter
-  of the 🤗 Hugging Face Course.
-
-⚡️ Inference
-
- A blog post on [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) with OPT.
-
-
-## Combining OPT and Flash Attention 2
-
-First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
-
-```bash
-pip install -U flash-attn --no-build-isolation
-```
-
-Make also sure that you have a hardware that is compatible with Flash-Attention 2. Read more about it in the official documentation of flash-attn repository. Make also sure to load your model in half-precision (e.g. `torch.float16``)
-
-To load and run a model using Flash Attention 2, refer to the snippet below:
-
-```python
->>> import torch
->>> from transformers import OPTForCausalLM, GPT2Tokenizer
->>> device = "cuda" # the device to load the model onto
-
->>> model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="flash_attention_2")
->>> tokenizer = GPT2Tokenizer.from_pretrained("facebook/opt-350m")
-
->>> prompt = ("A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the "
-              "Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived "
-              "there?")
-
->>> model_inputs = tokenizer([prompt], return_tensors="pt").to(device)
->>> model.to(device)
-
->>> generated_ids = model.generate(**model_inputs, max_new_tokens=30, do_sample=False)
->>> tokenizer.batch_decode(generated_ids)[0]
-'</s>A chat between a curious human and the Statue of Liberty.\n\nHuman: What is your name?\nStatue: I am the Statue of Liberty.\nHuman: Where do you live?\nStatue: New York City.\nHuman: How long have you lived there?\nStatue: I have lived here for about a year.\nHuman: What is your favorite place to eat?\nStatue: I love'
-```
-
-### Expected speedups
-
-Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-2.7b` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
-
-<div style="text-align: center">
-<img src="https://user-images.githubusercontent.com/49240599/281101546-d2fca6d2-ee44-48f3-9534-ba8d5bee4531.png">
-</div>
-
-Below is an expected speedup diagram that compares pure inference time between the native implementation in transformers using `facebook/opt-350m` checkpoint and the Flash Attention 2 version of the model using two different sequence lengths.
-
-<div style="text-align: center">
-<img src="https://user-images.githubusercontent.com/49240599/281101682-d1144e90-0dbc-46f4-8fc8-c6206cb793c9.png">
-</div>
-
-
-### Using Scaled Dot Product Attention (SDPA)
-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
-
-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
-
-```python
-from transformers import OPTForCausalLM
-model = OPTForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.float16, attn_implementation="sdpa")
-...
-```
-
-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
-
-On a local benchmark (L40S-45GB, PyTorch 2.4.0, OS Debian GNU/Linux 11) using `float16` with
-[facebook/opt-350m](https://huggingface.co/facebook/opt-350m), we saw the
-following speedups during training and inference.
-
-### Training
-
-|    batch_size |    seq_len |  Time per batch (eager - s)   |    Time per batch (sdpa - s) |  Speedup (%)   |  Eager peak mem (MB)   |    sdpa peak mem (MB) |  Mem saving (%)   |
-|--------------:|-----------:|:------------------------------|-----------------------------:|:---------------|:-----------------------|----------------------:|:------------------|
-|             1 |        128 | 0.047                         |                        0.037 | 26.360         | 1474.611               |               1474.32 | 0.019             |
-|             1 |        256 | 0.046                         |                        0.037 | 24.335         | 1498.541               |               1499.49 | -0.063            |
-|             1 |        512 | 0.046                         |                        0.037 | 24.959         | 1973.544               |               1551.35 | 27.215            |
-|             1 |       1024 | 0.062                         |                        0.038 | 65.135         | 4867.113               |               1698.35 | 186.578           |
-|             1 |       2048 | 0.230                         |                        0.039 | 483.933        | 15662.224              |               2715.75 | 476.718           |
-|             2 |        128 | 0.045                         |                        0.037 | 20.455         | 1498.164               |               1499.49 | -0.089            |
-|             2 |        256 | 0.046                         |                        0.037 | 24.027         | 1569.367               |               1551.35 | 1.161             |
-|             2 |        512 | 0.045                         |                        0.037 | 20.965         | 3257.074               |               1698.35 | 91.778            |
-|             2 |       1024 | 0.122                         |                        0.038 | 225.958        | 9054.405               |               2715.75 | 233.403           |
-|             2 |       2048 | 0.464                         |                        0.067 | 593.646        | 30572.058              |               4750.55 | 543.548           |
-|             4 |        128 | 0.045                         |                        0.037 | 21.918         | 1549.448               |               1551.35 | -0.123            |
-|             4 |        256 | 0.044                         |                        0.038 | 18.084         | 2451.768               |               1698.35 | 44.361            |
-|             4 |        512 | 0.069                         |                        0.037 | 84.421         | 5833.180               |               2715.75 | 114.791           |
-|             4 |       1024 | 0.262                         |                        0.062 | 319.475        | 17427.842              |               4750.55 | 266.860           |
-|             4 |       2048 | OOM                           |                        0.062 | Eager OOM      | OOM                    |               4750.55 | Eager OOM         |
-|             8 |        128 | 0.044                         |                        0.037 | 18.436         | 2049.115               |               1697.78 | 20.694            |
-|             8 |        256 | 0.048                         |                        0.036 | 32.887         | 4222.567               |               2715.75 | 55.484            |
-|             8 |        512 | 0.153                         |                        0.06  | 154.862        | 10985.391              |               4750.55 | 131.245           |
-|             8 |       1024 | 0.526                         |                        0.122 | 330.697        | 34175.763              |               8821.18 | 287.428           |
-|             8 |       2048 | OOM                           |                        0.122 | Eager OOM      | OOM                    |               8821.18 | Eager OOM         |
-
-### Inference
-
-|    batch_size |    seq_len |    Per token latency eager (ms) |    Per token latency SDPA (ms) |    Speedup (%) |    Mem eager (MB) |    Mem BT (MB) |    Mem saved (%) |
-|--------------:|-----------:|--------------------------------:|-------------------------------:|---------------:|------------------:|---------------:|-----------------:|
-|             1 |        128 |                          11.634 |                          8.647 |         34.546 |           717.676 |        717.674 |            0     |
-|             1 |        256 |                          11.593 |                          8.86  |         30.851 |           742.852 |        742.845 |            0.001 |
-|             1 |        512 |                          11.515 |                          8.816 |         30.614 |           798.232 |        799.593 |           -0.17  |
-|             1 |       1024 |                          11.556 |                          8.915 |         29.628 |           917.265 |        895.538 |            2.426 |
-|             2 |        128 |                          12.724 |                         11.002 |         15.659 |           762.434 |        762.431 |            0     |
-|             2 |        256 |                          12.704 |                         11.063 |         14.83  |           816.809 |        816.733 |            0.009 |
-|             2 |        512 |                          12.757 |                         10.947 |         16.535 |           917.383 |        918.339 |           -0.104 |
-|             2 |       1024 |                          13.018 |                         11.018 |         18.147 |          1162.65  |       1114.81  |            4.291 |
-|             4 |        128 |                          12.739 |                         10.959 |         16.243 |           856.335 |        856.483 |           -0.017 |
-|             4 |        256 |                          12.718 |                         10.837 |         17.355 |           957.298 |        957.674 |           -0.039 |
-|             4 |        512 |                          12.813 |                         10.822 |         18.393 |          1158.44  |       1158.45  |           -0.001 |
-|             4 |       1024 |                          13.416 |                         11.06  |         21.301 |          1653.42  |       1557.19  |            6.18  |
-|             8 |        128 |                          12.763 |                         10.891 |         17.193 |          1036.13  |       1036.51  |           -0.036 |
-|             8 |        256 |                          12.89  |                         11.104 |         16.085 |          1236.98  |       1236.87  |            0.01  |
-|             8 |        512 |                          13.327 |                         10.939 |         21.836 |          1642.29  |       1641.78  |            0.031 |
-|             8 |       1024 |                          15.181 |                         11.175 |         35.848 |          2634.98  |       2443.35  |            7.843 |
+- Refer to this [notebook](https://colab.research.google.com/drive/1jCkpikz0J2o20FBQmYmAGdiKmJGOMo-o?usp=sharing) for an example of fine-tuning OPT with PEFT, bitsandbytes, and Transformers.
+- The [How 🤗 Accelerate runs very large models thanks to PyTorch](https://huggingface.co/blog/accelerate-large-models) blog post demonstrates how to run OPT for inference.

 ## OPTConfig

--- a/docs/source/en/model_doc/owlv2.md
+++ b/docs/source/en/model_doc/owlv2.md
@ -106,6 +106,13 @@ Usage of OWLv2 is identical to [OWL-ViT](owlvit) with a new, updated image proce
    - post_process_object_detection
    - post_process_image_guided_detection

+## Owlv2ImageProcessorFast
+
+[[autodoc]] Owlv2ImageProcessorFast
+    - preprocess
+    - post_process_object_detection
+    - post_process_image_guided_detection
+
 ## Owlv2Processor

 [[autodoc]] Owlv2Processor
--- a/docs/source/en/model_doc/perception_lm.md
+++ b/docs/source/en/model_doc/perception_lm.md
@ -0,0 +1,68 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# PerceptionLM
+
+## Overview
+
+The PerceptionLM model was proposed in [PerceptionLM: Open-Access Data and Models for Detailed Visual Understanding](https://ai.meta.com/research/publications/perceptionlm-open-access-data-and-models-for-detailed-visual-understanding/) by Jang Hyun Cho et al. It's a fully open, reproducible model for transparent research in image and video understanding. PLM consists of
+a vision encoder with a small scale (<8B parameters) LLM decoder.
+
+The abstract from the paper is the following:
+
+*Vision-language models are integral to computer vision research, yet many high-performing models
+remain closed-source, obscuring their data, design and training recipe. The research community
+has responded by using distillation from black-box models to label training data, achieving strong
+benchmark results, at the cost of measurable scientific progress. However, without knowing the details
+of the teacher model and its data sources, scientific progress remains difficult to measure. In this
+paper, we study building a Perception Language Model (PLM) in a fully open and reproducible
+framework for transparent research in image and video understanding. We analyze standard training
+pipelines without distillation from proprietary models and explore large-scale synthetic data to identify
+critical data gaps, particularly in detailed video understanding. To bridge these gaps, we release 2.8M
+human-labeled instances of fine-grained video question-answer pairs and spatio-temporally grounded
+video captions. Additionally, we introduce PLM–VideoBench, a suite for evaluating challenging video
+understanding tasks focusing on the ability to reason about “what”, “where”, “when”, and “how” of a
+video. We make our work fully reproducible by providing data, training recipes, code & models.*
+
+
+This model was contributed by [shumingh](https://huggingface.co/shumingh).
+The original code can be found [here](https://github.com/facebookresearch/perception_models).
+
+
+## PerceptionLMConfig
+
+[[autodoc]] PerceptionLMConfig
+
+## PerceptionLMProcessor
+
+[[autodoc]] PerceptionLMProcessor
+
+## PerceptionLMImageProcessorFast
+
+[[autodoc]] PerceptionLMImageProcessorFast
+
+## PerceptionLMVideoProcessor
+
+[[autodoc]] PerceptionLMVideoProcessor
+
+## PerceptionLMModel
+
+[[autodoc]] PerceptionLMModel
+
+## PerceptionLMForConditionalGeneration
+
+[[autodoc]] PerceptionLMForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/phi4_multimodal.md
+++ b/docs/source/en/model_doc/phi4_multimodal.md
@ -9,44 +9,53 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.
 -->

-# Phi4 Multimodal
+<div style="float: right;">
+  <div class="flex flex-wrap space-x-1">
+    <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-EE4C2C?logo=pytorch&logoColor=white&style=flat">
+  </div>
+</div>

-## Overview
+## Phi4 Multimodal

-Phi4 Multimodal is a lightweight open multimodal foundation model that leverages the language, vision, and speech research and datasets used for Phi-3.5 and 4.0 models. The model processes text, image, and audio inputs, generating text outputs, and comes with 128K token context length. The model underwent an enhancement process, incorporating both supervised fine-tuning, direct preference optimization and RLHF (Reinforcement Learning from Human Feedback) to support precise instruction adherence and safety measures. The languages that each modal supports are the following:
+[Phi4 Multimodal](https://huggingface.co/papers/2503.01743) is a multimodal model capable of text, image, and speech and audio inputs or any combination of these. It features a mixture of LoRA adapters for handling different inputs, and each input is routed to the appropriate encoder.

- Text: Arabic, Chinese, Czech, Danish, Dutch, English, Finnish, French, German, Hebrew, Hungarian, Italian, Japanese, Korean, Norwegian, Polish, Portuguese, Russian, Spanish, Swedish, Thai, Turkish, Ukrainian
- Vision: English
- Audio: English, Chinese, German, French, Italian, Japanese, Spanish, Portuguese
+You can find all the original Phi4 Multimodal checkpoints under the [Phi4](https://huggingface.co/collections/microsoft/phi-4-677e9380e514feb5577a40e4) collection.

-This model was contributed by [Cyril Vallez](https://huggingface.co/cyrilvallez). The most recent code can be
-found [here](https://github.com/huggingface/transformers/blob/main/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py).
+> [!TIP]
+> This model was contributed by [cyrilvallez](https://huggingface.co/cyrilvallez).
+>
+> Click on the Phi-4 Multimodal in the right sidebar for more examples of how to apply Phi-4 Multimodal to different tasks.

+The example below demonstrates how to generate text based on an image with [`Pipeline`] or the [`AutoModel`] class.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-`Phi4-multimodal-instruct` can be found on the [Huggingface Hub](https://huggingface.co/microsoft/Phi-4-multimodal-instruct)
+```python
+from transformers import pipeline
+generator = pipeline("text-generation", model="microsoft/Phi-4-multimodal-instruct", torch_dtype="auto", device=0)

-In the following, we demonstrate how to use it for inference depending on the input modalities (text, image, audio).
+prompt = "Explain the concept of multimodal AI in simple terms."
+
+result = generator(prompt, max_length=50)
+print(result[0]['generated_text'])
+```
+
+</hfoption>
+<hfoption id="AutoModel">

 ```python
 import torch
 from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig

-
-# Define model path
 model_path = "microsoft/Phi-4-multimodal-instruct"
 device = "cuda:0"

-# Load model and processor
 processor = AutoProcessor.from_pretrained(model_path)
-model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device,  torch_dtype=torch.float16)
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device, torch_dtype=torch.float16)

-# Optional: load the adapters (note that without them, the base model will very likely not work well)
-model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
 model.load_adapter(model_path, adapter_name="vision", device_map=device, adapter_kwargs={"subfolder": 'vision-lora'})

-# Part : Image Processing
 messages = [
    {
        "role": "user",
@ -57,7 +66,7 @@ messages = [
    },
 ]

-model.set_adapter("vision") # if loaded, activate the vision adapter
+model.set_adapter("vision")
 inputs = processor.apply_chat_template(
    messages,
    add_generation_prompt=True,
@ -66,7 +75,6 @@ inputs = processor.apply_chat_template(
    return_tensors="pt",
 ).to(device)

-# Generate response
 generate_ids = model.generate(
    **inputs,
    max_new_tokens=1000,
@ -77,10 +85,27 @@ response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
+```

+</hfoption>
+</hfoptions>

-# Part 2: Audio Processing
-model.set_adapter("speech") # if loaded, activate the speech adapter
+## Notes
+
+The example below demonstrates inference with an audio and text input.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoProcessor, GenerationConfig
+
+model_path = "microsoft/Phi-4-multimodal-instruct"
+device = "cuda:0"
+
+processor = AutoProcessor.from_pretrained(model_path)
+model = AutoModelForCausalLM.from_pretrained(model_path, device_map=device,  torch_dtype=torch.float16)
+
+model.load_adapter(model_path, adapter_name="speech", device_map=device, adapter_kwargs={"subfolder": 'speech-lora'})
+model.set_adapter("speech")
 audio_url = "https://upload.wikimedia.org/wikipedia/commons/b/b0/Barbara_Sahakian_BBC_Radio4_The_Life_Scientific_29_May_2012_b01j5j24.flac"
 messages = [
    {
@ -110,6 +135,7 @@ response = processor.batch_decode(
    generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False
 )[0]
 print(f'>>> Response\n{response}')
+
 ```

 ## Phi4MultimodalFeatureExtractor
--- a/docs/source/en/model_doc/pixtral.md
+++ b/docs/source/en/model_doc/pixtral.md
@ -86,6 +86,10 @@ output = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up

 [[autodoc]] PixtralVisionConfig

+## MistralCommonTokenizer
+
+[[autodoc]] MistralCommonTokenizer
+
 ## PixtralVisionModel

 [[autodoc]] PixtralVisionModel
--- a/docs/source/en/model_doc/sam.md
+++ b/docs/source/en/model_doc/sam.md
@ -25,7 +25,7 @@ rendered properly in your Markdown viewer.

 SAM (Segment Anything Model) was proposed in [Segment Anything](https://huggingface.co/papers/2304.02643v1.pdf) by Alexander Kirillov, Eric Mintun, Nikhila Ravi, Hanzi Mao, Chloe Rolland, Laura Gustafson, Tete Xiao, Spencer Whitehead, Alex Berg, Wan-Yen Lo, Piotr Dollar, Ross Girshick.

-The model can be used to predict segmentation masks of any object of interest given an input image. 
+The model can be used to predict segmentation masks of any object of interest given an input image.

 ![example image](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/sam-output.png)

@ -37,9 +37,9 @@ Tips:

 - The model predicts binary masks that states the presence or not of the object of interest given an image.
 - The model predicts much better results if input 2D points and/or input bounding boxes are provided
- You can prompt multiple points for the same image, and predict a single mask. 
+- You can prompt multiple points for the same image, and predict a single mask.
 - Fine-tuning the model is not supported yet
- According to the paper, textual input should be also supported. However, at this time of writing this seems not to be supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844). 
+- According to the paper, textual input should be also supported. However, at this time of writing this seems not to be supported according to [the official repository](https://github.com/facebookresearch/segment-anything/issues/4#issuecomment-1497626844).


 This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
@ -149,6 +149,11 @@ alt="drawing" width="900"/>
 [[autodoc]] SamImageProcessor


+## SamImageProcessorFast
+
+[[autodoc]] SamImageProcessorFast
+
+
 ## SamVisionModel

 [[autodoc]] SamVisionModel
--- a/docs/source/en/model_doc/superglue.md
+++ b/docs/source/en/model_doc/superglue.md
@ -10,40 +10,31 @@ specific language governing permissions and limitations under the License.
 ⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
 rendered properly in your Markdown viewer.

-
 -->

+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white" >
+    </div>
+</div>
+
 # SuperGlue

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[SuperGlue](https://huggingface.co/papers/1911.11763) is a neural network that matches two sets of local features by jointly finding correspondences and rejecting non-matchable points. Assignments are estimated by solving a differentiable optimal transport problem, whose costs are predicted by a graph neural network. SuperGlue introduces a flexible context aggregation mechanism based on attention, enabling it to reason about the underlying 3D scene and feature assignments jointly. Paired with the [SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc.

-## Overview
+You can find all the original SuperGlue checkpoints under the [Magic Leap Community](https://huggingface.co/magic-leap-community) organization.

-The SuperGlue model was proposed in [SuperGlue: Learning Feature Matching with Graph Neural Networks](https://huggingface.co/papers/1911.11763) by Paul-Edouard Sarlin, Daniel DeTone, Tomasz Malisiewicz and Andrew Rabinovich.
+> [!TIP]
+> This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
+>
+> Click on the SuperGlue models in the right sidebar for more examples of how to apply SuperGlue to different computer vision tasks.

-This model consists of matching two sets of interest points detected in an image. Paired with the 
-[SuperPoint model](https://huggingface.co/magic-leap-community/superpoint), it can be used to match two images and 
-estimate the pose between them. This model is useful for tasks such as image matching, homography estimation, etc.
+The example below demonstrates how to match keypoints between two images with the [`AutoModel`] class.

-The abstract from the paper is the following:
+<hfoptions id="usage">
+<hfoption id="AutoModel">

-*This paper introduces SuperGlue, a neural network that matches two sets of local features by jointly finding correspondences 
-and rejecting non-matchable points. Assignments are estimated by solving a differentiable optimal transport problem, whose costs 
-are predicted by a graph neural network. We introduce a flexible context aggregation mechanism based on attention, enabling 
-SuperGlue to reason about the underlying 3D scene and feature assignments jointly. Compared to traditional, hand-designed heuristics, 
-our technique learns priors over geometric transformations and regularities of the 3D world through end-to-end training from image 
-pairs. SuperGlue outperforms other learned approaches and achieves state-of-the-art results on the task of pose estimation in 
-challenging real-world indoor and outdoor environments. The proposed method performs matching in real-time on a modern GPU and 
-can be readily integrated into modern SfM or SLAM systems. The code and trained weights are publicly available at this [URL](https://github.com/magicleap/SuperGluePretrainedNetwork).*
-
-## How to use
-
-Here is a quick example of using the model. Since this model is an image matching model, it requires pairs of images to be matched. 
-The raw outputs contain the list of keypoints detected by the keypoint detector as well as the list of matches with their corresponding 
-matching scores.
-```python
+```py
 from transformers import AutoImageProcessor, AutoModel
 import torch
 from PIL import Image
@ -52,7 +43,7 @@ import requests
 url_image1 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_98169888_3347710852.jpg"
 image1 = Image.open(requests.get(url_image1, stream=True).raw)
 url_image2 = "https://raw.githubusercontent.com/magicleap/SuperGluePretrainedNetwork/refs/heads/master/assets/phototourism_sample_images/united_states_capitol_26757027_6717084061.jpg"
-image_2 = Image.open(requests.get(url_image2, stream=True).raw)
+image2 = Image.open(requests.get(url_image2, stream=True).raw)

 images = [image1, image2]

@ -62,67 +53,97 @@ model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")
 inputs = processor(images, return_tensors="pt")
 with torch.no_grad():
    outputs = model(**inputs)
-```

-You can use the `post_process_keypoint_matching` method from the `SuperGlueImageProcessor` to get the keypoints and matches in a more readable format:
-
-```python
+# Post-process to get keypoints and matches
 image_sizes = [[(image.height, image.width) for image in images]]
-outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
-for i, output in enumerate(outputs):
-    print("For the image pair", i)
-    for keypoint0, keypoint1, matching_score in zip(
-            output["keypoints0"], output["keypoints1"], output["matching_scores"]
-    ):
-        print(
-            f"Keypoint at coordinate {keypoint0.numpy()} in the first image matches with keypoint at coordinate {keypoint1.numpy()} in the second image with a score of {matching_score}."
+processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+```
+
+</hfoption>
+</hfoptions>
+
+## Notes
+
+- SuperGlue performs feature matching between two images simultaneously, requiring pairs of images as input.
+
+    ```python
+    from transformers import AutoImageProcessor, AutoModel
+    import torch
+    from PIL import Image
+    import requests
+    
+    processor = AutoImageProcessor.from_pretrained("magic-leap-community/superglue_outdoor")
+    model = AutoModel.from_pretrained("magic-leap-community/superglue_outdoor")
+    
+    # SuperGlue requires pairs of images
+    images = [image1, image2]
+    inputs = processor(images, return_tensors="pt")
+    outputs = model(**inputs)
+    
+    # Extract matching information
+    keypoints0 = outputs.keypoints0  # Keypoints in first image
+    keypoints1 = outputs.keypoints1  # Keypoints in second image
+    matches = outputs.matches        # Matching indices
+    matching_scores = outputs.matching_scores  # Confidence scores
+    ```
+
+- The model outputs matching indices, keypoints, and confidence scores for each match.
+- For better visualization and analysis, use the [`SuperGlueImageProcessor.post_process_keypoint_matching`] method to get matches in a more readable format.
+
+    ```py
+    # Process outputs for visualization
+    image_sizes = [[(image.height, image.width) for image in images]]
+    processed_outputs = processor.post_process_keypoint_matching(outputs, image_sizes, threshold=0.2)
+    
+    for i, output in enumerate(processed_outputs):
+        print(f"For the image pair {i}")
+        for keypoint0, keypoint1, matching_score in zip(
+                output["keypoints0"], output["keypoints1"], output["matching_scores"]
+        ):
+            print(f"Keypoint at {keypoint0.numpy()} matches with keypoint at {keypoint1.numpy()} with score {matching_score}")
+    ```
+
+- The example below demonstrates how to visualize matches between two images.
+
+    ```py
+    import matplotlib.pyplot as plt
+    import numpy as np
+
+    # Create side by side image
+    merged_image = np.zeros((max(image1.height, image2.height), image1.width + image2.width, 3))
+    merged_image[: image1.height, : image1.width] = np.array(image1) / 255.0
+    merged_image[: image2.height, image1.width :] = np.array(image2) / 255.0
+    plt.imshow(merged_image)
+    plt.axis("off")
+
+    # Retrieve the keypoints and matches
+    output = processed_outputs[0]
+    keypoints0 = output["keypoints0"]
+    keypoints1 = output["keypoints1"]
+    matching_scores = output["matching_scores"]
+
+    # Plot the matches
+    for keypoint0, keypoint1, matching_score in zip(keypoints0, keypoints1, matching_scores):
+        plt.plot(
+            [keypoint0[0], keypoint1[0] + image1.width],
+            [keypoint0[1], keypoint1[1]],
+            color=plt.get_cmap("RdYlGn")(matching_score.item()),
+            alpha=0.9,
+            linewidth=0.5,
        )
+        plt.scatter(keypoint0[0], keypoint0[1], c="black", s=2)
+        plt.scatter(keypoint1[0] + image1.width, keypoint1[1], c="black", s=2)

-```
+    plt.savefig("matched_image.png", dpi=300, bbox_inches='tight')
+    ```

-From the outputs, you can visualize the matches between the two images using the following code:
-```python
-import matplotlib.pyplot as plt
-import numpy as np
+<div class="flex justify-center">
+    <img src="https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/01ZYaLB1NL5XdA8u7yCo4.png">
+</div>

-# Create side by side image
-merged_image = np.zeros((max(image1.height, image2.height), image1.width + image2.width, 3))
-merged_image[: image1.height, : image1.width] = np.array(image1) / 255.0
-merged_image[: image2.height, image1.width :] = np.array(image2) / 255.0
-plt.imshow(merged_image)
-plt.axis("off")
+## Resources

-# Retrieve the keypoints and matches
-output = outputs[0]
-keypoints0 = output["keypoints0"]
-keypoints1 = output["keypoints1"]
-matching_scores = output["matching_scores"]
-keypoints0_x, keypoints0_y = keypoints0[:, 0].numpy(), keypoints0[:, 1].numpy()
-keypoints1_x, keypoints1_y = keypoints1[:, 0].numpy(), keypoints1[:, 1].numpy()
-
-# Plot the matches
-for keypoint0_x, keypoint0_y, keypoint1_x, keypoint1_y, matching_score in zip(
-        keypoints0_x, keypoints0_y, keypoints1_x, keypoints1_y, matching_scores
-):
-    plt.plot(
-        [keypoint0_x, keypoint1_x + image1.width],
-        [keypoint0_y, keypoint1_y],
-        color=plt.get_cmap("RdYlGn")(matching_score.item()),
-        alpha=0.9,
-        linewidth=0.5,
-    )
-    plt.scatter(keypoint0_x, keypoint0_y, c="black", s=2)
-    plt.scatter(keypoint1_x + image1.width, keypoint1_y, c="black", s=2)
-
-# Save the plot
-plt.savefig("matched_image.png", dpi=300, bbox_inches='tight')
-plt.close()
-```
-
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/632885ba1558dac67c440aa8/01ZYaLB1NL5XdA8u7yCo4.png)
-
-This model was contributed by [stevenbucaille](https://huggingface.co/stevenbucaille).
-The original code can be found [here](https://github.com/magicleap/SuperGluePretrainedNetwork).
+- Refer to the [original SuperGlue repository](https://github.com/magicleap/SuperGluePretrainedNetwork) for more examples and implementation details.

 ## SuperGlueConfig

@ -133,10 +154,15 @@ The original code can be found [here](https://github.com/magicleap/SuperGluePret
 [[autodoc]] SuperGlueImageProcessor

 - preprocess
+- post_process_keypoint_matching

+<frameworkcontent>
+<pt>
 ## SuperGlueForKeypointMatching

 [[autodoc]] SuperGlueForKeypointMatching

 - forward
- post_process_keypoint_matching
+
+</pt>
+</frameworkcontent>
--- a/docs/source/en/model_doc/switch_transformers.md
+++ b/docs/source/en/model_doc/switch_transformers.md
@ -14,35 +14,90 @@ rendered properly in your Markdown viewer.

 -->

-# SwitchTransformers
-
-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
 </div>

-## Overview
+# Switch Transformers

-The SwitchTransformers model was proposed in [Switch Transformers: Scaling to Trillion Parameter Models with Simple and Efficient Sparsity](https://huggingface.co/papers/2101.03961) by William Fedus, Barret Zoph, Noam Shazeer.
+[Switch Transformers](https://huggingface.co/papers/2101.03961) is a sparse T5 model where the MLP layer is replaced by a Mixture-of-Experts (MoE). A routing mechanism associates each token with an expert and each expert is a dense MLP. Sparsity enables better scaling and the routing mechanism allows the model to select relevant weights on the fly which increases model capacity.

-The Switch Transformer model uses a sparse T5 encoder-decoder architecture, where the MLP are replaced by a Mixture of Experts (MoE). A routing mechanism (top 1 in this case) associates each token to one of the expert, where each expert is a dense MLP. While switch transformers have a lot more weights than their equivalent dense models, the sparsity allows better scaling and better finetuning performance at scale.
-During a forward pass, only a fraction of the weights are used. The routing mechanism allows the model to select relevant weights on the fly which increases the model capacity without increasing the number of operations.
+You can find all the original Switch Transformers checkpoints under the [Switch Transformer](https://huggingface.co/collections/google/switch-transformers-release-6548c35c6507968374b56d1f) collection.

-The abstract from the paper is the following:

-*In deep learning, models typically reuse the same parameters for all inputs. Mixture of Experts (MoE) defies this and instead selects different parameters for each incoming example. The result is a sparsely-activated model -- with outrageous numbers of parameters -- but a constant computational cost. However, despite several notable successes of MoE, widespread adoption has been hindered by complexity, communication costs and training instability -- we address these with the Switch Transformer. We simplify the MoE routing algorithm and design intuitive improved models with reduced communication and computational costs. Our proposed training techniques help wrangle the instabilities and we show large sparse models may be trained, for the first time, with lower precision (bfloat16) formats. We design models based off T5-Base and T5-Large to obtain up to 7x increases in pre-training speed with the same computational resources. These improvements extend into multilingual settings where we measure gains over the mT5-Base version across all 101 languages. Finally, we advance the current scale of language models by pre-training up to trillion parameter models on the "Colossal Clean Crawled Corpus" and achieve a 4x speedup over the T5-XXL model.*
+> [!TIP]
+> This model was contributed by [ybelkada](https://huggingface.co/ybelkada) and [ArthurZ](https://huggingface.co/ArthurZ).
+>
+> Click on the Switch Transformers models in the right sidebar for more examples of how to apply Switch Transformers to different natural language tasks.

-This model was contributed by [Younes Belkada](https://huggingface.co/ybelkada) and [Arthur Zucker](https://huggingface.co/ArthurZ).
-The original code can be found [here](https://github.com/google/flaxformer/tree/main/flaxformer/architectures/moe).
+The example below demonstrates how to predict the masked token with [`Pipeline`], [`AutoModel`], and from the command line.

-## Usage tips
+<hfoptions id="usage">
+<hfoption id="Pipeline">

- SwitchTransformers uses the [`T5Tokenizer`], which can be loaded directly from each model's repository.
- The released weights are pretrained on English [Masked Language Modeling](https://moon-ci-docs.huggingface.co/docs/transformers/pr_19323/en/glossary#general-terms) task, and should be finetuned.
+```python
+import torch
+from transformers import pipeline

-## Resources
+pipeline = pipeline(
+    task="text2text-generation", 
+    model="google/switch-base-8",
+    torch_dtype=torch.float16,
+    device=0
+)
+print(pipeline("The capital of France is <extra_id_0>."))
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```python
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
+model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", torch_dtype=torch.float16)
+
+input_text = "The capital of France is <extra_id_0>."
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
+
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```
+
+</hfoption>
+<hfoption id="transformers CLI">
+
+```bash
+echo -e "The capital of France is <extra_id_0>." | transformers run --task text2text-generation --model google/switch-base-8 --device 0
+# [{'generated_text': 'Paris.'}]
+```
+
+</hfoption>
+</hfoptions>
+
+Quantization reduces the memory burden of large models by representing the weights in a lower precision. Refer to the [Quantization](../quantization/overview) overview for more available quantization backends.
+
+The example below uses [bitsandbytes](../quantization/bitsandbytes/) to only quantize the weights to 8-bits.
+
+```py
+# pip install bitsandbytes
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, BitsAndBytesConfig
+
+tokenizer = AutoTokenizer.from_pretrained("google/switch-base-8")
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+model = AutoModelForSeq2SeqLM.from_pretrained("google/switch-base-8", device_map="auto", quantization_config=quantization_config)
+
+input_text = "The capital of France is <extra_id_0>."
+input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(0)
+
+outputs = model.generate(input_ids)
+print(tokenizer.decode(outputs[0]))
+```

- [Translation task guide](../tasks/translation)
- [Summarization task guide](../tasks/summarization)

 ## SwitchTransformersConfig

--- a/docs/source/en/model_doc/t5gemma.md
+++ b/docs/source/en/model_doc/t5gemma.md
@ -24,7 +24,7 @@ rendered properly in your Markdown viewer.

 # T5Gemma

-T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large langauge models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.
+T5Gemma (aka encoder-decoder Gemma) was proposed in a [research paper](https://arxiv.org/abs/2504.06225) by Google. It is a family of encoder-decoder large language models, developed by adapting pretrained decoder-only models into encoder-decoder. T5Gemma includes pretrained and instruction-tuned variants. The architecture is based on transformer encoder-decoder design following T5, with improvements from Gemma 2: GQA, RoPE, GeGLU activation, RMSNorm, and interleaved local/global attention.

 T5Gemma has two groups of model sizes: 1) [Gemma 2](https://ai.google.dev/gemma/docs/core/model_card_2) sizes (2B-2B, 9B-2B, and 9B-9B), which are based on the offical Gemma 2 models (2B and 9B); and 2) [T5](https://arxiv.org/abs/1910.10683) sizes (Small, Base, Large, and XL), where are pretrained under the Gemma 2 framework following T5 configuration. In addition, we also provide a model at ML size (medium large, ~2B in total), which is in-between T5 Large and T5 XL.

--- a/docs/source/en/model_doc/timesfm.md
+++ b/docs/source/en/model_doc/timesfm.md
@ -37,6 +37,7 @@ The original code can be found [here](https://github.com/google-research/timesfm
 To use the model:

 ```python
+import numpy as np
 import torch
 from transformers import TimesFmModelForPrediction

--- a/docs/source/en/model_doc/voxtral.md
+++ b/docs/source/en/model_doc/voxtral.md
@ -0,0 +1,351 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Voxtral
+
+Voxtral is an upgrade of [Ministral 3B and Mistral Small 3B](https://mistral.ai/news/ministraux), extending its language capabilities with audio input support. It is designed to handle tasks such as speech transcription, translation, and audio understanding.
+
+You can read more in Mistral's [realease blog post](https://mistral.ai/news/voxtral).
+
+The model is available in two checkpoints:
+- 3B: [mistralai/Voxtral-Mini-3B-2507](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)
+- 24B: [mistralai/Voxtral-Small-24B-2507](https://huggingface.co/mistralai/Voxtral-Small-24B-2507)
+
+## Key Features
+
+Voxtral builds on Ministral-3B by adding audio processing capabilities:
+
+- **Transcription mode**: Includes a dedicated mode for speech transcription. By default, Voxtral detects the spoken language and transcribes it accordingly.  
+- **Long-form context**: With a 32k token context window, Voxtral can process up to 30 minutes of audio for transcription or 40 minutes for broader audio understanding.  
+- **Integrated Q&A and summarization**: Supports querying audio directly and producing structured summaries without relying on separate ASR and language models.  
+- **Multilingual support**: Automatically detects language and performs well across several widely spoken languages, including English, Spanish, French, Portuguese, Hindi, German, Dutch, and Italian.  
+- **Function calling via voice**: Can trigger functions or workflows directly from spoken input based on detected user intent.  
+- **Text capabilities**: Maintains the strong text processing performance of its Ministral-3B foundation.
+
+## Usage
+
+### Audio Instruct Mode
+
+The model supports audio-text instructions, including multi-turn and multi-audio interactions, all processed in batches.
+
+➡️ audio + text instruction
+```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio",
+                "url": "https://huggingface.co/datasets/eustlb/audio-samples/resolve/main/dude_where_is_my_car.wav",
+            },
+            {"type": "text", "text": "What can you tell me about this audio?"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(conversation)
+inputs = inputs.to(device, dtype=torch.bfloat16)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+
+print("\nGenerated response:")
+print("=" * 80)
+print(decoded_outputs[0])
+print("=" * 80)
+```
+
+➡️ multi-audio + text instruction 
+```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/mary_had_lamb.mp3",
+            },
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
+            },
+            {"type": "text", "text": "What sport and what nursery rhyme are referenced?"},
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(conversation)
+inputs = inputs.to(device, dtype=torch.bfloat16)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+
+print("\nGenerated response:")
+print("=" * 80)
+print(decoded_outputs[0])
+print("=" * 80)
+```
+
+➡️ multi-turn:
+```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
+            },
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
+            },
+            {"type": "text", "text": "Describe briefly what you can hear."},
+        ],
+    },
+    {
+        "role": "assistant",
+        "content": "The audio begins with the speaker delivering a farewell address in Chicago, reflecting on his eight years as president and expressing gratitude to the American people. The audio then transitions to a weather report, stating that it was 35 degrees in Barcelona the previous day, but the temperature would drop to minus 20 degrees the following day.",
+    },
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
+            },
+            {"type": "text", "text": "Ok, now compare this new audio with the previous one."},
+        ],
+    },
+]
+
+inputs = processor.apply_chat_template(conversation)
+inputs = inputs.to(device, dtype=torch.bfloat16)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+
+print("\nGenerated response:")
+print("=" * 80)
+print(decoded_outputs[0])
+print("=" * 80)
+```
+
+➡️ text only:
+```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "text",
+                "text": "What if a cyber brain could possibly generate its own ghost, and create a soul all by itself?",
+            },
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(conversation)
+inputs = inputs.to(device, dtype=torch.bfloat16)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+
+print("\nGenerated response:")
+print("=" * 80)
+print(decoded_outputs[0])
+print("=" * 80)
+```
+
+➡️ audio only:
+```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+conversation = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "audio",
+                "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/dude_where_is_my_car.wav",
+            },
+        ],
+    }
+]
+
+inputs = processor.apply_chat_template(conversation)
+inputs = inputs.to(device, dtype=torch.bfloat16)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+
+print("\nGenerated response:")
+print("=" * 80)
+print(decoded_outputs[0])
+print("=" * 80)
+```
+
+➡️ batched inference!
+```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+conversations = [
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio",
+                    "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3",
+                },
+                {
+                    "type": "audio",
+                    "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/bcn_weather.mp3",
+                },
+                {
+                    "type": "text",
+                    "text": "Who's speaking in the speach and what city's weather is being discussed?",
+                },
+            ],
+        }
+    ],
+    [
+        {
+            "role": "user",
+            "content": [
+                {
+                    "type": "audio",
+                    "path": "https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/winning_call.mp3",
+                },
+                {"type": "text", "text": "What can you tell me about this audio?"},
+            ],
+        }
+    ],
+]
+
+inputs = processor.apply_chat_template(conversations)
+inputs = inputs.to(device, dtype=torch.bfloat16)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+
+print("\nGenerated responses:")
+print("=" * 80)
+for decoded_output in decoded_outputs:
+    print(decoded_output)
+    print("=" * 80)
+```
+
+### Transcription Mode
+
+Use the model to transcribe audio (supports English, Spanish, French, Portuguese, Hindi, German, Dutch, Italian)!
+
+```python
+from transformers import VoxtralForConditionalGeneration, AutoProcessor
+import torch
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+repo_id = "mistralai/Voxtral-Mini-3B-2507"
+
+processor = AutoProcessor.from_pretrained(repo_id)
+model = VoxtralForConditionalGeneration.from_pretrained(repo_id, torch_dtype=torch.bfloat16, device_map=device)
+
+inputs = processor.apply_transcription_request(language="en", audio="https://huggingface.co/datasets/hf-internal-testing/dummy-audio-samples/resolve/main/obama.mp3", model_id=repo_id)
+inputs = inputs.to(device, dtype=torch.bfloat16)
+
+outputs = model.generate(**inputs, max_new_tokens=500)
+decoded_outputs = processor.batch_decode(outputs[:, inputs.input_ids.shape[1]:], skip_special_tokens=True)
+
+print("\nGenerated responses:")
+print("=" * 80)
+for decoded_output in decoded_outputs:
+    print(decoded_output)
+    print("=" * 80)
+```
+
+This model was contributed by [Eustache Le Bihan](https://huggingface.co/eustlb).
+
+## VoxtralConfig
+
+[[autodoc]] VoxtralConfig
+
+## VoxtralEncoderConfig
+
+[[autodoc]] VoxtralEncoderConfig
+
+## VoxtralProcessor
+
+[[autodoc]] VoxtralProcessor
+
+## VoxtralEncoder
+
+[[autodoc]] VoxtralEncoder
+    - forward
+
+## VoxtralForConditionalGeneration
+
+[[autodoc]] VoxtralForConditionalGeneration
+    - forward
--- a/docs/source/en/model_doc/yolos.md
+++ b/docs/source/en/model_doc/yolos.md
@ -13,76 +13,95 @@ specific language governing permissions and limitations under the License.
 rendered properly in your Markdown viewer.

 -->
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>

 # YOLOS

-<div class="flex flex-wrap space-x-1">
-<img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
-<img alt="FlashAttention" src="https://img.shields.io/badge/%E2%9A%A1%EF%B8%8E%20FlashAttention-eae0c8?style=flat">
-<img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
-</div>
+[YOLOS](https://huggingface.co/papers/2106.00666) uses a [Vision Transformer (ViT)](./vit) for object detection with minimal modifications and region priors. It can achieve performance comparable to specialized object detection models and frameworks with knowledge about 2D spatial structures.

-## Overview

-The YOLOS model was proposed in [You Only Look at One Sequence: Rethinking Transformer in Vision through Object Detection](https://huggingface.co/papers/2106.00666) by Yuxin Fang, Bencheng Liao, Xinggang Wang, Jiemin Fang, Jiyang Qi, Rui Wu, Jianwei Niu, Wenyu Liu.
-YOLOS proposes to just leverage the plain [Vision Transformer (ViT)](vit) for object detection, inspired by DETR. It turns out that a base-sized encoder-only Transformer can also achieve 42 AP on COCO, similar to DETR and much more complex frameworks such as Faster R-CNN.
+You can find all the original YOLOS checkpoints under the [HUST Vision Lab](https://huggingface.co/hustvl/models?search=yolos) organization.

-The abstract from the paper is the following:
-
-*Can Transformer perform 2D object- and region-level recognition from a pure sequence-to-sequence perspective with minimal knowledge about the 2D spatial structure? To answer this question, we present You Only Look at One Sequence (YOLOS), a series of object detection models based on the vanilla Vision Transformer with the fewest possible modifications, region priors, as well as inductive biases of the target task. We find that YOLOS pre-trained on the mid-sized ImageNet-1k dataset only can already achieve quite competitive performance on the challenging COCO object detection benchmark, e.g., YOLOS-Base directly adopted from BERT-Base architecture can obtain 42.0 box AP on COCO val. We also discuss the impacts as well as limitations of current pre-train schemes and model scaling strategies for Transformer in vision through YOLOS.*
-
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png"
-alt="drawing" width="600"/>
+<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/yolos_architecture.png" alt="drawing" width="600"/>

 <small> YOLOS architecture. Taken from the <a href="https://huggingface.co/papers/2106.00666">original paper</a>.</small>

-This model was contributed by [nielsr](https://huggingface.co/nielsr). The original code can be found [here](https://github.com/hustvl/YOLOS).

-## Using Scaled Dot Product Attention (SDPA)
+> [!TIP]
+> This model wasa contributed by [nielsr](https://huggingface.co/nielsr).
+> Click on the YOLOS models in the right sidebar for more examples of how to apply YOLOS to different object detection tasks.

-PyTorch includes a native scaled dot-product attention (SDPA) operator as part of `torch.nn.functional`. This function 
-encompasses several implementations that can be applied depending on the inputs and the hardware in use. See the 
-[official documentation](https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html) 
-or the [GPU Inference](https://huggingface.co/docs/transformers/main/en/perf_infer_gpu_one#pytorch-scaled-dot-product-attention)
-page for more information.
+The example below demonstrates how to detect objects with [`Pipeline`] or the [`AutoModel`] class.

-SDPA is used by default for `torch>=2.1.1` when an implementation is available, but you may also set 
-`attn_implementation="sdpa"` in `from_pretrained()` to explicitly request SDPA to be used.
+<hfoptions id="usage">
+<hfoption id="Pipeline">

-```
-from transformers import AutoModelForObjectDetection
-model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-base", attn_implementation="sdpa", torch_dtype=torch.float16)
-...
+```py
+import torch
+from transformers import pipeline
+
+detector = pipeline(
+    task="object-detection",
+    model="hustvl/yolos-base",
+    torch_dtype=torch.float16,
+    device=0
+)
+detector("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
 ```

-For the best speedups, we recommend loading the model in half-precision (e.g. `torch.float16` or `torch.bfloat16`).
+</hfoption>
+<hfoption id="Automodel">

-On a local benchmark (A100-40GB, PyTorch 2.3.0, OS Ubuntu 22.04) with `float32` and `hustvl/yolos-base` model, we saw the following speedups during inference.
+```py
+import torch
+from PIL import Image
+import requests
+from transformers import AutoImageProcessor, AutoModelForObjectDetection

-|   Batch size |   Average inference time (ms), eager mode |   Average inference time (ms), sdpa model |   Speed up, Sdpa / Eager (x) |
-|--------------|-------------------------------------------|-------------------------------------------|------------------------------|
-|            1 |                                       106 |                                        76 |                      1.39 |
-|            2 |                                       154 |                                        90 |                      1.71 |
-|            4 |                                       222 |                                       116 |                      1.91 |
-|            8 |                                       368 |                                       168 |                      2.19 |
+processor = AutoImageProcessor.from_pretrained("hustvl/yolos-base")
+model = AutoModelForObjectDetection.from_pretrained("hustvl/yolos-base", torch_dtype=torch.float16, attn_implementation="sdpa").to("cuda")
+
+url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png"
+image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+inputs = processor(images=image, return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+logits = outputs.logits.softmax(-1)
+scores, labels = logits[..., :-1].max(-1)
+boxes = outputs.pred_boxes
+
+threshold = 0.3
+keep = scores[0] > threshold
+
+filtered_scores = scores[0][keep]
+filtered_labels = labels[0][keep]
+filtered_boxes  = boxes[0][keep]
+
+width, height = image.size
+pixel_boxes = filtered_boxes * torch.tensor([width, height, width, height], device=boxes.device)
+
+for score, label, box in zip(filtered_scores, filtered_labels, pixel_boxes):
+    x0, y0, x1, y1 = box.tolist()
+    print(f"Label {model.config.id2label[label.item()]}: {score:.2f} at [{x0:.0f}, {y0:.0f}, {x1:.0f}, {y1:.0f}]")
+```
+
+</hfoption>
+</hfoptions>
+
+
+## Notes
+- Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](./detr), YOLOS doesn't require a `pixel_mask`.

 ## Resources

-A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with YOLOS.
-
-<PipelineTag pipeline="object-detection"/>
-
- All example notebooks illustrating inference + fine-tuning [`YolosForObjectDetection`] on a custom dataset can be found [here](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS).
- Scripts for finetuning [`YolosForObjectDetection`] with [`Trainer`] or [Accelerate](https://huggingface.co/docs/accelerate/index) can be found [here](https://github.com/huggingface/transformers/tree/main/examples/pytorch/object-detection).
- See also: [Object detection task guide](../tasks/object_detection)
-
-If you're interested in submitting a resource to be included here, please feel free to open a Pull Request and we'll review it! The resource should ideally demonstrate something new instead of duplicating an existing resource.
-
-<Tip>
-
-Use [`YolosImageProcessor`] for preparing images (and optional targets) for the model. Contrary to [DETR](detr), YOLOS doesn't require a `pixel_mask` to be created.
-
-</Tip>
+- Refer to these [notebooks](https://github.com/NielsRogge/Transformers-Tutorials/tree/master/YOLOS) for inference and fine-tuning with [`YolosForObjectDetection`] on a custom dataset.

 ## YolosConfig

--- a/docs/source/en/model_sharing.md
+++ b/docs/source/en/model_sharing.md
@ -28,7 +28,7 @@ To share a model to the Hub, you need a Hugging Face [account](https://hf.co/joi
 <hfoption id="huggingface-CLI">

 ```bash
-huggingface-cli login
+hf auth login
 ```

 </hfoption>
--- a/docs/source/en/modular_transformers.md
+++ b/docs/source/en/modular_transformers.md
@ -94,7 +94,7 @@ ValueError: You defined `RobertaEmbeddings` in the modular_roberta.py, it should

 ## Implementing a modular file

-The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from.
+The easiest way to start is by browsing Transformers for a model similar to yours in order to inherit from it. Some good starting points are [Mistral](./model_doc/mistral), [Qwen2](./model_doc/qwen2), [Cohere](./model_doc/cohere) and [Cohere2](./model_doc/cohere2), and [Llama](./model_doc/llama). Refer to the table below for components your model might be using and where you can inherit from.

 | Component | Model |
 |---|---|
--- a/docs/source/en/optimizers.md
+++ b/docs/source/en/optimizers.md
@ -164,7 +164,7 @@ args = TrainingArguments(
    output_dir="./test-schedulefree",
    max_steps=1000,
    per_device_train_batch_size=4,
-+   optim="schedule_free_radamw,
+   optim="schedule_free_radamw",
 +   lr_scheduler_type="constant",
    gradient_checkpointing=True,
    logging_strategy="steps",
@ -174,3 +174,29 @@ args = TrainingArguments(
    run_name="sfo",
 )
 ```
+
+## StableAdamW
+
+```bash
+pip install torch-optimi
+```
+
+[StableAdamW](https://arxiv.org/pdf/2304.13013) is a hybrid between AdamW and AdaFactor. It ports AdaFactor's update clipping into AdamW, which removes the need for gradient clipping. Otherwise, it behaves as a drop-in replacement for AdamW.
+
+> [!TIP]
+> If training on large batch sizes or still observing training loss spikes, consider reducing beta_2 between [0.95, 0.99].
+
+```diff
+args = TrainingArguments(
+    output_dir="./test-stable-adamw",
+    max_steps=1000,
+    per_device_train_batch_size=4,
+   optim="stable_adamw",
+    gradient_checkpointing=True,
+    logging_strategy="steps",
+    logging_steps=1,
+    learning_rate=2e-6,
+    save_strategy="no",
+    run_name="stable-adamw",
+)
+```
--- a/docs/source/en/perf_hardware.md
+++ b/docs/source/en/perf_hardware.md
@ -15,7 +15,7 @@ rendered properly in your Markdown viewer.

 # Build your own machine

-One of the most important consideration when building a machine for deep learning is the GPU choice. GPUs are the standard workhorse for deep learning owing to their tensor cores for performing very efficient matrix multiplication and high memory bandwidth. To train large models, you either need a more powerful GPU, multiple GPUs, or take advantage of techniques that offload some of the load to the CPU or NVMe.
+One of the most important considerations when building a machine for deep learning is the GPU choice. GPUs are the standard workhorse for deep learning owing to their tensor cores for performing very efficient matrix multiplication and high memory bandwidth. To train large models, you either need a more powerful GPU, multiple GPUs, or take advantage of techniques that offload some of the load to the CPU or NVMe.

 This guide provides some practical tips for setting up a GPU for deep learning. For a more detailed discussion and comparison of GPUs, take a look at the [Which GPU(s) to Get for Deep Learning](https://timdettmers.com/2023/01/30/which-gpu-for-deep-learning/) blog post.

@ -25,11 +25,11 @@ High-end consumer GPUs may have two or three PCIe 8-pin power sockets, and you s

 Each PCIe 8-pin power cable should be connected to a 12V rail on the power supply unit (PSU) and can deliver up to 150W. Other GPUs may use a PCIe 12-pin connector which can deliver up to 500-600W. Lower-end GPUs may only use a PCIe 6-pin connector which supplies up to 75W.

-It is important the PSU has stable voltage otherwise it may not be able to supply the GPU with enough power to function properly during peak usage.
+It is important that the PSU maintains stable voltage; otherwise, it may fail to supply the GPU with enough power during peak usage.

 ## Cooling

-An overheated GPU throttles its performance and can even shutdown if it's too hot to prevent damage. Keeping the GPU temperature low, anywhere between 158 - 167F, is essential for delivering full performance and maintaining its lifespan. Once temperatures reach 183 - 194F, the GPU may begin to throttle performance.
+An overheated GPU throttles its performance and can even shutdown if it's too hot to prevent damage. Keeping the GPU temperature low, anywhere between 158–167°F, is essential for delivering full performance and maintaining its lifespan. Once temperatures reach 183 - 194°F, the GPU may begin to throttle performance.

 ## Multi-GPU connectivity

--- a/docs/source/en/perf_infer_gpu_one.md
+++ b/docs/source/en/perf_infer_gpu_one.md
@ -177,10 +177,16 @@ There are three supported implementations available.

 SDPA is used by default for PyTorch v2.1.1. and greater when an implementation is available. You could explicitly enable SDPA by setting `attn_implementation="sdpa"` in [`~PreTrainedModel.from_pretrained`] though. Certain attention parameters, such as `head_mask` and `output_attentions=True`, are unsupported and returns a warning that Transformers will fall back to the (slower) eager implementation.

+Refer to the [AttentionInterface](./attention_interface) guide to learn how to change the attention implementation after loading a model.
+
 ```py
 from transformers import AutoModelForCausalLM

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto", attn_implementation="sdpa")
+
+# Change the model's attention dynamically after loading it
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", device_map="auto")
+model.set_attention_implementation("sdpa")
 ```

 SDPA selects the most performant implementation available, but you can also explicitly select an implementation with [torch.nn.attention.sdpa_kernel](https://pytorch.org/docs/master/backends.html#torch.backends.cuda.sdp_kernel) as a context manager. The example below shows how to enable the FlashAttention2 implementation with `enable_flash=True`.
@ -234,7 +240,7 @@ FlashAttention2 support is currently limited to Instinct MI210, Instinct MI250 a
 </hfoption>
 </hfoptions>

-Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`]. FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.
+Enable FlashAttention2 by setting `attn_implementation="flash_attention_2"` in [`~PreTrainedModel.from_pretrained`] or by setting `model.set_attention_implementation("flash_attention_2")` to dynamically update the [attention interface](./attention_interface). FlashAttention2 is only supported for models with the fp16 or bf16 torch type. Make sure to cast your model to the appropriate data type first.

 ```py
 from transformers import AutoModelForCausalLM
--- a/docs/source/en/quantization/fp_quant.md
+++ b/docs/source/en/quantization/fp_quant.md
@ -0,0 +1,66 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# FP-Quant
+
+[FP-Quant](https://github.com/IST-DASLab/FP-Quant) is a family of quantization algorithms tailored for the Blackwell generation of Nvidia GPUs. The goal is to allow for efficient post-training quantization (PTQ) and quantization-aware trainin (QAT) of LLMs in the [MXFP4 and NVFP4 data-types](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf).
+
+Currently, only PTQ with MXFP4 is supported. Models can either be quantized on the fly with `quantization_config=FPQuantConfig()`:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
+import torch
+
+model = AutoModelForCausalLM.from_pretrained(
+    "qwen/Qwen3-8B",
+    quantization_config=FPQuantConfig(),
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+)
+```
+
+or pre-processed with GPTQ for better quality (see [FP Format Quantization Harness](https://github.com/IST-DASLab/FP-Quant)).
+
+A **Blackwell-generation GPU is required** to run the kernels. Runtime support for FP-Quant is implemented through the [QuTLASS](https://github.com/IST-DASLab/qutlass) library and a lightweight PyTorch interface lib [`fp_quant`](https://github.com/IST-DASLab/FP-Quant/tree/master/inference_lib). We recommend installing the former **from source** and the latter with  `pip install fp_quant`.
+
+Users **without a Blackwell-generation GPU** , can use the method with `quantization_config=FPQuantConfig(pseudoquant=True)` without having to install [QuTLASS](https://github.com/IST-DASLab/qutlass). This would provide no speedups but would fully emulate the effect of quantization.
+
+> [!TIP]
+> Find models pre-quantized with FP-Quant in the official ISTA-DASLab [collection](https://huggingface.co/collections/ISTA-DASLab/fp-quant-6877c186103a21d3a02568ee).
+
+## torch.compile
+
+FP-Quant is fully compatible with [torch.compile](https://pytorch.org/tutorials/intermediate/torch_compile_tutorial.html).
+
+```python
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, FPQuantConfig
+
+model = AutoModelForCausalLM.from_pretrained(
+    "qwen/Qwen3-8B",
+    quantization_config=FPQuantConfig(),
+    device_map="cuda",
+    torch_dtype=torch.bfloat16,
+)
+
+model.forward = torch.compile(model.forward, mode="max-autotune", fullgraph=True)
+```
+
+## Speedups
+
+FP-Quant currently performs best for very large batch size processing.
+
+See [QuTLASS README](https://github.com/IST-DASLab/qutlass/blob/main/README.md) for speedups.
--- a/docs/source/en/quantization/overview.md
+++ b/docs/source/en/quantization/overview.md
@ -30,6 +30,7 @@ Use the Space below to help you pick a quantization method depending on your har
 | [bitsandbytes](./bitsandbytes)            | 🟢                   | 🟡 |     🟢     | 🟡 | 🔴                    | 🟡 | 🟢 | 4/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/bitsandbytes-foundation/bitsandbytes |
 | [compressed-tensors](./compressed_tensors) | 🔴                   | 🟢              |     🟢     | 🟢        | 🔴                                 | 🔴              | 🔴              | 1/8          | 🟢               | 🟢                          | 🟢                      | https://github.com/neuralmagic/compressed-tensors |
 | [EETQ](./eetq)                            | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | ?               | 8            | 🟢               | 🟢                          | 🟢                      | https://github.com/NetEase-FuXi/EETQ        |
+| [FP-Quant](./fp_quant)                          | 🟢                   | 🔴              | 🟢        | 🔴        | 🔴                                 | 🔴              | 🟢              | 4           | 🔴               | 🟢                          | 🟢                      | https://github.com/IST-DASLab/FP-Quant      |
 | [GGUF / GGML (llama.cpp)](../gguf)        | 🟢                   | 🟢              | 🟢        | 🔴        | 🟢                                 | 🔴              | 🔴              | 1/8          | 🔴               | [See Notes](../gguf)     | [See Notes](../gguf) | https://github.com/ggerganov/llama.cpp      |
 | [GPTQModel](./gptq)                       | 🔴                   | 🟢 | 🟢        | 🟢        | 🟢                                 | 🟢 | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/ModelCloud/GPTQModel        |
 | [AutoGPTQ](./gptq)                        | 🔴                   | 🔴              | 🟢        | 🟢        | 🔴                                 | 🔴              | 🔴              | 2/3/4/8      | 🟢               | 🟢                          | 🟢                      | https://github.com/AutoGPTQ/AutoGPTQ        |
--- a/docs/source/en/quicktour.md
+++ b/docs/source/en/quicktour.md
@ -49,7 +49,7 @@ notebook_login()
 Make sure the [huggingface_hub[cli]](https://huggingface.co/docs/huggingface_hub/guides/cli#getting-started) package is installed and run the command below. Paste your User Access Token when prompted to log in.

 ```bash
-huggingface-cli login
+hf auth login
 ```

 </hfoption>
--- a/docs/source/en/serving.md
+++ b/docs/source/en/serving.md
@ -16,62 +16,22 @@ rendered properly in your Markdown viewer.

 # Serving

-Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users.
+Transformer models can be efficiently deployed using libraries such as vLLM, Text Generation Inference (TGI), and others. These libraries are designed for production-grade user-facing services, and can scale to multiple servers and millions of concurrent users. Refer to [Transformers as Backend for Inference Servers](./transformers_as_backends) for usage examples.

-You can also serve transformer models easily using the `transformers serve` CLI. This is ideal for experimentation purposes, or to run models locally for personal and private use.
-
-## TGI
-
-[TGI](https://huggingface.co/docs/text-generation-inference/index) can serve models that aren't [natively implemented](https://huggingface.co/docs/text-generation-inference/supported_models) by falling back on the Transformers implementation of the model. Some of TGIs high-performance features aren't available in the Transformers implementation, but other features like continuous batching and streaming are still supported.
-
-> [!TIP]
-> Refer to the [Non-core model serving](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) guide for more details.
-
-Serve a Transformers implementation the same way you'd serve a TGI model.
-
-```docker
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
-```
-
-Add `--trust-remote_code` to the command to serve a custom Transformers model.
-
-```docker
-docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
-```
-
-## vLLM
-
-[vLLM](https://docs.vllm.ai/en/latest/index.html) can also serve a Transformers implementation of a model if it isn't [natively implemented](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) in vLLM.
-
-Many features like quantization, LoRA adapters, and distributed inference and serving are supported for the Transformers implementation.
-
-> [!TIP]
-> Refer to the [Transformers fallback](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers-fallback) section for more details.
-
-By default, vLLM serves the native implementation and if it doesn't exist, it falls back on the Transformers implementation. But you can also set `--model-impl transformers` to explicitly use the Transformers model implementation.
-
-```shell
-vllm serve Qwen/Qwen2.5-1.5B-Instruct \
-    --task generate \
-    --model-impl transformers
-```
-
-Add the `trust-remote-code` parameter to enable loading a remote code model.
-
-```shell
-vllm serve Qwen/Qwen2.5-1.5B-Instruct \
-    --task generate \
-    --model-impl transformers \
-    --trust-remote-code
-```
+Apart from that you can also serve transformer models easily using the `transformers serve` CLI. This is ideal for experimentation purposes, or to run models locally for personal and private use.

 ## Serve CLI

 > [!WARNING]
 > This section is experimental and subject to change in future versions

-<!-- TODO: LLMs -> models, after we add audio/image input/output support -->
-You can serve LLMs supported by `transformers` with the `transformers serve` CLI. It spawns a local server that offers a chat Completions API compatible with the OpenAI SDK, which is the _de facto_ standard for LLM conversations. This way, you can use the server from many third party applications, or test it using the `transformers chat` CLI ([docs](conversations.md#chat-cli)).
+You can serve models of diverse modalities supported by `transformers` with the `transformers serve` CLI. It spawns a local server that offers compatibility with the OpenAI SDK, which is the _de facto_ standard for LLM conversations and other related tasks. This way, you can use the server from many third party applications, or test it using the `transformers chat` CLI ([docs](conversations.md#chat-cli)).
+
+The server supports the following REST APIs:
+- `/v1/chat/completions`
+- `/v1/responses`
+- `/v1/audio/transcriptions`
+- `/v1/models`

 To launch a server, simply use the `transformers serve` CLI command:

@ -109,7 +69,7 @@ The server is also an MCP client, so it can interact with MCP tools in agentic u
 <!-- TODO: example with a minimal python example, and explain that it is possible to pass a full generation config in the request -->


-### Usage example 1: apps with local requests (feat. Jan)
+### Usage example 1: chat with local requests (feat. Jan)

 This example shows how to use `transformers serve` as a local LLM provider for the [Jan](https://jan.ai/) app. Jan is a ChatGPT-alternative graphical interface, fully running on your machine. The requests to `transformers serve` come directly from the local app -- while this section focuses on Jan, you can extrapolate some instructions to other apps that make local requests.

@ -139,17 +99,17 @@ ssh -N -f -L 8000:localhost:8000 your_server_account@your_server_IP -p port_to_s
 Port forwarding is not Jan-specific: you can use it to connect `transformers serve` running in a different machine with an app of your choice.


-### Usage example 2: apps with external requests (feat. Cursor)
+### Usage example 2: chat with external requests (feat. Cursor)

 This example shows how to use `transformers serve` as a local LLM provider for [Cursor](https://cursor.com/), the popular IDE. Unlike in the previous example, requests to `transformers serve` will come from an external IP (Cursor's server IPs), which requires some additional setup. Furthermore, some of Cursor's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons.

-To launch our server with CORS enabled, run
+To launch a server with CORS enabled, run

 ```shell
 transformers serve --enable-cors
 ```

-We'll also need to expose our server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run
+You'll also need to expose your server to external IPs. A potential solution is to use [`ngrok`](https://ngrok.com/), which has a permissive free tier. After setting up your `ngrok` account and authenticating on your server machine, you run

 ```shell
 ngrok http [port]
@ -161,7 +121,7 @@ where `port` is the port used by `transformers serve` (`8000` by default). On th
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_serve_ngrok.png"/>
 </h3>

-We're now ready to set things up on the app side! In Cursor, while we can't set a new provider, we can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set our `transformers serve` endpoint, follow this order:
+You're now ready to set things up on the app side! In Cursor, while you can't set a new provider, you can change the endpoint for OpenAI requests in the model selection settings. First, navigate to "Settings" > "Cursor Settings", "Models" tab, and expand the "API Keys" collapsible. To set your `transformers serve` endpoint, follow this order:
 1. Unselect ALL models in the list above (e.g. `gpt4`, ...);
 2. Add and select the model you want to use (e.g. `Qwen/Qwen3-4B`)
 3. Add some random text to OpenAI API Key. This field won't be used, but it can’t be empty;
@ -225,3 +185,26 @@ Image URL: https://evalstate-flux1-schnell.hf.space/gradio_api/file=/tmp/gradio/

 I have generated an image of a cat on the moon using the Flux 1 Schnell Image Generator. The image is 1024x1024 pixels and was created with 4 inference steps. Let me know if you would like to make any changes or need further assistance!
 ```
+
+### Usage example 4: speech to text transcription (feat. Open WebUI)
+
+This guide shows how to do audio transcription for chat purposes, using `transformers serve` and [Open WebUI](https://openwebui.com/). This guide assumes you have Open WebUI installed on your machine and ready to run. Please refer to the examples above to use the text functionalities of `transformer serve` with Open WebUI -- the instructions are the same.
+
+To start, let's launch the server. Some of Open WebUI's requests require [CORS](https://developer.mozilla.org/en-US/docs/Web/HTTP/Guides/CORS), which is disabled by default for security reasons, so you need to enable it:
+
+```shell
+transformers serve --enable-cors
+```
+
+Before you can speak into Open WebUI, you need to update its settings to use your server for speech to text (STT) tasks. Launch Open WebUI, and navigate to the audio tab inside the admin settings. If you're using Open WebUI with the default ports, [this link (default)](http://localhost:3000/admin/settings/audio) or [this link (python deployment)](http://localhost:8080/admin/settings/audio) will take you there. Do the following changes there:
+1. Change the type of "Speech-to-Text Engine" to "OpenAI";
+2. Update the address to your server's address -- `http://localhost:8000/v1` by default;
+3. Type your model of choice into the "STT Model" field, e.g. `openai/whisper-large-v3` ([available models](https://huggingface.co/models?pipeline_tag=automatic-speech-recognition&sort=trending)).
+
+If you've done everything correctly, the audio tab should look like this
+
+<h3 align="center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers_openwebui_stt_settings.png"/>
+</h3>
+
+You're now ready to speak! Open a new chat, utter a few words after hitting the microphone button, and you should see the corresponding text on the chat input after the model transcribes it.
--- a/docs/source/en/tasks/semantic_segmentation.md
+++ b/docs/source/en/tasks/semantic_segmentation.md
@ -289,7 +289,7 @@ You could also create and use your own dataset if you prefer to train with the [
          }
     )

-     # step 3: push to Hub (assumes you have ran the huggingface-cli login command in a terminal/notebook)
+     # step 3: push to Hub (assumes you have ran the hf auth login command in a terminal/notebook)
     dataset.push_to_hub("your-name/dataset-repo")

     # optionally, you can push to a private repo on the Hub
--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -187,13 +187,13 @@ from torch import nn
 from transformers import Trainer

 class CustomTrainer(Trainer):
-    def compute_losss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
+    def compute_loss(self, model: nn.Module, inputs: dict[str, Union[torch.Tensor, Any]], return_outputs: bool = False num_items_in_batch: Optional[torch.Tensor] = None):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss for 3 labels with different weights
-        reduction = "mean" if num_items_in_batch is not None else "sum"
+        reduction = "sum" if num_items_in_batch is not None else "mean"
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 2.0, 3.0], device=model.device, reduction=reduction))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        if num_items_in_batch is not None:
--- a/docs/source/en/training.md
+++ b/docs/source/en/training.md
@ -74,7 +74,7 @@ model = AutoModelForSequenceClassification.from_pretrained("google-bert/bert-bas
 ```

 > [!TIP]
-> The message above is a reminder that the models pretrained head is discarded and replaced with a randomly initialized classification head. The randomly initialized head needs to be fine-tuned on your specific task to output meanginful predictions.
+> The message above is a reminder that the models pretrained head is discarded and replaced with a randomly initialized classification head. The randomly initialized head needs to be fine-tuned on your specific task to output meaningful predictions.

 With the model loaded, set up your training hyperparameters in [`TrainingArguments`]. Hyperparameters are variables that control the training process - such as the learning rate, batch size, number of epochs - which in turn impacts model performance. Selecting the correct hyperparameters is important and you should experiment with them to find the best configuration for your task.

--- a/docs/source/en/transformers_as_backend.md
+++ b/docs/source/en/transformers_as_backend.md
@ -0,0 +1,254 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Inference server backends
+
+Transformers' models are compatible with different inference servers like vLLM and SGLang. Instead of implementing a model for each inference server, you only need one model, which can be plugged into any inference server. It simplifies maintenance and makes it easy for users to use different inference servers for different use cases.
+
+With Transformers as a backend, you can also serve any model - including custom and Hub-hosted models - without waiting for native support.
+
+This guide shows how to use Transformers' models as a backend to some popular inference servers and how to build a model that supports all inference servers.
+
+## vLLM
+
+[vLLM](https://github.com/vllm-project/vllm) is a high-performance inference engine optimized for serving LLMs at scale. It supports many Transformers' models, including all decoder-only LLMs and several vision-language models (VLMs). VLMs currently support image inputs only, with video support planned.
+
+vLLM automatically selects the best backend, and if a model isn’t natively supported, it falls back to the Transformers model. To explicitly use a Transformers' model, set `model_impl="transformers"`.
+
+```python
+from vllm import LLM
+llm = LLM(model="meta-llama/Llama-3.2-1B", model_impl="transformers")
+```
+Add `--model-impl transformers` to `vllm serve` to launch a server with a Transformers' model.
+
+```bash
+vllm serve meta-llama/Llama-3.2-1B \
+    --task generate \
+    --model-impl transformers
+```
+
+Refer to the [vLLM docs](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers) for more usage examples and tips on using a Transformers as the backend.
+
+
+## SGLang
+
+[SGLang](https://github.com/InternLM/sglang) is a high-performance, OpenAI-compatible server and runtime designed for chat-based LLMs. It offers fast inference, role-based conversation handling, and support for custom pipelines, making it great for building real-world LLM apps.
+
+SGLang automatically falls back to the Transformers backend if a model isn’t natively supported. To explicitly use a Transformers' model, set `impl="transformers"`.
+
+```python
+import sglang as sgl
+
+llm = sgl.Engine("meta-llama/Llama-3.2-1B-Instruct", impl="transformers")
+print(llm.generate(["The capital of France is"], {"max_new_tokens": 20})[0])
+```
+
+Add `impl transformers` to `sglang.launch_server` to launch a server with a Transformers' model.
+          
+      
+    
+    
+  
+
+
+```bash
+python3 -m sglang.launch_server \
+  --model-path kyutai/helium-1-preview-2b \
+  --impl transformers \
+  --host 0.0.0.0 \
+  --port 30000
+```
+
+Refer to the [SGLang docs](https://docs.sglang.ai/supported_models/transformers_fallback.html) for more usage examples and tips on using a Transformers as the backend.
+
+## TGI
+
+[TGI](https://huggingface.co/docs/text-generation-inference/index) can serve models that aren't [natively implemented](https://huggingface.co/docs/text-generation-inference/supported_models) by falling back on the Transformers implementation of the model. Some of TGIs high-performance features aren't available in the Transformers implementation, but other features like continuous batching and streaming are still supported.
+
+> [!TIP]
+> Refer to the [Non-core model serving](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) guide for more details.
+
+Serve a Transformers implementation the same way you'd serve a TGI model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+Add `--trust-remote_code` to the command to serve a custom Transformers model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+## Building a compatible model backend
+
+To ensure a model is compatible as a backend to any inference server, make sure it is compatible with Transformers and supports the [AttentionInterface](./attention_interface) class.
+
+1. A model must be Transformers-compatible following the model [contribution guidelines](./add_new_model) or the [custom model contribution guidelines](./custom_models). Make sure the model has a valid `config.json` in its directory and a valid `auto_map` field pointing to the model class in the config.
+
+2. A model's attentions needs to be configurable with the [AttentionInterface](./attention_interface) to allow custom and optimized attention functions. This is important for enabling the performance features of the different inference servers.
+   Use `ALL_ATTENTION_FUNCTIONS` when defining the attention layer and propagate `**kwargs**` from the base `MyModel` class to the attention layers. Set `_supports_attention_backend` to `True` in [`PreTrainedModel`]. Expand the code below for an example.
+
+<details>
+<summary>modeling_my_model.py</summary>
+
+```python
+
+from transformers import PreTrainedModel
+from torch import nn
+
+class MyAttention(nn.Module):
+
+    def forward(self, hidden_states, **kwargs):
+        ...
+        attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            **kwargs,
+        )
+        ...
+
+class MyModel(PreTrainedModel):
+    _supports_attention_backend = True
+```
+
+</details>
+
+3. This step is optional, but if you want to support tensor parallel and/or pipeline parallel features, add the following keys to the config.
+    * `base_model_tp_plan` enables [tensor parallelism](./perf_infer_gpu_multi) by mapping fully qualified layer name patterns to tensor parallel styles. Only the `"colwise"` and `"rowwise"` partitioning strategies are currently supported.
+    * `base_model_pp_plan` enables pipeline parallelism by mapping direct child layer names to tuples of lists of strings. The list in the first element of the tuple contains the names of the input arguments. The list in the last element of the tuple contains the names of the variables the layer outputs to in the modeling code.
+ 
+ Expand the code below for an example.
+
+<details>
+<summary>configuration_my_model.py</summary>
+
+```python
+
+from transformers import PretrainedConfig
+
+class MyConfig(PretrainedConfig):
+    base_model_tp_plan = {
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.gate_proj": "colwise",
+        "layers.*.mlp.up_proj": "colwise",
+        "layers.*.mlp.down_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+```
+</details>
+
+### Multimodal models
+
+For multimodal models, you need to include a few more changes on top of the general recommendations. These rules ensure that your model integrates properly with multimodal data.
+
+1. A multimodal model requires a base `MyMultiModalModel` class to handle multimodal fusion without a language modeling head and a separate generative class that adds a head.
+
+    The base model needs to implement the `get_image_features()` method to accept image pixel values and return encoded outputs. These are later merged with the language embeddings and don't require any postprocessing. The shape of the returned features must match the number of input images. If a vision encoder returns variable-length outputs (patch-based), return a list of 2D tensors of size `(image_seq_len, image_dim)` for each image.
+
+Expand the code below for an example.
+
+<details>
+<summary>modeling_my_multimodal_model.py</summary>
+
+```python
+from transformers.generation import GenerationMixin
+
+class MyMultimodalModel(MyMultimodalPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+        self.multimodal_projection = nn.Linear(vision_dim, text_dim)
+    
+    def get_image_features(self, pixel_values):
+        return self.vision_tower(pixel_values).last_hidden_states
+    
+    def forward(self, input_ids, pixel_values, **kwargs):
+        # process your inputs
+        return MyModelOutputWithPast(
+            last_hidden_state=last_hidden_state,
+            image_hidden_states=image_features,
+            [...]
+        )
+
+class MyMultimodalModelForConditionalGeneration(MyMultimodalPreTrainedModel, GenerationMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = MyMultimodalModel(config)
+        self.lm_head = nn.Linear(hidden_dim, vocab_size)
+```
+</details>
+
+
+2. A multimodal model config must be nested with the following fields.
+    * text_config: decoder language model config
+    * vision_config: vision encoder config
+    * image_token_id: ID of the image placeholder token used in the input to indicate image position
+
+3. A multimodal model's processing class must have the `self.image_token` and `self.image_token_ids` attributes. These are placeholder tokens used to indicate image positions in the input. The placeholder token is the same token used in the input prompt and to mask scatter image features.
+
+   The processing class also needs ` self._get_num_multimodal_tokens` method to compute the number of placeholder tokens needed for multimodal inputs with given sizes and to return a [`MultiModalData`] object. The placeholder for row and column tokens don't count as image placeholders. Only the tokens that are actually replaced by image features are computed.
+
+Finally, when `return_mm_token_type_ids=True`, the class has to return `mm_token_type_ids` to indicate whether each position is a text token (`0`) or image placeholder token (`1`). Each image's token type IDs must be contiguous with no breaks between consecutive ones.
+
+Expand the code below for an example.
+
+<details>
+<summary>processing_my_multimodal_model.py</summary>
+
+```python
+class MyMultimodalProcessor(ProcessorMixin):
+
+    def __call__(self, images=None, text=None, **kwargs):
+        if return_mm_token_type_ids:
+            mm_token_type_ids = np.zeros_like(input_ids)
+            mm_token_type_ids[input_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            num_image_tokens = [256] * len(image_sizes) # 256 placeholder tokens for each image always
+            num_image_patches = [1] * len(image_sizes) # no patching, thus each image is processed as a single base image
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        return MultiModalData(**vision_data)
+```
+</details>
+
+## Resources
+
+* Read the [Transformers backend integration in vLLM](https://blog.vllm.ai/2025/04/11/transformers-backend.html) blog post for more details about the Transformers backend in vLLM.
+* Read the [Transformers backend integration in SGLang](https://huggingface.co/blog/transformers-backend-sglang) blog post for more details about the Transformers backend in SGLang.
--- a/docs/source/es/_toctree.yml
+++ b/docs/source/es/_toctree.yml
@ -38,6 +38,8 @@
    sections:
    - local: tasks/asr
      title: Reconocimiento automático del habla
+    - local: tasks/audio_classification
+      title: Clasificación de audio
    title: Audio
  - isExpanded: false
    sections:
--- a/docs/source/es/custom_models.md
+++ b/docs/source/es/custom_models.md
@ -285,7 +285,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 Ahora, para enviar el modelo al Hub, asegúrate de haber iniciado sesión. Ejecuta en tu terminal:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 o desde un _notebook_:
--- a/docs/source/es/model_sharing.md
+++ b/docs/source/es/model_sharing.md
@ -56,7 +56,7 @@ Los archivos son editados fácilmente dentro de un repositorio. Incluso puedes o
 Antes de compartir un modelo al Hub necesitarás tus credenciales de Hugging Face. Si tienes acceso a una terminal ejecuta el siguiente comando en el entorno virtual donde 🤗 Transformers esté instalado. Esto guardará tu token de acceso dentro de tu carpeta cache de Hugging Face (~/.cache/ by default):

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Si usas un notebook como Jupyter o Colaboratory, asegúrate de tener instalada la biblioteca [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library). Esta biblioteca te permitirá interactuar por código con el Hub.
--- a/docs/source/es/run_scripts.md
+++ b/docs/source/es/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 Todos los scripts pueden cargar tu modelo final en el [Model Hub](https://huggingface.co/models). Asegúrate de haber iniciado sesión en Hugging Face antes de comenzar:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Luego agrega el argumento `push_to_hub` al script. Este argumento creará un repositorio con tu nombre de usuario Hugging Face y el nombre de la carpeta especificado en `output_dir`.
--- a/docs/source/es/tasks/audio_classification.md
+++ b/docs/source/es/tasks/audio_classification.md
@ -0,0 +1,323 @@
+<!--Copyright 2022 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contains specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Clasificación de audio
+
+[[open-in-colab]]
+
+<Youtube id="KWwzcmG98Ds"/>
+
+Clasificación de audio - al igual que con texto — asigna una etiqueta de clase como salida desde las entradas de datos. La diferencia única es en vez de entrada de texto, tiene formas de onda de audio. Algunas aplicaciones prácticas de clasificación incluye identificar la intención del hablante, identificación del idioma, y la clasificación de animales por sus sonidos.
+
+En esta guía te mostraremos como: 
+
+1. Hacer fine-tuning al modelo [Wav2Vec2](https://huggingface.co/facebook/wav2vec2-base) en el dataset [MInDS-14](https://huggingface.co/datasets/PolyAI/minds14) para clasificar la intención del hablante.
+2. Usar tu modelo ajustado para tareas de inferencia.
+
+
+<Tip>
+
+Consulta la [página de la tarea](https://huggingface.co/tasks/audio-classification) de clasificación de audio para acceder a más información sobre los modelos, datasets, y métricas asociados.
+
+</Tip>
+
+Antes de comenzar, asegúrate de haber instalado todas las librerías necesarias:
+
+```bash
+pip install transformers datasets evaluate
+```
+
+Te aconsejamos iniciar sesión con tu cuenta de Hugging Face para que puedas subir tu modelo y compartirlo con la comunidad. Cuando se te solicite, ingresa tu token para iniciar sesión:
+
+```py
+>>> from huggingface_hub import notebook_login
+
+>>> notebook_login()
+```
+
+## Carga el dataset MInDS-14
+
+Comencemos cargando el dataset MInDS-14 con la biblioteca de 🤗 Datasets:
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> minds = load_dataset("PolyAI/minds14", name="en-US", split="train")
+```
+
+Divide el conjunto de `train` (entrenamiento) en un conjunto de entrenamiento y prueba mas pequeño con el método [`~datasets.Dataset.train_test_split`]. De esta forma, tendrás la oportunidad para experimentar y asegúrate de que todo funcióne antes de invertir más tiempo entrenando con el dataset entero.
+
+```py
+>>> minds = minds.train_test_split(test_size=0.2)
+```
+
+Ahora échale un vistazo al dataset:
+
+```py
+>>> minds
+DatasetDict({
+    train: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 450
+    })
+    test: Dataset({
+        features: ['path', 'audio', 'transcription', 'english_transcription', 'intent_class', 'lang_id'],
+        num_rows: 113
+    })
+})
+```
+
+Aunque el dataset contiene mucha información útil, como los campos `land_id` (identificador del lenguaje) y `english_transcription` (transcripción al inglés), en esta guía nos enfocaremos en los campos `audio` y `intent_class` (clase de intención). Puedes quitar las otras columnas con cel método [`~datasets.Dataset.remove_columns`]:
+
+```py
+>>> minds = minds.remove_columns(["path", "transcription", "english_transcription", "lang_id"])
+```
+
+Aquí está un ejemplo:
+
+```py
+>>> minds["train"][0]
+{'audio': {'array': array([ 0.        ,  0.        ,  0.        , ..., -0.00048828,
+         -0.00024414, -0.00024414], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 8000},
+ 'intent_class': 2}
+```
+
+Hay dos campos:
+
+- `audio`: un `array` (arreglo) unidimensional de la señal de voz que se obtiene al cargar y volver a muestrear el archivo de audio.
+- `intent_class`: representa el identificador de la clase de la intención del hablante.
+
+Crea un diccionario que asigne el nombre de la etiqueta a un número entero y viceversa para facilitar la obtención del nombre de la etiqueta a partir de su identificador.
+
+```py
+>>> labels = minds["train"].features["intent_class"].names
+>>> label2id, id2label = dict(), dict()
+>>> for i, label in enumerate(labels):
+...     label2id[label] = str(i)
+...     id2label[str(i)] = label
+```
+
+Ahora puedes convertir el identificador de la etiqueta a un nombre de etiqueta:
+
+```py
+>>> id2label[str(2)]
+'app_error'
+```
+
+## Preprocesamiento
+
+Seguidamente carga el feature extractor (función de extracción de características) de Wav2Vec para procesar la señal de audio:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-base")
+```
+
+El dataset MInDS-14 tiene una tasa de muestreo de 8kHz (puedes encontrar esta información en su [tarjeta de dataset](https://huggingface.co/datasets/PolyAI/minds14)), lo que significa que tendrás que volver a muestrear el dataset a 16kHZ para poder usar el modelo Wav2Vec2 preentranado:
+
+```py
+>>> minds = minds.cast_column("audio", Audio(sampling_rate=16_000))
+>>> minds["train"][0]
+{'audio': {'array': array([ 2.2098757e-05,  4.6582241e-05, -2.2803260e-05, ...,
+         -2.8419291e-04, -2.3305941e-04, -1.1425107e-04], dtype=float32),
+  'path': '/root/.cache/huggingface/datasets/downloads/extracted/f14948e0e84be638dd7943ac36518a4cf3324e8b7aa331c5ab11541518e9368c/en-US~APP_ERROR/602b9a5fbb1e6d0fbce91f52.wav',
+  'sampling_rate': 16000},
+ 'intent_class': 2}
+```
+
+Ahora vamos a crear una función de preprocesamiento:
+
+1. Invoque la columna `audio` para cargar, y si es necesario, volver a muestrear al archivo de audio.
+2. Comprueba si la frecuencia de muestreo del archivo de audio coincide con la frecuencia de muestreo de los datos de audio con los que se entrenó previamente el modelo. Puedes encontrar esta información en la [tarjeta de modelo](https://huggingface.co/facebook/wav2vec2-base) de Wav2Vec2.
+3. Establece una longitud de entrada máxima para agrupar entradas más largas sin truncarlas.
+
+```py
+>>> def preprocess_function(examples):
+...     audio_arrays = [x["array"] for x in examples["audio"]]
+...     inputs = feature_extractor(
+...         audio_arrays, sampling_rate=feature_extractor.sampling_rate, max_length=16000, truncation=True
+...     )
+...     return inputs
+```
+
+Para aplicar la función de preprocesamiento a todo el dataset, puedes usar la función [`~datasets.Dataset.map`] de 🤗 Datasets. Acelera la función `map` haciendo `batched=True` para procesar varios elementos del dataset a la vez. Quitas las columnas que no necesites con el método `[~datasets.Dataset.remove_columns]` y cambia el nombre de `intent_class` a `label`, como requiere el modelo.
+
+```py
+>>> encoded_minds = minds.map(preprocess_function, remove_columns="audio", batched=True)
+>>> encoded_minds = encoded_minds.rename_column("intent_class", "label")
+```
+
+## Evaluación
+A menudo es útil incluir una métrica durante el entrenamiento para evaluar el rendimiento de tu modelo. Puedes cargar un método de evaluación rapidamente con la biblioteca de 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index). Para esta tarea, puedes usar la métrica de [exactitud](https://huggingface.co/spaces/evaluate-metric/accuracy) (accuracy). Puedes ver la [guía rápida](https://huggingface.co/docs/evaluate/a_quick_tour) de 🤗 Evaluate para aprender más de cómo cargar y computar una métrica:
+
+```py
+>>> import evaluate
+
+>>> accuracy = evaluate.load("accuracy")
+```
+
+Ahora crea una función que le pase tus predicciones y etiquetas a [`~evaluate.EvaluationModule.compute`] para calcular la exactitud:
+
+```py
+>>> import numpy as np
+
+
+>>> def compute_metrics(eval_pred):
+...     predictions = np.argmax(eval_pred.predictions, axis=1)
+...     return accuracy.compute(predictions=predictions, references=eval_pred.label_ids)
+```
+
+Ahora tu función `compute_metrics` (computar métricas) está lista y podrás usarla cuando estés preparando tu entrenamiento.
+
+## Entrenamiento
+
+<frameworkcontent>
+<pt>
+<Tip>
+
+¡Si no tienes experiencia haciéndo *fine-tuning* a un modelo con el [`Trainer`], échale un vistazo al tutorial básico [aquí](../training#train-with-pytorch-trainer)!
+
+</Tip>
+
+¡Ya puedes empezar a entrenar tu modelo! Carga Wav2Vec2 con [`AutoModelForAudioClassification`] junto con el especifica el número de etiquetas, y pasa al modelo los *mappings* entre el número entero de etiqueta y la clase de etiqueta.
+
+```py
+>>> from transformers import AutoModelForAudioClassification, TrainingArguments, Trainer
+
+>>> num_labels = len(id2label)
+>>> model = AutoModelForAudioClassification.from_pretrained(
+...     "facebook/wav2vec2-base", num_labels=num_labels, label2id=label2id, id2label=id2label
+... )
+```
+
+Al llegar a este punto, solo quedan tres pasos:
+
+1. Define tus hiperparámetros de entrenamiento en [`TrainingArguments`]. El único parámetro obligatorio es `output_dir` (carpeta de salida), el cual especifica dónde guardar tu modelo. Puedes subir este modelo al Hub haciendo `push_to_hub=True` (debes haber iniciado sesión en Hugging Face para subir tu modelo). Al final de cada época, el [`Trainer`] evaluará la exactitud y guardará el punto de control del entrenamiento.
+2. Pásale los argumentos del entrenamiento al [`Trainer`] junto con el modelo, el dataset, el tokenizer, el data collator y la función `compute_metrics`.
+3. Llama el método [`~Trainer.train`] para hacerle fine-tuning a tu modelo.
+
+```py
+>>> training_args = TrainingArguments(
+...     output_dir="my_awesome_mind_model",
+...     eval_strategy="epoch",
+...     save_strategy="epoch",
+...     learning_rate=3e-5,
+...     per_device_train_batch_size=32,
+...     gradient_accumulation_steps=4,
+...     per_device_eval_batch_size=32,
+...     num_train_epochs=10,
+...     warmup_ratio=0.1,
+...     logging_steps=10,
+...     load_best_model_at_end=True,
+...     metric_for_best_model="accuracy",
+...     push_to_hub=True,
+... )
+
+>>> trainer = Trainer(
+...     model=model,
+...     args=training_args,
+...     train_dataset=encoded_minds["train"],
+...     eval_dataset=encoded_minds["test"],
+...     processing_class=feature_extractor,
+...     compute_metrics=compute_metrics,
+... )
+
+>>> trainer.train()
+```
+
+Una vez que el entrenamiento haya sido completado, comparte tu modelo en el Hub con el método [`~transformers.Trainer.push_to_hub`] para que todo el mundo puede usar tu modelo.
+
+```py
+>>> trainer.push_to_hub()
+```
+</pt>
+</frameworkcontent>
+
+<Tip>
+
+Para ver un ejemplo más detallado de comó hacerle fine-tuning a un modelo para clasificación, échale un vistazo al correspondiente [PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/audio_classification.ipynb).
+
+</Tip>
+
+## Inference
+
+¡Genial, ahora que le has hecho *fine-tuned* a un modelo, puedes usarlo para hacer inferencia!
+
+Carga el archivo de audio para hacer inferencia. Recuerda volver a muestrear la tasa de muestreo del archivo de audio para que sea la misma del modelo si es necesario.
+
+```py
+>>> from datasets import load_dataset, Audio
+
+>>> dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")
+>>> dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
+>>> sampling_rate = dataset.features["audio"].sampling_rate
+>>> audio_file = dataset[0]["audio"]["path"]
+```
+
+La manera más simple de probar tu modelo para hacer inferencia es usarlo en un [`pipeline`]. Puedes instanciar un `pipeline` para clasificación de audio con tu modelo y pasarle tu archivo de audio:
+
+```py
+>>> from transformers import pipeline
+
+>>> classifier = pipeline("audio-classification", model="stevhliu/my_awesome_minds_model")
+>>> classifier(audio_file)
+[
+    {'score': 0.09766869246959686, 'label': 'cash_deposit'},
+    {'score': 0.07998877018690109, 'label': 'app_error'},
+    {'score': 0.0781070664525032, 'label': 'joint_account'},
+    {'score': 0.07667109370231628, 'label': 'pay_bill'},
+    {'score': 0.0755252093076706, 'label': 'balance'}
+]
+```
+
+También puedes replicar de forma manual los resultados del `pipeline` si lo deseas:
+
+<frameworkcontent>
+<pt>
+Carga el feature extractor para preprocesar el archivo de audio y devuelve el `input` como un tensor de PyTorch:
+
+```py
+>>> from transformers import AutoFeatureExtractor
+
+>>> feature_extractor = AutoFeatureExtractor.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> inputs = feature_extractor(dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
+```
+
+Pásale tus entradas al modelo y devuelve los logits:
+
+```py
+>>> from transformers import AutoModelForAudioClassification
+
+>>> model = AutoModelForAudioClassification.from_pretrained("stevhliu/my_awesome_minds_model")
+>>> with torch.no_grad():
+...     logits = model(**inputs).logits
+```
+
+Obtén los identificadores de los clases con mayor probabilidad y usa el *mapping* `id2label` del modelo para convertirle a una etiqueta:
+
+```py
+>>> import torch
+
+>>> predicted_class_ids = torch.argmax(logits).item()
+>>> predicted_label = model.config.id2label[predicted_class_ids]
+>>> predicted_label
+'cash_deposit'
+```
+</pt>
+</frameworkcontent>
--- a/docs/source/fr/run_scripts_fr.md
+++ b/docs/source/fr/run_scripts_fr.md
@ -327,7 +327,7 @@ python examples/pytorch/summarization/run_summarization.py
 Tous les scripts peuvent télécharger votre modèle final sur le Model Hub. Assurez-vous que vous êtes connecté à Hugging Face avant de commencer :

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Ensuite, ajoutez l'argument `push_to_hub` au script. Cet argument créera un dépôt avec votre nom d'utilisateur Hugging Face et le nom du dossier spécifié dans `output_dir`.
--- a/docs/source/it/custom_models.md
+++ b/docs/source/it/custom_models.md
@ -285,7 +285,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 Adesso, per inviare il modello all'Hub, assicurati di aver effettuato l'accesso. Lancia dal tuo terminale:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 O da un notebook:
--- a/docs/source/it/model_sharing.md
+++ b/docs/source/it/model_sharing.md
@ -56,7 +56,7 @@ Anche i file possono essere modificati facilmente in un repository ed è possibi
 Prima di condividere un modello nell'Hub, hai bisogno delle tue credenziali di Hugging Face. Se hai accesso ad un terminale, esegui il seguente comando nell'ambiente virtuale in cui è installata la libreria 🤗 Transformers. Questo memorizzerà il tuo token di accesso nella cartella cache di Hugging Face (di default `~/.cache/`):

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Se stai usando un notebook come Jupyter o Colaboratory, assicurati di avere la libreria [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) installata. Questa libreria ti permette di interagire in maniera programmatica con l'Hub.
--- a/docs/source/it/run_scripts.md
+++ b/docs/source/it/run_scripts.md
@ -324,7 +324,7 @@ python examples/pytorch/summarization/run_summarization.py
 Tutti gli script possono caricare il tuo modello finale al [Model Hub](https://huggingface.co/models). Prima di iniziare, assicurati di aver effettuato l'accesso su Hugging Face:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Poi, aggiungi l'argomento `push_to_hub` allo script. Questo argomento consentirà di creare un repository con il tuo username Hugging Face e la cartella specificata in `output_dir`.
--- a/docs/source/ja/custom_models.md
+++ b/docs/source/ja/custom_models.md
@ -270,7 +270,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 モデルをHubに送信するには、ログインしていることを確認してください。ターミナルで次のコマンドを実行します：

 ```bash
-huggingface-cli login
+hf auth login
 ```

 またはノートブックから：
--- a/docs/source/ja/model_sharing.md
+++ b/docs/source/ja/model_sharing.md
@ -56,7 +56,7 @@ Model Hubの組み込みバージョニングはgitおよび[git-lfs](https://gi
 モデルをHubに共有する前に、Hugging Faceの認証情報が必要です。ターミナルへのアクセス権がある場合、🤗 Transformersがインストールされている仮想環境で以下のコマンドを実行します。これにより、アクセストークンがHugging Faceのキャッシュフォルダに保存されます（デフォルトでは `~/.cache/` に保存されます）：

 ```bash
-huggingface-cli login
+hf auth login
 ```

 JupyterやColaboratoryのようなノートブックを使用している場合、[`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library)ライブラリがインストールされていることを確認してください。
--- a/docs/source/ja/run_scripts.md
+++ b/docs/source/ja/run_scripts.md
@ -337,7 +337,7 @@ python examples/pytorch/summarization/run_summarization.py
 すべてのスクリプトは、最終的なモデルを [Model Hub](https://huggingface.co/models) にアップロードできます。開始する前に Hugging Face にログインしていることを確認してください。

 ```bash
-huggingface-cli login
+hf auth login
 ```

 次に、スクリプトに `push_to_hub` 引数を追加します。この引数は、Hugging Face のユーザー名と `output_dir` で指定したフォルダ名でリポジトリを作成します。
--- a/docs/source/ko/_toctree.yml
+++ b/docs/source/ko/_toctree.yml
--- a/docs/source/ko/custom_models.md
+++ b/docs/source/ko/custom_models.md
@ -277,7 +277,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 터미널에서 다음 코드를 실행해 확인할 수 있습니다:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 주피터 노트북의 경우에는 다음과 같습니다:
--- a/docs/source/ko/main_classes/executorch.md
+++ b/docs/source/ko/main_classes/executorch.md
--- a/docs/source/ko/internal/generation_utils.md
+++ b/docs/source/ko/internal/generation_utils.md
@ -342,66 +342,92 @@ generation_output[:2]

 ## 캐시 (Caches) [[transformers.Cache]]

-[[autodoc]] Cache
-    - update
-
-[[autodoc]] CacheConfig
-    - update
-
-[[autodoc]] QuantizedCacheConfig
-    - validate
-
-[[autodoc]] DynamicCache
+[[autodoc]] CacheLayerMixin
    - update
    - get_seq_length
+    - get_mask_sizes
+    - get_max_cache_shape
+    - reset
    - reorder_cache
+
+[[autodoc]] DynamicLayer
+    - update
+    - crop
+    - batch_repeat_interleave
+    - batch_select_indices
+
+[[autodoc]] StaticLayer
+    - update
+
+[[autodoc]] SlidingWindowLayer
+    - update
+
+[[autodoc]] CacheProcessor
+    - pre_update
+    - post_update
+
+[[autodoc]] OffloadedCacheProcessor
+    - pre_update
+
+[[autodoc]] QuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] QuantoQuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] HQQQuantizedCacheProcessor
+    - post_update
+
+[[autodoc]] Cache
+    - update
+    - get_seq_length
+    - get_mask_sizes
+    - get_max_cache_shape
+    - reset
+    - reorder_cache
+    - crop
+    - batch_repeat_interleave
+    - batch_select_indices
+
+[[autodoc]] DynamicCache
    - to_legacy_cache
    - from_legacy_cache

 [[autodoc]] QuantizedCache
-    - update
-    - get_seq_length

 [[autodoc]] QuantoQuantizedCache

+[[autodoc]] QuantoQuantizedCacheProcessor
+
 [[autodoc]] HQQQuantizedCache

+[[autodoc]] HQQQuantizedCacheProcessor
+
 [[autodoc]] OffloadedCache
-    - update
-    - prefetch_layer
-    - evict_previous_layer

 [[autodoc]] StaticCache
-    - update
-    - get_seq_length
-    - reset

 [[autodoc]] OffloadedStaticCache
-    - update
-    - get_seq_length
-    - reset

 [[autodoc]] HybridCache
-    - update
-    - get_seq_length
-    - reset
+
+[[autodoc]] HybridChunkedCache

 [[autodoc]] SlidingWindowCache
-    - update
-    - reset

 [[autodoc]] EncoderDecoderCache
-    - get_seq_length
    - to_legacy_cache
    - from_legacy_cache
-    - reset
-    - reorder_cache

 [[autodoc]] MambaCache
    - update_conv_state
    - update_ssm_state
    - reset

+[[autodoc]] CacheConfig
+
+[[autodoc]] QuantizedCacheConfig
+
 ## 워터마크 유틸리티 (Watermark Utils) [[transformers.WatermarkDetector]]

 [[autodoc]] WatermarkDetector
--- a/docs/source/ko/model_sharing.md
+++ b/docs/source/ko/model_sharing.md
@ -56,7 +56,7 @@ picture-in-picture" allowfullscreen></iframe>
 모델을 허브에 공유하기 전에 Hugging Face 자격 증명이 필요합니다. 터미널에 액세스할 수 있는 경우, 🤗 Transformers가 설치된 가상 환경에서 다음 명령을 실행합니다. 그러면 Hugging Face 캐시 폴더(기본적으로 `~/.cache/`)에 액세스 토큰을 저장합니다:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Jupyter 또는 Colaboratory와 같은 노트북을 사용 중인 경우, [`huggingface_hub`](https://huggingface.co/docs/hub/adding-a-library) 라이브러리가 설치되었는지 확인하세요. 이 라이브러리를 사용하면 API로 허브와 상호 작용할 수 있습니다.
--- a/docs/source/ko/perf_infer_gpu_multi.md
+++ b/docs/source/ko/perf_infer_gpu_multi.md
@ -0,0 +1,311 @@
+<!--Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# 분산 추론[[distributed-inference]]
+
+모델이 단일 GPU에 올라가지 않는 경우, [텐서 병렬 처리](./perf_train_gpu_many#tensor-parallelism)를 사용한 분산 추론이 도움이 될 수 있습니다. 텐서 병렬화는 모델을 여러 가속기(CUDA GPU, Intel XPU 등)에 분할하여 행렬 곱셈과 같은 계산을 병렬화합니다. 이를 통해 더 큰 모델을 메모리에 올릴 수 있으며, 각 가속기가 텐서의 일부를 처리하므로 추론 속도가 향상됩니다.
+
+그러나 텐서 병렬화는 통신 오버헤드를 발생시키므로, 빠른 노드 내 통신을 활용할 수 있는 다중 가속기 환경에서 사용하는 것이 가장 효과적입니다. 다중 노드 학습 환경에서는 사용 사례에 따라 파이프라인 병렬화나 데이터 병렬화를 사용하는 것이 더 효율적일 수 있습니다.
+
+> [!TIP]
+> 텐서 병렬화에 대해 더 자세히 알아보려면 [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism)의 텐서 병렬화 섹션을 참조하세요.
+
+아래 목록에서 텐서 병렬 처리를 기본적으로 지원하는 모델을 확인할 수 있습니다. 새로운 모델에 대한 지원을 추가하려면 GitHub 이슈나 풀 리퀘스트를 열어주세요.
+
+<details>
+<summary>지원되는 모델 보기</summary>
+
+* [Cohere](./model_doc/cohere) 및 [Cohere 2](./model_doc/cohere2)
+* [Gemma](./model_doc/gemma) 및 [Gemma 2](./model_doc/gemma2)
+* [GLM](./model_doc/glm)
+* [Granite](./model_doc/granite)
+* [Llama](./model_doc/llama)
+* [Mistral](./model_doc/mistral)
+* [Mixtral](./model_doc/mixtral)
+* [OLMo](./model_doc/olmo) 및 [OLMo2](./model_doc/olmo2)
+* [Phi](./model_doc/phi) 및 [Phi-3](./model_doc/phi3)
+* [Qwen2](./model_doc/qwen2), [Qwen2Moe](./model_doc/qwen2_moe), 및 [Qwen2-VL](./model_doc/qwen2_5_vl)
+* [Starcoder2](./model_doc/starcoder2)
+
+</details>
+
+이 가이드는 Transformers에서 다양한 분할 전략을 사용하여 텐서 병렬화를 활성화하는 방법을 설명합니다.
+
+## 모델 분할[[partitioning-a-model]]
+
+Transformers는 `tp_plan`매개변수를 활용할 수 있는 모델에 대해 텐서 병렬 처리를 지원합니다. 모델 분할 방식은 두 가지가 있습니다.
+
+- `auto` 텐서 병렬화 계획은 사전 정의된 구성을 기반으로 모델(위에 언급된 지원 모델)을 자동으로 분할합니다.
+- 사용자 지정 분할 계획을 직접 정의하여 [~PreTrainedModel.from_pretrained] 메소드의 `tp_plan` 매개변수로 전달할 수 있습니다.
+
+<hfoptions id="sharding">
+<hfoption id="auto plan">
+
+```py
+import os
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # 모든 가능한 전략을 시각화하기에 더 좋음
+model_id = "meta-llama/Meta-Llama-3-8B-Instruct"  # 적은 수의 GPU에 더 좋음
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
+print(model._tp_plan)
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
+prompt = "Can I help"
+inputs = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
+
+# 분산 실행
+outputs = model(inputs)
+```
+
+위의 추론 스크립트를 GPU당 4개 프로세스로 [torchrun](https://pytorch.org/docs/stable/elastic/run.html)에서 실행하세요.
+
+```bash
+torchrun --nproc-per-node 4 demo.py
+```
+
+</hfoption>
+<hfoption id="manual plan">
+
+각 레이어에 대한 텐서 병렬 계획을 `tp_plan`에 정의한 후 [`~PreTrainedModel.from_pretrained`]에 전달하세요. 아래 예시는 열 및 행 분할을 조합하여 사용합니다. 지원되는 다른 분할 전략은 [분할 전략](#partitioning-strategies) 섹션을 참고하세요.
+
+> [!WARNING]
+> 사용자 지정 분할 계획을 수동으로 지정하려면 모델 아키텍처와 분할 전략이 함께 상호 작용하는 방식에 대한 충분한 이해가 필요합니다. 분할 전략을 잘못 설정하면 모델이 매우 느려지거나, 오류가 발생하거나, 부정확한 결과를 낼 수 있습니다. 자세히 알아보려면 [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism)을 참고하세요.
+
+```py
+from transformers import AutoModelForCausalLM
+
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise",
+    "model.layers.*.self_attn.k_proj": "colwise",
+    "model.layers.*.self_attn.v_proj": "colwise",
+    "model.layers.*.self_attn.o_proj": "rowwise",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+print(model._tp_plan)
+```
+
+</hfoption>
+</hfoptions>
+
+## 분할 전략[[partitioning-strategies]]
+
+모든 분할 전략은 문자열을 전략 구현에 매핑하는 [`ParallelInterface`] 클래스에서 정의됩니다. 모든 전략은 [`~PreTrainedModel.from_pretrained`]의 `tp_plan`을 통해 설정되므로 이 클래스와 직접 상호 작용할 필요는 없지만, 어떤 전략을 사용할 수 있는지 확인할 때 유용합니다.
+
+```py
+class ParallelInterface(MutableMapping):
+    """
+    허용된 어텐션 함수를 추적하는 딕셔너리 같은 객체입니다. `register()` 호출로 새로운 어텐션 함수를 쉽게 추가할 수 있습니다. 
+    모델이 기존 어텐션 함수(예: `sdpa`)를 로컬에서 덮어쓰려면 `modeling_<model>.py` 내부에서 이 클래스의 새 인스턴스를 선언하고 
+    해당 인스턴스에서 선언해야 합니다.
+    """
+    _global_mapping = {
+        "colwise": ColwiseParallel(),
+        "rowwise": RowwiseParallel(),
+        "colwise_rep": ColwiseParallel(output_layouts=Replicate()),
+        "rowwise_rep": RowwiseParallel(input_layouts=Replicate()),
+        "local_colwise": ColwiseParallel(use_dtensor=False),
+        "local_rowwise": RowwiseParallel(use_dtensor=False),
+        "local": IsolatedParallel(),
+        "gather": GatherParallel(),
+        "local_packed_rowwise": PackedRowwiseParallel(use_dtensor=False),
+        "sequence_parallel": SequenceParallel(),
+        "replicate": ReplicateParallel(),
+    }
+```
+
+각 전략에 대해 자세히 알아보려면 아래 표를 참고하세요.
+
+| 전략 | 설명 |
+|---|---|
+| `ColwiseParallel` | 가중치와 편향의 열 방향 분할. |
+| `RowwiseParallel` | 가중치와 편향의 행 방향 분할. `nn.Embedding` 모듈 분할도 지원. |
+| `SequenceParallel` | `LayerNorm`과 `Dropout` 레이어를 지원하는 시퀀스 병렬 구현. [RMSNorm](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34)의 Python 구현도 지원. |
+| `PackedColwiseParallel` | 패킹된 가중치를 지원하는 `ColwiseParallel`의 변형(예: `up_proj`와 `gate_proj`를 함께 패킹). 자세한 내용은 [코드](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108)를 참조하세요. |
+| `PackedRowwiseParallel` | 패킹된 가중치를 지원하는 `RowwiseParallel`의 변형([코드](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) 참조). |
+| `GatherParallel` | 기기 간 모듈의 출력을 수집. |
+| `IsolatedParallel` | Mixture-of-Experts(MoE) 레이어의 전문가에 사용되어 다른 기기로부터 모듈을 격리. |
+| `ReplicateParallel` | 부분적으로 분할된 모델로 인해 `torch.distributed` API가 중단되는 것을 방지하기 위해 모든 기기에 모듈을 복제. |
+
+### 패킹된 전략[[packed-strategies]]
+
+가중치 패킹은 여러 선형 레이어를 하나의 더 큰 레이어로 합치는 기법입니다. 패킹된 전략인 `PackedColwiseParallel`과 `PackedRowwiseParallel`은 패킹된 가중치를 분할하는 데 사용됩니다. 기본적인 `ColwiseParallel`이나 `RowwiseParallel`은 패킹된 가중치를 올바르게 분할하지 못합니다.
+
+아래 예시는 `up_proj`와 `gate_proj`를 단일 `gate_up_proj` 모듈로 패킹하고 `gate_up_proj`를 분할하기 위해 `PackedRowwiseParallel` 전략이 필요합니다.
+
+```python
+class Llama4TextExperts(nn.Module):
+    ...
+    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+```
+
+배치 행렬 곱셈을 `forward` 패스에서 사용하여 `gate_up_proj` 모듈의 출력을 계산할 수 있습니다.
+
+```python
+def forward(self, hidden_states):
+    ...
+    gate_up = torch.bmm(hidden_states, self.gate_up_proj) # gate_up_proj 모듈의 출력 계산
+    gate, up = gate_up.chunk(2, dim=-1) # 출력을 gate와 up으로 분할
+```
+
+> [!TIP]
+> `Packed*`를 사용해야 하는 이유에 대한 시각적 표현은 [이 주석](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108)을 참고하세요.
+
+### 로컬 전략[[local-strategies]]
+
+로컬 전략(`local_colwise`, `local_rowwise`, `local_packed_rowwise`)은 [torch.chunk](https://docs.pytorch.org/docs/stable/generated/torch.chunk.html)와 같은 일부 연산에서 지원되지 않기 때문에 [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html)를 사용하지 않습니다. 대신 로컬 전략은 기본 [torch.Tensor](https://docs.pytorch.org/docs/stable/tensors.html)를 사용하고 일부 분산 로직을 수동으로 수행합니다.
+
+<!--
+Readd this when I get the exact error message
+> [!TIP]
+> 사용자 정의 분할 전략을 사용하는데 `... is not supported` 오류로 작동하지 않는 경우, `local*` 전략을 사용해서 더 잘 작동하는지 시도해보세요.
+-->
+
+## 사용자 정의 분할 전략[[custom-partitioning-strategies]]
+
+사용자 정의 분할 전략은 [`TensorParallelLayer`](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py)를 상속하고 `partition_tensor`, `_prepare_input_fn`, `_prepare_output_fn`을 구현해야 합니다.
+
+그런 다음 `tp_plan`에서 해당 전략을 지정했을 때 디스패칭 로직이 찾을 수 있도록 `ParallelInterface` 매핑에 등록해야 합니다.
+
+아래 예시는 이 워크플로우로 `ColwiseParallel`을 구현하는 방법을 보여줍니다.
+
+1. `TensorParallelLayer`를 상속합니다. `__init__` 메소드에서 입력 및 출력 텐서가 기기에 어떻게 배치되어야 하는지 설명하는 `input_layouts`과 `output_layouts`을 정의합니다. `desired_input_layouts` 속성은 입력이 기기에 어떻게 배치*되어야만* 하는지를 명시하는 데 사용됩니다.
+
+    ```python
+    class ColwiseParallel(TensorParallelLayer):
+        def __init__(
+            self,
+            *,
+            input_layouts: Optional[Placement] = None, # 이전 레이어에서 오는 입력 레이아웃
+            output_layouts: Optional[Placement] = None, # 달성하고자 하는 출력 레이아웃
+            use_local_output: bool = True, # 로컬 출력 사용 여부
+            use_dtensor=True, # DTensor 사용 여부
+        ):
+            self.input_layouts = (input_layouts or Replicate(),) # 이전 레이어에서 오는 입력 분할
+            self.output_layouts = (output_layouts or Shard(-1),) # 원하는 출력 분할
+            self.desired_input_layouts = (Replicate(),) # 원하는 입력 분할, 입력은 GPU 간에 복제되어야 함
+            self.use_local_output = use_local_output
+            self.use_dtensor = use_dtensor
+    ```
+
+2. `partition_tensor`, `_prepare_input_fn`, `_prepare_output_fn` 메서드를 구현합니다.
+
+    `partition_tensor` 메소드는 텐서를 분할하고 분할된 텐서로 `empty_param`을 채웁니다. 유틸리티 함수 `get_tensor_shard`를 사용하여 주어진 랭크에 대한 원본 매개변수의 올바른 분할을 얻고, 패킹된 가중치에 대해서는 `get_packed_weights`를 사용하세요.
+
+    ```python
+    def partition_tensor(
+        self,
+        param, # 매개변수의 전체 텐서
+        empty_param, # 매개변수의 빈 텐서, 분할된 텐서로 채워짐
+        param_type, # 매개변수 유형, `bias` 또는 `weight`
+        param_casting_dtype, # 매개변수를 캐스팅할 유형
+        to_contiguous, # 텐서를 연속적인 메모리 레이아웃으로 변환할지 여부
+        rank, # 현재 기기의 랭크
+        device_mesh, # 기기 메시
+    ) -> nn.Parameter: # 분할된 매개변수 반환
+        ...
+    ```
+
+    `_prepare_input_fn`과 `_prepare_output_fn` 메소드는 [사전 포워드](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) 및 [포워드](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) 훅에서 사용됩니다. `__init__`에서 지정된 대로 입력과 출력을 원하는 레이아웃으로 재분배합니다.
+
+    ```python
+    def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+        ...
+        # 사용자 정의 로직 수행, DTensor로 캐스팅 등.
+        ...
+        return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)
+    def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+        ...
+        # 사용자 정의 로직 수행, DTensor로 캐스팅 등.
+        ...
+        return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
+    ```
+
+3. `tp_plan`과 함께 사용할 수 있도록 전략을 [`ParallelInterface`]에 등록합니다.
+
+    ```python
+    from transformers.integrations.tensor_parallel import ParallelInterface
+
+    ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
+    tp_plan = {
+        "model.layers.*.self_attn.q_proj": "colwise_custom",
+        ...
+    }
+    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+    ```
+
+## 벤치마크[[benchmarks]]
+
+텐서 병렬화는 특히 큰 배치 크기나 긴 시퀀스를 가진 입력에 대한 추론 속도를 크게 향상시킬 수 있습니다.
+
+시퀀스 길이가 512인 [Llama](./model_doc/llama)에서 단일 포워드 패스에 대한 예상 속도 향상 수치는 아래 차트를 참조하세요.
+
+<div style="text-align: center">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
+</div>
+
+## 설계 구현[[design-implementation]]
+
+Transformers 텐서 병렬화 구현은 프레임워크에 구애받지 않지만, 구체적인 구현을 위해서는 [DeviceMesh](https://docs.pytorch.org/tutorials/recipes/distributed_device_mesh.html)와 [torch.distributed](https://docs.pytorch.org/tutorials/beginner/dist_overview.html)의 [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html)에 의존하여 간단하고 확장 가능한 인터페이스를 제공합니다.
+
+### DeviceMesh[[devicemesh]]
+
+`DeviceMesh`를 함께 통신하는 기기들의 다차원 그리드로 상상해보세요. 병렬 처리 전략마다 각기 다른 통신 패턴이 필요하므로, 여러 하위 메시를 가진 `DeviceMesh`를 만들 수 있습니다.
+
+```python
+from torch.distributed.device_mesh import init_device_mesh
+
+# 4개 GPU의 1D 메시 생성
+device_mesh = init_device_mesh("cuda", (4,), mesh_dim_names=["tp"])
+```
+
+`torch.distributed`에서 정의된 대부분의 병렬화 전략은 메시 자체나 하위 메시에 적용할 수 있으며, 자동으로 통신 패턴을 처리합니다.
+
+### DTensor[[dtensor]]
+
+`DTensor`(분산 텐서)는 일반적인 텐서 연산 위에 분산 로직을 처리하는 텐서 하위 클래스입니다. 텐서 병렬화의 대부분의 모델 가중치는 `DTensor` 형태로 저장됩니다.
+
+DTensor의 가장 중요한 부분은 `placement` 속성입니다. 이는 PyTorch에게 텐서가 `DeviceMesh`의 기기에 어떻게 배치되는지 알려주기 때문입니다. `placement` 속성은 다음 값을 가질 수 있습니다.
+
+- `Shard(dimension)` - `DTensor`가 구성된 `DeviceMesh`에서 주어진 차원에 걸쳐 어떻게 분할되는지 나타냅니다. 아래 예시는 열 방향 분할을 위해 다양한 차원에 걸쳐 가중치를 분할하는 방법을 보여줍니다.
+
+    ```python
+    weight = ...
+    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # 첫 번째(열 방향) 차원에 걸쳐 분할
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # 유일한 차원에 걸쳐 분할
+    ```
+
+    이 예시는 행 방향 분할을 위해 여러 차원에 걸쳐 가중치를 분할하는 방법을 보여줍니다.
+
+    ```python
+    weight = ...
+    weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # 두 번째(행 방향) 차원에 걸쳐 분할
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # 모든 GPU에 편향 복제
+    ```
+
+- `Replicate()` - `DTensor`가 `DeviceMesh`에 걸쳐 복제됨을 나타냅니다. 각 기기에 텐서의 전체 사본만 생성합니다.
+
+    ```py
+    bias = ...
+    bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # 모든 GPU에 편향 복제
+    ```
+
+- `Partial()` - 텐서가 감소 연산을 기다리고 있는 상태임을 나타냅니다 (일반적으로 Transformers에서의 사용 사례와는 직접적인 관련이 적습니다).
--- a/docs/source/ko/quantization/quark.md
+++ b/docs/source/ko/quantization/quark.md
@ -0,0 +1,85 @@
+<!--Copyright 2025 Advanced Micro Devices, Inc. and The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Quark[[quark]]
+
+[Quark](https://quark.docs.amd.com/latest/)는 특정 데이터 타입, 알고리즘, 하드웨어에 구애받지 않도록 설계된 딥러닝 양자화 툴킷입니다. Quark에서는 다양한 전처리 전략, 알고리즘, 데이터 타입을 조합하여 사용할 수 있습니다.
+
+🤗 Transformers를 통해 통합된 PyTorch 지원은 주로 AMD CPU 및 GPU를 대상으로 하며, 주로 평가 목적으로 사용됩니다. 예를 들어, [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness)를 🤗 Transformers 백엔드와 함께 사용하여 Quark로 양자화된 다양한 모델을 원활하게 평가할 수 있습니다.
+
+Quark에 관심이 있는 사용자는 [문서](https://quark.docs.amd.com/latest/)를 참고하여 모델 양자화를 시작하고 지원되는 오픈 소스 라이브러리에서 사용할 수 있습니다!
+
+Quark는 자체 체크포인트/[설정 포맷](https://huggingface.co/amd/Llama-3.1-8B-Instruct-FP8-KV-Quark-test/blob/main/config.json#L26)를 가지고 있지만, 다른 양자화/런타임 구현체 ([AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq), [네이티브 fp8](https://huggingface.co/docs/transformers/quantization/finegrained_fp8))와 호환되는 직렬화 레이아웃으로 모델을 생성하는 것도 지원합니다.
+
+Transformer에서 Quark 양자화 모델을 로드하려면 먼저 라이브러리를 설치해야 합니다:
+
+```bash
+pip install amd-quark
+```
+
+## 지원 매트릭스[[Support matrix]]
+
+Quark를 통해 양자화된 모델은 함께 조합할 수 있는 광범위한 기능을 지원합니다. 구성에 관계없이 모든 양자화된 모델은 `PretrainedModel.from_pretrained`를 통해 원활하게 다시 로드할 수 있습니다.
+
+아래 표는 Quark에서 지원하는 몇 가지 기능을 보여줍니다:
+
+| **기능**                        | **Quark에서 지원하는 항목**                                                                             |   |
+|---------------------------------|-----------------------------------------------------------------------------------------------------------|---|
+| 데이터 타입                     | int8, int4, int2, bfloat16, float16, fp8_e5m2, fp8_e4m3, fp6_e3m2, fp6_e2m3, fp4, OCP MX, MX6, MX9, bfp16 |   |
+| 양자화 전 모델 변환 | SmoothQuant, QuaRot, SpinQuant, AWQ                                                                       |   |
+| 양자화 알고리즘                 | GPTQ                                                                                                      |   |
+| 지원 연산자                     | ``nn.Linear``, ``nn.Conv2d``, ``nn.ConvTranspose2d``, ``nn.Embedding``, ``nn.EmbeddingBag``               |   |
+| 세분성(Granularity)             | per-tensor, per-channel, per-block, per-layer, per-layer type                                             |   |
+| KV 캐시                         | fp8                                                                                                       |   |
+| 활성화 캘리브레이션             | MinMax / Percentile / MSE                                                                                 |   |
+| 양자화 전략                     | weight-only, static, dynamic, with or without output quantization                                         |   |
+
+## Hugging Face Hub의 모델[[Models on Hugging Face Hub]]
+
+Quark 네이티브 직렬화를 사용하는 공개 모델은 https://huggingface.co/models?other=quark 에서 찾을 수 있습니다.
+
+Quark는 [`quant_method="fp8"`을 이용하는 모델](https://huggingface.co/models?other=fp8)과 [`quant_method="awq"`을 사용하는 모델](https://huggingface.co/models?other=awq)도 지원하지만, Transformers는 이러한 모델을 [AutoAWQ](https://huggingface.co/docs/transformers/quantization/awq)를 통해 불러오거나 
+[🤗 Transformers의 네이티브 fp8 지원](https://huggingface.co/docs/transformers/quantization/finegrained_fp8)을 사용합니다.
+
+## Transformers에서 Quark모델 사용하기[[Using Quark models in Transformers]]
+
+다음은 Transformers에서 Quark 모델을 불러오는 방법의 예시입니다:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model_id = "EmbeddedLLM/Llama-3.1-8B-Instruct-w_fp8_per_channel_sym"
+model = AutoModelForCausalLM.from_pretrained(model_id)
+model = model.to("cuda")
+
+print(model.model.layers[0].self_attn.q_proj)
+# QParamsLinear(
+#   (weight_quantizer): ScaledRealQuantizer()
+#   (input_quantizer): ScaledRealQuantizer()
+#   (output_quantizer): ScaledRealQuantizer()
+# )
+
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+inp = tokenizer("Where is a good place to cycle around Tokyo?", return_tensors="pt")
+inp = inp.to("cuda")
+
+res = model.generate(**inp, min_new_tokens=50, max_new_tokens=100)
+
+print(tokenizer.batch_decode(res)[0])
+# <|begin_of_text|>Where is a good place to cycle around Tokyo? There are several places in Tokyo that are suitable for cycling, depending on your skill level and interests. Here are a few suggestions:
+# 1. Yoyogi Park: This park is a popular spot for cycling and has a wide, flat path that's perfect for beginners. You can also visit the Meiji Shrine, a famous Shinto shrine located in the park.
+# 2. Imperial Palace East Garden: This beautiful garden has a large, flat path that's perfect for cycling. You can also visit the
+```
--- a/docs/source/ko/run_scripts.md
+++ b/docs/source/ko/run_scripts.md
@ -347,7 +347,7 @@ python examples/pytorch/summarization/run_summarization.py
 모든 스크립트는 최종 모델을 [Model Hub](https://huggingface.co/models)에 업로드할 수 있습니다.
 시작하기 전에 Hugging Face에 로그인했는지 확인하세요:
 ```bash
-huggingface-cli login
+hf auth login
 ```

 그런 다음 스크립트에 `push_to_hub` 인수를 추가합니다.
--- a/docs/source/pt/custom_models.md
+++ b/docs/source/pt/custom_models.md
@ -284,7 +284,7 @@ resnet50d.model.load_state_dict(pretrained_model.state_dict())
 Agora para enviar o modelo para o Hub, certifique-se de estar logado. Ou execute no seu terminal:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 ou a partir do notebook:
--- a/docs/source/pt/run_scripts.md
+++ b/docs/source/pt/run_scripts.md
@ -327,7 +327,7 @@ python examples/pytorch/summarization/run_summarization.py
 Todos os scripts podem enviar seu modelo final para o [Model Hub](https://huggingface.co/models). Certifique-se de estar conectado ao Hugging Face antes de começar:

 ```bash
-huggingface-cli login
+hf auth login
 ```

 Em seguida, adicione o argumento `push_to_hub` ao script. Este argumento criará um repositório com seu nome de usuário do Hugging Face e o nome da pasta especificado em `output_dir`.
--- a/Show More
+++ b/Show More