Release: v4.52.0

[gemma3] fix bidirectional attention mask (#38080 )
* fix attn mask * attn viz doesn't show yello cubes between images * bucketize made it hard with different number of crops * fixup
2025-10-20 17:13:56 +08:00 · 2025-05-20 18:11:22 +02:00 · 2025-05-20 17:35:04 +02:00 · 2025-05-20 17:34:55 +02:00 · 2025-05-20 17:13:59 +02:00 · 2025-05-20 17:13:21 +02:00
1036 changed files with 44491 additions and 127650 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@ -7,6 +7,18 @@ parameters:
    nightly:
        type: boolean
        default: false
+    GHA_Actor:
+        type: string
+        default: ""
+    GHA_Action:
+        type: string
+        default: ""
+    GHA_Event:
+        type: string
+        default: ""
+    GHA_Meta:
+        type: string
+        default: ""

 jobs:
    # Ensure running with CircleCI/huggingface
@ -31,8 +43,10 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: if [[ "$CIRCLE_PULL_REQUEST" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
-            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/${CIRCLE_PULL_REQUEST##*/} >> github.txt'
+            - run: python3 utils/extract_pr_number_from_circleci.py > pr_number.txt
+            - run: echo $(cat pr_number.txt)
+            - run: if [[ "$(cat pr_number.txt)" == "" && "$CIRCLE_BRANCH" != "main" && "$CIRCLE_BRANCH" != *-release ]]; then echo "Not a PR, not the main branch and not a release branch, skip test!"; circleci-agent step halt; fi
+            - run: 'curl -L -H "Accept: application/vnd.github+json" -H "X-GitHub-Api-Version: 2022-11-28" https://api.github.com/repos/$CIRCLE_PROJECT_USERNAME/$CIRCLE_PROJECT_REPONAME/pulls/$(cat pr_number.txt) >> github.txt'
            - run: cat github.txt
            - run: (python3 -c 'import json; from datetime import datetime; fp = open("github.txt"); data = json.load(fp); fp.close(); f = "%Y-%m-%dT%H:%M:%SZ"; created = datetime.strptime(data["created_at"], f); updated = datetime.strptime(data["updated_at"], f); s = (updated - created).total_seconds(); print(int(s))' || true) > elapsed.txt
            - run: if [ "$(cat elapsed.txt)" == "" ]; then echo 60 > elapsed.txt; fi
--- a/.circleci/create_circleci_config.py
+++ b/.circleci/create_circleci_config.py
@ -110,6 +110,7 @@ class CircleCIJob:
            print(f"Using {self.docker_image} docker image")
        if self.install_steps is None:
            self.install_steps = ["uv venv && uv pip install ."]
+        self.install_steps.append("uv venv && uv pip install git+https://github.com/ydshieh/pytest.git@8.3.5-ydshieh git+https://github.com/ydshieh/pluggy.git@1.5.0-ydshieh")
        if self.pytest_options is None:
            self.pytest_options = {}
        if isinstance(self.tests_to_run, str):
@ -397,7 +398,12 @@ def create_circleci_config(folder=None):
        "parameters": {
            # Only used to accept the parameters from the trigger
            "nightly": {"type": "boolean", "default": False},
-            "tests_to_run": {"type": "string", "default": ''},
+            # Only used to accept the parameters from GitHub Actions trigger
+            "GHA_Actor": {"type": "string", "default": ""},
+            "GHA_Action": {"type": "string", "default": ""},
+            "GHA_Event": {"type": "string", "default": ""},
+            "GHA_Meta": {"type": "string", "default": ""},
+            "tests_to_run": {"type": "string", "default": ""},
            **{j.job_name + "_test_list":{"type":"string", "default":''} for j in jobs},
            **{j.job_name + "_parallelism":{"type":"integer", "default":1} for j in jobs},
        },
--- a/.github/workflows/change_pr_to_draft.yml
+++ b/.github/workflows/change_pr_to_draft.yml
@ -1,25 +0,0 @@
-name: Change PR to draft
-
-on:
-  pull_request_target:
-    types: [opened, reopened]
-
-jobs:
-  convert_pr_to_draft:
-    runs-on: ubuntu-22.04
-    name: Convert PR to draft
-    permissions:
-      pull-requests: write
-      contents: write
-    if: github.event.pull_request.draft == false
-    steps:
-      - name: Convert PR to draft
-        shell: bash
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          REPO: ${{ github.repository }}
-        run: |
-          echo $PR_NUMBER
-          gh pr ready $PR_NUMBER --repo $REPO --undo
-          gh pr comment $PR_NUMBER --repo $REPO --body "Hi 👋, thank you for opening this pull request! The pull request is converted to draft by default. The CI will be paused while the PR is in draft mode. When it is ready for review, please click the \`Ready for review\` button (at the bottom of the PR page). This will assign reviewers and trigger CI."
--- a/.github/workflows/new_model_pr_merged_notification.yml
+++ b/.github/workflows/new_model_pr_merged_notification.yml
@ -59,7 +59,7 @@ jobs:
                  "type": "section",
                  "text": {
                    "type": "mrkdwn",
-                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh"
+                    "text": "<https://github.com/huggingface/transformers/commit/${{ env.COMMIT_SHA }}|New model: ${{ env.NEW_MODEL }}> GH_ArthurZucker, GH_lysandrejik, GH_ydshieh\ncommit SHA: ${{ env.COMMIT_SHA }}"
                  }
                }
              ]
--- a/.github/workflows/pr-style-bot.yml
+++ b/.github/workflows/pr-style-bot.yml
@ -0,0 +1,19 @@
+# To run this bot, comment "@bot /style" on a PR
+name: Style Bot
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  style:
+    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@main
+    with:
+      python_quality_dependencies: "[quality]"
+      style_command_type: "default"
+    secrets:
+      bot_token: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/self-comment-ci.yml
+++ b/.github/workflows/self-comment-ci.yml
@ -29,7 +29,7 @@ jobs:
    runs-on: ubuntu-22.04
    name: Get PR number
    # For security: only allow team members to run
-    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
+    if: ${{ github.event.issue.state == 'open' && contains(fromJSON('["ydshieh", "ArthurZucker", "zucchini-nlp", "qubvel", "molbap", "gante", "LysandreJik", "Cyrilvallez", "Rocketknight1", "SunMarc", "muellerzr", "eustlb", "MekkCyber", "manueldeprada"]'), github.actor) && (startsWith(github.event.comment.body, 'run-slow') || startsWith(github.event.comment.body, 'run slow') || startsWith(github.event.comment.body, 'run_slow')) }}
    outputs:
      PR_NUMBER: ${{ steps.set_pr_number.outputs.PR_NUMBER }}
    steps:
@ -145,7 +145,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          MODELS: ${{ needs.get-tests.outputs.models }}
-          BODY: "This comment contains run-slow, running the specified jobs:\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
+          BODY: "\n\nmodels: ${{ needs.get-tests.outputs.models }}\nquantizations: ${{ needs.get-tests.outputs.quantizations }}"
        run: |
          gh api \
            --method POST \
--- a/README.md
+++ b/README.md
@ -78,7 +78,6 @@ Create and activate a virtual environment with [venv](https://docs.python.org/3/
 # venv
 python -m venv .my-env
 source .my-env/bin/activate
-
 # uv
 uv venv .my-env
 source .my-env/bin/activate
@ -88,10 +87,10 @@ Install Transformers in your virtual environment.

 ```py
 # pip
-pip install transformers
+pip install "transformers[torch]"

 # uv
-uv pip install transformers
+uv pip install "transformers[torch]"
 ```

 Install Transformers from source if you want the latest changes in the library or are interested in contributing. However, the *latest* version may not be stable. Feel free to open an [issue](https://github.com/huggingface/transformers/issues) if you encounter an error.
@ -99,7 +98,12 @@ Install Transformers from source if you want the latest changes in the library o
 ```shell
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .
+
+# pip
+pip install .[torch]
+
+# uv
+uv pip install .[torch]
 ```

 ## Quickstart
@ -121,7 +125,7 @@ To chat with a model, the usage pattern is the same. The only difference is you
 > [!TIP]
 > You can also chat with a model directly from the command line.
 > ```shell
-> transformers chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
+> transformers chat Qwen/Qwen2.5-0.5B-Instruct
 > ```

 ```py
--- a/docker/transformers-all-latest-gpu/Dockerfile
+++ b/docker/transformers-all-latest-gpu/Dockerfile
@ -71,6 +71,9 @@ RUN python3 -m pip install --no-cache-dir g2p-en
 # For Some bitsandbytes tests
 RUN python3 -m pip install --no-cache-dir einops

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-latest-gpu/Dockerfile
@ -45,6 +45,9 @@ RUN python3 -m pip uninstall -y deepspeed
 # TODO: Find out why test fail.
 RUN DS_BUILD_CPU_ADAM=1 DS_BUILD_FUSED_ADAM=1 python3 -m pip install deepspeed --global-option="build_ext" --global-option="-j8" --no-cache -v --disable-pip-version-check 2>&1

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
+++ b/docker/transformers-pytorch-deepspeed-nightly-gpu/Dockerfile
@ -57,6 +57,9 @@ RUN python3 -m pip uninstall -y deepspeed
 #RUN git clone https://github.com/pytorch/TensorRT.git
 #RUN cd TensorRT/py && python3 setup.py install --fx-only

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@ -28,6 +28,9 @@ RUN python3 -m pip uninstall -y tensorflow flax
 RUN python3 -m pip install --no-cache-dir git+https://github.com/facebookresearch/detectron2.git pytesseract
 RUN python3 -m pip install -U "itsdangerous<2.1.0"

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docker/transformers-quantization-latest-gpu/Dockerfile
+++ b/docker/transformers-quantization-latest-gpu/Dockerfile
@ -90,6 +90,9 @@ RUN python3 -m pip install --no-cache-dir "auto-round>=0.5.0"
 # Add transformers in editable mode
 RUN python3 -m pip install --no-cache-dir -e ./transformers[dev-torch]

+# `kernels` may give different outputs (within 1e-5 range) even with the same model (weights) and the same inputs
+RUN python3 -m pip uninstall -y kernels
+
 # When installing in editable mode, `transformers` is not recognized as a package.
 # this line must be added in order for python to be aware of transformers.
 RUN cd transformers && python3 setup.py develop
--- a/docs/source/en/_toctree.yml
+++ b/docs/source/en/_toctree.yml
@ -21,6 +21,8 @@
      title: Adding a new model to Transformers
    - local: modular_transformers
      title: Modular Transformers
+    - local: auto_docstring
+      title: Document your models
    - local: task_summary
      title: What 🤗 Transformers can do
    - local: tasks_explained
@ -37,6 +39,8 @@
      title: Tokenizers
    - local: image_processors
      title: Image processors
+    - local: video_processors
+      title: Video processors
    - local: backbones
      title: Backbones
    - local: feature_extractors
@ -360,7 +364,9 @@
      title: Feature Extractor
    - local: main_classes/image_processor
      title: Image Processor
-    title: Main classes
+    - local: main_classes/video_processor
+      title: Video Processor
+    title: Main Classes
  - sections:
    - sections:
      - local: model_doc/albert
--- a/docs/source/en/auto_docstring.md
+++ b/docs/source/en/auto_docstring.md
@ -0,0 +1,279 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Utilizing the @auto_docstring Decorator
+
+The `@auto_docstring` decorator in the Hugging Face Transformers library helps generate docstrings for model classes and their methods, which will be used to build the documentation for the library. It aims to improve consistency and reduce boilerplate by automatically including standard argument descriptions and allowing for targeted overrides and additions.
+
+---
+
+## 📜 How it Works
+
+The `@auto_docstring` decorator constructs docstrings by:
+
+1.  **Signature Inspection:** It inspects the signature (arguments, types, defaults) of the decorated class's `__init__` method or the decorated function.
+2.  **Centralized Docstring Fetching:** It retrieves predefined docstrings for common arguments (e.g., `input_ids`, `attention_mask`) from internal library sources (like `ModelArgs` or `ImageProcessorArgs` in `utils/args_doc.py`).
+3.  **Overriding or Adding Arguments Descriptions:**
+    * **Direct Docstring Block:** It incorporates custom docstring content from an `r""" """` (or `""" """`) block below the method signature or within the `__init__` docstring. This is for documenting new arguments or overriding standard descriptions.
+    * **Decorator Arguments (`custom_args`):** A `custom_args` docstring block can be passed to the decorator to provide docstrings for specific arguments directly in the decorator call. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+4.  **Adding Classes and Functions Introduction:**
+    * **`custom_intro` argument:** Allows prepending a custom introductory paragraph to a class or function docstring.
+    * **Automatic Introduction Generation:** For model classes with standard naming patterns (like `ModelForCausalLM`) or belonging to a pipeline, the decorator automatically generates an appropriate introductory paragraph using `ClassDocstring` in `utils/args_doc.py` as the source.
+5.  **Templating:** The decorator uses a templating system, allowing predefined docstrings to include dynamic information deduced from the `auto_modules` of the library, such as `{{processor_class}}` or `{{config_class}}`.
+6.  **Deducing Relevant Examples:** The decorator attempts to find appropriate usage examples based on the model's task or pipeline compatibility. It extracts checkpoint information from the model's configuration class to provide concrete examples with real model identifiers.
+7.  **Adding Return Value Documentation:** For methods like `forward`, the decorator can automatically generate the "Returns" section based on the method's return type annotation. For example, for a method returning a `ModelOutput` subclass, it will extracts field descriptions from that class's docstring to create a comprehensive return value description. A custom `Returns` section can also be manually specified in the function docstring block.
+8.  **Unrolling Kwargs Typed With Unpack Operator:** For specific methods (defined in `UNROLL_KWARGS_METHODS`) or classes (defined in `UNROLL_KWARGS_CLASSES`), the decorator processes `**kwargs` parameters that are typed with `Unpack[KwargsTypedDict]`. It extracts the documentation from the TypedDict and adds each parameter to the function's docstring. Currently, this functionality is only supported for `FastImageProcessorKwargs`.
+
+
+---
+
+## 🚀 How to Use @auto_docstring
+
+### 1. Importing the Decorator
+Import the decorator into your modeling file:
+
+```python
+from ...utils import auto_docstring
+```
+
+### 2. Applying to Classes
+Place `@auto_docstring` directly above the class definition. It uses the `__init__` method's signature and its docstring for parameter descriptions.
+
+```python
+from transformers.modeling_utils import PreTrainedModel
+from ...utils import auto_docstring
+
+@auto_docstring
+class MyAwesomeModel(PreTrainedModel):
+    def __init__(self, config, custom_parameter: int = 10, another_custom_arg: str = "default"):
+        r"""
+        custom_parameter (`int`, *optional*, defaults to 10):
+            Description of the custom_parameter for MyAwesomeModel.
+        another_custom_arg (`str`, *optional*, defaults to "default"):
+            Documentation for another unique argument.
+        """
+        super().__init__(config)
+        self.custom_parameter = custom_parameter
+        self.another_custom_arg = another_custom_arg
+        # ... rest of your init
+
+    # ... other methods
+```
+
+#### Advanced Class Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+    custom_args="""
+    custom_parameter (`type`, *optional*, defaults to `default_value`):
+        A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+    internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+        A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+    """
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        # ...
+```
+
+Or:
+
+```python
+@auto_docstring(
+    custom_intro="""This model performs specific synergistic operations.
+    It builds upon the standard Transformer architecture with unique modifications.""",
+)
+class MySpecialModel(PreTrainedModel):
+    def __init__(self, config: ConfigType, custom_parameter: "type" = "default_value", internal_helper_arg=None):
+        r"""
+        custom_parameter (`type`, *optional*, defaults to `default_value`):
+            A concise description for custom_parameter if not defined or overriding the description in `args_doc.py`.
+        internal_helper_arg (`type`, *optional*, defaults to `default_value`):
+            A concise description for internal_helper_arg if not defined or overriding the description in `args_doc.py`.
+        """
+        # ...
+```
+
+### 3. Applying to Functions (e.g., `forward` method)
+Apply the decorator above method definitions, such as the `forward` method.
+
+```python
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        new_custom_argument: Optional[torch.Tensor] = None,
+        arg_documented_in_args_doc: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> Union[Tuple, ModelOutput]: # The description of the return value will automatically be generated from the ModelOutput class docstring.
+        r"""
+        new_custom_argument (`torch.Tensor`, *optional*):
+            Description of this new custom argument and its expected shape or type.
+        """
+        # ...
+```
+
+#### Advanced Function Decoration:
+
+Arguments can be passed directly to `@auto_docstring` for more control. `Returns` and `Examples` sections can also be manually specified:
+
+```python
+MODEL_COMMON_CUSTOM_ARGS = r"""
+    common_arg_1 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_1
+    common_arg_2 (`torch.Tensor`, *optional*, defaults to `default_value`):
+        Description of common_arg_2
+    ...
+"""
+
+class MyModel(PreTrainedModel):
+    # ...
+    @auto_docstring(
+        custom_intro="""
+        This is a custom introduction for the function.
+        """
+        custom_args=MODEL_COMMON_CUSTOM_ARGS
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        common_arg_1: Optional[torch.Tensor] = None,
+        common_arg_2: Optional[torch.Tensor] = None,
+        #...
+        function_specific_argument: Optional[torch.Tensor] = None,
+        # ... other arguments
+    ) -> torch.Tensor:
+        r"""
+        function_specific_argument (`torch.Tensor`, *optional*):
+            Description of an argument specific to this function
+
+        Returns:
+            `torch.Tensor`: For a function returning a generic type, a custom "Returns" section can be specified.
+
+        Example:
+
+        (To override the default example with a custom one or to add an example for a model class that does not have a pipeline)
+
+        ```python
+        ...
+        ```
+        """
+        # ...
+```
+
+---
+
+### ✍️ Documenting Arguments: Approach & Priority
+
+1.  **Standard Arguments (e.g., `input_ids`, `attention_mask`, `pixel_values`, `encoder_hidden_states` etc.):**
+    * `@auto_docstring` retrieves descriptions from a central source. Do not redefine these locally if their description and shape are the same as in `args_doc.py`.
+
+2.  **New or Custom Arguments:**
+    * **Primary Method:** Document these within an `r""" """` docstring block following the signature (for functions) or in the `__init__` method's docstring (for class parameters).
+    * **Format:**
+        ```
+        argument_name (`type`, *optional*, defaults to `X`):
+            Description of the argument.
+            Explain its purpose, expected shape/type if complex, and default behavior.
+            This can span multiple lines.
+        ```
+    * Include `type` in backticks.
+    * Add "*optional*" if the argument is not required (has a default value).
+    * Add "defaults to `X`" if it has a default value (no need to specify "defaults to `None`" if the default value is `None`).
+
+3.  **Overriding Standard Arguments:**
+    * If a standard argument behaves differently (e.g., different expected shape, model-specific behavior), provide its complete description in the local `r""" """` docstring. This local definition takes precedence.
+    * The `labels` argument is often customized per model and typically requires a specific docstring.
+
+4.  **Using Decorator Arguments for Overrides or New Arguments (`custom_args`):**
+    * New or custom arguments docstrings can also be passed to `@auto_docstring` as a `custom_args` argument. This can be used to define the docstring block for new arguments once if they are repeated in multiple places in the modeling file.
+
+---
+
+### Usage with [modular files](./modular_transformers)
+
+When working with modular files, follow these guidelines for applying the `@auto_docstring` decorator:
+
+- **For standalone models in modular files:**
+  Apply the `@auto_docstring` decorator just as you would in regular modeling files.
+
+- **For models inheriting from other library models:**
+  - When inheriting from a parent model, decorators (including `@auto_docstring`) are automatically carried over to the generated modeling file without needing to add them in your modular file.
+  - If you need to modify the `@auto_docstring` behavior, apply the customized decorator in your modular file, making sure to *include all other decorators* that were present on the original function/class.
+
+  > **Warning**: When overriding any decorator in a modular file, you must include ALL decorators that were applied to that function/class in the parent model. If you only override some decorators, the others won't be included in the generated modeling file.
+
+
+**Note**: The `check_auto_docstrings` tool doesn't check modular files directly, but it will check (and modify when using `--fix_and_overwrite`) the generated modeling files. If issues are found in the generated files, you'll need to update your modular files accordingly.
+
+---
+
+## ✅ Checking Your Docstrings with `check_auto_docstrings`
+
+The library includes a utility script to validate docstrings. This check is typically run during Continuous Integration (CI).
+
+#### What it Checks:
+
+* **Decorator Presence:** Ensures `@auto_docstring` is applied to relevant model classes and public methods. (TODO)
+* **Argument Completeness & Consistency:**
+    * Flags arguments in the signature that are not known standard arguments and lack a local description.
+    * Ensures documented arguments exist in the signature. (TODO)
+    * Verifies that types and default values in the docstring match the signature. (TODO)
+* **Placeholder Detection:** Reminds you to complete placeholders like `<fill_type>` or `<fill_docstring>`.
+* **Formatting:** Adherence to the expected docstring style.
+
+#### Running the Check Locally:
+
+Run this check locally before committing. The common command is:
+
+```bash
+make fix-copies
+```
+
+Alternatively, to only perform docstrings and auto-docstring checks, you can use:
+
+```bash
+python utils/check_docstrings.py # to only check files included in the diff without fixing them
+# Or: python utils/check_docstrings.py --fix_and_overwrite # to fix and overwrite the files in the diff
+# Or: python utils/check_docstrings.py --fix_and_overwrite --check_all # to fix and overwrite all files
+```
+
+#### Workflow with the Checker:
+
+1.  Add `@auto_docstring(...)` to the class or method.
+2.  For new, custom, or overridden arguments, add descriptions in an `r""" """` block.
+3.  Run `make fix-copies` (or the `check_docstrings.py` utility).
+    * For unrecognized arguments lacking documentation, the utility will create placeholder entries.
+4.  Manually edit these placeholders with accurate types and descriptions.
+5.  Re-run the check to ensure all issues are resolved.
+
+---
+
+## 🔑 Key Takeaways & Best Practices
+
+* Use `@auto_docstring` for new PyTorch model classes (`PreTrainedModel` subclasses) and their primary for methods (e.g., `forward`, `get_text_features` etc.).
+* For classes, the `__init__` method's docstring is the main source for parameter descriptions when using `@auto_docstring` on the class.
+* Rely on standard docstrings; do not redefine common arguments unless their behavior is different in your specific model.
+* Document new or custom arguments clearly.
+* Run `check_docstrings` locally and iteratively.
+
+By following these guidelines, you help maintain consistent and informative documentation for the Hugging Face Transformers library 🤗.
--- a/docs/source/en/conversations.md
+++ b/docs/source/en/conversations.md
@ -27,7 +27,7 @@ This guide shows you how to quickly start chatting with Transformers from the co

 ## transformers CLI

-Chat with a model directly from the command line as shown below. It launches an interactive session with a model. Enter `clear` to reset the conversation, `exit` to terminate the session, and `help` to display all the command options.
+After you've [installed Transformers](./installation.md), chat with a model directly from the command line as shown below. It launches an interactive session with a model, with a few base commands listed at the start of the session.

 ```bash
 transformers chat Qwen/Qwen2.5-0.5B-Instruct
@ -37,6 +37,12 @@ transformers chat Qwen/Qwen2.5-0.5B-Instruct
    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/transformers-chat-cli.png"/>
 </div>

+You can launch the CLI with arbitrary `generate` flags, with the format `arg_1=value_1 arg_2=value_2 ...`
+
+```bash
+transformers chat Qwen/Qwen2.5-0.5B-Instruct do_sample=False max_new_tokens=10
+```
+
 For a full list of options, run the command below.

 ```bash
--- a/docs/source/en/generation_strategies.md
+++ b/docs/source/en/generation_strategies.md
@ -20,11 +20,15 @@ A decoding strategy informs how a model should select the next generated token.

 This guide will help you understand the different decoding strategies available in Transformers and how and when to use them.

-## Greedy search
+## Basic decoding methods

-Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 tokens.
+These are well established decoding methods, and should be your starting point for text generation tasks.

-Greedy search works well for tasks with relatively short outputs. However, it breaks down when generating longer sequences because it begins to repeat itself.
+### Greedy search
+
+Greedy search is the default decoding strategy. It selects the next most likely token at each step. Unless specified in [`GenerationConfig`], this strategy generates a maximum of 20 new tokens.
+
+Greedy search works well for tasks with relatively short outputs where creativity is not a priority. However, it breaks down when generating longer sequences because it begins to repeat itself.

 ```py
 import torch
@ -40,11 +44,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 'Hugging Face is an open-source company that provides a suite of tools and services for building, deploying, and maintaining natural language processing'
 ```

-## Contrastive search
+### Sampling

-[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
+Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire model's vocabulary (as opposed to the most likely token, as in greedy search). This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.

-Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
+Enable multinomial sampling with `do_sample=True` and `num_beams=1`.

 ```py
 import torch
@ -55,14 +59,14 @@ inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt"

 model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
 # explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
+'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
 ```

-## Beam search
+### Beam search

-Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability.
+Beam search keeps track of several generated sequences (beams) at each time step. After a certain number of steps, it selects the sequence with the highest *overall* probability. Unlike greedy search, this strategy can "look ahead" and pick a sequence with a higher probability overall even if the initial tokens have a lower probability. It is best suited for input-grounded tasks, like describing an image or speech recognition. You can also use `do_sample=True` with beam search to sample at each step, but beam search will still greedily prune out low probability sequences between steps.

 > [!TIP]
 > Check out the [beam search visualizer](https://huggingface.co/spaces/m-ric/beam_search_visualizer) to see how beam search works.
@ -83,66 +87,11 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 "['Hugging Face is an open-source company that develops and maintains the Hugging Face platform, which is a collection of tools and libraries for building and deploying natural language processing (NLP) models. Hugging Face was founded in 2018 by Thomas Wolf']"
 ```

-## Diverse beam search
+## Advanced decoding methods

-[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
+Advanced decoding methods aim at either tackling specific generation quality issues (e.g. repetition) or at improving the generation throughput in certain situations. These techniques are more complex, and may not work correctly with all models.

-Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
-```
-
-## Multinomial sampling
-
-Search methods selects the most likely tokens. Sampling, or multinomial sampling, randomly selects a token based on the probability distribution over the entire models vocabulary. This means every token with a non-zero probability has a chance to be selected. Sampling strategies reduce repetition and can generate more creative and diverse outputs.
-
-Enable multinomial sampling with `do_sample=True` and `num_beams=1`.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=1)
-tokenizer.batch_decode(outputs, skip_special_tokens=True)
-'Hugging Face is an open-source company 🤗\nWe are open-source and believe that open-source is the best way to build technology. Our mission is to make AI accessible to everyone, and we believe that open-source is the best way to achieve that.'
-```
-
-## Beam search multinomial sampling
-
-This decoding strategy is a combination of beam search and multinomial sampling. It generates multiple beams and uses a sampling strategy for each beam.
-
-Enable beam search multinomial sampling by setting `num_beams` to a value greater than 1 and `do_sample=True`.
-
-```py
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
-
-model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
-# explicitly set to 100 because Llama2 generation length is 4096
-outputs = model.generate(**inputs, max_new_tokens=50, do_sample=True, num_beams=4)
-'Hugging Face is an open-source company 100% dedicated to making AI more accessible. We believe that AI should be available to everyone, and we’re working hard to make that a reality.\nWe’re a team of passionate engineers, designers,'
-```
-
-## Speculative decoding
+### Speculative decoding

 [Speculative](https://hf.co/papers/2211.17192) or assistive decoding isn't a search or sampling strategy. Instead, speculative decoding adds a second smaller model to generate candidate tokens. The main model verifies the candidate tokens in a single `forward` pass, which speeds up the decoding process overall. This method is especially useful for LLMs where it can be more costly and slower to generate tokens. Refer to the [speculative decoding](./llm_optims#speculative-decoding) guide to learn more.

@ -203,7 +152,7 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 </hfoption>
 </hfoptions>

-### Prompt lookup decoding
+#### Prompt lookup decoding

 [Prompt lookup decoding](./llm_optims#prompt-lookup-decoding) is a variant of speculative decoding that uses overlapping n-grams as the candidate tokens. It works well for input-grounded tasks such as summarization. Refer to the [prompt lookup decoding](./llm_optims#prompt-lookup-decoding) guide to learn more.

@ -245,7 +194,7 @@ outputs = model.generate(**inputs, assistant_early_exit=4, do_sample=False, max_
 tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ```

-### Universal assisted decoding
+#### Universal assisted decoding

 Universal assisted decoding (UAD) enables the main and assistant models to use different tokenizers. The main models input tokens are re-encoded into assistant model tokens. Candidate tokens are generated in the assistant encoding which are re-encoded into the main model candidate tokens. The candidate tokens are verified as explained in [speculative decoding](#speculative-decoding).

@ -269,7 +218,27 @@ tokenizer.batch_decode(outputs, skip_special_tokens=True)
 ['Alice and Bob are sitting in a bar. Alice is drinking a beer and Bob is drinking a']
 ```

-## DoLa
+### Contrastive search
+
+[Contrastive search](https://huggingface.co/papers/2202.06417) is a decoding strategy that aims to reduce repetition even while generating longer sequences. This strategy compares how similar a generated token is against previous tokens, and if they're more similar, a penalty is applied.
+
+Enable contrastive search with the `penalty_alpha` and `top_k` parameters. The `penalty_alpha` manages the penalty applied and `top_k` is the number of most likely tokens to return.
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=100, penalty_alpha=0.6, top_k=4)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company that provides a platform for building and deploying AI models.\nHugging Face is an open-source company that provides a platform for building and deploying AI models. The platform allows developers to build and deploy AI models, as well as collaborate with other developers.\nHugging Face was founded in 2019 by Thibault Wittemberg and Clément Delangue. The company is based in Paris, France.\nHugging Face has'
+```
+
+### DoLa

 [Decoding by Contrasting Layers (DoLa)](https://hf.co/papers/2309.03883) is a contrastive decoding strategy for improving factuality and reducing hallucination. This strategy works by contrasting the logit differences between the final and early layers. As a result, factual knowledge localized to particular layers are amplified. DoLa is not recommended for smaller models like GPT-2.

@ -325,6 +294,210 @@ tokenizer.batch_decode(outputs[:, inputs.input_ids.shape[-1]:], skip_special_tok
 </hfoption>
 </hfoptions>

+### Diverse beam search
+
+[Diverse beam search](https://hf.co/papers/1610.02424) is a variant of beam search that produces more diverse output candidates to choose from. This strategy measures the dissimilarity of sequences and a penalty is applied if sequences are too similar. To avoid high computation costs, the number of beams is divided into groups.
+
+Enable diverse beam search with the `num_beams`, `num_beam_groups` and `diversity_penalty` parameters (the `num_beams` parameter should be divisible by `num_beam_groups`).
+
+```py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+inputs = tokenizer("Hugging Face is an open-source company", return_tensors="pt").to("cuda")
+
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-2-7b-hf", torch_dtype=torch.float16).to("cuda")
+# explicitly set to 100 because Llama2 generation length is 4096
+outputs = model.generate(**inputs, max_new_tokens=50, num_beams=6, num_beam_groups=3, diversity_penalty=1.0, do_sample=False)
+tokenizer.batch_decode(outputs, skip_special_tokens=True)
+'Hugging Face is an open-source company 🤗\nWe are an open-source company. Our mission is to democratize AI and make it accessible to everyone. We believe that AI should be used for the benefit of humanity, not for the benefit of a'
+```
+
+
+## Custom decoding methods
+
+Custom decoding methods enable specialized generation behavior such as the following:
+- have the model continue thinking if it is uncertain;
+- roll back generation if the model gets stuck;
+- handle special tokens with custom logic;
+- enhanced input preparation for advanced models;
+
+We enable custom decoding methods through model repositories, assuming a specific model tag and file structure (see subsection below). This feature is an extension of [custom modeling code](./models.md#custom-models) and, like such, requires setting `trust_remote_code=True`.
+
+If a model repository holds a custom decoding method, the easiest way to try it out is to load the model and generate with it:
+
+<!-- TODO before merging: 1) better repo name (use a `generate-community` org?) 2) prettify the repo -->
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# `transformers-community/custom_generate_example` holds a copy of `Qwen/Qwen2.5-0.5B-Instruct`, but
+# with custom generation code -> calling `generate` uses the custom decoding method!
+tokenizer = AutoTokenizer.from_pretrained("transformers-community/custom_generate_example")
+model = AutoModelForCausalLM.from_pretrained(
+    "transformers-community/custom_generate_example", device_map="auto", trust_remote_code=True
+)
+
+inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
+# The custom decoding method is a minimal greedy decoding implementation. It also prints a custom message at run time.
+gen_out = model.generate(**inputs)
+# you should now see its custom message, "✨ using a custom generation method ✨"
+print(tokenizer.batch_decode(gen_out, skip_special_tokens=True))
+'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
+```
+
+Model repositories with custom decoding methods have a special property: their decoding method can be loaded from **any** model through [`~GenerationMixin.generate`]'s `custom_generate` argument. This means anyone can create and share their custom generation method to potentially work with any Transformers model, without requiring users to install additional Python packages.
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", device_map="auto")
+
+inputs = tokenizer(["The quick brown"], return_tensors="pt").to(model.device)
+# `custom_generate` replaces the original `generate` by the custom decoding method defined in
+# `transformers-community/custom_generate_example`
+gen_out = model.generate(**inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True)
+print(tokenizer.batch_decode(gen_out, skip_special_tokens=True)[0])
+'The quick brown fox jumps over a lazy dog, and the dog is a type of animal. Is'
+```
+
+You should read the `README.md` file of the repository containing the custom generation strategy to see what the new arguments and output type differences are, if they exist. Otherwise, you can assume it works like the base [`~GenerationMixin.generate`] method.
+
+> [!TIP]
+> You can find all custom decoding methods by [searching for their custom tag.](https://huggingface.co/models?other=custom_generate), `custom_generate`
+
+Consider the Hub repository [transformers-community/custom_generate_example](https://huggingface.co/transformers-community/custom_generate_example) as an example. The `README.md` states that it has an additional input argument, `left_padding`, which adds a number of padding tokens before the prompt.
+
+```py
+gen_out = model.generate(
+    **inputs, custom_generate="transformers-community/custom_generate_example", trust_remote_code=True, left_padding=5
+)
+print(tokenizer.batch_decode(gen_out)[0])
+'<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>The quick brown fox jumps over the lazy dog.\n\nThe sentence "The quick'
+```
+
+If the custom method has pinned Python requirements that your environment doesn't meet, you'll get an exception about missing requirements. For instance, [transformers-community/custom_generate_bad_requirements](https://huggingface.co/transformers-community/custom_generate_bad_requirements) has an impossible set of requirements defined in its `custom_generate/requirements.txt` file, and you'll see the error message below if you try to run it.
+
+```
+ImportError: Missing requirements in your local environment for `transformers-community/custom_generate_bad_requirements`:
+foo (installed: None)
+bar==0.0.0 (installed: None)
+torch>=99.0 (installed: 2.6.0)
+```
+
+Updating your Python requirements accordingly will remove this error message.
+
+### Creating a custom decoding method
+
+To create a new decoding method, you need to create a new [**Model**](https://huggingface.co/new) repository and push a few files into it.
+1. The model you've designed your decoding method with.
+2. `custom_generate/generate.py`, which contains all the logic for your custom decoding method.
+3. `custom_generate/requirements.txt`, used to optionally add new Python requirements and/or lock specific versions to correctly use your method.
+4. `README.md`, where you should add the `custom_generate` tag and document any new arguments or output type differences of your custom method here.
+
+After you've added all required files, your repository should look like this
+
+```
+your_repo/
+├── README.md          # include the 'custom_generate' tag
+├── config.json
+├── ...
+└── custom_generate/
+    ├── generate.py
+    └── requirements.txt
+```
+
+#### Adding the base model
+
+The starting point for your custom decoding method is a model repository just like any other. The model to add to this repository should be the model you've designed your method with, and it is meant to be part of a working self-contained model-generate pair. When the model in this repository is loaded, your custom decoding method will override `generate`. Don't worry -- your decoding method can still be loaded with any other Transformers model, as explained in the section above.
+
+If you simply want to copy an existing model, you can do
+
+```py
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("source/model_repo")
+model = AutoModelForCausalLM.from_pretrained("source/model_repo")
+tokenizer.save_pretrained("your/decoding_method", push_to_hub=True)
+model.save_pretrained("your/decoding_method", push_to_hub=True)
+```
+
+#### generate.py
+
+This is the core of your decoding method. It *must* contain a method named `generate`, and this method *must* contain a `model` argument as its first argument. `model` is the model instance, which means you have access to all attributes and methods in the model, including the ones defined in [`GenerationMixin`] (like the base `generate` method).
+
+> [!WARNING]
+> `generate.py` must be placed in a folder named `custom_generate`, and not at the root level of the repository. The file paths for this feature are hardcoded.
+
+Under the hood, when the base [`~GenerationMixin.generate`] method is called with a `custom_generate` argument, it first checks its Python requirements (if any), then locates the custom `generate` method in `generate.py`, and finally calls the custom `generate`. All received arguments and `model` are forwarded to your custom `generate` method.
+
+This means your `generate` can have a mix of original and custom arguments (as well as a different output type) as shown below.
+
+```py
+import torch
+
+def generate(model, input_ids, generation_config=None, left_padding=None, **kwargs):
+    generation_config = generation_config or model.generation_config  # default to the model generation config
+    cur_length = input_ids.shape[1]
+    max_length = generation_config.max_length or cur_length + generation_config.max_new_tokens
+
+    # Example of custom argument: add `left_padding` (integer) pad tokens before the prompt
+    if left_padding is not None:
+        if not isinstance(left_padding, int) or left_padding < 0:
+            raise ValueError(f"left_padding must be an integer larger than 0, but is {left_padding}")
+
+        pad_token = kwargs.pop("pad_token", None) or generation_config.pad_token_id or model.config.pad_token_id
+        if pad_token is None:
+            raise ValueError("pad_token is not defined")
+        batch_size = input_ids.shape[0]
+        pad_tensor = torch.full(size=(batch_size, left_padding), fill_value=pad_token).to(input_ids.device)
+        input_ids = torch.cat((pad_tensor, input_ids), dim=1)
+        cur_length = input_ids.shape[1]
+
+    # Simple greedy decoding loop
+    while cur_length < max_length:
+        logits = model(input_ids).logits
+        next_token_logits = logits[:, -1, :]
+        next_tokens = torch.argmax(next_token_logits, dim=-1)
+        input_ids = torch.cat((input_ids, next_tokens[:, None]), dim=-1)
+        cur_length += 1
+
+    return input_ids
+```
+
+Follow the recommended practices below to ensure your custom decoding method works as expected.
+- Feel free to reuse the logic for validation and input preparation in the original [`~GenerationMixin.generate`].
+- Pin the `transformers` version in the requirements if you use any private method/attribute in `model`.
+- You can add other files in the `custom_generate` folder, and use relative imports.
+- Consider adding model validation, input validation, or even a separate test file to help users sanity-check your code in their environment.
+
+#### requirements.txt
+
+You can optionally specify additional Python requirements in a `requirements.txt` file inside the `custom_generate` folder. These are checked at runtime and an exception will be thrown if they're missing, nudging users to update their environment accordingly.
+
+#### README.md
+
+The root level `README.md` in the model repository usually describes the model therein. However, since the focus of the repository is the custom decoding method, we highly recommend to shift its focus towards describing the custom decoding method. In addition to a description of the method, we recommend documenting any input and/or output differences to the original [`~GenerationMixin.generate`]. This way, users can focus on what's new, and rely on Transformers docs for generic implementation details.
+
+For discoverability, we highly recommend you to add the `custom_generate` tag to your repository. To do so, the top of your `README.md` file should look like the example below. After you push the file, you should see the tag in your repository!
+
+```
+---
+library_name: transformers
+tags:
+  - custom_generate
+---
+
+(your markdown content here)
+```
+
+Recommended practices:
+- Document input and output differences in [`~GenerationMixin.generate`].
+- Add self-contained examples to enable quick experimentation.
+- Describe soft-requirements such as if the method only works well with a certain family of models.
+
+
 ## Resources

 Read the [How to generate text: using different decoding methods for language generation with Transformers](https://huggingface.co/blog/how-to-generate) blog post for an explanation of how common decoding strategies work.
--- a/docs/source/en/how_to_hack_models.md
+++ b/docs/source/en/how_to_hack_models.md
@ -90,11 +90,6 @@ class SamVisionAttentionSplit(SamVisionAttention, nn.Module):

        attn_weights = (query * self.scale) @ key.transpose(-2, -1)

-        if self.use_rel_pos:
-            attn_weights = self.add_decomposed_rel_pos(
-                attn_weights, query, self.rel_pos_h, self.rel_pos_w, (height, width), (height, width)
-            )
-
        attn_weights = torch.nn.functional.softmax(attn_weights, dtype=torch.float32, dim=-1).to(query.dtype)
        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
        attn_output = (attn_probs @ value).reshape(batch_size, self.num_attention_heads, height, width, -1)
@ -114,13 +109,14 @@ Load the model with [`~PreTrainedModel.from_pretrained`].

 ```py
 from transformers import SamModel
-from transformers.models.sam import modeling_sam
-
-# replace the attention class in the modeling_sam module
-modeling_sam.SamVisionAttention = SamVisionAttentionSplit

 # load the pretrained SAM model
 model = SamModel.from_pretrained("facebook/sam-vit-base")
+
+# replace the attention class in the vision_encoder module
+for layer in model.vision_encoder.layers:
+    if hasattr(layer, "attn"):
+        layer.attn = SamVisionAttentionSplit(model.config.vision_config, model.config.vision_config.window_size)
 ```

 ## LoRA
@ -138,7 +134,7 @@ config = LoraConfig(
    # apply LoRA to q and v
    target_modules=["q", "v"],
    lora_dropout=0.1,
-    task_type="mask-generation"
+    task_type="FEATURE_EXTRACTION"
 )
 ```

@ -152,5 +148,5 @@ Call [print_trainable_parameters](https://huggingface.co/docs/peft/package_refer

 ```py
 model.print_trainable_parameters()
-"trainable params: 608,256 || all params: 94,343,728 || trainable%: 0.6447"
+"trainable params: 589,824 || all params: 94,274,096 || trainable%: 0.6256"
 ```
--- a/docs/source/en/image_processors.md
+++ b/docs/source/en/image_processors.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # Image processors

-Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision or video model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.
+Image processors converts images into pixel values, tensors that represent image colors and size. The pixel values are inputs to a vision model. To ensure a pretrained model receives the correct input, an image processor can perform the following operations to make sure an image is exactly like the images a model was pretrained on.

 - [`~BaseImageProcessor.center_crop`] to resize an image
 - [`~BaseImageProcessor.normalize`] or [`~BaseImageProcessor.rescale`] pixel values
--- a/docs/source/en/internal/import_utils.md
+++ b/docs/source/en/internal/import_utils.md
@ -84,6 +84,19 @@ class Trainer:

 Backends that can be added here are all the backends that are available in the `import_utils.py` module.

+Additionally, specific versions can be specified in each backend. For example, this is how you would specify
+a requirement on torch>=2.6 on the `Trainer` class:
+
+```python
+from .utils.import_utils import requires
+
+@requires(backends=("torch>=2.6", "accelerate"))
+class Trainer:
+    ...
+```
+
+You can specify the following operators: `==`, `>`, `>=`, `<`, `<=`, `!=`.
+
 ## Methods

 [[autodoc]] utils.import_utils.define_import_structure
--- a/docs/source/en/llm_tutorial.md
+++ b/docs/source/en/llm_tutorial.md
@ -20,9 +20,13 @@ rendered properly in your Markdown viewer.

 Text generation is the most popular application for large language models (LLMs). A LLM is trained to generate the next word (token) given some initial text (prompt) along with its own generated outputs up to a predefined length or when it reaches an end-of-sequence (`EOS`) token.

-In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities.
+In Transformers, the [`~GenerationMixin.generate`] API handles text generation, and it is available for all models with generative capabilities. This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.

-This guide will show you the basics of text generation with [`~GenerationMixin.generate`] and some common pitfalls to avoid.
+> [!TIP]
+> You can also chat with a model directly from the command line. ([reference](./conversations.md#transformers-cli))
+> ```shell
+> transformers chat Qwen/Qwen2.5-0.5B-Instruct
+> ```

 ## Default generate

@ -134,6 +138,20 @@ outputs = model.generate(**inputs, generation_config=generation_config)
 print(tokenizer.batch_decode(outputs, skip_special_tokens=True))
 ```

+## Common Options
+
+[`~GenerationMixin.generate`] is a powerful tool that can be heavily customized. This can be daunting for a new users. This section contains a list of popular generation options that you can define in most text generation tools in Transformers: [`~GenerationMixin.generate`], [`GenerationConfig`], `pipelines`, the `chat` CLI, ...
+
+| Option name | Type | Simplified description |
+|---|---|---|
+| `max_new_tokens` | `int` | Controls the maximum generation length. Be sure to define it, as it usually defaults to a small value. |
+| `do_sample` | `bool` | Defines whether generation will sample the next token (`True`), or is greedy instead (`False`). Most use cases should set this flag to `True`. Check [this guide](./generation_strategies.md) for more information. |
+| `temperature` | `float` | How unpredictable the next selected token will be. High values (`>0.8`) are good for creative tasks, low values (e.g. `<0.4`) for tasks that require "thinking". Requires `do_sample=True`. |
+| `num_beams` | `int` | When set to `>1`, activates the beam search algorithm. Beam search is good on input-grounded tasks. Check [this guide](./generation_strategies.md) for more information. |
+| `repetition_penalty` | `float` | Set it to `>1.0` if you're seeing the model repeat itself often. Larger values apply a larger penalty. |
+| `eos_token_id` | `List[int]` | The token(s) that will cause generation to stop. The default value is usually good, but you can specify a different token. |
+
+
 ## Pitfalls

 The section below covers some common issues you may encounter during text generation and how to solve them.
@ -286,4 +304,4 @@ Take a look below for some more specific and specialized text generation librari
 - [SynCode](https://github.com/uiuc-focal-lab/syncode): a library for context-free grammar guided generation (JSON, SQL, Python).
 - [Text Generation Inference](https://github.com/huggingface/text-generation-inference): a production-ready server for LLMs.
 - [Text generation web UI](https://github.com/oobabooga/text-generation-webui): a Gradio web UI for text generation.
- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
+- [logits-processor-zoo](https://github.com/NVIDIA/logits-processor-zoo): additional logits processors for controlling text generation.
--- a/docs/source/en/main_classes/video_processor.md
+++ b/docs/source/en/main_classes/video_processor.md
@ -0,0 +1,55 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Video Processor
+
+A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
+
+The video processor extends the functionality of image processors by allowing Vision Large Language Models (VLMs) to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
+
+When adding a new VLM or updating an existing one to enable distinct video preprocessing, saving and reloading the processor configuration will store the video related arguments in a dedicated file named `video_preprocessing_config.json`. Don't worry if you haven't upadted your VLM, the processor will try to load video related configurations from a file named `preprocessing_config.json`.
+
+
+### Usage Example
+Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
+
+```python
+from transformers import AutoVideoProcessor
+
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+```
+
+Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
+
+Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
+
+```python
+import torch
+from transformers.video_utils import load_video
+from transformers import AutoVideoProcessor
+
+video = load_video("video.mp4")
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
+processor = torch.compile(processor)
+processed_video = processor(video, return_tensors="pt")
+```
+
+
+## BaseVideoProcessor
+
+[[autodoc]] video_processing_utils.BaseVideoProcessor
+
--- a/docs/source/en/model_doc/albert.md
+++ b/docs/source/en/model_doc/albert.md
@ -57,6 +57,7 @@ This model was contributed by [lysandre](https://huggingface.co/lysandre). This
 - Embedding size E is different from hidden size H justified because the embeddings are context independent (one embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V being the vocab size). If E < H, it has less parameters.
 - Layers are split in groups that share parameters (to save memory).
 Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have been swapped or not.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/aria.md
+++ b/docs/source/en/model_doc/aria.md
@ -102,6 +102,10 @@ response = processor.decode(output_ids, skip_special_tokens=True)

 [[autodoc]] AriaTextModel

+## AriaModel
+
+[[autodoc]] AriaModel
+
 ## AriaTextForCausalLM

 [[autodoc]] AriaTextForCausalLM
--- a/docs/source/en/model_doc/auto.md
+++ b/docs/source/en/model_doc/auto.md
@ -74,6 +74,10 @@ Likewise, if your `NewModel` is a subclass of [`PreTrainedModel`], make sure its

 [[autodoc]] AutoImageProcessor

+## AutoVideoProcessor
+
+[[autodoc]] AutoVideoProcessor
+
 ## AutoProcessor

 [[autodoc]] AutoProcessor
--- a/docs/source/en/model_doc/aya_vision.md
+++ b/docs/source/en/model_doc/aya_vision.md
@ -237,6 +237,10 @@ for i, output in enumerate(batch_outputs):

 [[autodoc]] AyaVisionConfig

+## AyaVisionModel
+
+[[autodoc]] AyaVisionModel
+
 ## AyaVisionForConditionalGeneration

 [[autodoc]] AyaVisionForConditionalGeneration
--- a/docs/source/en/model_doc/bamba.md
+++ b/docs/source/en/model_doc/bamba.md
@ -39,7 +39,7 @@ Checkout all Bamba-9B model checkpoints [here](https://github.com/foundation-mod
 <!---
 ## Usage Tips

-Tips: 
+Tips:

 - The architecture is based on Mamba-2 models.

@ -63,7 +63,35 @@ response = model.generate(**inputs, max_new_tokens=64)
 print(tokenizer.batch_decode(response, skip_special_tokens=True)[0])
 ```

+
+## Padding-Free Training
+
+Bamba supports padding-free training in which distinct training examples can be concatenated
+together while nevertheless processing the inputs as though they belonged to separate batches. When
+the examples are of varying lengths, padding-free training can provide significant speed ups and
+memory savings compared to batching the examples together and using padding, as the unnecessary
+compute and memory due to padding is avoided entirely. The performance gains depend on factors such
+as the model and the data distribution, but throughput gains up to [~2x are commonly
+seen](https://github.com/huggingface/transformers/pull/35861#issue-2807873129).
+
+Using padding-free training with Bamba requires the `flash-attn`, `mamba-ssm`, and `causal-conv1d`
+packages, and the following arguments must be passed to the model in addition to `input_ids` and
+`labels`:
+* `position_ids: torch.LongTensor`: the position index of each token in each sequence.
+* `seq_idx: torch.IntTensor`: the index of each sequence in the batch.
+* Each of the [`FlashAttentionKwargs`]
+    * `cu_seq_lens_q: torch.LongTensor`: The cumulative sequence lengths of all queries.
+    * `cu_seq_lens_k: torch.LongTensor`: The cumulative sequence lengths of all keys.
+    * `max_length_q: int`: the longest query length in the batch.
+    * `max_length_k: int`: the longest key length in the batch.
+
+The `attention_mask` inputs should not be provided. The [`DataCollatorWithFlattening`] can be used
+to programmatically generate the above set of additional arguments using `return_seq_idx=True` and
+`return_flash_attn_kwargs=True`. See [this blog post](https://huggingface.co/blog/packing-with-FA2)
+for additional information.
+
+
 [[autodoc]] BambaForCausalLM
    - forward

-This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim). 
+This HF implementation is contributed by [ani300](https://github.com/ani300) and [fabianlim](https://github.com/fabianlim).
--- a/docs/source/en/model_doc/bart.md
+++ b/docs/source/en/model_doc/bart.md
@ -55,6 +55,7 @@ This model was contributed by [sshleifer](https://huggingface.co/sshleifer). The
  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
  * permute sentences
  * rotate the document to make it start at a specific token
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ## Implementation Notes

--- a/docs/source/en/model_doc/beit.md
+++ b/docs/source/en/model_doc/beit.md
@ -150,6 +150,7 @@ If you're interested in submitting a resource to be included here, please feel f
 [[autodoc]] BeitImageProcessor
    - preprocess
    - post_process_semantic_segmentation
+
 ## BeitImageProcessorFast

 [[autodoc]] BeitImageProcessorFast
--- a/docs/source/en/model_doc/biogpt.md
+++ b/docs/source/en/model_doc/biogpt.md
@ -36,6 +36,7 @@ This model was contributed by [kamalkraj](https://huggingface.co/kamalkraj). The
 - BioGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than the left.
 - BioGPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next token in a sequence. Leveraging this feature allows BioGPT to generate syntactically coherent text as it can be observed in the run_generation.py example script.
 - The model can take the `past_key_values` (for PyTorch) as input, which is the previously computed key/value attention pairs. Using this (past_key_values or past) value prevents the model from re-computing pre-computed values in the context of text generation. For PyTorch, see past_key_values argument of the BioGptForCausalLM.forward() method for more information on its usage.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/data2vec.md
+++ b/docs/source/en/model_doc/data2vec.md
@ -53,6 +53,7 @@ The original code for vision can be found [here](https://github.com/facebookrese
 - For Data2VecAudio, preprocessing is identical to [`Wav2Vec2Model`], including feature extraction
 - For Data2VecText, preprocessing is identical to [`RobertaModel`], including tokenization.
 - For Data2VecVision, preprocessing is identical to [`BeitModel`], including feature extraction.
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ### Using Scaled Dot Product Attention (SDPA)

--- a/docs/source/en/model_doc/emu3.md
+++ b/docs/source/en/model_doc/emu3.md
@ -174,6 +174,10 @@ for i, image in enumerate(images['pixel_values']):
 [[autodoc]] Emu3TextModel
    - forward

+## Emu3Model
+
+[[autodoc]] Emu3Model
+
 ## Emu3ForCausalLM

 [[autodoc]] Emu3ForCausalLM
--- a/docs/source/en/model_doc/fuyu.md
+++ b/docs/source/en/model_doc/fuyu.md
@ -103,6 +103,10 @@ The `LlamaTokenizer` is used as it is a standard wrapper around sentencepiece.

 [[autodoc]] FuyuConfig

+## FuyuModel
+
+[[autodoc]] FuyuModel
+
 ## FuyuForCausalLM

 [[autodoc]] FuyuForCausalLM
--- a/docs/source/en/model_doc/gemma3.md
+++ b/docs/source/en/model_doc/gemma3.md
@ -254,6 +254,10 @@ visualizer("<img>What is shown in this image?")
 [[autodoc]] Gemma3TextModel
    - forward

+## Gemma3Model
+
+[[autodoc]] Gemma3Model
+
 ## Gemma3ForCausalLM

 [[autodoc]] Gemma3ForCausalLM
--- a/docs/source/en/model_doc/got_ocr2.md
+++ b/docs/source/en/model_doc/got_ocr2.md
@ -277,6 +277,10 @@ alt="drawing" width="600"/>

 [[autodoc]] GotOcr2Processor

+## GotOcr2Model
+
+[[autodoc]] GotOcr2Model
+
 ## GotOcr2ForConditionalGeneration

 [[autodoc]] GotOcr2ForConditionalGeneration
--- a/docs/source/en/model_doc/gpt_bigcode.md
+++ b/docs/source/en/model_doc/gpt_bigcode.md
@ -46,8 +46,12 @@ The main differences compared to GPT2.
 - Merge the key and value caches into one (this changes the format of layer_past/ present, does it risk creating problems?)
 - Use the memory layout (self.num_heads, 3, self.head_dim) instead of `(3, self.num_heads, self.head_dim)` for the QKV tensor with MHA. (prevents an overhead with the merged key and values, but makes the checkpoints incompatible with the original openai-community/gpt2 model).

+
 You can read more about the optimizations in the [original pull request](https://github.com/huggingface/transformers/pull/22575)

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Combining Starcoder and Flash Attention 2

 First, make sure to install the latest version of Flash Attention 2 to include the sliding window attention feature.
--- a/docs/source/en/model_doc/hubert.md
+++ b/docs/source/en/model_doc/hubert.md
@ -50,7 +50,7 @@ This model was contributed by [patrickvonplaten](https://huggingface.co/patrickv
 - Hubert is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
 - Hubert model was fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
  using [`Wav2Vec2CTCTokenizer`].
-
+- The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`  

 ## Using Flash Attention 2

--- a/docs/source/en/model_doc/instructblip.md
+++ b/docs/source/en/model_doc/instructblip.md
@ -69,6 +69,10 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipQFormerModel
    - forward

+## InstructBlipModel
+
+[[autodoc]] InstructBlipModel
+
 ## InstructBlipForConditionalGeneration

 [[autodoc]] InstructBlipForConditionalGeneration
--- a/docs/source/en/model_doc/instructblipvideo.md
+++ b/docs/source/en/model_doc/instructblipvideo.md
@ -58,6 +58,12 @@ The attributes can be obtained from model config, as `model.config.num_query_tok

 [[autodoc]] InstructBlipVideoProcessor

+
+## InstructBlipVideoVideoProcessor
+
+[[autodoc]] InstructBlipVideoVideoProcessor
+    - preprocess
+
 ## InstructBlipVideoImageProcessor

 [[autodoc]] InstructBlipVideoImageProcessor
@ -73,6 +79,10 @@ The attributes can be obtained from model config, as `model.config.num_query_tok
 [[autodoc]] InstructBlipVideoQFormerModel
    - forward

+## InstructBlipVideoModel
+[[autodoc]] InstructBlipVideoModel
+    - forward
+
 ## InstructBlipVideoForConditionalGeneration

 [[autodoc]] InstructBlipVideoForConditionalGeneration
--- a/docs/source/en/model_doc/internvl.md
+++ b/docs/source/en/model_doc/internvl.md
@ -340,6 +340,11 @@ This example showcases how to handle a batch of chat conversations with interlea
 [[autodoc]] InternVLVisionModel
    - forward

+## InternVLModel
+
+[[autodoc]] InternVLModel
+    - forward
+
 ## InternVLForConditionalGeneration

 [[autodoc]] InternVLForConditionalGeneration
@ -348,3 +353,7 @@ This example showcases how to handle a batch of chat conversations with interlea
 ## InternVLProcessor

 [[autodoc]] InternVLProcessor
+
+## InternVLVideoProcessor
+
+[[autodoc]] InternVLVideoProcessor
--- a/docs/source/en/model_doc/llava.md
+++ b/docs/source/en/model_doc/llava.md
@ -256,6 +256,10 @@ A list of official Hugging Face and community (indicated by 🌎) resources to h

 [[autodoc]] LlavaProcessor

+## LlavaModel
+
+[[autodoc]] LlavaModel
+
 ## LlavaForConditionalGeneration

 [[autodoc]] LlavaForConditionalGeneration
--- a/docs/source/en/model_doc/llava_next.md
+++ b/docs/source/en/model_doc/llava_next.md
@ -315,6 +315,10 @@ model = AutoModelForImageTextToText.from_pretrained(

 [[autodoc]] LlavaNextProcessor

+## LlavaNextModel
+
+[[autodoc]] LlavaNextModel
+
 ## LlavaNextForConditionalGeneration

 [[autodoc]] LlavaNextForConditionalGeneration
--- a/docs/source/en/model_doc/llava_next_video.md
+++ b/docs/source/en/model_doc/llava_next_video.md
@ -262,6 +262,14 @@ model = LlavaNextVideoForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaNextVideoImageProcessor

+## LlavaNextVideoVideoProcessor
+
+[[autodoc]] LlavaNextVideoVideoProcessor
+
+## LlavaNextVideoModel
+
+[[autodoc]] LlavaNextVideoModel
+
 ## LlavaNextVideoForConditionalGeneration

 [[autodoc]] LlavaNextVideoForConditionalGeneration
--- a/docs/source/en/model_doc/llava_onevision.md
+++ b/docs/source/en/model_doc/llava_onevision.md
@ -303,6 +303,7 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(
 ## LlavaOnevisionImageProcessor

 [[autodoc]] LlavaOnevisionImageProcessor
+    - preprocess

 ## LlavaOnevisionImageProcessorFast

@ -313,6 +314,14 @@ model = LlavaOnevisionForConditionalGeneration.from_pretrained(

 [[autodoc]] LlavaOnevisionVideoProcessor

+## LlavaOnevisionVideoProcessor
+
+[[autodoc]] LlavaOnevisionVideoProcessor
+
+## LlavaOnevisionModel
+
+[[autodoc]] LlavaOnevisionModel
+
 ## LlavaOnevisionForConditionalGeneration

 [[autodoc]] LlavaOnevisionForConditionalGeneration
--- a/docs/source/en/model_doc/m2m_100.md
+++ b/docs/source/en/model_doc/m2m_100.md
@ -51,6 +51,9 @@ multilingual it expects the sequences in a certain format: A special language id
 source and target text. The source text format is `[lang_code] X [eos]`, where `lang_code` is source language
 id for source text and target language id for target text, with `X` being the source or target text.

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 The [`M2M100Tokenizer`] depends on `sentencepiece` so be sure to install it before running the
 examples. To install `sentencepiece` run `pip install sentencepiece`.

--- a/docs/source/en/model_doc/mbart.md
+++ b/docs/source/en/model_doc/mbart.md
@ -35,6 +35,9 @@ You can find all the original mBART checkpoints under the [AI at Meta](https://h
 > [!TIP]
 > Click on the mBART models in the right sidebar for more examples of applying mBART to different language tasks.

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 The example below demonstrates how to translate text with [`Pipeline`] or the [`AutoModel`] class.

 <hfoptions id="usage">
--- a/docs/source/en/model_doc/mistral3.md
+++ b/docs/source/en/model_doc/mistral3.md
@ -227,6 +227,9 @@ This example also how to use `BitsAndBytes` to load the model in 4bit quantizati

 [[autodoc]] Mistral3Config

+## Mistral3Model
+
+[[autodoc]] Mistral3Model

 ## Mistral3ForConditionalGeneration

--- a/docs/source/en/model_doc/mllama.md
+++ b/docs/source/en/model_doc/mllama.md
@ -130,6 +130,10 @@ print(processor.decode(output[0], skip_special_tokens=True))
 [[autodoc]] MllamaTextModel
    - forward

+## MllamaModel
+
+[[autodoc]] MllamaModel
+
 ## MllamaForCausalLM

 [[autodoc]] MllamaForCausalLM
--- a/docs/source/en/model_doc/musicgen.md
+++ b/docs/source/en/model_doc/musicgen.md
@ -62,6 +62,9 @@ python src/transformers/models/musicgen/convert_musicgen_transformers.py \
    --checkpoint small --pytorch_dump_folder /output/path --safe_serialization 
 ```

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Generation

 MusicGen is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly
--- a/docs/source/en/model_doc/musicgen_melody.md
+++ b/docs/source/en/model_doc/musicgen_melody.md
@ -44,6 +44,9 @@ There are two key differences with MusicGen:
 1. The audio prompt is used here as a conditional signal for the generated audio sample, whereas it's used for audio continuation in [MusicGen](https://huggingface.co/docs/transformers/main/en/model_doc/musicgen).
 2. Conditional text and audio signals are concatenated to the decoder's hidden states instead of being used as a cross-attention signal, as in MusicGen.

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Generation

 MusicGen Melody is compatible with two generation modes: greedy and sampling. In practice, sampling leads to significantly better results than greedy, thus we encourage sampling mode to be used where possible. Sampling is enabled by default, and can be explicitly specified by setting `do_sample=True` in the call to [`MusicgenMelodyForConditionalGeneration.generate`], or by overriding the model's generation config (see below).
--- a/docs/source/en/model_doc/opt.md
+++ b/docs/source/en/model_doc/opt.md
@ -41,6 +41,9 @@ Tips:
 - OPT has the same architecture as [`BartDecoder`].
 - Contrary to GPT2, OPT adds the EOS token `</s>` to the beginning of every prompt.

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources

 A list of official Hugging Face and community (indicated by 🌎) resources to help you get started with OPT. If you're
--- a/docs/source/en/model_doc/paligemma.md
+++ b/docs/source/en/model_doc/paligemma.md
@ -174,6 +174,10 @@ visualizer("<img> What is in this image?")

 [[autodoc]] PaliGemmaProcessor

+## PaliGemmaModel
+
+[[autodoc]] PaliGemmaModel
+
 ## PaliGemmaForConditionalGeneration

 [[autodoc]] PaliGemmaForConditionalGeneration
--- a/docs/source/en/model_doc/qwen2_5_vl.md
+++ b/docs/source/en/model_doc/qwen2_5_vl.md
@ -240,6 +240,10 @@ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(

 [[autodoc]] Qwen2_5_VLProcessor

+## Qwen2_5_VLTextModel
+
+[[autodoc]] Qwen2_5_VLTextModel
+    - forward

 ## Qwen2_5_VLModel

--- a/docs/source/en/model_doc/qwen2_audio.md
+++ b/docs/source/en/model_doc/qwen2_audio.md
@ -40,6 +40,9 @@ The abstract from the paper is the following:

 `Qwen2-Audio-7B` and `Qwen2-Audio-7B-Instruct` can be found on the [Huggingface Hub](https://huggingface.co/Qwen)

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ### Inference

 ```python
--- a/docs/source/en/model_doc/qwen2_vl.md
+++ b/docs/source/en/model_doc/qwen2_vl.md
@ -287,6 +287,11 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(
 [[autodoc]] Qwen2VLImageProcessor
    - preprocess

+## Qwen2VLVideoProcessor
+
+[[autodoc]] Qwen2VLVideoProcessor
+    - preprocess
+
 ## Qwen2VLImageProcessorFast

 [[autodoc]] Qwen2VLImageProcessorFast
@ -296,6 +301,11 @@ model = Qwen2VLForConditionalGeneration.from_pretrained(

 [[autodoc]] Qwen2VLProcessor

+## Qwen2VLTextModel
+
+[[autodoc]] Qwen2VLTextModel
+    - forward
+    
 ## Qwen2VLModel

 [[autodoc]] Qwen2VLModel
--- a/docs/source/en/model_doc/roberta.md
+++ b/docs/source/en/model_doc/roberta.md
@ -23,6 +23,7 @@ rendered properly in your Markdown viewer.
 ">
 <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
 </div>
+
 ## Overview

 The RoBERTa model was proposed in [RoBERTa: A Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, [Myle Ott](https://huggingface.co/myleott), Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
--- a/docs/source/en/model_doc/sam_hq.md
+++ b/docs/source/en/model_doc/sam_hq.md
@ -43,8 +43,8 @@ import requests
 from transformers import SamHQModel, SamHQProcessor

 device = "cuda" if torch.cuda.is_available() else "cpu"
-model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
-processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")

 img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
@ -69,8 +69,8 @@ import requests
 from transformers import SamHQModel, SamHQProcessor

 device = "cuda" if torch.cuda.is_available() else "cpu"
-model = SamHQModel.from_pretrained("sushmanth/sam_hq_vit_b").to(device)
-processor = SamHQProcessor.from_pretrained("sushmanth/sam_hq_vit_b")
+model = SamHQModel.from_pretrained("syscv-community/sam-hq-vit-base").to(device)
+processor = SamHQProcessor.from_pretrained("syscv-community/sam-hq-vit-base")

 img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
--- a/docs/source/en/model_doc/sew.md
+++ b/docs/source/en/model_doc/sew.md
@ -46,6 +46,9 @@ This model was contributed by [anton-l](https://huggingface.co/anton-l).
 - SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
  [`Wav2Vec2CTCTokenizer`].

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources

 - [Audio classification task guide](../tasks/audio_classification)
--- a/docs/source/en/model_doc/smolvlm.md
+++ b/docs/source/en/model_doc/smolvlm.md
@ -197,6 +197,9 @@ print(generated_texts[0])
 [[autodoc]] SmolVLMImageProcessor
    - preprocess

+## SmolVLMVideoProcessor
+[[autodoc]] SmolVLMVideoProcessor
+    - preprocess

 ## SmolVLMProcessor
 [[autodoc]] SmolVLMProcessor
--- a/docs/source/en/model_doc/swin2sr.md
+++ b/docs/source/en/model_doc/swin2sr.md
@ -50,6 +50,11 @@ A demo Space for image super-resolution with SwinSR can be found [here](https://
 [[autodoc]] Swin2SRImageProcessor
    - preprocess

+## Swin2SRImageProcessorFast
+
+[[autodoc]] Swin2SRImageProcessorFast
+    - preprocess
+
 ## Swin2SRConfig

 [[autodoc]] Swin2SRConfig
--- a/docs/source/en/model_doc/unispeech-sat.md
+++ b/docs/source/en/model_doc/unispeech-sat.md
@ -54,6 +54,9 @@ found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT).
  decoded using [`Wav2Vec2CTCTokenizer`].
 - UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources

 - [Audio classification task guide](../tasks/audio_classification)
--- a/docs/source/en/model_doc/unispeech.md
+++ b/docs/source/en/model_doc/unispeech.md
@ -49,6 +49,9 @@ found [here](https://github.com/microsoft/UniSpeech/tree/main/UniSpeech).
 - UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
  decoded using [`Wav2Vec2CTCTokenizer`].

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Resources

 - [Audio classification task guide](../tasks/audio_classification)
--- a/docs/source/en/model_doc/video_llava.md
+++ b/docs/source/en/model_doc/video_llava.md
@ -211,10 +211,19 @@ model = VideoLlavaForConditionalGeneration.from_pretrained(

 [[autodoc]] VideoLlavaImageProcessor

+
+## VideoLlavaVideoProcessor
+
+[[autodoc]] VideoLlavaVideoProcessor
+
 ## VideoLlavaProcessor

 [[autodoc]] VideoLlavaProcessor

+## VideoLlavaModel
+
+[[autodoc]] VideoLlavaModel
+
 ## VideoLlavaForConditionalGeneration

 [[autodoc]] VideoLlavaForConditionalGeneration
--- a/docs/source/en/model_doc/vilt.md
+++ b/docs/source/en/model_doc/vilt.md
@ -72,6 +72,11 @@ This model was contributed by [nielsr](https://huggingface.co/nielsr). The origi
 [[autodoc]] ViltImageProcessor
    - preprocess

+## ViltImageProcessorFast
+
+[[autodoc]] ViltImageProcessorFast
+    - preprocess
+
 ## ViltProcessor

 [[autodoc]] ViltProcessor
--- a/docs/source/en/model_doc/vipllava.md
+++ b/docs/source/en/model_doc/vipllava.md
@ -101,6 +101,10 @@ A chat between a curious human and an artificial intelligence assistant. The ass

 [[autodoc]] VipLlavaConfig

+## VipLlavaModel
+
+[[autodoc]] VipLlavaModel
+
 ## VipLlavaForConditionalGeneration

 [[autodoc]] VipLlavaForConditionalGeneration
--- a/docs/source/en/model_doc/wav2vec2.md
+++ b/docs/source/en/model_doc/wav2vec2.md
@ -50,6 +50,9 @@ Note: Meta (FAIR) released a new version of [Wav2Vec2-BERT 2.0](https://huggingf
 - Wav2Vec2 model was trained using connectionist temporal classification (CTC) so the model output has to be decoded
  using [`Wav2Vec2CTCTokenizer`].

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 ## Using Flash Attention 2

 Flash Attention 2 is an faster, optimized version of the model.
--- a/docs/source/en/model_doc/whisper.md
+++ b/docs/source/en/model_doc/whisper.md
@ -32,6 +32,9 @@ rendered properly in your Markdown viewer.

 You can find all the original Whisper checkpoints under the [Whisper](https://huggingface.co/collections/openai/whisper-release-6501bba2cf999715fd953013) collection.

+> [!NOTE]
+> The `head_mask` argument is ignored when using all attention implementation other than "eager". If you have a `head_mask` and want it to have effect, load the model with `XXXModel.from_pretrained(model_id, attn_implementation="eager")`
+
 > [!TIP]
 > Click on the Whisper models in the right sidebar for more examples of how to apply Whisper to different audio tasks.

--- a/docs/source/en/models.md
+++ b/docs/source/en/models.md
@ -54,8 +54,8 @@ For each model type, there is a separate class for each machine learning framewo
 from transformers import AutoModelForCausalLM, MistralForCausalLM

 # load with AutoClass or model-specific class
-model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", , torch_dtype="auto", device_map="auto")
-model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", , torch_dtype="auto", device_map="auto")
+model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype="auto", device_map="auto")
+model = MistralForCausalLM.from_pretrained("mistralai/Mistral-7B-v0.1", torch_dtype="auto", device_map="auto")
 ```

 </hfoption>
@ -272,6 +272,7 @@ Explicitly set the [torch_dtype](https://pytorch.org/docs/stable/tensor_attribut
 <hfoption id="specific dtype">

 ```py
+import torch
 from transformers import AutoModelForCausalLM

 gemma = AutoModelForCausalLM.from_pretrained("google/gemma-7b", torch_dtype=torch.float16)
--- a/docs/source/en/perf_infer_gpu_multi.md
+++ b/docs/source/en/perf_infer_gpu_multi.md
@ -13,9 +13,15 @@ rendered properly in your Markdown viewer.

 -->

-# Distributed GPU inference
+# Tensor parallelism in transformers

 [Tensor parallelism](./perf_train_gpu_many#tensor-parallelism) shards a model onto multiple GPUs and parallelizes computations such as matrix multiplication. It enables fitting larger model sizes into memory and is faster because each GPU can process a tensor slice.
+This document assumes that you are already familiar with the basics of tensor parallelism. If you are not, please refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) section on tensor parallelism.
+
+> [!TIP]
+> Tensor parallelism is very communication intensive, therefore it is reccomended to use it on a single machine with multiple GPUs, utilizing fast intra-node communication. For multi-node training, methods as pipeline or data parallelism are more efficient (depending on your use case).
+
+Tensor parallelism requires slight changes to the model parameters, therefore in transformers, we support some of the popular models out of the box.

 > [!TIP]
 > Expand the list below to see which models support tensor parallelism. Open a GitHub issue or pull request to add support for a model not currently below.
@ -37,9 +43,218 @@ rendered properly in your Markdown viewer.

 </details>

-Set `tp_plan="auto"` in [`~AutoModel.from_pretrained`] to enable tensor parallelism for inference.
+## Using 🤗 transformers

-```py
+Transformers provides a simple interface to use for tensor parallelism. We provide multiple classes implementing different partitioning
+strategies and a simple entrypoint to parallelize `nn.Module` instance. You won't have to interact with this interface directly, everything is done in `PretrainedModel.from_pretrained` method for you. This section will first talk about the partitioning strategies
+we support, then the user interface you will be interacting with, and finally it will teach you how to extend it with your own partitioning
+strategies.
+
+### Partitioning strategies
+
+In transformers, partitioning strategies reside in a class `ParallelInterface` which works like a mapping from string to the strategy implementation.
+
+
+```python
+class ParallelInterface(MutableMapping):
+    """
+    Dict-like object keeping track of allowed attention functions. You can easily add a new attention function
+    with a call to `register()`. If a model needs to locally overwrite an existing attention function, say `sdpa`,
+    it needs to declare a new instance of this class inside the `modeling_<model>.py`, and declare it on that instance.
+    """
+    _global_mapping = {
+        "colwise": ColwiseParallel(),
+        "rowwise": RowwiseParallel(),
+        "colwise_rep": ColwiseParallel(output_layouts=Replicate()),
+        "rowwise_rep": RowwiseParallel(input_layouts=Replicate()),
+        "local_colwise": ColwiseParallel(use_dtensor=False),
+        "local_rowwise": RowwiseParallel(use_dtensor=False),
+        "local": IsolatedParallel(),
+        "gather": GatherParallel(),
+        "local_packed_rowwise": PackedRowwiseParallel(use_dtensor=False),
+        "sequence_parallel": SequenceParallel(),
+        "replicate": ReplicateParallel(),
+    }
+```
+
+We support the following strategies:
+
+- `ColwiseParallel` - A simple column-wise partitioning, being able to handle both weights and biases, does exactly what we've discussed before.
+- `RowwiseParallel` - Again, row-wise partitioning as dicussed before, supports weights and biases, on top of that it also supports `nn.Embedding` modules.
+- `SequenceParallel` - Sequence parallel implementation, for support of `LayerNorm` and `Dropout` layers. Also supports Python implementation of `RMSNorm` (see [this](https://github.com/facebookresearch/llama/blob/main/llama/model.py#L34))
+- `PackedColwiseParallel` - A variant of column-wise partitioning, however it works on packed weights (i.e. `up_proj` and `gate_proj` being packed together). For more details, see [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108)
+- `PackedRowwiseParallel` - A variant of row-wise partitioning, works on packed weights, for more details check the comment linked above.
+- `GatherParallel` - A very simple class, that only makes the outputs of the module to be gathered across devices.
+- `IsolatedParallel` - This is a special case, where we want to *isolate* the module from the rest of the devices (world). This is used for Experts in MoE layers, basically creating Expert parallelism of sorts.
+- `ReplicateParallel` - Many `torch.distributed` APIs break if model is partially sharded, so this class is used to replicate the module across all devices.
+
+### Sharding a model
+
+We provide two ways to shard a model, first one is to use `auto` tensor parallelism plan, which will automatically shard the model based on our predefined configuration. This requires the model to have predefined tensor parallel plan in transformers.
+
+```python
+from transformers import AutoModelForCausalLM
+
+# model_id = "meta-llama/Meta-Llama-3-8B-Instruct" # better for smaller number of GPUs
+model_id = "meta-llama/Llama-4-Scout-17B-16E-Instruct" # better to visualize all the possible strategies
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan="auto")
+
+print(model._tp_plan)
+```
+
+> [!TIP]
+> For a list of models that support tensor parallelism, see the [Supported models](#supported-models) section above.
+
+The second way is to manually specify your own partitioning plan.
+
+```python
+from transformers import AutoModelForCausalLM
+
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise",
+    "model.layers.*.self_attn.k_proj": "colwise",
+    "model.layers.*.self_attn.v_proj": "colwise",
+    "model.layers.*.self_attn.o_proj": "rowwise",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+
+print(model._tp_plan)
+```
+
+You might have noticed that there are some special cases in the `ParallelInterface` mapping, let's now talk about them. This will help you understand their purpose and help with extending to other strategies.
+
+### PackedRowwiseParallel
+This class is a special case of `RowwiseParallel`, it's used to shard packed weights. Weight packing is a common technique used in models. It's a technique where we pack multiple linear layers into a single, bigger one.
+
+For example in `Llama4` model, we pack `up_proj` and `gate_proj` into a single `gate_up_proj` module.
+```python
+class Llama4TextExperts(nn.Module):
+    ...
+    self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_size, 2 * self.expert_dim))
+```
+
+Then in forward, we can use batch matrix multiplication to compute the output of the `gate_up_proj` module.
+
+```python
+def forward(self, hidden_states):
+    ...
+    gate_up = torch.bmm(hidden_states, self.gate_up_proj) # Compute the output of the gate_up_proj module
+    gate, up = gate_up.chunk(2, dim=-1) # Split the output into gate and up
+```
+
+In this case, we need to use the `PackedRowwiseParallel` strategy to shard the `gate_up_proj` module, as using a simple `RowwiseParallel` will shard the layers wrongly.
+
+> [!TIP]
+> If this is a bit difficult to wrap your head around, check out [this comment](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py#L79-#L108) for an amazing visual representation of why `Packed*` needs to be used.
+
+
+### `local*` strategies
+
+You could have noticed that there are `local*` strategies, which use the same layers as `*` strategy, but don't use `DTensor` at all.
+This is because `DTensor` is not supported for some of the operations: such as `torch.chunk`. Therefore, sometimes we need to use the `local*` strategies, which use vanilla `torch.Tensor` and do some of the distributed logic manually.
+
+<!---
+Readd this when I get the exact error message
+> [!TIP]
+> If you are using a custom partitioning strategy, and it's not working with `... is not supported` error, try using the `local*` strategies to see if they work better.
+-->
+
+> [!WARNING]
+> Manually specifying your own partitiong plan requires a good understanding of the model architecture and how the partitioning strategies interact together. If you are not sure about this, the resulting model can be very slow, even failing or incorrect. Again, refer to the [Ultra-Scale Playbook](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=tensor_parallelism) which can teach you everything required.
+
+### Extending the interface with your own partitioning strategies
+
+This is a very advanced topic, which requires a good understanding of distributed collectives and the model architecture.
+Your custom partitioning strategy should inherit from `TensorParallelLayer` defined in [integrations/tensor_parallel.py](https://github.com/huggingface/transformers/blob/main/src/transformers/integrations/tensor_parallel.py) and implement: `partition_tensor`, `_prepare_input_fn` and `_prepare_output_fn`. Then it should be registered in the `ParallelInterface` mapping, so our dispatching logic can find it when specified in the `tp_plan`.
+
+Let's go through this workflow step by step, on an already existing example: `ColwiseParallel`.
+
+1. Inherit from `TensorParallelLayer` and initialization
+
+```python
+class ColwiseParallel(TensorParallelLayer):
+    def __init__(
+        self,
+        *,
+        input_layouts: Optional[Placement] = None, # The input layout coming from the previous layer
+        output_layouts: Optional[Placement] = None, # The output layout we want to achieve
+        use_local_output: bool = True, # Whether to use local output or not
+        use_dtensor=True, # Whether to use DTensor or not
+    ):
+        self.input_layouts = (input_layouts or Replicate(),) # The input sharding coming from the previous layer
+        self.output_layouts = (output_layouts or Shard(-1),) # Desired output sharding
+        self.desired_input_layouts = (Replicate(),) # Desired input sharding, inputs should be replicated across GPUs
+        self.use_local_output = use_local_output
+        self.use_dtensor = use_dtensor
+```
+
+In the `__init__` method, we define these attributes, where `input_layouts` and `output_layouts` describing, how the input and output tensors should be placed on the devices. `desired_input_layouts` is used to specify, how the input *SHOULD* be placed on the devices.
+
+2a. Implement `partition_tensor` method
+
+```python
+def partition_tensor(
+    self,
+    param, # Full tensor of the parameter
+    empty_param, # Empty tensor of the parameter, will be filled with the partitioned tensor
+    param_type, # Type of the parameter, `bias` or `weight`
+    param_casting_dtype, # The type to cast the parameter to
+    to_contiguous, # Whether to convert the tensor to a contiguous memory layout
+    rank, # The rank of the current device
+    device_mesh, # The device mesh
+) -> nn.Parameter: # Return the partitioned parameter
+    ...
+```
+
+This method is used to partition the tensor, and fill the `empty_param` with the partitioned tensor.
+We provide some utility functions to help you with this, such as `get_tensor_shard` which will get you the correct shard of the original parameter for this rank or `get_packed_weights` to help with packed weights.
+
+2b. Implement `_prepare_input_fn` and `_prepare_output_fn` methods
+
+These methods are used as [`pre-forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_pre_hook.html) and [`forward`](https://docs.pytorch.org/docs/stable/generated/torch.nn.modules.module.register_module_forward_hook.html) hooks respectively. Their purpose is to re-distribute the inputs and outputs to the desired layout, passed in the `__init__` method.
+
+```python
+def _prepare_input_fn(input_layouts, desired_input_layouts, mod, inputs, device_mesh):
+    ...
+    # Do some custom logic, cast to DTensor etc.
+    ...
+    return inputs.redistribute(placements=desired_input_layouts, device_mesh=device_mesh)
+
+def _prepare_output_fn(output_layouts, use_local_output, mod, outputs, device_mesh):
+    ...
+    # Do some custom logic, cast to DTensor etc.
+    ...
+    return outputs.redistribute(placements=output_layouts, device_mesh=device_mesh)
+```
+
+3. Register the strategy
+Congratulations! You've implemented your own partitioning strategy. Now, to use it with your own `tp_plan`, you need to register it in the `ParallelInterface` mapping.
+
+```python
+from transformers.integrations.tensor_parallel import ParallelInterface
+
+ParallelInterface.register_strategy("colwise_custom", ColwiseParallel)
+```
+
+And now you can use it in your `tp_plan` as such:
+
+```python
+tp_plan = {
+    "model.layers.*.self_attn.q_proj": "colwise_custom",
+    ...
+}
+
+model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.bfloat16, tp_plan=tp_plan)
+```
+
+
+## Full example
+
+Let's go through a full example of inference with tensor parallelism.
+```python
 import os
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
@ -66,17 +281,49 @@ Launch the inference script above on [torchrun](https://pytorch.org/docs/stable/
 torchrun --nproc-per-node 4 demo.py
 ```

-For CPU, please binding different socket on each rank. For example, if you are using Intel 4th Gen Xeon:
-```bash
-export OMP_NUM_THREADS=56
-numactl -C 0-55 -m 0 torchrun --nnodes=2 --node_rank=0 --master_addr="127.0.0.1" --master_port=29500 --nproc-per-node 1 demo.py & numactl -C 56-111 -m 1 torchrun --nnodes=2 --node_rank=1 --master_addr="127.0.0.1" --master_port=29500 --nproc-per-node 1 demo.py & wait
-```
-The CPU benchmark data will be released soon.
-
 You can benefit from considerable speed ups for inference, especially for inputs with large batch size or long sequences.

 For a single forward pass on [Llama](./model_doc/llama) with a sequence length of 512 and various batch sizes, you can expect the following speed ups.

 <div style="text-align: center">
-<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
+    <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/Meta-Llama-3-8B-Instruct%2C%20seqlen%20%3D%20512%2C%20python%2C%20w_%20compile.png">
 </div>
+
+## Tensor parallelism in-depth
+Our implementation of tensor parallelism is framework-agnostic in design, but the specific implementations we've developed rely on the torch.distributed package. We heavily utilize abstractions such as `DeviceMesh` or `DTensor` to provide a simple and extensible interface to the user.
+
+### DeviceMesh
+Imagine `DeviceMesh` as a multi-dimensional grid of devices that communicate together. Different parallelization strategies require different types of communication patterns, therefore we can create a `DeviceMesh` with multiple submeshes:
+```python
+from torch.distributed.device_mesh import init_device_mesh
+
+# Create a 1D mesh of 4 GPUs
+device_mesh = init_device_mesh("cuda", (4,), mesh_dim_names=["tp"])
+```
+Then, most of the `torch.distributed` defined parallelization strategies can be applied to a mesh itself, or its submesh, automatically handling the communication patterns.
+
+### DTensor
+
+Abbreviation for Distributed Tensor, `DTensor` is a tensor subclass that handles the distributed logic on-top of the usual tensor operations. Most of the model weights in case of tensor parallelism are stored as `DTensor`s (with some exceptions, more on that later).
+The most important part of DTensor, that is crucial to understand, is the `placement` attribute. It's an attribute that tells PyTorch how is the tensor placed on the devices of the `DeviceMesh`.
+
+It can have the following values:
+
+- `Shard(dimension)` - Annotates that this `DTensor` is sharded across a given dimension, over the `DeviceMesh` it was constructed under. For example, if we would like to shard weights for column-wise partitioning, we would do:
+```python
+weight = ...
+weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(0)]) # Shard across the 1st (column-wise) dimension
+bias = ...
+bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Shard(-1)]) # Shard across the ONLY dimension
+```
+
+To give another example, for row-wise partitioning, we would do:
+```python
+weight = ...
+weight = DTensor.from_local(weight, device_mesh["tp"], placements=[Shard(1)]) # Shard across the 2nd (row-wise) dimension
+bias = ...
+bias = DTensor.from_local(bias, device_mesh["tp"], placements=[Replicate()]) # Replicate bias across all GPUs
+```
+
+- `Replicate()` - Annotates that this `DTensor` is replicated across the `DeviceMesh`. Very straight-forward, only creates a full copy of the tensor on each device.
+- `Partial()` - This placement is mostly of no interest to us, it's used to annotate that this tensor is pending a reduction operation.
--- a/docs/source/en/processors.md
+++ b/docs/source/en/processors.md
@ -106,6 +106,8 @@ dataset[0]["text"]
 Remember to resample the sampling rate to match the pretrained models required sampling rate.

 ```py
+from datasets import Audio
+
 dataset = dataset.cast_column("audio", Audio(sampling_rate=16000))
 ```

--- a/docs/source/en/torchscript.md
+++ b/docs/source/en/torchscript.md
@ -16,7 +16,7 @@ rendered properly in your Markdown viewer.

 # TorchScript

-[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may the most performant choice.
+[TorchScript](https://pytorch.org/docs/stable/jit.html) serializes PyTorch models into programs that can be executed in non-Python processes. This is especially advantageous in production environments where Python may not be the most performant choice.

 Transformers can export a model to TorchScript by:

--- a/docs/source/en/trainer.md
+++ b/docs/source/en/trainer.md
@ -372,14 +372,14 @@ accelerate launch \

 ### torch.compile

-[torch.compile](./perf_torch_compile) can significantly speed up training and reduce computational overhead. Configure your torch.compile settings in [`TrainingArguments`]. Set `torch.compile` to `True`, and select a backend and compile mode.
+[torch.compile](./perf_torch_compile) can significantly speed up training and reduce computational overhead. Configure your torch.compile settings in [`TrainingArguments`]. Set `torch_compile` to `True`, and select a backend and compile mode.

 ```py
 from transformers import TrainingArguments

 training_args = TrainingArguments(
-    torch.compile=True,
-    torch.compile_backend="inductor",
+    torch_compile=True,
+    torch_compile_backend="inductor",
    torch_compile_mode="default",
    ...,
 )
--- a/docs/source/en/video_processors.md
+++ b/docs/source/en/video_processors.md
@ -0,0 +1,49 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+
+# Video Processor
+
+A **Video Processor** is a utility responsible for preparing input features for video models, as well as handling the post-processing of their outputs. It provides transformations such as resizing, normalization, and conversion into PyTorch. 
+
+The video processor extends the functionality of image processors by allowing the models to handle videos with a distinct set of arguments compared to images. It serves as the bridge between raw video data and the model, ensuring that input features are optimized for the VLM.
+
+Use [`~BaseVideoProcessor.from_pretrained`] to load a video processors configuration (image size, whether to normalize and rescale, etc.) from a video model on the Hugging Face [Hub](https://hf.co) or local directory. The configuration for each pretrained model should be saved in a [video_preprocessor_config.json] file but older models might have the config saved in [preprocessor_config.json](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf/blob/main/preprocessor_config.json) file. Note that the latter is less preferred and will be removed in the future.
+
+
+### Usage Example
+Here's an example of how to load a video processor with [`llava-hf/llava-onevision-qwen2-0.5b-ov-hf`](https://huggingface.co/llava-hf/llava-onevision-qwen2-0.5b-ov-hf) model:
+
+```python
+from transformers import AutoVideoProcessor
+
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+```
+
+Currently, if using base image processor for videos, it processes video data by treating each frame as an individual image and applying transformations frame-by-frame. While functional, this approach is not highly efficient. Using `AutoVideoProcessor` allows us to take advantage of **fast video processors**, leveraging the [torchvision](https://pytorch.org/vision/stable/index.html) library. Fast processors handle the whole batch of videos at once, without iterating over each video or frame. These updates introduce GPU acceleration and significantly enhance processing speed, especially for tasks requiring high throughput.
+
+Fast video processors are available for all models and are loaded by default when an `AutoVideoProcessor` is initialized. When using a fast video processor, you can also set the `device` argument to specify the device on which the processing should be done. By default, the processing is done on the same device as the inputs if the inputs are tensors, or on the CPU otherwise. For even more speed improvement, we can compile the processor when using 'cuda' as device.
+
+```python
+import torch
+from transformers.video_utils import load_video
+from transformers import AutoVideoProcessor
+
+video = load_video("video.mp4")
+processor = AutoVideoProcessor.from_pretrained("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", device="cuda")
+processor = torch.compile(processor)
+processed_video = processor(video, return_tensors="pt")
+```
--- a/docs/source/ja/model_doc/beit.md
+++ b/docs/source/ja/model_doc/beit.md
@ -105,6 +105,7 @@ BEiT の使用を開始するのに役立つ公式 Hugging Face およびコミ

 [[autodoc]] BeitImageProcessor
    - preprocess
+    - post_process_semantic_segmentation

 ## BeitImageProcessorFast

--- a/docs/source/zh/_toctree.yml
+++ b/docs/source/zh/_toctree.yml
@ -157,5 +157,8 @@
      title: 通用工具
    - local: internal/time_series_utils
      title: 时序数据工具
+    - sections:
+      - local: model_doc/bert
+        title: BERT
    title: 内部辅助工具
-  title: 应用程序接口 (API)
+  title: 应用程序接口 (API)
--- a/docs/source/zh/model_doc/bert.md
+++ b/docs/source/zh/model_doc/bert.md
@ -0,0 +1,258 @@
+<!--Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+<div style="float: right;">
+    <div class="flex flex-wrap space-x-1">
+        <img alt="PyTorch" src="https://img.shields.io/badge/PyTorch-DE3412?style=flat&logo=pytorch&logoColor=white">
+        <img alt="TensorFlow" src="https://img.shields.io/badge/TensorFlow-FF6F00?style=flat&logo=tensorflow&logoColor=white">
+        <img alt="Flax" src="https://img.shields.io/badge/Flax-29a79b.svg?style=flat&logo=data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAC0AAAAtCAMAAAANxBKoAAAC7lBMVEUAAADg5vYHPVgAoJH+/v76+v39/f9JbLP///9+AIgAnY3///+mcqzt8fXy9fgkXa3Ax9709fr+///9/f8qXq49qp5AaLGMwrv8/P0eW60VWawxYq8yqJzG2dytt9Wyu9elzci519Lf3O3S2efY3OrY0+Xp7PT///////+dqNCexMc6Z7AGpJeGvbenstPZ5ejQ1OfJzOLa7ejh4+/r8fT29vpccbklWK8PVa0AS6ghW63O498vYa+lsdKz1NDRt9Kw1c672tbD3tnAxt7R6OHp5vDe7OrDyuDn6vLl6/EAQKak0MgATakkppo3ZK/Bz9y8w9yzu9jey97axdvHzeG21NHH4trTwthKZrVGZLSUSpuPQJiGAI+GAI8SWKydycLL4d7f2OTi1+S9xNzL0ePT6OLGzeEAo5U0qJw/aLEAo5JFa7JBabEAp5Y4qZ2QxLyKmsm3kL2xoMOehrRNb7RIbbOZgrGre68AUqwAqZqNN5aKJ5N/lMq+qsd8kMa4pcWzh7muhLMEV69juq2kbKqgUaOTR5uMMZWLLZSGAI5VAIdEAH+ovNDHuNCnxcy3qcaYx8K8msGplrx+wLahjbYdXrV6vbMvYK9DrZ8QrZ8tqJuFms+Sos6sw8ecy8RffsNVeMCvmb43aLltv7Q4Y7EZWK4QWa1gt6meZKUdr6GOAZVeA4xPAISyveLUwtivxtKTpNJ2jcqfvcltiMiwwcfAoMVxhL+Kx7xjdrqTe60tsaNQs6KaRKACrJ6UTZwkqpqTL5pkHY4AloSgsd2ptNXPvNOOncuxxsqFl8lmg8apt8FJcr9EbryGxLqlkrkrY7dRa7ZGZLQ5t6iXUZ6PPpgVpZeJCJFKAIGareTa0+KJod3H0deY2M+esM25usmYu8d2zsJOdcBVvrCLbqcAOaaHaKQAMaScWqKBXqCXMJ2RHpiLF5NmJZAdAHN2kta11dKu1M+DkcZLdb+Mcql3TppyRJdzQ5ZtNZNlIY+DF4+voCOQAAAAZ3RSTlMABAT+MEEJ/RH+/TP+Zlv+pUo6Ifz8+fco/fz6+evr39S9nJmOilQaF/7+/f38+smmoYp6b1T+/v7++vj189zU0tDJxsGzsrKSfv34+Pf27dDOysG9t6+n/vv6+vr59uzr1tG+tZ6Qg9Ym3QAABR5JREFUSMeNlVVUG1EQhpcuxEspXqS0SKEtxQp1d3d332STTRpIQhIISQgJhODu7lAoDoUCpe7u7u7+1puGpqnCPOyZvffbOXPm/PsP9JfQgyCC+tmTABTOcbxDz/heENS7/1F+9nhvkHePG0wNDLbGWwdXL+rbLWvpmZHXD8+gMfBjTh+aSe6Gnn7lwQIOTR0c8wfX3PWgv7avbdKwf/ZoBp1Gp/PvuvXW3vw5ib7emnTW4OR+3D4jB9vjNJ/7gNvfWWeH/TO/JyYrsiKCRjVEZA3UB+96kON+DxOQ/NLE8PE5iUYgIXjFnCOlxEQMaSGVxjg4gxOnEycGz8bptuNjVx08LscIgrzH3umcn+KKtiBIyvzOO2O99aAdR8cF19oZalnCtvREUw79tCd5sow1g1UKM6kXqUx4T8wsi3sTjJ3yzDmmhenLXLpo8u45eG5y4Vvbk6kkC4LLtJMowkSQxmk4ggVJEG+7c6QpHT8vvW9X7/o7+3ELmiJi2mEzZJiz8cT6TBlanBk70cB5GGIGC1gRDdZ00yADLW1FL6gqhtvNXNG5S9gdSrk4M1qu7JAsmYshzDS4peoMrU/gT7qQdqYGZaYhxZmVbGJAm/CS/HloWyhRUlknQ9KYcExTwS80d3VNOxUZJpITYyspl0LbhArhpZCD9cRWEQuhYkNGMHToQ/2Cs6swJlb39CsllxdXX6IUKh/H5jbnSsPKjgmoaFQ1f8wRLR0UnGE/RcDEjj2jXG1WVTwUs8+zxfcrVO+vSsuOpVKxCfYZiQ0/aPKuxQbQ8lIz+DClxC8u+snlcJ7Yr1z1JPqUH0V+GDXbOwAib931Y4Imaq0NTIXPXY+N5L18GJ37SVWu+hwXff8l72Ds9XuwYIBaXPq6Shm4l+Vl/5QiOlV+uTk6YR9PxKsI9xNJny31ygK1e+nIRC1N97EGkFPI+jCpiHe5PCEy7oWqWSwRrpOvhFzcbTWMbm3ZJAOn1rUKpYIt/lDhW/5RHHteeWFN60qo98YJuoq1nK3uW5AabyspC1BcIEpOhft+SZAShYoLSvnmSfnYADUERP5jJn2h5XtsgCRuhYQqAvwTwn33+YWEKUI72HX5AtfSAZDe8F2DtPPm77afhl0EkthzuCQU0BWApgQIH9+KB0JhopMM7bJrdTRoleM2JAVNMyPF+wdoaz+XJpGoVAQ7WXUkcV7gT3oUZyi/ISIJAVKhgNp+4b4veCFhYVJw4locdSjZCp9cPUhLF9EZ3KKzURepMEtCDPP3VcWFx4UIiZIklIpFNfHpdEafIF2aRmOcrUmjohbT2WUllbmRvgfbythbQO3222fpDJoufaQPncYYuqoGtUEsCJZL6/3PR5b4syeSjZMQG/T2maGANlXT2v8S4AULWaUkCxfLyW8iW4kdka+nEMjxpL2NCwsYNBp+Q61PF43zyDg9Bm9+3NNySn78jMZUUkumqE4Gp7JmFOdP1vc8PpRrzj9+wPinCy8K1PiJ4aYbnTYpCCbDkBSbzhu2QJ1Gd82t8jI8TH51+OzvXoWbnXUOBkNW+0mWFwGcGOUVpU81/n3TOHb5oMt2FgYGjzau0Nif0Ss7Q3XB33hjjQHjHA5E5aOyIQc8CBrLdQSs3j92VG+3nNEjbkbdbBr9zm04ruvw37vh0QKOdeGIkckc80fX3KH/h7PT4BOjgCty8VZ5ux1MoO5Cf5naca2LAsEgehI+drX8o/0Nu+W0m6K/I9gGPd/dfx/EN/wN62AhsBWuAAAAAElFTkSuQmCC
+        ">
+        <img alt="SDPA" src="https://img.shields.io/badge/SDPA-DE3412?style=flat&logo=pytorch&logoColor=white">
+    </div>
+</div>
+
+# BERT
+
+[BERT](https://huggingface.co/papers/1810.04805) 是一个在无标签的文本数据上预训练的双向 transformer，用于预测句子中被掩码的（masked） token，以及预测一个句子是否跟随在另一个句子之后。其主要思想是，在预训练过程中，通过随机掩码一些 token，让模型利用左右上下文的信息预测它们，从而获得更全面深入的理解。此外，BERT 具有很强的通用性，其学习到的语言表示可以通过额外的层或头进行微调，从而适配其他下游 NLP 任务。
+
+你可以在 [BERT](https://huggingface.co/collections/google/bert-release-64ff5e7a4be99045d1896dbc) 集合下找到 BERT 的所有原始 checkpoint。
+
+> [!TIP]
+> 点击右侧边栏中的 BERT 模型，以查看将 BERT 应用于不同语言任务的更多示例。
+
+下面的示例演示了如何使用 [`Pipeline`], [`AutoModel`] 和命令行预测 `[MASK]` token。
+
+<hfoptions id="usage">
+<hfoption id="Pipeline">
+
+```py
+import torch
+from transformers import pipeline
+
+pipeline = pipeline(
+    task="fill-mask",
+    model="google-bert/bert-base-uncased",
+    torch_dtype=torch.float16,
+    device=0
+)
+pipeline("Plants create [MASK] through a process known as photosynthesis.")
+```
+
+</hfoption>
+<hfoption id="AutoModel">
+
+```py
+import torch
+from transformers import AutoModelForMaskedLM, AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(
+    "google-bert/bert-base-uncased",
+)
+model = AutoModelForMaskedLM.from_pretrained(
+    "google-bert/bert-base-uncased",
+    torch_dtype=torch.float16,
+    device_map="auto",
+    attn_implementation="sdpa"
+)
+inputs = tokenizer("Plants create [MASK] through a process known as photosynthesis.", return_tensors="pt").to("cuda")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+    predictions = outputs.logits
+
+masked_index = torch.where(inputs['input_ids'] == tokenizer.mask_token_id)[1]
+predicted_token_id = predictions[0, masked_index].argmax(dim=-1)
+predicted_token = tokenizer.decode(predicted_token_id)
+
+print(f"The predicted token is: {predicted_token}")
+```
+
+</hfoption>
+<hfoption id="transformers-cli">
+
+```bash
+echo -e "Plants create [MASK] through a process known as photosynthesis." | transformers-cli run --task fill-mask --model google-bert/bert-base-uncased --device 0
+```
+
+</hfoption>
+</hfoptions>
+
+## 注意
+
+- 输入内容应在右侧进行填充，因为 BERT 使用绝对位置嵌入。
+## BertConfig
+
+[[autodoc]] BertConfig
+    - all
+
+## BertTokenizer
+
+[[autodoc]] BertTokenizer
+    - build_inputs_with_special_tokens
+    - get_special_tokens_mask
+    - create_token_type_ids_from_sequences
+    - save_vocabulary
+
+## BertTokenizerFast
+
+[[autodoc]] BertTokenizerFast
+
+## BertModel
+
+[[autodoc]] BertModel
+    - forward
+
+## BertForPreTraining
+
+[[autodoc]] BertForPreTraining
+    - forward
+
+## BertLMHeadModel
+
+[[autodoc]] BertLMHeadModel
+    - forward
+
+## BertForMaskedLM
+
+[[autodoc]] BertForMaskedLM
+    - forward
+
+## BertForNextSentencePrediction
+
+[[autodoc]] BertForNextSentencePrediction
+    - forward
+
+## BertForSequenceClassification
+
+[[autodoc]] BertForSequenceClassification
+    - forward
+
+## BertForMultipleChoice
+
+[[autodoc]] BertForMultipleChoice
+    - forward
+
+## BertForTokenClassification
+
+[[autodoc]] BertForTokenClassification
+    - forward
+
+## BertForQuestionAnswering
+
+[[autodoc]] BertForQuestionAnswering
+    - forward
+
+## TFBertTokenizer
+
+[[autodoc]] TFBertTokenizer
+
+## TFBertModel
+
+[[autodoc]] TFBertModel
+    - call
+
+## TFBertForPreTraining
+
+[[autodoc]] TFBertForPreTraining
+    - call
+
+## TFBertModelLMHeadModel
+
+[[autodoc]] TFBertLMHeadModel
+    - call
+
+## TFBertForMaskedLM
+
+[[autodoc]] TFBertForMaskedLM
+    - call
+
+## TFBertForNextSentencePrediction
+
+[[autodoc]] TFBertForNextSentencePrediction
+    - call
+
+## TFBertForSequenceClassification
+
+[[autodoc]] TFBertForSequenceClassification
+    - call
+
+## TFBertForMultipleChoice
+
+[[autodoc]] TFBertForMultipleChoice
+    - call
+
+## TFBertForTokenClassification
+
+[[autodoc]] TFBertForTokenClassification
+    - call
+
+## TFBertForQuestionAnswering
+
+[[autodoc]] TFBertForQuestionAnswering
+    - call
+
+## FlaxBertModel
+
+[[autodoc]] FlaxBertModel
+    - __call__
+
+## FlaxBertForPreTraining
+
+[[autodoc]] FlaxBertForPreTraining
+    - __call__
+
+## FlaxBertForCausalLM
+
+[[autodoc]] FlaxBertForCausalLM
+    - __call__
+
+## FlaxBertForMaskedLM
+
+[[autodoc]] FlaxBertForMaskedLM
+    - __call__
+
+## FlaxBertForNextSentencePrediction
+
+[[autodoc]] FlaxBertForNextSentencePrediction
+    - __call__
+
+## FlaxBertForSequenceClassification
+
+[[autodoc]] FlaxBertForSequenceClassification
+    - __call__
+
+## FlaxBertForMultipleChoice
+
+[[autodoc]] FlaxBertForMultipleChoice
+    - __call__
+
+## FlaxBertForTokenClassification
+
+[[autodoc]] FlaxBertForTokenClassification
+    - __call__
+
+## FlaxBertForQuestionAnswering
+
+[[autodoc]] FlaxBertForQuestionAnswering
+    - __call__
+
+## Bert specific outputs
+
+[[autodoc]] models.bert.modeling_bert.BertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
+
+[[autodoc]] models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
--- a/examples/3D_parallel.py
+++ b/examples/3D_parallel.py
@ -0,0 +1,422 @@
+""":
+This script is used to test training a model using Tensor Parallelism and Data Parallelism.
+
+Usage:
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+export CUDA_VISIBLE_DEVICES=5,6,7
+TP_SIZE=2 DP_SIZE=2 torchrun --nproc_per_node=4 --rdzv_endpoint=localhost:29503 examples/3D_parallel.py
+CP_SIZE=2 DP_SIZE=2 torchrun --nproc_per_node=4 examples/3D_parallel.py
+CP_SIZE=2 TP_SIZE=2 torchrun --nproc_per_node=4 examples/3D_parallel.py
+DP_SIZE=2 CP_SIZE=2 TP_SIZE=2 torchrun --nproc_per_node=8 examples/3D_parallel.py
+
+TP_SIZE=1 CP_SIZE=4 torchrun --nproc_per_node=4 examples/3D_parallel.py
+TP_SIZE=1 DP_SIZE=4 torchrun --nproc_per_node=4 examples/3D_parallel.py
+TP_SIZE=4 DP_SIZE=1 torchrun --nproc_per_node=4 --rdzv_endpoint=localhost:29503 examples/3D_parallel.py
+IGNORE_SANITY=1 CP_SIZE=1 TP_SIZE=1 DP_SIZE=1 torchrun --nproc_per_node=1 --rdzv_endpoint=localhost:29504 examples/3D_parallel.py
+ocalhost:29504 test_train.py
+"""
+
+import logging
+import os
+from contextlib import nullcontext
+from typing import Iterable
+
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+import torch.optim as optim
+import wandb
+from datasets import load_dataset
+from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.experimental import context_parallel
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.utils.data import DataLoader
+from torch.utils.data.distributed import DistributedSampler
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+# torch.use_deterministic_algorithms(True)
+torch.backends.cudnn.deterministic = True
+
+# Set up logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+# from torch.distributed.tensor.experimental._attention import set_rotate_method
+
+# set_rotate_method("alltoall")  # CP rotate shards using all-to-all
+
+
+def main():
+    tp_size = int(os.environ.get("TP_SIZE", 1))
+    dp_size = int(os.environ.get("DP_SIZE", 1))
+    cp_size = int(os.environ.get("CP_SIZE", 1))  # Add CP size configuration
+    sdpa_backend = SDPBackend.FLASH_ATTENTION  # For CP
+    # sdpa_backend = SDPBackend.MATH # For CP
+    global_batch_size = 8  # Desired global batch size
+    seq_len = 1024  # Sequence length
+    num_train_steps = 10000  # Number of training steps
+    LR = 1e-5
+    model_name = "HuggingFaceTB/SmolLM2-1.7B"
+    # model_name = "unsloth/Llama-3.2-1B"
+
+    CHECKPOINT_DIR = f"checkpoint_tp{tp_size}_dp{dp_size}_cp{cp_size}"
+
+    # Initialize distributed environment
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        local_rank = int(os.environ["LOCAL_RANK"])
+        torch.cuda.set_device(local_rank)
+
+        assert world_size == tp_size * dp_size * cp_size, (
+            f"World size ({world_size}) must equal TP size ({tp_size}) * DP size ({dp_size}) * CP size ({cp_size})"
+        )
+
+        mesh = torch.arange(world_size).reshape(dp_size, tp_size, cp_size)
+        world_mesh = DeviceMesh(device_type="cuda", mesh=mesh, mesh_dim_names=("dp", "tp", "cp"))
+        tp_mesh = world_mesh["tp"]
+        dp_mesh = world_mesh["dp"]
+        cp_mesh = world_mesh["cp"]
+        world_mesh["dp", "cp"]._flatten(mesh_dim_name="dp_cp")
+        logger.info(f"Created DeviceMesh: {world_mesh}")
+        logger.info(
+            f"Distributed setup - Rank: {rank}, World size: {world_size}, Local rank: {local_rank}, DP: {dp_mesh.get_local_rank()}, TP: {tp_mesh.get_local_rank()}, CP: {cp_mesh.get_local_rank()}"
+        )
+
+        if dist.get_rank() == 0:
+            wandb.init(
+                project="tp_dp_test",
+                config={
+                    "tp_size": tp_size,
+                    "dp_size": dp_size,
+                    "cp_size": cp_size,
+                    "global_batch_size": global_batch_size,
+                    "model_name": model_name,
+                    "dataset": "roneneldan/TinyStories-1M",
+                    "seq_len": seq_len,
+                    "lr": LR,
+                    "weight_decay": 0.1,
+                },
+                name=f"llama_tp{tp_size}_dp{dp_size}_cp{cp_size}"
+                if model_name == "unsloth/Llama-3.2-1B"
+                else f"tp{tp_size}_dp{dp_size}_cp{cp_size}",
+            )
+            logger.info("Wandb initialized.")
+            # Log the current file to wandb
+            wandb.save("test_train.py")
+
+    # Load model and tokenizer
+    logger.info(f"Loading model and tokenizer from {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        logger.info(f"Set pad_token to eos_token: {tokenizer.pad_token}")
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_mesh=tp_mesh if dist.is_initialized() else None,
+        tp_plan="auto",
+        torch_dtype=torch.bfloat16,
+    )
+    logger.info(f"Model loaded onto device mesh: {tp_mesh}")
+    device = torch.device(f"cuda:{local_rank}")
+    logger.info(f"Using device: {device} for non-model tensors")
+    use_ddp = False
+    if dist.is_initialized() and dp_mesh.size() > 1:
+        model = FSDP(model, device_mesh=dp_mesh, sharding_strategy=ShardingStrategy.NO_SHARD)
+        use_ddp = True
+        pass
+
+    model.train()
+
+    logger.info("Loading TinyStories dataset...")
+    raw_dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")  # Use 1% for faster testing
+
+    def tokenize_function(examples):
+        # Tokenize the text without padding
+        tokenized_batch = tokenizer(
+            examples["text"], padding=False, truncation=True, max_length=seq_len, return_tensors=None
+        )
+        # Set labels to be the same as input_ids for Causal LM
+        tokenized_batch["labels"] = tokenized_batch["input_ids"].copy()
+        return tokenized_batch
+
+    tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+    logger.info(f"Dataset loaded and tokenized. Size: {len(tokenized_dataset)}")
+
+    # Create packed sequences
+    def create_packed_sequences(examples):
+        # Flatten all sequences
+        all_tokens = []
+        for input_ids in examples["input_ids"]:
+            all_tokens.extend(input_ids)
+
+        # Split into sequences of seq_len + 1 (for input + label)
+        num_sequences = len(all_tokens) // (seq_len + 1)
+        packed_input_ids = []
+        packed_labels = []
+
+        for i in range(num_sequences):
+            start_idx = i * (seq_len + 1)
+            end_idx = start_idx + (seq_len + 1)
+            # Get the full sequence
+            full_sequence = all_tokens[start_idx:end_idx]
+            # For input_ids, remove the last token
+            packed_input_ids.append(full_sequence[:-1])
+            # For labels, remove the first token
+            packed_labels.append(full_sequence[1:])
+
+        return {"input_ids": packed_input_ids, "labels": packed_labels}
+
+    # Apply packing to the dataset
+    packed_dataset = tokenized_dataset.map(
+        create_packed_sequences,
+        batched=True,
+        remove_columns=tokenized_dataset.column_names,
+        batch_size=1000,  # Process in batches for efficiency
+        num_proc=60,
+    )
+    logger.info(f"Dataset packed. New size: {len(packed_dataset)}")
+
+    # Shuffle the packed dataset
+    packed_dataset = packed_dataset.shuffle(seed=42)
+    logger.info("Packed dataset shuffled")
+
+    # Calculate local batch size
+    if dist.is_initialized():
+        assert global_batch_size % dp_mesh.size() == 0, (
+            f"Global batch size ({global_batch_size}) must be divisible by DP size ({dp_mesh.size()})"
+        )
+        local_batch_size = global_batch_size // dp_mesh.size()
+    else:
+        local_batch_size = global_batch_size
+
+    logger.info(
+        f"Global batch size: {global_batch_size}, DP size: {dp_size if dist.is_initialized() else 1}, Local batch size: {local_batch_size}"
+    )
+
+    # Simple collate function since sequences are already packed
+    def collate_fn(batch):
+        input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
+        labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
+        return {"input_ids": input_ids, "labels": labels}
+
+    if dist.is_initialized():
+        sampler = DistributedSampler(
+            packed_dataset, num_replicas=dp_mesh.size(), rank=dp_mesh.get_local_rank(), shuffle=False
+        )
+    else:
+        sampler = None
+
+    dataloader = DataLoader(
+        packed_dataset,
+        batch_size=local_batch_size,
+        sampler=sampler,
+        shuffle=False,
+        collate_fn=collate_fn,
+        pin_memory=True,
+    )
+    logger.info(f"DataLoader created. Distributed: {dist.is_initialized()}")
+
+    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.1)
+
+    # Training loop
+    logger.info(f"Starting training for {num_train_steps} steps...")
+    model.train()
+    step = 0
+    while step < num_train_steps:
+        for batch in dataloader:
+            if step >= num_train_steps:
+                break  # Exit loop if max steps reached
+
+            # Move batch to appropriate device
+            batch = {k: v.to(device) for k, v in batch.items()}
+            optimizer.zero_grad()
+
+            # Add position_ids to batch before CP sharding
+            batch_size = batch["input_ids"].shape[0]
+            position_ids = torch.arange(0, seq_len, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+            batch["position_ids"] = position_ids
+            from torch.distributed.tensor.experimental._attention import _cp_options
+
+            _cp_options.enable_load_balance = False
+
+            with sdpa_kernel(sdpa_backend):  # TODO: ideally move this to attention implementation
+                cp_context = (
+                    nullcontext()
+                    if cp_mesh.size() == 1
+                    else context_parallel(
+                        cp_mesh,
+                        buffers=[
+                            batch["input_ids"],
+                            batch["labels"],
+                            batch["position_ids"],
+                        ],
+                        buffer_seq_dims=[1, 1, 1],
+                    )
+                )
+                with cp_context:
+                    # Pop labels from batch before model forward pass
+                    labels = batch.pop("labels")
+                    outputs = model(**batch)  # [mbs, seq_len/cp]
+                    loss = outputs.loss
+                    logits = outputs.logits
+
+                    # Compute loss with shifted labels
+                    loss = model.loss_function(
+                        logits=logits, labels=None, shift_labels=labels, vocab_size=model.config.vocab_size
+                    )
+                    loss.backward()
+
+                # all reduce grads across dp_cp if applicable
+                all_reduce_grads(model, world_mesh, use_ddp=use_ddp)
+
+                if hasattr(model, "clip_grad_norm_"):
+                    gradnorm = model.clip_grad_norm_(max_norm=1.0, norm_type=2.0)  # TODO: fix reported gradnorm
+                else:
+                    # only works with FSDP's NO_SHARD otherwise we should use FSDP's clip_grad_norm_
+                    assert len(list(model.parameters())) > 5, "No parameters found in model. Probably DDP bug.."
+                    gradnorm = clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=2.0, foreach=True)
+
+                optimizer.step()
+                # allreduce loss across cp_dp before logging
+                if dist.is_initialized() and (cp_mesh.size() > 1 or dp_mesh.size() > 1):
+                    dist.all_reduce(loss, group=world_mesh["dp_cp"].get_group(), op=dist.ReduceOp.AVG)
+                current_loss = loss.item()
+
+                # Log loss and gradnorm to wandb (only on rank 0 of dp group)
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    logger.info(
+                        f"Step: {step} | GBS: {global_batch_size} | DP: {dp_mesh.size()} | TP: {tp_mesh.size()} | CP: {cp_mesh.size()} | Loss: {current_loss} | Gradnorm: {gradnorm} | lr: {LR}"
+                    )
+                    wandb.log(
+                        {
+                            "train/loss": current_loss,
+                            "train/gradnorm": gradnorm,
+                            "step": step,
+                            "lr": LR,
+                            "GBS": global_batch_size,
+                        }
+                    )
+
+            step += 1  # Increment step count
+
+    logger.info("Training loop finished.")
+
+    # Save model using DCP (only if distributed)
+    if dist.is_initialized():
+        state_dict = {"app": AppState(model, optimizer)}
+        dcp.save(
+            state_dict=state_dict,
+            checkpoint_id=CHECKPOINT_DIR,
+        )
+        logger.info(f"Saved checkpoint to {CHECKPOINT_DIR}")
+    else:
+        # Fallback to regular save for non-distributed case
+        save_dir = "test_model_nondist"
+        model.save_pretrained(save_dir, safe_serialization=False)
+        tokenizer.save_pretrained(save_dir)  # Save tokenizer too
+        logger.info(f"Saved model to {save_dir}")
+
+    dist.destroy_process_group()
+    logger.info("Cleaned up distributed process group")
+    # Finish wandb run on rank 0
+    if dist.get_rank() == 0:
+        wandb.finish()
+        logger.info("Wandb run finished.")
+
+
+def all_reduce_grads(model, world_mesh, use_ddp):
+    """All reduce gradients across dp_cp if applicable."""
+    cp_mesh = world_mesh["cp"]
+    if use_ddp:
+        # DDP/FSDP takes care of syncing grads
+        mesh = cp_mesh
+    else:
+        mesh = world_mesh["dp", "cp"]._flatten(mesh_dim_name="dp_cp")
+    if dist.is_initialized() and mesh.size() > 1:
+        for name, param in model.named_parameters():
+            if param.grad is not None:
+                # Workaround for cross-mesh communication limitation with DTensor gradients
+                if isinstance(param.grad, DTensor):
+                    local_grad = param.grad.to_local()
+                    # Ensure grad requires grad for inplace modification checks (might not be needed)
+                    # local_grad = local_grad.detach().requires_grad_(True)
+                    torch.distributed.all_reduce(local_grad, op=torch.distributed.ReduceOp.SUM, group=mesh.get_group())
+                    local_grad = local_grad / mesh.size()
+                    # Assign averaged grad back - need careful handling if DTensor structure is complex
+                    # This simple assignment might work if the grad structure matches param structure
+                    param.grad = DTensor.from_local(
+                        local_grad, device_mesh=param.grad.device_mesh, placements=param.grad.placements
+                    )
+                else:
+                    # Handle regular tensors if any exist (e.g. buffers not converted to DTensor)
+                    torch.distributed.all_reduce(param.grad, op=torch.distributed.ReduceOp.AVG, group=mesh.get_group())
+
+
+class AppState(Stateful):
+    """Wrapper for checkpointing the Application State including model and optimizer."""
+
+    def __init__(self, model, optimizer=None):
+        self.model = model
+        self.optimizer = optimizer
+
+    def state_dict(self):
+        model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
+        return {"model": model_state_dict, "optim": optimizer_state_dict}
+
+    def load_state_dict(self, state_dict):
+        set_state_dict(
+            self.model, self.optimizer, model_state_dict=state_dict["model"], optim_state_dict=state_dict["optim"]
+        )
+
+
+def clip_grad_norm_(
+    parameters: Iterable[torch.Tensor],
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: bool | None = None,
+) -> torch.Tensor:
+    """
+    Clip the gradient norm of an iterable of parameters.
+    """
+    # Filter out parameters with no gradients
+    parameters = [p for p in parameters if p.grad is not None]
+    assert len(parameters) > 0, "No parameters with gradients found"
+
+    # Calculate total norm
+    if norm_type == float("inf"):
+        total_norm = max(p.grad.detach().abs().max() for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type) for p in parameters]), norm_type)
+
+    # Convert DTensor to local tensor if needed
+    if isinstance(total_norm, DTensor):
+        total_norm = total_norm.full_tensor()
+
+    # Clip gradients
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.detach().mul_(clip_coef)
+
+    return total_norm
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/flax/question-answering/run_qa.py
+++ b/examples/flax/question-answering/run_qa.py
@ -60,7 +60,7 @@ from transformers.utils import check_min_version, send_example_telemetry
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
+++ b/examples/flax/speech-recognition/run_flax_speech_recognition_seq2seq.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risk.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/flax/speech-recognition/requirements.txt")

--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@ -55,7 +55,7 @@ from transformers.utils import check_min_version, send_example_telemetry

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 Array = Any
 Dataset = datasets.arrow_dataset.Dataset
--- a/examples/flax/token-classification/run_flax_ner.py
+++ b/examples/flax/token-classification/run_flax_ner.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version

 logger = logging.getLogger(__name__)
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

--- a/examples/pytorch/3d_parallel_checks.py
+++ b/examples/pytorch/3d_parallel_checks.py
@ -0,0 +1,780 @@
+""":
+This script is used to test training a model using Tensor Parallelism and Data Parallelism.
+
+Usage:
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+export CUDA_VISIBLE_DEVICES=4,5,6,7
+export CUDA_VISIBLE_DEVICES=5,6,7
+TP_SIZE=2 DP_SIZE=2 torchrun --nproc_per_node=4 --rdzv_endpoint=localhost:29503 test_train.py
+CP_SIZE=2 DP_SIZE=2 torchrun --nproc_per_node=4 test_train.py
+CP_SIZE=2 TP_SIZE=2 torchrun --nproc_per_node=4 test_train.py
+
+TP_SIZE=1 CP_SIZE=4 torchrun --nproc_per_node=4 test_train.py
+TP_SIZE=1 DP_SIZE=4 torchrun --nproc_per_node=4 test_train.py
+TP_SIZE=4 DP_SIZE=1 torchrun --nproc_per_node=4 --rdzv_endpoint=localhost:29503 test_train.py
+IGNORE_SANITY=1 CP_SIZE=1 TP_SIZE=1 DP_SIZE=1 torchrun --nproc_per_node=1 --rdzv_endpoint=l
+ocalhost:29504 test_train.py
+"""
+
+import logging
+import os
+from contextlib import nullcontext
+from typing import Dict, Iterable, Optional
+
+import torch
+import torch.distributed as dist
+import torch.distributed.checkpoint as dcp
+import torch.nn as nn
+import torch.optim as optim
+import wandb
+from datasets import load_dataset
+from torch.distributed.checkpoint.state_dict import get_state_dict, set_state_dict
+from torch.distributed.checkpoint.stateful import Stateful
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
+from torch.distributed.fsdp import ShardingStrategy
+from torch.distributed.tensor import DTensor
+from torch.distributed.tensor.experimental import context_parallel
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.utils.data import DataLoader, default_collate
+from torch.utils.data.distributed import DistributedSampler
+
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+ignore_sanity_checks = int(os.environ.get("IGNORE_SANITY", 0)) == 1
+# torch.use_deterministic_algorithms(True)
+torch.backends.cudnn.deterministic = True
+
+# Set up logging
+logging.basicConfig(
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+    datefmt="%m/%d/%Y %H:%M:%S",
+    level=logging.INFO,
+)
+logger = logging.getLogger(__name__)
+
+# from torch.distributed.tensor.experimental._attention import set_rotate_method
+
+# set_rotate_method("alltoall")  # rotate shards using all-to-all
+
+
+def main():
+    tp_size = int(os.environ.get("TP_SIZE", 1))
+    dp_size = int(os.environ.get("DP_SIZE", 4))
+    cp_size = int(os.environ.get("CP_SIZE", 1))  # Add CP size configuration
+    sdpa_backend = SDPBackend.FLASH_ATTENTION  # For CP
+    # sdpa_backend = SDPBackend.MATH # For CP
+    global_batch_size = 8  # Desired global batch size
+    seq_len = 1024  # Sequence length
+    num_train_steps = 10000  # Number of training steps
+    LR = 1e-5
+    model_name = "HuggingFaceTB/SmolLM2-1.7B"
+    # model_name = "unsloth/Llama-3.2-1B"
+
+    CHECKPOINT_DIR = f"checkpoint_tp{tp_size}_dp{dp_size}_cp{cp_size}"
+
+    # Initialize distributed environment
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        dist.init_process_group("nccl")
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+        local_rank = int(os.environ["LOCAL_RANK"])
+        torch.cuda.set_device(local_rank)
+
+        assert world_size == tp_size * dp_size * cp_size, (
+            f"World size ({world_size}) must equal TP size ({tp_size}) * DP size ({dp_size}) * CP size ({cp_size})"
+        )
+
+        mesh = torch.arange(world_size).reshape(dp_size, tp_size, cp_size)
+        world_mesh = DeviceMesh(device_type="cuda", mesh=mesh, mesh_dim_names=("dp", "tp", "cp"))
+        tp_mesh = world_mesh["tp"]
+        dp_mesh = world_mesh["dp"]
+        cp_mesh = world_mesh["cp"]
+        world_mesh["dp", "cp"]._flatten(mesh_dim_name="dp_cp")
+        logger.info(f"Created DeviceMesh: {world_mesh}")
+        logger.info(
+            f"Distributed setup - Rank: {rank}, World size: {world_size}, Local rank: {local_rank}, DP: {dp_mesh.get_local_rank()}, TP: {tp_mesh.get_local_rank()}, CP: {cp_mesh.get_local_rank()}"
+        )
+
+        if dist.get_rank() == 0:
+            wandb.init(
+                project="tp_dp_test",
+                config={
+                    "tp_size": tp_size,
+                    "dp_size": dp_size,
+                    "cp_size": cp_size,
+                    "global_batch_size": global_batch_size,
+                    "model_name": model_name,
+                    "dataset": "roneneldan/TinyStories-1M",
+                    "seq_len": seq_len,
+                    "lr": LR,
+                    "weight_decay": 0.1,
+                },
+                name=f"llama_tp{tp_size}_dp{dp_size}_cp{cp_size}"
+                if model_name == "unsloth/Llama-3.2-1B"
+                else f"tp{tp_size}_dp{dp_size}_cp{cp_size}",
+            )
+            logger.info(f"ignore_sanity_checks is set to: {ignore_sanity_checks}")
+            logger.info("Wandb initialized.")
+            # Log the current file to wandb
+            wandb.save("test_train.py")
+
+    else:
+        logger.info("Running in non-distributed mode. DeviceMesh not applicable.")
+        rank = 0
+        world_size = 1
+        local_rank = 0
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        wandb.init(
+            project="tp_dp_test",
+            config={
+                "tp_size": 1,
+                "dp_size": 1,
+                "global_batch_size": global_batch_size,
+                "model_name": model_name,
+                "dataset": "roneneldan/TinyStories-1M",
+                "seq_len": seq_len,
+            },
+            name="llama_tp1_dp1_nondist" if model_name == "unsloth/Llama-3.2-1B" else "tp1_dp1_nondist",
+        )
+        logger.info("Wandb initialized for non-distributed run.")
+
+    # Load model and tokenizer
+    logger.info(f"Loading model and tokenizer from {model_name}")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        logger.info(f"Set pad_token to eos_token: {tokenizer.pad_token}")
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name,
+        device_mesh=tp_mesh if dist.is_initialized() else None,
+        tp_plan="auto",
+        torch_dtype=torch.bfloat16,
+    )
+    logger.info(f"Model loaded onto device mesh: {tp_mesh}")
+
+    if dist.is_initialized():
+        assert model.config.num_key_value_heads % tp_mesh.size() == 0, (
+            f"num_key_value_heads={model.config.num_key_value_heads} must be divisible by tp_size={tp_mesh.size()}"
+        )
+        device = torch.device(f"cuda:{local_rank}")
+    else:
+        model = model.to(device)
+
+    logger.info(f"Using device: {device} for non-model tensors")
+    use_ddp = False
+    if dist.is_initialized() and dp_mesh.size() > 1:
+        # FSDP1
+        model = FSDP(model, device_mesh=dp_mesh, sharding_strategy=ShardingStrategy.NO_SHARD)
+        # FSDP2
+        # for transformer_block in model.model.layers:
+        #     fully_shard(transformer_block, mesh=dp_mesh, reshard_after_forward=False)
+        # fully_shard(model.model, mesh=dp_mesh, reshard_after_forward=False)
+        # DDP
+        # replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+        # assert len(list(model.parameters()))>5, "No parameters found in model. Probably DDP/FSDP bug.." # TODO: we should be cautious abt using model.parameters()
+        use_ddp = True
+
+    model.train()
+    assert len(list(model.parameters())) > 0, "No parameters found in model. Probably DDP bug.."
+    assert len([p for p in model.parameters() if p.requires_grad]) > 0, (
+        "No gradients found in model. Probably DDP bug.."
+    )
+
+    if dist.is_initialized() and not ignore_sanity_checks:
+        # assert model is replicated across all dp
+        for name, param in model.named_parameters():
+            sanity_check_tensor_sync(param, dp_mesh)
+
+        # assert model is different across tp (only for sharded params)
+        for name, param in model.named_parameters():
+            if isinstance(param, DTensor) and param.placements[0].is_shard():
+                # Only check sharded parameters for non-sync across TP
+                sanity_check_tensor_sync(param, tp_mesh, not_sync=True)
+            elif isinstance(param, DTensor) and param.placements[0].is_replicate():
+                # Replicated parameters should be the same across TP
+                sanity_check_tensor_sync(param, tp_mesh)
+
+        # assert model is replicated across cp
+        for name, param in model.named_parameters():
+            sanity_check_tensor_sync(param, cp_mesh)
+
+    # Load and preprocess TinyStories dataset
+    logger.info("Loading TinyStories dataset...")
+    raw_dataset = load_dataset("roneneldan/TinyStories", split="train[:1%]")  # Use 1% for faster testing
+
+    def tokenize_function(examples):
+        # Tokenize the text without padding
+        tokenized_batch = tokenizer(
+            examples["text"], padding=False, truncation=True, max_length=seq_len, return_tensors=None
+        )
+        # Set labels to be the same as input_ids for Causal LM
+        tokenized_batch["labels"] = tokenized_batch["input_ids"].copy()
+        return tokenized_batch
+
+    tokenized_dataset = raw_dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+    logger.info(f"Dataset loaded and tokenized. Size: {len(tokenized_dataset)}")
+
+    # Create packed sequences
+    def create_packed_sequences(examples):
+        # Flatten all sequences
+        all_tokens = []
+        for input_ids in examples["input_ids"]:
+            all_tokens.extend(input_ids)
+
+        # Split into sequences of seq_len + 1 (for input + label)
+        num_sequences = len(all_tokens) // (seq_len + 1)
+        packed_input_ids = []
+        packed_labels = []
+
+        for i in range(num_sequences):
+            start_idx = i * (seq_len + 1)
+            end_idx = start_idx + (seq_len + 1)
+            # Get the full sequence
+            full_sequence = all_tokens[start_idx:end_idx]
+            # For input_ids, remove the last token
+            packed_input_ids.append(full_sequence[:-1])
+            # For labels, remove the first token
+            packed_labels.append(full_sequence[1:])
+
+        return {"input_ids": packed_input_ids, "labels": packed_labels}
+
+    # Apply packing to the dataset
+    packed_dataset = tokenized_dataset.map(
+        create_packed_sequences,
+        batched=True,
+        remove_columns=tokenized_dataset.column_names,
+        batch_size=1000,  # Process in batches for efficiency
+        num_proc=60,
+    )
+    logger.info(f"Dataset packed. New size: {len(packed_dataset)}")
+
+    # Shuffle the packed dataset
+    packed_dataset = packed_dataset.shuffle(seed=42)
+    logger.info("Packed dataset shuffled")
+
+    # Calculate local batch size
+    if dist.is_initialized():
+        assert global_batch_size % dp_mesh.size() == 0, (
+            f"Global batch size ({global_batch_size}) must be divisible by DP size ({dp_mesh.size()})"
+        )
+        local_batch_size = global_batch_size // dp_mesh.size()
+    else:
+        local_batch_size = global_batch_size
+
+    logger.info(
+        f"Global batch size: {global_batch_size}, DP size: {dp_size if dist.is_initialized() else 1}, Local batch size: {local_batch_size}"
+    )
+
+    # Simple collate function since sequences are already packed
+    def collate_fn(batch):
+        input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
+        labels = torch.tensor([item["labels"] for item in batch], dtype=torch.long)
+        return {"input_ids": input_ids, "labels": labels}
+
+    if dist.is_initialized():
+        sampler = DistributedSampler(
+            packed_dataset, num_replicas=dp_mesh.size(), rank=dp_mesh.get_local_rank(), shuffle=False
+        )
+    else:
+        sampler = None
+
+    dataloader = DataLoader(
+        packed_dataset,
+        batch_size=local_batch_size,
+        sampler=sampler,
+        shuffle=False,
+        collate_fn=collate_fn,
+    )
+    logger.info(f"DataLoader created. Distributed: {dist.is_initialized()}")
+
+    optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=0.1)
+
+    # Training loop
+    logger.info(f"Starting training for {num_train_steps} steps...")
+    model.train()
+    step = 0
+    while step < num_train_steps:
+        for batch in dataloader:
+            if step >= num_train_steps:
+                break  # Exit loop if max steps reached
+
+            # Move batch to appropriate device
+            batch = {k: v.to(device) for k, v in batch.items()}
+
+            # Sanity checks for batch distribution (only if distributed)
+            if dist.is_initialized() and not ignore_sanity_checks:
+                # check batch is same across all tp
+                sanity_check_tensor_sync(batch["input_ids"], tp_mesh)
+                # check batch is different across dp
+                sanity_check_tensor_sync(batch["input_ids"], dp_mesh, not_sync=True)
+
+            optimizer.zero_grad()
+
+            # Add position_ids to batch before CP sharding
+            batch_size = batch["input_ids"].shape[0]
+            position_ids = torch.arange(0, seq_len, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0).expand(batch_size, -1)
+            batch["position_ids"] = position_ids
+            from torch.distributed.tensor.experimental._attention import _cp_options
+
+            _cp_options.enable_load_balance = False
+
+            with sdpa_kernel(sdpa_backend):  # TODO: ideally move this to attention implementation
+                cp_context = (
+                    nullcontext()
+                    if cp_mesh.size() == 1
+                    else context_parallel(
+                        cp_mesh,
+                        buffers=[
+                            batch["input_ids"],
+                            batch["labels"],
+                            batch["position_ids"],
+                        ],  # TODO: need to add attention mask
+                        buffer_seq_dims=[1, 1, 1],
+                    )
+                )
+                with cp_context:
+                    # Pop labels from batch before model forward pass
+                    labels = batch.pop("labels")
+                    outputs = model(**batch)  # [mbs, seq_len/cp]
+                    loss = outputs.loss
+                    logits = outputs.logits
+
+                    # Compute loss with shifted labels
+                    loss = model.loss_function(
+                        logits=logits, labels=None, shift_labels=labels, vocab_size=model.config.vocab_size
+                    )
+
+                    # Sanity checks for logits
+                    if dist.is_initialized() and not ignore_sanity_checks:
+                        # sanity_check_tensor_sync(logits, tp_mesh) # TODO: only true without sequence parallel
+                        sanity_check_tensor_sync(logits, dp_mesh, not_sync=True)
+                        sanity_check_tensor_sync(logits, cp_mesh, not_sync=True)
+
+                    loss.backward()
+
+                # all reduce grads across dp_cp if applicable
+                all_reduce_grads(model, world_mesh, use_ddp=use_ddp)
+
+                # Sanity checks for gradients (only if distributed)
+                if dist.is_initialized() and not ignore_sanity_checks:
+                    # check grads are not same across all tp (for sharded grads)
+                    for name, param in model.named_parameters():
+                        if param.grad is not None and isinstance(param.grad, DTensor):
+                            if param.grad.placements[0].is_shard():
+                                sanity_check_tensor_sync(param.grad, tp_mesh, not_sync=True)
+                            elif param.grad.placements[0].is_replicate():
+                                sanity_check_tensor_sync(param.grad, tp_mesh)
+                    # check grads are same across dp
+                    for name, param in model.named_parameters():
+                        if param.grad is not None and dp_mesh.size() > 1:
+                            sanity_check_tensor_sync(param.grad, dp_mesh)
+                    # check grads are same across cp
+                    for name, param in model.named_parameters():
+                        if param.grad is not None and cp_mesh.size() > 1:
+                            sanity_check_tensor_sync(param.grad, cp_mesh)
+
+                # Calculate gradient norm and clip gradients
+                if hasattr(model, "clip_grad_norm_"):
+                    # when using FSDP or DDP, model.parameters() doesn't work
+                    gradnorm = model.clip_grad_norm_(max_norm=1.0, norm_type=2.0)
+                else:
+                    assert len(list(model.parameters())) > 2, "No parameters found in model. Probably DDP bug.."
+                    assert len([p for p in model.parameters() if p.requires_grad]) > 2, (
+                        "No gradients found in model. Probably DDP bug.."
+                    )
+                    assert len([p for p in model.parameters() if p.grad is not None]) > 2, (
+                        "No gradients found in model. Probably DDP bug.."
+                    )
+                    # only works with FSDP's NO_SHARD otherwise we should use FSDP's clip_grad_norm_
+                    gradnorm = clip_grad_norm_(model.parameters(), max_norm=1.0, norm_type=2.0, foreach=True)
+
+                optimizer.step()
+                # Sanity checks for updated model parameters (only if distributed)
+                if dist.is_initialized() and not ignore_sanity_checks:
+                    # check updated model is different across all tp (for sharded params)
+                    for name, param in model.named_parameters():
+                        if isinstance(param, DTensor):
+                            if param.placements[0].is_shard():
+                                sanity_check_tensor_sync(param, tp_mesh, not_sync=True)
+                            elif param.placements[0].is_replicate():
+                                sanity_check_tensor_sync(param, tp_mesh)
+                    # check updated model is same across dp
+                    for name, param in model.named_parameters():
+                        sanity_check_tensor_sync(param, dp_mesh)
+                    # check updated model is same across cp
+                    for name, param in model.named_parameters():
+                        sanity_check_tensor_sync(param, cp_mesh)
+
+                # allreduce loss across cp_dp before logging
+                if dist.is_initialized() and (cp_mesh.size() > 1 or dp_mesh.size() > 1):
+                    dist.all_reduce(loss, group=world_mesh["dp_cp"].get_group(), op=dist.ReduceOp.AVG)
+                current_loss = loss.item()
+
+                # Log loss and gradnorm to wandb (only on rank 0 of dp group)
+                if not dist.is_initialized() or dist.get_rank() == 0:
+                    logger.info(
+                        f"Step: {step} | GBS: {global_batch_size} | DP: {dp_mesh.size()} | TP: {tp_mesh.size()} | CP: {cp_mesh.size()} | Loss: {current_loss} | Gradnorm: {gradnorm} | lr: {LR}"
+                    )
+                    wandb.log(
+                        {
+                            "train/loss": current_loss,
+                            "train/gradnorm": gradnorm,
+                            "step": step,
+                            "lr": LR,
+                            "GBS": global_batch_size,
+                        }
+                    )
+
+            step += 1  # Increment step count
+
+    logger.info("Training loop finished.")
+
+    # Save model using DCP (only if distributed)
+    if dist.is_initialized():
+        state_dict = {"app": AppState(model, optimizer)}
+        dcp.save(
+            state_dict=state_dict,
+            checkpoint_id=CHECKPOINT_DIR,
+        )
+        logger.info(f"Saved checkpoint to {CHECKPOINT_DIR}")
+    else:
+        # Fallback to regular save for non-distributed case
+        save_dir = "test_model_nondist"
+        model.save_pretrained(save_dir, safe_serialization=False)
+        tokenizer.save_pretrained(save_dir)  # Save tokenizer too
+        logger.info(f"Saved model to {save_dir}")
+
+    # Example of loading the checkpoint (only if distributed)
+    if dist.is_initialized():
+        # Create a new model instance
+        logger.info("Creating new model instance for verification")
+        new_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
+            device_mesh=tp_mesh,
+            torch_dtype=torch.bfloat16,  # Use same dtype
+        )
+        new_optimizer = optim.AdamW(new_model.parameters(), lr=LR)
+
+        # Load checkpoint into new model
+        state_dict = {"app": AppState(new_model, new_optimizer)}
+        dcp.load(
+            state_dict=state_dict,
+            checkpoint_id=CHECKPOINT_DIR,
+        )
+        logger.info("Loaded checkpoint into new model")
+
+        # Verify model weights match
+        logger.info("Verifying model weights match...")
+        for (name1, param1), (name2, param2) in zip(model.named_parameters(), new_model.named_parameters()):
+            torch.testing.assert_close(
+                param1.to_local(),
+                param2.to_local(),
+                rtol=1e-3,
+                atol=1e-3,
+                msg=f"Weights mismatch in {name1} vs {name2}",
+            )
+
+        # Verify optimizer states match
+        logger.info("Verifying optimizer states match...")
+        for name1, state1 in optimizer.state_dict().items():
+            state2 = new_optimizer.state_dict()[name1]
+            if name1 == "state":
+                # Compare state dictionaries for each parameter
+                for param_id, param_state1 in state1.items():
+                    param_state2 = state2[param_id]
+                    # Compare each state component (step, exp_avg, exp_avg_sq)
+                    for key, value1 in param_state1.items():
+                        value2 = param_state2[key]
+                        if isinstance(value1, DTensor):
+                            # Convert DTensors to local tensors for comparison
+                            torch.testing.assert_close(
+                                value1.to_local(),
+                                value2.to_local(),
+                                rtol=1e-5,
+                                atol=1e-5,
+                                msg=f"Optimizer state mismatch in state[{param_id}][{key}]",
+                            )
+                        else:
+                            torch.testing.assert_close(
+                                value1,
+                                value2,
+                                rtol=1e-5,
+                                atol=1e-5,
+                                msg=f"Optimizer state mismatch in state[{param_id}][{key}]",
+                            )
+            elif name1 == "param_groups":
+                # Compare param_groups (excluding the actual params list)
+                for i, (group1, group2) in enumerate(zip(state1, state2)):
+                    for key in group1:
+                        if key != "params":  # Skip comparing the params list
+                            assert group1[key] == group2[key], f"Param group mismatch in param_groups[{i}][{key}]"
+
+        # Run a forward pass with both models to verify outputs match
+        logger.info("Running forward pass verification...")
+        with torch.no_grad():
+            # Use the last batch for verification
+            batch = {k: v.to(device) for k, v in batch.items()}  # Ensure batch is on correct device
+            original_outputs = model(**batch)
+            new_outputs = new_model(**batch)
+            torch.testing.assert_close(
+                original_outputs.logits.to_local(),
+                new_outputs.logits.to_local(),
+                rtol=1e-3,
+                atol=1e-3,
+                msg="Model outputs do not match!",
+            )  # Increased tolerance slightly for bf16
+
+    # Clean up distributed environment and finish wandb run
+    if dist.is_initialized():
+        dist.destroy_process_group()
+        logger.info("Cleaned up distributed process group")
+        # Finish wandb run on rank 0
+        if dist.get_rank() == 0:
+            wandb.finish()
+            logger.info("Wandb run finished.")
+    else:
+        wandb.finish()
+        logger.info("Wandb run finished.")
+
+
+def all_reduce_grads(model, world_mesh, use_ddp):
+    """All reduce gradients across dp_cp if applicable."""
+    cp_mesh = world_mesh["cp"]
+    if use_ddp:
+        # DDP takes care of syncing grads
+        mesh = cp_mesh
+    else:
+        mesh = world_mesh["dp", "cp"]._flatten(mesh_dim_name="dp_cp")
+    if dist.is_initialized() and mesh.size() > 1:
+        for name, param in model.named_parameters():
+            if param.grad is not None:
+                # Workaround for cross-mesh communication limitation with DTensor gradients
+                if isinstance(param.grad, DTensor):
+                    local_grad = param.grad.to_local()
+                    # Ensure grad requires grad for inplace modification checks (might not be needed)
+                    # local_grad = local_grad.detach().requires_grad_(True)
+                    torch.distributed.all_reduce(local_grad, op=torch.distributed.ReduceOp.SUM, group=mesh.get_group())
+                    local_grad = local_grad / mesh.size()
+                    # Assign averaged grad back - need careful handling if DTensor structure is complex
+                    # This simple assignment might work if the grad structure matches param structure
+                    param.grad = DTensor.from_local(
+                        local_grad, device_mesh=param.grad.device_mesh, placements=param.grad.placements
+                    )
+                else:
+                    # Handle regular tensors if any exist (e.g. buffers not converted to DTensor)
+                    torch.distributed.all_reduce(param.grad, op=torch.distributed.ReduceOp.AVG, group=mesh.get_group())
+
+
+class ContextParallelCollator:
+    """Collator for context parallel training that splits sequences into chunks."""
+
+    def __init__(self, cp_mesh: Optional[DeviceMesh] = None):
+        self.cp_mesh = cp_mesh
+
+    def __call__(self, batch: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        batch = default_collate(batch)
+        if self.cp_mesh is not None and self.cp_mesh.size() > 1:
+            # Get sequence length from the input batch
+            seq_len = batch["input_ids"].shape[1]
+            assert seq_len % self.cp_mesh.size() == 0, (
+                f"Sequence length {seq_len} must be divisible by CP size {self.cp_mesh.size()}"
+            )
+            chunk_size = seq_len // self.cp_mesh.size()
+            cp_rank = self.cp_mesh.get_local_rank()
+            start_idx = cp_rank * chunk_size
+            end_idx = start_idx + chunk_size
+
+            # Keep only the local chunk of the sequence
+            batch["input_ids"] = batch["input_ids"][:, start_idx:end_idx]
+            batch["attention_mask"] = batch["attention_mask"][:, start_idx:end_idx]
+            batch["labels"] = batch["labels"][:, start_idx:end_idx]
+
+        return batch
+
+
+class AppState(Stateful):
+    """Wrapper for checkpointing the Application State including model and optimizer."""
+
+    def __init__(self, model, optimizer=None):
+        self.model = model
+        self.optimizer = optimizer
+
+    def state_dict(self):
+        model_state_dict, optimizer_state_dict = get_state_dict(self.model, self.optimizer)
+        return {"model": model_state_dict, "optim": optimizer_state_dict}
+
+    def load_state_dict(self, state_dict):
+        set_state_dict(
+            self.model, self.optimizer, model_state_dict=state_dict["model"], optim_state_dict=state_dict["optim"]
+        )
+
+
+def sanity_check_tensor_sync(
+    tensor: torch.Tensor, mesh: DeviceMesh, rtol: float = 1e-4, atol: float = 1e-4, not_sync: bool = False
+) -> None:
+    """
+    Verify that a tensor is synchronized (or not synchronized) across all processes in the mesh's process group.
+    Handles both regular tensors and DTensors.
+
+    Args:
+        tensor (torch.Tensor): The tensor to check for synchronization (can be DTensor)
+        mesh (DeviceMesh): The device mesh containing the process group
+        rtol (float): Relative tolerance for comparison
+        atol (float): Absolute tolerance for comparison
+        not_sync (bool): If True, asserts that tensors are NOT synchronized. If False, asserts they are synchronized.
+    """
+    if not dist.is_initialized() or mesh.size() == 1:
+        return  # No need to check in non-distributed mode
+
+    # Get the process group from the mesh
+    pg = mesh.get_group()
+
+    # Convert DTensor to local tensor if needed
+    if hasattr(tensor, "to_local"):
+        local_tensor = tensor.to_local()
+    else:
+        local_tensor = tensor
+
+    # Gather tensors from all processes
+    world_size = dist.get_world_size(pg)
+    gathered_tensors = [torch.empty_like(local_tensor) for _ in range(world_size)]
+    dist.all_gather(gathered_tensors, local_tensor, group=pg)
+
+    # Compare each tensor with the first one
+    for i in range(1, world_size):
+        try:
+            torch.testing.assert_close(gathered_tensors[0], gathered_tensors[i], rtol=rtol, atol=atol)
+        except AssertionError as e:
+            if not_sync:
+                continue
+            # # Add detailed debugging for logit synchronization issues
+            # print(f"\nLogit synchronization error between rank 0 and rank {i}:")
+            # print(f"Tensor shape: {gathered_tensors[0].shape}")
+            # print(f"Number of mismatched elements: {(gathered_tensors[0] != gathered_tensors[i]).sum()}")
+            # print(f"Percentage of mismatched elements: {((gathered_tensors[0] != gathered_tensors[i]).sum() / gathered_tensors[0].numel() * 100):.2f}%")
+
+            # # Find the first few mismatches
+            # mismatches = torch.nonzero(gathered_tensors[0] != gathered_tensors[i])
+            # print("\nFirst few mismatches:")
+            # for idx in mismatches[:5]:
+            #     idx = tuple(idx.tolist())
+            #     print(f"Index {idx}:")
+            #     print(f"Rank 0 value: {gathered_tensors[0][idx]}")
+            #     print(f"Rank {i} value: {gathered_tensors[i][idx]}")
+            #     print(f"Absolute difference: {abs(gathered_tensors[0][idx] - gathered_tensors[i][idx])}")
+            #     print(f"Relative difference: {abs(gathered_tensors[0][idx] - gathered_tensors[i][idx]) / max(abs(gathered_tensors[0][idx]), abs(gathered_tensors[i][idx]))}")
+
+            # # Check if differences are systematic (e.g., all positive or negative)
+            # diff = gathered_tensors[0] - gathered_tensors[i]
+            # print(f"\nDifference statistics:")
+            # print(f"Mean difference: {diff.mean()}")
+            # print(f"Std difference: {diff.std()}")
+            # print(f"Max positive difference: {diff.max()}")
+            # print(f"Max negative difference: {diff.min()}")
+            raise e
+
+
+def clip_grad_norm_(
+    parameters: Iterable[torch.Tensor],
+    max_norm: float,
+    norm_type: float = 2.0,
+    error_if_nonfinite: bool = False,
+    foreach: bool | None = None,
+) -> torch.Tensor:
+    """
+    Clip the gradient norm of an iterable of parameters.
+    """
+    # Filter out parameters with no gradients
+    parameters = [p for p in parameters if p.grad is not None]
+    assert len(parameters) > 0, "No parameters with gradients found"
+
+    # Calculate total norm
+    if norm_type == float("inf"):
+        total_norm = max(p.grad.detach().abs().max() for p in parameters)
+    else:
+        total_norm = torch.norm(torch.stack([torch.norm(p.grad.detach(), norm_type) for p in parameters]), norm_type)
+
+    # Convert DTensor to local tensor if needed
+    if isinstance(total_norm, DTensor):
+        total_norm = total_norm.full_tensor()
+
+    # Clip gradients
+    clip_coef = max_norm / (total_norm + 1e-6)
+    if clip_coef < 1:
+        for p in parameters:
+            p.grad.detach().mul_(clip_coef)
+
+    return total_norm
+
+
+def check_params_sync(model_params, original_params):
+    """
+    Check if original_params are being updated in sync with model parameters.
+
+    Args:
+        model_params: Iterator of model parameters after update
+        original_params: List of original parameters before DDP wrapping
+    """
+    for mp, op in zip(model_params, original_params):
+        if isinstance(mp, DTensor):
+            mp = mp.to_local()
+        if isinstance(op, DTensor):
+            op = op.to_local()
+        if not torch.allclose(mp.data, op.data, rtol=0, atol=0):
+            raise RuntimeError(f"Parameters out of sync: model param {mp.data} != original param {op.data}")
+    return True
+
+
+def get_parameters(model: nn.Module) -> Iterable[torch.Tensor]:
+    """
+    Get all parameters from a model by iterating over its modules.
+    This is an alternative to model.parameters() that works with DTensor models.
+
+    Args:
+        model (nn.Module): The model to get parameters from
+
+    Returns:
+        Iterable[torch.Tensor]: An iterator over all parameters in the model
+    """
+    for name, module in model._modules.items():
+        # Look for parameters in module attributes
+        for attr_name, attr in module.__dict__.items():
+            if isinstance(attr, torch.Tensor) and attr.requires_grad:
+                yield attr
+        # Recursively get parameters from submodules
+        for param in get_parameters(module):
+            yield param
+
+
+def update_model_parameters(model: nn.Module) -> None:
+    """
+    Update model._parameters using named_modules() to ensure all parameters are properly tracked.
+
+    Args:
+        model (nn.Module): The model to update parameters for
+    """
+    # Clear existing parameters
+    model._parameters = {}
+
+    # Add parameters from named_modules
+    for name, module in model.named_modules():
+        # Skip the root module itself
+        if name == "":
+            continue
+
+        # Get the parameter name by removing 'module.' prefix if it exists
+        param_name = name.replace("module.", "")
+
+        # Add weight and bias parameters if they exist
+        if hasattr(module, "weight") and module.weight is not None:
+            model._parameters[f"{param_name}.weight"] = module.weight
+        if hasattr(module, "bias") and module.bias is not None:
+            model._parameters[f"{param_name}.bias"] = module.bias
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/pytorch/audio-classification/run_audio_classification.py
+++ b/examples/pytorch/audio-classification/run_audio_classification.py
@ -44,7 +44,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")

--- a/examples/pytorch/context_parallel.py
+++ b/examples/pytorch/context_parallel.py
@ -0,0 +1,94 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+
+import torch
+import torch.distributed as dist
+from torch.distributed.device_mesh import init_device_mesh
+from torch.distributed.tensor.experimental import context_parallel
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from transformers import AutoModelForCausalLM
+from transformers.loss.loss_utils import ForCausalLMLoss
+
+
+world_size = int(os.environ.get("WORLD_SIZE", "1"))
+cp_mesh = init_device_mesh("cuda", (world_size,))
+rank = torch.distributed.get_node_local_rank()
+
+device = "cuda"
+dtype = torch.bfloat16
+sdpa_backend = SDPBackend.FLASH_ATTENTION
+
+# prepare inputs
+batch_size = 1
+seq_len = 128
+
+input_ids = torch.randint(low=8, high=64, size=(batch_size, seq_len), device=device)
+
+ignore_index = -100
+# When using CP, we need to use `shift_labels`
+shift_labels = torch.nn.functional.pad(input_ids, (0, 1), value=ignore_index)
+shift_labels = shift_labels[..., 1:].contiguous()
+
+position_ids = (
+    torch.cumsum(torch.ones(size=input_ids.size(), dtype=input_ids.dtype, device=input_ids.device), dim=1) - 1
+)
+
+# sync input as they are created randomly
+dist.broadcast(input_ids, src=0)
+dist.broadcast(shift_labels, src=0)
+dist.broadcast(position_ids, src=0)
+
+# model and optimizer
+repo_id = "Qwen/Qwen2.5-Coder-0.5B-Instruct"
+model = AutoModelForCausalLM.from_pretrained(repo_id, torch_dtype=dtype, device_map=device)
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
+
+model.train()
+model.zero_grad()
+optimizer.zero_grad()
+
+# For loss
+vocab_size = model.config.vocab_size
+
+# so training could be synced
+model = DDP(model, device_ids=[rank])
+
+# prepare for CP
+buffers = (input_ids, shift_labels, position_ids)
+buffer_seq_dims = (1, 1, 1)
+# `no_restore_buffers=set(buffers)` is required if `loss.backward` is outside `context_parallel`.
+# no_restore_buffers = set(buffers)
+no_restore_buffers = None
+
+# run with CP
+with sdpa_kernel(sdpa_backend):
+    with context_parallel(
+        cp_mesh,
+        buffers=buffers,
+        buffer_seq_dims=buffer_seq_dims,
+        no_restore_buffers=no_restore_buffers,
+    ):
+        outputs = model(input_ids, shift_labels=shift_labels, position_ids=position_ids)
+        print(outputs.logits.shape)
+
+        # So far we need to compute `loss` outside `model.forward` when using `shift_labels`
+        # loss = outputs.loss
+        loss = ForCausalLMLoss(logits=outputs.logits, labels=None, shift_labels=shift_labels, vocab_size=vocab_size)
+
+        # This could be outside `context_parallel` context if `no_restore_buffers` is specified
+        loss.backward()
+        optimizer.step()
--- a/examples/pytorch/contrastive-image-text/run_clip.py
+++ b/examples/pytorch/contrastive-image-text/run_clip.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification.py
+++ b/examples/pytorch/image-classification/run_image_classification.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")

--- a/examples/pytorch/image-classification/run_image_classification_no_trainer.py
+++ b/examples/pytorch/image-classification/run_image_classification_no_trainer.py
@ -48,7 +48,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/image-pretraining/run_mae.py
+++ b/examples/pytorch/image-pretraining/run_mae.py
@ -42,7 +42,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim.py
+++ b/examples/pytorch/image-pretraining/run_mim.py
@ -47,7 +47,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/image-pretraining/run_mim_no_trainer.py
+++ b/examples/pytorch/image-pretraining/run_mim_no_trainer.py
@ -52,7 +52,7 @@ Any model supported by the AutoModelForMaskedImageModeling API can be used.
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/image-pretraining/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
+++ b/examples/pytorch/instance-segmentation/run_instance_segmentation_no_trainer.py
@ -52,7 +52,7 @@ from transformers.utils.versions import require_version
 logger = logging.getLogger(__name__)

 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.0.0", "To fix: pip install -r examples/pytorch/instance-segmentation/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@ -54,7 +54,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_fim.py
+++ b/examples/pytorch/language-modeling/run_fim.py
@ -57,7 +57,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_fim_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_fim_no_trainer.py
@ -59,7 +59,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 logger = get_logger(__name__)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@ -53,7 +53,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@ -56,7 +56,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 logger = get_logger(__name__)
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@ -46,7 +46,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@ -45,7 +45,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 logger = logging.getLogger(__name__)

--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@ -53,7 +53,7 @@ from transformers.utils import check_min_version, send_example_telemetry


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.52.0.dev0")
+check_min_version("4.52.0")

 logger = get_logger(__name__)
 # You should update this to your particular problem to have better documentation of `model_type`
--- a/Show More
+++ b/Show More