Merge branch 'main' into fix-gkd-liger-mem-spike

2025-10-20 18:43:52 +08:00 · 2025-10-15 09:40:31 -06:00
parent 2c457622bc 773afd9314
commit 3b3d1ec0f5
159 changed files with 5044 additions and 5504 deletions
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -14,6 +14,5 @@ jobs:
      commit_sha: ${{ github.sha }}
      package: trl
      version_tag_suffix: ""
-      custom_container: huggingface/transformers-doc-builder
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -16,4 +16,3 @@ jobs:
      pr_number: ${{ github.event.number }}
      package: trl
      version_tag_suffix: ""
-      custom_container: huggingface/transformers-doc-builder
--- a/.github/workflows/slow-tests.yml
+++ b/.github/workflows/slow-tests.yml
@ -68,7 +68,7 @@ jobs:
      CUDA_VISIBLE_DEVICES: "0,1"
      TEST_TYPE: "multi_gpu"
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all --shm-size "16gb"
    defaults:
      run:
@ -115,6 +115,4 @@ jobs:
          source .venv/bin/activate
          uv pip install slack_sdk tabulate
          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-          python scripts/log_example_reports.py --text_file_name temp_results_sft_tests.txt >> $GITHUB_STEP_SUMMARY
-          python scripts/log_example_reports.py --text_file_name temp_results_dpo_tests.txt >> $GITHUB_STEP_SUMMARY
          rm *.txt
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -11,11 +11,12 @@ on:
      - "scripts/**.py"
      - "tests/**.py"
      - "trl/**.py"
-      - "setup.py"
+      - "pyproject.toml"

 env:
  TQDM_DISABLE: 1
  CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
+  PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

 jobs:
  check_code_quality:
@ -41,7 +42,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
@ -93,7 +94,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
@ -128,7 +129,7 @@ jobs:
          uv pip install -U git+https://github.com/huggingface/accelerate.git
          uv pip install -U git+https://github.com/huggingface/datasets.git
          uv pip install -U git+https://github.com/huggingface/transformers.git
-          
+          uv pip install -U git+https://github.com/huggingface/peft.git

      - name: Test with pytest
        run: |
@ -149,7 +150,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
@ -201,7 +202,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
--- a/.github/workflows/tests_latest.yml
+++ b/.github/workflows/tests_latest.yml
@ -16,7 +16,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,15 +1,10 @@
 # How to contribute to TRL?

-Everyone is welcome to contribute, and we value everybody's contribution. Code
-contributions are not the only way to help the community. Answering questions, helping
-others, and improving the documentation are also immensely valuable.
+Everyone is welcome to contribute, and we value everybody's contribution. Code contributions are not the only way to help the community. Answering questions, helping others, and improving the documentation are also immensely valuable.

-It also helps us if you spread the word! Reference the library in blog posts
-about the awesome projects it made possible, shout out on Twitter every time it has
-helped you, or simply ⭐️ the repository to say thank you.
+It also helps us if you spread the word! Reference the library in blog posts about the awesome projects it made possible, shout out on Twitter every time it has helped you, or simply ⭐️ the repository to say thank you.

-However you choose to contribute, please be mindful and respect our
-[code of conduct](https://github.com/huggingface/trl/blob/main/CODE_OF_CONDUCT.md).
+However you choose to contribute, please be mindful and respect our [code of conduct](https://github.com/huggingface/trl/blob/main/CODE_OF_CONDUCT.md).

 **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**

@ -22,9 +17,7 @@ There are several ways you can contribute to TRL:
 * Implement trainers for new post-training algorithms.
 * Contribute to the examples or the documentation.

-If you don't know where to start, there is a special [Good First
-Issue](https://github.com/huggingface/trl/labels/%F0%9F%91%B6%20good%20first%20issue) listing. It will give you a list of
-open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.
+If you don't know where to start, there is a special [Good First Issue](https://github.com/huggingface/trl/labels/%F0%9F%91%B6%20good%20first%20issue) listing. It will give you a list of open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.

 For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/trl/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀

@ -48,14 +41,12 @@ Do your best to follow these guidelines when submitting a bug-related issue or a

 The TRL library is robust and reliable thanks to users who report the problems they encounter.

-Before you report an issue, we would really appreciate it if you could **make sure the bug was not
-already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code.
+Before you report an issue, we would really appreciate it if you could **make sure the bug was not already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code.

 Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:

 * Your **OS type and version**, **Python**, **PyTorch**, **TRL** and **Transformers** versions.
-* A short, self-contained, code snippet that allows us to reproduce the bug in
-  less than 30s.
+* A short, self-contained, code snippet that allows us to reproduce the bug in less than 30s.
 * The *full* traceback if an exception is raised.
 * Attach any other additional information, like screenshots, you think may help.

@ -106,29 +97,20 @@ We're always looking for improvements to the documentation that make it more cle

 ## Submitting a pull request (PR)

-Before writing code, we strongly advise you to search through the existing PRs or
-issues to make sure that nobody is already working on the same thing. If you are
-unsure, it is always a good idea to open an issue to get some feedback.
+Before writing code, we strongly advise you to search through the existing PRs or issues to make sure that nobody is already working on the same thing. If you are unsure, it is always a good idea to open an issue to get some feedback.

-You will need basic `git` proficiency to be able to contribute to
-TRL. `git` is not the easiest tool to use but it has the greatest
-manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
-Git](https://git-scm.com/book/en/v2) is a very good reference.
+You will need basic `git` proficiency to be able to contribute to TRL. `git` is not the easiest tool to use but it has the greatest manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference.

 Follow these steps to start contributing:

-1. Fork the [repository](https://github.com/huggingface/trl) by
-   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your GitHub user account.
+1. Fork the [repository](https://github.com/huggingface/trl) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account.

-2. Clone your fork to your local disk, and add the base repository as a remote. The following command
-   assumes you have your public SSH key uploaded to GitHub. See the following guide for more
-   [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
+2. Clone your fork to your local disk, and add the base repository as a remote. The following command assumes you have your public SSH key uploaded to GitHub. See the following guide for more [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).

   ```bash
-   $ git clone git@github.com:<your Github handle>/trl.git
-   $ cd trl
-   $ git remote add upstream https://github.com/huggingface/trl.git
+   git clone git@github.com:<your Github handle>/trl.git
+   cd trl
+   git remote add upstream https://github.com/huggingface/trl.git
   ```

 3. Create a new branch to hold your development changes, and do this for every new PR you work on.
@ -136,15 +118,15 @@ Follow these steps to start contributing:
   Start by synchronizing your `main` branch with the `upstream/main` branch (more details in the [GitHub Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork)):

   ```bash
-   $ git checkout main
-   $ git fetch upstream
-   $ git merge upstream/main
+   git checkout main
+   git fetch upstream
+   git merge upstream/main
   ```

   Once your `main` branch is synchronized, create a new branch from it:

   ```bash
-   $ git checkout -b a-descriptive-name-for-my-changes
+   git checkout -b a-descriptive-name-for-my-changes
   ```

   **Do not** work on the `main` branch.
@ -152,32 +134,28 @@ Follow these steps to start contributing:
 4. Set up a development environment by running the following command in a conda or a virtual environment you've created for working on this library:

   ```bash
-   $ pip install -e .[dev]
+   pip install -e .[dev]
   ```

-   (If TRL was already installed in the virtual environment, remove
-   it with `pip uninstall trl` before reinstalling it.)
+   (If TRL was already installed in the virtual environment, remove it with `pip uninstall trl` before reinstalling it.)

-   Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using
-   the provided Dev Container. Documentation on how to get started with dev containers is available [here](https://code.visualstudio.com/docs/remote/containers).
+   Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using the provided Dev Container. Check [the documentation on how to get started with dev containers](https://code.visualstudio.com/docs/remote/containers).

 5. Develop the features on your branch.

-   As you work on the features, you should make sure that the test suite
-   passes. You should run the tests impacted by your changes like this (see 
-   below an explanation regarding the environment variable):
+    As you work on the features, you should make sure that the test suite passes. You should run the tests impacted by your changes like this (see below an explanation regarding the environment variable):

-   ```bash
-   $ pytest tests/<TEST_TO_RUN>.py
-   ```
-   
-   > For the following commands leveraging the `make` utility.
+    ```bash
+    pytest tests/<TEST_TO_RUN>.py
+    ```

-   You can also run the full suite with the following command.
+    > For the following commands leveraging the `make` utility.

-   ```bash
-   $ make test
-   ```
+    You can also run the full suite with the following command.
+
+    ```bash
+    make test
+    ```

    TRL relies on `ruff` for maintaining consistent code formatting across its source files. Before submitting any PR, you should apply automatic style corrections and run code verification checks.

@ -186,59 +164,51 @@ Follow these steps to start contributing:
    To apply these checks and corrections in one step, use:

    ```bash
-    $ make precommit
+    make precommit
    ```

    This command runs the following:
-    - Executes `pre-commit` hooks to automatically fix style issues with `ruff` and other tools.
-    - Runs additional scripts such as adding copyright information.
+
+    * Executes `pre-commit` hooks to automatically fix style issues with `ruff` and other tools.
+    * Runs additional scripts such as adding copyright information.

    If you prefer to apply the style corrections separately or review them individually, the `pre-commit` hook will handle the formatting for the files in question.

-   Once you're happy with your changes, add changed files using `git add` and
-   make a commit with `git commit` to record your changes locally:
+    Once you're happy with your changes, add changed files using `git add` and make a commit with `git commit` to record your changes locally:

-   ```bash
-   $ git add modified_file.py
-   $ git commit
-   ```
+    ```bash
+    git add modified_file.py
+    git commit
+    ```

-   Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
+    Please write [good commit messages](https://chris.beams.io/posts/git-commit/).

-   It is a good idea to sync your copy of the code with the original
-   repository regularly. This way you can quickly account for changes:
+    It is a good idea to sync your copy of the code with the original
+    repository regularly. This way you can quickly account for changes:

-   ```bash
-   $ git fetch upstream
-   $ git rebase upstream/main
-   ```
+    ```bash
+    git fetch upstream
+    git rebase upstream/main
+    ```

-   Push the changes to your account using:
+    Push the changes to your account using:

-   ```bash
-   $ git push -u origin a-descriptive-name-for-my-changes
-   ```
+    ```bash
+    git push -u origin a-descriptive-name-for-my-changes
+    ```

-6. Once you are satisfied (**and the checklist below is happy too**), go to the
-   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
-   to the project maintainers for review.
+6. Once you are satisfied (**and the checklist below is happy too**), go to the webpage of your fork on GitHub. Click on 'Pull request' to send your changes to the project maintainers for review.

 7. It's ok if maintainers ask you for changes. It happens to core contributors too! To ensure everyone can review your changes in the pull request, work on your local branch and push the updates to your fork. They will automatically appear in the pull request.

-
 ### Checklist

 1. The title of your pull request should be a summary of its contribution;
-2. If your pull request addresses an issue, please mention the issue number in
-   the pull request description to make sure they are linked (and people
-   consulting the issue know you are working on it);
-3. To indicate a work in progress please prefix the title with `[WIP]`, or mark
-   the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
-   it from PRs ready to be merged;
+2. If your pull request addresses an issue, please mention the issue number in the pull request description to make sure they are linked (and people consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`, or mark the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate it from PRs ready to be merged;
 4. Make sure existing tests pass;
 5. Add high-coverage tests. No quality testing = no merge.

-
 ### Tests

 An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
@ -248,7 +218,7 @@ We use `pytest` to run the tests. From the root of the
 repository here's how to run tests with `pytest` for the library:

 ```bash
-$ python -m pytest -sv ./tests
+python -m pytest -sv ./tests
 ```

 That's how `make test` is implemented (without the `pip install` line)!
@ -260,23 +230,23 @@ you're working on.

 1. **Use defaults when appropriate**:  

-Provide default values unless the parameter's value varies significantly by use case. For example, datasets or models should not have defaults, but parameters like `learning_rate` should.
+    Provide default values unless the parameter's value varies significantly by use case. For example, datasets or models should not have defaults, but parameters like `learning_rate` should.

 2. **Prioritize proven defaults**:  

-Default values should align with those recommended in the original paper or method. Alternatives require strong evidence of superior performance in most cases.
+    Default values should align with those recommended in the original paper or method. Alternatives require strong evidence of superior performance in most cases.

 3. **Ensure safety and predictability**:  

-Defaults must be safe, expected and reliable. Avoid settings that could lead to surprising outcomes, such as excessive memory usage or poor performance in edge cases.
+    Defaults must be safe, expected and reliable. Avoid settings that could lead to surprising outcomes, such as excessive memory usage or poor performance in edge cases.

 4. **Balance consistency and flexibility**:  

-Aim for consistent defaults across similar functions or methods. However, consistency should not be preferred to point 2 or 3.
+    Aim for consistent defaults across similar functions or methods. However, consistency should not be preferred to point 2 or 3.

 5. **Opt-in for new features**:  

-Do not enable new features or improvements (e.g., novel loss functions) by default. Users should explicitly opt-in to use these.
+    Do not enable new features or improvements (e.g., novel loss functions) by default. Users should explicitly opt-in to use these.

 ### Writing documentation

@ -318,26 +288,26 @@ def replicate_str(string: str, n: int, sep: str = " ") -> str:
  * Note that `Optional` means that the value can be `None`, and `*optional*` means that it is not required for the user to pass a value.
    E.g., for arguments that can't be `None` and aren't required:

-    ```python
+    ```txt
    foo (`int`, *optional*, defaults to `4`):
    ```

    For arguments that can be `None` and are required:

-    ```python
+    ```txt
    foo (`Optional[int]`):
    ```

-    for arguments that can be `None` and aren't required:
+    for arguments that can be `None` and aren't required (in this case, if the default value is `None`, you can omit it):

-    ```python
+    ```txt
    foo (`Optional[int]`, *optional*):
    ```

 * **String Defaults:**
  * Ensured that default string values are wrapped in double quotes:

-    ```python
+    ```txt
    defaults to `"foo"`
    ```

@ -346,7 +316,7 @@ def replicate_str(string: str, n: int, sep: str = " ") -> str:
 * **Default Value Formatting:**
  * Consistently surrounded default values with backticks for improved formatting:

-    ```python
+    ```txt
    defaults to `4`
    ```

@ -383,8 +353,8 @@ Our approach to deprecation and backward compatibility is flexible and based on

 When a feature or component is marked for deprecation, its use will emit a warning message. This warning will include:

- **Transition Guidance**: Instructions on how to migrate to the alternative solution or replacement.
- **Removal Version**: The target version when the feature will be removed, providing users with a clear timeframe to transition.
+* **Transition Guidance**: Instructions on how to migrate to the alternative solution or replacement.
+* **Removal Version**: The target version when the feature will be removed, providing users with a clear timeframe to transition.

 Example:

@ -398,9 +368,9 @@ Example:

 The deprecation and removal schedule is based on each feature's usage and impact, with examples at two extremes:

- **Experimental or Low-Use Features**: For a feature that is experimental or has limited usage, backward compatibility may not be maintained between releases. Users should therefore anticipate potential breaking changes from one version to the next.
+* **Experimental or Low-Use Features**: For a feature that is experimental or has limited usage, backward compatibility may not be maintained between releases. Users should therefore anticipate potential breaking changes from one version to the next.

- **Widely-Used Components**: For a feature with high usage, we aim for a more gradual transition period of approximately **5 months**, generally scheduling deprecation around **5 minor releases** after the initial warning.
+* **Widely-Used Components**: For a feature with high usage, we aim for a more gradual transition period of approximately **5 months**, generally scheduling deprecation around **5 minor releases** after the initial warning.

 These examples represent the two ends of a continuum. The specific timeline for each feature will be determined individually, balancing innovation with user stability needs.

@ -410,22 +380,22 @@ Warnings play a critical role in guiding users toward resolving potential issues

 #### Definitions

- **Correct**: An operation is correct if it is valid, follows the intended approach, and aligns with the current best practices or guidelines within the codebase. This is the recommended or intended way to perform the operation.
- **Supported**: An operation is supported if it is technically valid and works within the current codebase, but it may not be the most efficient, optimal, or recommended way to perform the task. This includes deprecated features or legacy approaches that still work but may be phased out in the future.
+* **Correct**: An operation is correct if it is valid, follows the intended approach, and aligns with the current best practices or guidelines within the codebase. This is the recommended or intended way to perform the operation.
+* **Supported**: An operation is supported if it is technically valid and works within the current codebase, but it may not be the most efficient, optimal, or recommended way to perform the task. This includes deprecated features or legacy approaches that still work but may be phased out in the future.

 #### Choosing the right message

- **Correct → No warning**:  
+* **Correct → No warning**:  
   If the operation is fully valid and expected, no message should be issued. The system is working as intended, so no warning is necessary.  

- **Correct but deserves attention → No warning, possibly a log message**:
+* **Correct but deserves attention → No warning, possibly a log message**:
   When an operation is correct but uncommon or requires special attention, providing an informational message can be helpful. This keeps users informed without implying any issue. If available, use the logger to output this message. Example:  

   ```python
   logger.info("This is an informational message about a rare but correct operation.")
   ```

- **Correct but very likely a mistake → Warning with option to disable**:  
+* **Correct but very likely a mistake → Warning with option to disable**:  
   In rare cases, you may want to issue a warning for a correct operation that’s very likely a mistake. In such cases, you must provide an option to suppress the warning. This can be done with a flag in the function. Example:  

   ```python
@ -436,7 +406,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
           # Do something
   ```

- **Supported but not correct → Warning**:  
+* **Supported but not correct → Warning**:  
   If the operation is technically supported but is deprecated, suboptimal, or could cause future issues (e.g., conflicting arguments), a warning should be raised. This message should be actionable, meaning it must explain how to resolve the issue. Example:  

   ```python
@ -446,7 +416,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
           # Do something
   ```

- **Not supported → Exception**:  
+* **Not supported → Exception**:  
   If the operation is invalid or unsupported, raise an exception. This indicates that the operation cannot be performed and requires immediate attention. Example:  

   ```python
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,6 +1,7 @@
 include LICENSE
 include CONTRIBUTING.md
 include README.md
-recursive-exclude * __pycache__
+include trl/accelerate_configs/*.yaml
 include trl/templates/*.md
-include trl/accelerate_configs/*.yaml
+recursive-exclude * __pycache__
+prune tests
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -55,12 +55,12 @@
    title: Example Overview
  - local: community_tutorials
    title: Community Tutorials
+  - local: lora_without_regret
+    title: LoRA Without Regret
  - local: sentiment_tuning
    title: Sentiment Tuning
  - local: using_llama_models
    title: Training StackLlama
-  - local: detoxifying_a_lm
-    title: Detoxifying a Language Model
  - local: multi_adapter_rl
    title: Multi Adapter RLHF
  title: Examples
--- a/docs/source/bco_trainer.md
+++ b/docs/source/bco_trainer.md
@ -1,6 +1,6 @@
 # BCO Trainer

-[![](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco,trl)
+[![model badge](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco,trl)

 TRL supports the Binary Classifier Optimization (BCO).
 The [BCO](https://huggingface.co/papers/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0.
@ -12,17 +12,16 @@ The [`BCOTrainer`] requires an [unpaired preference dataset](dataset_formats#unp
 The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 ## Expected model format
+
 The BCO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.

 ## Using the `BCOTrainer`

-For a detailed example have a look at the `examples/scripts/bco.py` script. At a high level we need to initialize the `BCOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response. 
+For a detailed example have a look at the `examples/scripts/bco.py` script. At a high level we need to initialize the `BCOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response.

 The `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).

-
-
-```py
+```python
 training_args = BCOConfig(
    beta=0.1,
 )
@ -35,9 +34,10 @@ bco_trainer = BCOTrainer(
    processing_class=tokenizer,
 )
 ```
+
 After this one can then call:

-```py
+```python
 bco_trainer.train()
 ```

@ -49,7 +49,7 @@ If the prompts in your desired and undesired datasets differ a lot, it is useful

 Choose an embedding model and tokenizer:

-```py
+```python
 embedding_model = AutoModel.from_pretrained(your_model_id)
 embedding_tokenizer = AutoTokenizer.from_pretrained(your_model_id)

@ -64,7 +64,7 @@ embedding_func = partial(embed_prompt, model=embedding_model)

 Set `prompt_sample_size` to define how many prompts are selected to train the UDM classifier and start the training with the provided embedding function:

-```py
+```python
 training_args = BCOConfig(
    beta=0.1,
    prompt_sample_size=512,
--- a/docs/source/best_of_n.md
+++ b/docs/source/best_of_n.md
@ -1,4 +1,4 @@
-# Best of N sampling: Alternative ways to get better model output without RL based fine-tuning 
+# Best of N sampling: Alternative ways to get better model output without RL based fine-tuning

 Within the extras module is the `best-of-n` sampler class that serves as an alternative method of generating better model output.
 As to how it fares against the RL based fine-tuning, please look in the `examples` directory for a comparison example
@ -8,7 +8,6 @@ As to how it fares against the RL based fine-tuning, please look in the `example
 To get started quickly, instantiate an instance of the class with a model, a length sampler, a tokenizer and a callable that serves as a proxy reward pipeline that outputs reward scores for input queries

 ```python
-
 from transformers import pipeline, AutoTokenizer
 from trl import AutoModelForCausalLMWithValueHead
 from trl.core import LengthSampler
@ -19,41 +18,33 @@ reward_pipe = pipeline("sentiment-analysis", model=reward_model, device=device)
 tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
 tokenizer.pad_token = tokenizer.eos_token

-
 # callable that takes a list of raw text and returns a list of corresponding reward scores
 def queries_to_scores(list_of_strings):
  return [output["score"] for output in reward_pipe(list_of_strings)]

 best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler)
-
-
 ```

 And assuming you have a list/tensor of tokenized queries, you can generate better output by calling the `generate` method

 ```python
-
 best_of_n.generate(query_tensors, device=device, **gen_kwargs)
-
 ```
+
 The default sample size is 4, but you can change it at the time of instance initialization like so

 ```python
-
 best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, sample_size=8)
-
 ```

 The default output is the result of taking the top scored output for each query, but you can change it to top 2 and so on by passing the `n_candidates` argument at the time of instance initialization

 ```python
-
 best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, n_candidates=2)
-
 ```

 There is the option of setting the generation settings (like `temperature`, `pad_token_id`) at the time of instance creation as opposed to when calling the `generate` method.
-This is done by passing a `GenerationConfig` from the `transformers` library at the time of initialization
+This is done by passing a [`~transformers.GenerationConfig`] from the `transformers` library at the time of initialization

 ```python

--- a/docs/source/clis.md
+++ b/docs/source/clis.md
@ -2,9 +2,11 @@

 TRL provides a powerful command-line interface (CLI) to fine-tune large language models (LLMs) using methods like Supervised Fine-Tuning (SFT), Direct Preference Optimization (DPO), and more. The CLI abstracts away much of the boilerplate, letting you launch training jobs quickly and reproducibly.

+## Commands
+
 Currently supported commands are:

-#### Training Commands
+### Training Commands

 - `trl dpo`: fine-tune a LLM with DPO
 - `trl grpo`: fine-tune a LLM with GRPO
@ -13,7 +15,7 @@ Currently supported commands are:
 - `trl rloo`: fine-tune a LLM with RLOO
 - `trl sft`: fine-tune a LLM with SFT

-#### Other Commands
+### Other Commands

 - `trl env`: get the system information
 - `trl vllm-serve`: serve a model with vLLM
@ -197,22 +199,22 @@ trl reward --config reward_config.yaml

 The `--accelerate_config` flag lets you easily configure distributed training with [🤗 Accelerate](https://github.com/huggingface/accelerate). This flag accepts either:

-* the name of a predefined config profile (built into TRL), or
-* a path to a custom Accelerate YAML config file.
+- the name of a predefined config profile (built into TRL), or
+- a path to a custom Accelerate YAML config file.

 #### Predefined Config Profiles

 TRL provides several ready-to-use Accelerate configs to simplify common training setups:

-| Name         | Description                         |
-| ------------ | ----------------------------------- |
-| `fsdp1`      | Fully Sharded Data Parallel Stage 1 |
-| `fsdp2`      | Fully Sharded Data Parallel Stage 2 |
-| `zero1`      | DeepSpeed ZeRO Stage 1              |
-| `zero2`      | DeepSpeed ZeRO Stage 2              |
-| `zero3`      | DeepSpeed ZeRO Stage 3              |
-| `multi_gpu`  | Multi-GPU training                  |
-| `single_gpu` | Single-GPU training                 |
+| Name | Description |
+| --- | --- |
+| `fsdp1` | Fully Sharded Data Parallel Stage 1 |
+| `fsdp2` | Fully Sharded Data Parallel Stage 2 |
+| `zero1` | DeepSpeed ZeRO Stage 1 |
+| `zero2` | DeepSpeed ZeRO Stage 2 |
+| `zero3` | DeepSpeed ZeRO Stage 3 |
+| `multi_gpu` | Multi-GPU training |
+| `single_gpu` | Single-GPU training |

 To use one of these, just pass the name to `--accelerate_config`. TRL will automatically load the corresponding config file from `trl/accelerate_config/`.

--- a/docs/source/community_tutorials.md
+++ b/docs/source/community_tutorials.md
@ -8,6 +8,7 @@ Community tutorials are made by active members of the Hugging Face community who

 | Task | Class | Description | Author | Tutorial | Colab |
 | --- | --- | --- | --- | --- | --- |
+| Reinforcement Learning | [`GRPOTrainer`] | Efficient Online Training with GRPO and vLLM in TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/grpo_vllm_online_training) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/grpo_vllm_online_training.ipynb) |
 | Reinforcement Learning | [`GRPOTrainer`] | Post training an LLM for reasoning with GRPO in TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_llm_grpo_trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_grpo_trl.ipynb) |
 | Reinforcement Learning | [`GRPOTrainer`] | Mini-R1: Reproduce Deepseek R1 „aha moment“ a RL tutorial | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/mini-deepseek-r1) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/mini-deepseek-r1-aha-grpo.ipynb) |
 | Reinforcement Learning | [`GRPOTrainer`] | RL on LLaMA 3.1-8B with GRPO and Unsloth optimizations | [Andrea Manzoni](https://huggingface.co/AManzoni) | [Link](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) | 
@ -17,7 +18,6 @@ Community tutorials are made by active members of the Hugging Face community who
 | Preference Optimization | [`ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
 | Instruction tuning | [`SFTTrainer`] | How to fine-tune open LLMs in 2025 with Hugging Face | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-llms-in-2025) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2025.ipynb) |

-
 ### Videos

 | Task | Title | Author | Video |
@ -31,6 +31,7 @@ Community tutorials are made by active members of the Hugging Face community who

 > [!WARNING]
 > The tutorial uses two deprecated features:
+>
 > - `SFTTrainer(..., tokenizer=tokenizer)`: Use `SFTTrainer(..., processing_class=tokenizer)` instead, or simply omit it (it will be inferred from the model).
 > - `setup_chat_format(model, tokenizer)`: Use `SFTConfig(..., chat_template_path="Qwen/Qwen3-0.6B")`, where `chat_template_path` specifies the model whose chat template you want to copy.

--- a/docs/source/cpo_trainer.md
+++ b/docs/source/cpo_trainer.md
@ -1,6 +1,6 @@
 # CPO Trainer

-[![](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo,trl)

 ## Overview

@ -98,15 +98,13 @@ To use this loss as described in the paper, we can set the `loss_type="alphapo"`

 The CPO algorithm supports several loss functions. The loss function can be set using the `loss_type` parameter in the [`CPOConfig`]. The following loss functions are supported:

-| `loss_type=`                           | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `"sigmoid"` (default)                  | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `"hinge"`                              | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `"ipo"`                                | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only).                                                                                                                        |
-| `"simpo"`                              | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value.  |
-| `"alphapo"`                            | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero.  |
-
-
+| `loss_type=` | Description |
+| --- | --- |
+| `"sigmoid"` (default) | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression. |
+| `"hinge"` | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin. |
+| `"ipo"` | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only). |
+| `"simpo"` | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value. |
+| `"alphapo"` | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero. |

 ### For Mixture of Experts Models: Enabling the auxiliary loss

--- a/docs/source/customization.md
+++ b/docs/source/customization.md
@ -2,8 +2,6 @@

 TRL is designed with modularity in mind so that users are able to efficiently customize the training loop for their needs. Below are some examples on how you can apply and test different techniques.  Note: Although these examples use the DPOTrainer, the customization applies to most (if not all) trainers.

-
-
 ## Use different optimizers and schedulers

 By default, the `DPOTrainer` creates a `torch.optim.AdamW` optimizer. You can create and define a different optimizer and pass it to `DPOTrainer` as follows:
@ -84,11 +82,11 @@ trainer = DPOTrainer(
 trainer.train()
 ```

-## Pass 8-bit reference models 
- 
+## Pass 8-bit reference models
+
 Since `trl` supports all keyword arguments when loading a model from `transformers` using `from_pretrained`, you can also leverage `load_in_8bit` from `transformers` for more memory efficient fine-tuning.

-Read more about 8-bit model loading in `transformers` [here](https://huggingface.co/docs/transformers/en/peft#load-in-8bit-or-4bit).
+Read more about 8-bit model loading in `transformers` [Load in 8bit or 4bit](https://huggingface.co/docs/transformers/en/peft#load-in-8bit-or-4bit).

 ```python
 from datasets import load_dataset
@ -114,7 +112,7 @@ trainer.train()

 ## Use the accelerator cache optimizer

-When training large models, you should better handle the accelerator cache by iteratively clearing it. To do so, simply pass `optimize_device_cache=True` to `DPOConfig`:
+When training large models, you should better handle the accelerator cache by iteratively clearing it. To do so, simply pass `optimize_device_cache=True` to [`DPOConfig`]:

 ```python
 training_args = DPOConfig(..., optimize_device_cache=True)
--- a/docs/source/dataset_formats.md
+++ b/docs/source/dataset_formats.md
@ -81,7 +81,7 @@ This guide provides an overview of the dataset formats and types supported by ea
    <td>Stepwise supervision</td>
    <td>
      <pre><code>{"prompt": "Which number is larger, 9.8 or 9.11?",
- "completions": ["The fractional part of 9.8 is 0.8.", 
+ "completions": ["The fractional part of 9.8 is 0.8.",
                 "The fractional part of 9.11 is 0.11.",
                 "0.11 is greater than 0.8.",
                 "Hence, 9.11 > 9.8."],
@ -387,23 +387,23 @@ For examples of stepwise supervision datasets, refer to the [Stepwise supervisio

 Choosing the right dataset type depends on the task you are working on and the specific requirements of the TRL trainer you are using. Below is a brief overview of the dataset types supported by each TRL trainer.

-| Trainer                 | Expected dataset type                                                                                  |
-| ----------------------- | ------------------------------------------------------------------------------------------------------ |
-| [`BCOTrainer`]          | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
-| [`CPOTrainer`]          | [Preference (explicit prompt recommended)](#preference)                                                |
-| [`DPOTrainer`]          | [Preference (explicit prompt recommended)](#preference)                                                |
-| [`GKDTrainer`]          | [Prompt-completion](#prompt-completion)                                                                |
-| [`GRPOTrainer`]         | [Prompt-only](#prompt-only)                                                                            |
-| [`KTOTrainer`]          | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
-| [`NashMDTrainer`]       | [Prompt-only](#prompt-only)                                                                            |
-| [`OnlineDPOTrainer`]    | [Prompt-only](#prompt-only)                                                                            |
-| [`ORPOTrainer`]         | [Preference (explicit prompt recommended)](#preference)                                                |
-| [`PPOTrainer`]          | Tokenized language modeling                                                                            |
-| [`PRMTrainer`]          | [Stepwise supervision](#stepwise-supervision)                                                          |
-| [`RewardTrainer`]       | [Preference (implicit prompt recommended)](#preference)                                                |
-| [`RLOOTrainer`]         | [Prompt-only](#prompt-only)                                                                            |
-| [`SFTTrainer`]          | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion)                     |
-| [`XPOTrainer`]          | [Prompt-only](#prompt-only)                                                                            |
+| Trainer | Expected dataset type |
+| --- | --- |
+| [`BCOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
+| [`CPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`DPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`GKDTrainer`] | [Prompt-completion](#prompt-completion) |
+| [`GRPOTrainer`] | [Prompt-only](#prompt-only) |
+| [`KTOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
+| [`NashMDTrainer`] | [Prompt-only](#prompt-only) |
+| [`OnlineDPOTrainer`] | [Prompt-only](#prompt-only) |
+| [`ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`PPOTrainer`] | Tokenized language modeling |
+| [`PRMTrainer`] | [Stepwise supervision](#stepwise-supervision) |
+| [`RewardTrainer`] | [Preference (implicit prompt recommended)](#preference) |
+| [`RLOOTrainer`] | [Prompt-only](#prompt-only) |
+| [`SFTTrainer`] | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion) |
+| [`XPOTrainer`] | [Prompt-only](#prompt-only) |

 > [!TIP]
 > TRL trainers only support standard dataset formats, [for now](https://github.com/huggingface/trl/issues/2071). If you have a conversational dataset, you must first convert it into a standard format.
@ -416,7 +416,7 @@ Fortunately, TRL offers tools to easily handle this conversion, which are detail

 ### Converting a conversational dataset into a standard dataset

-To convert a conversational dataset into a standard dataset, you need to _apply a chat template_ to the dataset. A chat template is a predefined structure that typically includes placeholders for user and assistant messages. This template is provided by the tokenizer of the model you use.
+To convert a conversational dataset into a standard dataset, you need to *apply a chat template* to the dataset. A chat template is a predefined structure that typically includes placeholders for user and assistant messages. This template is provided by the tokenizer of the model you use.

 For detailed instructions on using chat templating, refer to the [Chat templating section in the `transformers` documentation](https://huggingface.co/docs/transformers/en/chat_templating).

@ -519,15 +519,15 @@ This section provides example code to help you convert between different dataset

 For simplicity, some of the examples below do not follow this recommendation and use the standard format. However, the conversions can be applied directly to the conversational format without modification.

-| From \ To                       | Language modeling                                                       | Prompt-completion                                                       | Prompt-only                                                       | Preference with implicit prompt                           | Preference                                                | Unpaired preference                                                       | Stepwise supervision |
-| ------------------------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------- | --------------------------------------------------------- | --------------------------------------------------------- | ------------------------------------------------------------------------- | -------------------- |
-| Language modeling               | N/A                                                                     | N/A                                                                     | N/A                                                               | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Prompt-completion               | [🔗](#from-prompt-completion-to-language-modeling-dataset)               | N/A                                                                     | [🔗](#from-prompt-completion-to-prompt-only-dataset)               | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Prompt-only                     | N/A                                                                     | N/A                                                                     | N/A                                                               | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Preference with implicit prompt | [🔗](#from-preference-with-implicit-prompt-to-language-modeling-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-completion-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-only-dataset) | N/A                                                       | [🔗](#from-implicit-to-explicit-prompt-preference-dataset) | [🔗](#from-preference-with-implicit-prompt-to-unpaired-preference-dataset) | N/A                  |
-| Preference                      | [🔗](#from-preference-to-language-modeling-dataset)                      | [🔗](#from-preference-to-prompt-completion-dataset)                      | [🔗](#from-preference-to-prompt-only-dataset)                      | [🔗](#from-explicit-to-implicit-prompt-preference-dataset) | N/A                                                       | [🔗](#from-preference-to-unpaired-preference-dataset)                      | N/A                  |
-| Unpaired preference             | [🔗](#from-unpaired-preference-to-language-modeling-dataset)             | [🔗](#from-unpaired-preference-to-prompt-completion-dataset)             | [🔗](#from-unpaired-preference-to-prompt-only-dataset)             | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Stepwise supervision            | [🔗](#from-stepwise-supervision-to-language-modeling-dataset)            | [🔗](#from-stepwise-supervision-to-prompt-completion-dataset)            | [🔗](#from-stepwise-supervision-to-prompt-only-dataset)            | N/A                                                       | N/A                                                       | [🔗](#from-stepwise-supervision-to-unpaired-preference-dataset)            | N/A                  |
+| From \ To | Language modeling | Prompt-completion | Prompt-only | Preference with implicit prompt | Preference | Unpaired preference | Stepwise supervision |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| Language modeling | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| Prompt-completion | [🔗](#from-prompt-completion-to-language-modeling-dataset) | N/A | [🔗](#from-prompt-completion-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
+| Prompt-only | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| Preference with implicit prompt | [🔗](#from-preference-with-implicit-prompt-to-language-modeling-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-completion-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-only-dataset) | N/A | [🔗](#from-implicit-to-explicit-prompt-preference-dataset) | [🔗](#from-preference-with-implicit-prompt-to-unpaired-preference-dataset) | N/A |
+| Preference | [🔗](#from-preference-to-language-modeling-dataset) | [🔗](#from-preference-to-prompt-completion-dataset) | [🔗](#from-preference-to-prompt-only-dataset) | [🔗](#from-explicit-to-implicit-prompt-preference-dataset) | N/A | [🔗](#from-preference-to-unpaired-preference-dataset) | N/A |
+| Unpaired preference | [🔗](#from-unpaired-preference-to-language-modeling-dataset) | [🔗](#from-unpaired-preference-to-prompt-completion-dataset) | [🔗](#from-unpaired-preference-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
+| Stepwise supervision | [🔗](#from-stepwise-supervision-to-language-modeling-dataset) | [🔗](#from-stepwise-supervision-to-prompt-completion-dataset) | [🔗](#from-stepwise-supervision-to-prompt-only-dataset) | N/A | N/A | [🔗](#from-stepwise-supervision-to-unpaired-preference-dataset) | N/A |

 ### From prompt-completion to language modeling dataset

@ -1043,3 +1043,23 @@ An example of a conversational vision dataset is the [openbmb/RLAIF-V-Dataset](h
  width="100%"
  height="560px"
 ></iframe>
+
+> [!NOTE]
+> Mixing text-only and vision-language data in the dataset is possible, but it requires `transformers` version 4.57.0 or later. Example:
+>
+> ```python
+> dataset = Dataset.from_dict({
+>     "prompt": [
+>         [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky in the image?"}]}],
+>         [{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}],
+>     ],
+>     "completion": [
+>         [{"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]}],
+>         [{"role": "assistant", "content": [{"type": "text", "text": "Paris."}]}],
+>     ],
+>     "images": [
+>         [PIL.Image.open("path/to/sky_image1.png")],
+>         [],
+>     ],
+> })
+> ```
--- a/docs/source/detoxifying_a_lm.md
+++ b/docs/source/detoxifying_a_lm.md
@ -1,187 +0,0 @@
-# Detoxifying a Language Model using PPO
-
-Language models (LMs) are known to sometimes generate toxic outputs. In this example, we will show how to "detoxify" a LM by feeding it toxic prompts and then using [Transformer Reinforcement Learning (TRL)](https://huggingface.co/docs/trl/index) and Proximal Policy Optimization (PPO) to "detoxify" it.
-
-Read this section to follow our investigation on how we can reduce toxicity in a wide range of LMs, from 125m parameters to 6B parameters! 
-
-Here's an overview of the notebooks and scripts in the [TRL toxicity repository](https://github.com/huggingface/trl/tree/main/examples/toxicity/scripts) as well as the link for the interactive demo:
-
-| File | Description | Colab link |
-|---|---| --- |
-| [`gpt-j-6b-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py) | Detoxify `GPT-J-6B` using PPO | x | 
-| [`evaluate-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py) | Evaluate de-toxified models using `evaluate` | x | 
-| [Interactive Space](https://huggingface.co/spaces/ybelkada/detoxified-lms)| An interactive Space that you can use to compare the original model with its detoxified version!| x |
-
-## Context
-
-Language models are trained on large volumes of text from the internet which also includes a lot of toxic content. Naturally,  language models pick up the toxic patterns during training. Especially when prompted with already toxic texts the models are likely to continue the generations in a toxic way. The goal here is to "force" the model to be less toxic by feeding it toxic prompts and then using PPO to "detoxify" it.
-
-### Computing toxicity scores
-
-In order to optimize a model with PPO we need to define a reward. For this use-case we want a negative reward whenever the model generates something toxic and a positive comment when it is not toxic.
-Therefore, we used [`facebook/roberta-hate-speech-dynabench-r4-target`](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target), which is a RoBERTa model fine-tuned to classify between "neutral" and "toxic" text as our toxic prompts classifier.
-One could have also used different techniques to evaluate the toxicity of a model, or combined different toxicity classifiers, but for simplicity we have chosen to use this one.
-
-### Selection of models
-
-We selected the following models for our experiments to show that TRL can be easily scaled to 10B parameters models: 
-
-* [`EleutherAI/gpt-neo-125M`](https://huggingface.co/EleutherAI/gpt-neo-125M) (125 million parameters)
-* [`EleutherAI/gpt-neo-2.7B`](https://huggingface.co/EleutherAI/gpt-neo-2.7B) (2.7 billion parameters)
-* [`EleutherAI/gpt-j-6B`](https://huggingface.co/EleutherAI/gpt-j-6B) (6 billion parameters)
-
-For the selection of the smallest model, we have chosen `EleutherAI/gpt-neo-125M` because it has shown to be a model that was the "most toxic" compared to other models. We have run toxicity evaluation using `facebook/roberta-hate-speech-dynabench-r4-target` model on 4 different architectures on a subset of `allenai/real-toxicity-prompts` dataset. Note that we have computed the toxicity score on the generated text only (thus ignoring the prompt).
-
-| Model | Mean toxicity score |
-|---|---| 
-| `gpt2` | 0.01602 |
-| `facebook/opt-350m` | 0.01628 |
-| `bigscience/bloom-560m` | 0.00767 |
-| `EleutherAI/gpt-neo-125M` | **0.02016** |
-
-## Designing the problem
-
-When doing PPO, it is very important to design the problem efficiently so that the model can learn to solve it. Let's cover the topics that were important for the model to converge.
-
-### Pre-processing the dataset
-
-The dataset consists of prompts and their continuations, and each of them has an associated `toxicity` score.
-
-A `prompt` example:
-```
-{ "text": "Bronx officer Luis Rios alleged on-the-job tall tales were compounded by his lying about them in court, and", "profanity": 0.08360514, "sexually_explicit": 0.118823394, "identity_attack": 0.09608547, "flirtation": 0.23102224, "threat": 0.13215046, "insult": 0.10130461, "severe_toxicity": 0.04068885, "toxicity": 0.16534281 }
-```
-And its `continuation` value:
-```
-{ "text": " cost federal prosecutors all the drug evidence that the cop collected against an armed suspect — 16 baggies of cocaine during a strip search.", "severe_toxicity": 0.067997746, "toxicity": 0.1694093, "profanity": 0.11931301, "sexually_explicit": 0.12521537, "identity_attack": 0.09268324, "flirtation": 0.13452998, "threat": 0.31312028, "insult": 0.10761123 }
-```
-
-We want to increase the chance for the model to generate toxic prompts so we get more learning signal. For this reason pre-process the dataset to consider only the prompt that has a toxicity score that is greater than a threshold. We can do this in a few lines of code:
-```python
-train_dataset = load_dataset("allenai/real-toxicity-prompts", split="train")
-
-def filter_fn(sample):
-    toxicity = sample["prompt"]["toxicity"]
-    return toxicity is not None and toxicity > 0.3
-
-train_dataset = train_dataset.filter(filter_fn, batched=False)
-```
-
-### Reward function
-
-The reward function is one of the most important part of training a model with reinforcement learning. It is the function that will tell the model if it is doing well or not.
-We tried various combinations, considering the softmax of the label "neutral", the log of the toxicity score and the raw logits of the label "neutral". We have found out that the convergence was much more smoother with the raw logits of the label "neutral".
-```python
-logits = toxicity_model(**toxicity_inputs).logits.float()
-rewards = (logits[:, 0]).tolist()
-```
-
-### Impact of input prompts length
-
-We have found out that training a model with small or long context (from 5 to 8 tokens for the small context and from 15 to 20 tokens for the long context) does not have any impact on the convergence of the model, however, when training the model with longer prompts, the model will tend to generate more toxic prompts. 
-As a compromise between the two we took for a context window of 10 to 15 tokens for the training.
-
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-long-vs-short-context.png">
-</div>
-
-### How to deal with OOM issues
-
-Our goal is to train models up to 6B parameters, which is about 24GB in float32! Here are two tricks we use to be able to train a 6B model on a single 40GB-RAM GPU:
-
- Use `bfloat16` precision: Simply load your model in `bfloat16` when calling `from_pretrained` and you can reduce the size of the model by 2:
-
-```python
-model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", dtype=torch.bfloat16)
-```
-
-and the optimizer will take care of computing the gradients in `bfloat16` precision. Note that this is a pure `bfloat16` training which is different from the mixed precision training. If one wants to train a model in mixed-precision, they should not load the model with `dtype` and specify the mixed precision argument when calling `accelerate config`.
-
- Use shared layers: Since PPO algorithm requires to have both the active and reference model to be on the same device, we have decided to use shared layers to reduce the memory footprint of the model. This can be achieved by specifying `num_shared_layers` argument when calling the `create_reference_model()` function. For example, if you want to share the first 6 layers of the model, you can do it like this:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-shared-layers.png">
-</div>
-
-```python
-ref_model = create_reference_model(model, num_shared_layers=6)
-trainer = PPOTrainer(..., ref_model=ref_model)
-```
-
-In the example above this means that the model has the 4 first layers frozen (i.e. since these layers are shared between the active model and the reference model).
-
- One could have also applied gradient checkpointing to reduce the memory footprint of the model by calling `model.pretrained_model.enable_gradient_checkpointing()` (although this has the downside of training being ~20% slower).
-
-## Training the model!
-
-We have decided to keep 3 models in total that correspond to our best models:
-
- [`ybelkada/gpt-neo-125m-detox`](https://huggingface.co/ybelkada/gpt-neo-125m-detox)
- [`ybelkada/gpt-neo-2.7B-detox`](https://huggingface.co/ybelkada/gpt-neo-2.7B-detox)
- [`ybelkada/gpt-j-6b-detox`](https://huggingface.co/ybelkada/gpt-j-6b-detox)
-
-We have used different learning rates for each model, and have found out that the largest models were quite hard to train and can easily lead to collapse mode if the learning rate is not chosen correctly (i.e. if the learning rate is too high):
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-collapse-mode.png">
-</div>
-
-The final training run of `ybelkada/gpt-j-6b-detoxified-20shdl` looks like this:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-final-run-2.png">
-</div>
-
-As you can see the model converges nicely, but obviously we don't observe a very large improvement from the first step, as the original model is not trained to generate toxic contents. 
-
-Also we have observed that training with larger `mini_batch_size` leads to smoother convergence and better results on the test set:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-mbs-run.png">
-</div>
-
-## Results
-
-We tested our models on a new dataset, the [`OxAISH-AL-LLM/wiki_toxic`](https://huggingface.co/datasets/OxAISH-AL-LLM/wiki_toxic) dataset. We feed each model with a toxic prompt from it (a sample with the label "toxic"), and generate 30 new tokens as it is done on the training loop and measure the toxicity score using `evaluate`'s [`toxicity` metric](https://huggingface.co/spaces/ybelkada/toxicity).
-We report the toxicity score of 400 sampled examples, compute its mean and standard deviation and report the results in the table below:
-
-| Model | Mean toxicity score | Std toxicity score |
-| --- | --- | --- |
-| `EleutherAI/gpt-neo-125m` | 0.1627 | 0.2997 |
-| `ybelkada/gpt-neo-125m-detox` | **0.1148** | **0.2506** |
-| --- | --- | --- |
-| `EleutherAI/gpt-neo-2.7B` | 0.1884 | 0.3178 |
-| `ybelkada/gpt-neo-2.7B-detox` | **0.0916** | **0.2104** |
-| --- | --- | --- |
-| `EleutherAI/gpt-j-6B` | 0.1699 | 0.3033 |
-| `ybelkada/gpt-j-6b-detox` | **0.1510** | **0.2798** |
-
-<div class="column" style="text-align:center">
-  <figure>
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-final-barplot.png" style="width:80%">
-    <figcaption>Toxicity score with respect to the size of the model.</figcaption>
-  </figure>
-</div>
-
-Below are few generation examples of `gpt-j-6b-detox` model:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-toxicity-examples.png">
-</div>
-
-The evaluation script can be found [here](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py).
-
-### Discussions
-
-The results are quite promising, as we can see that the models are able to reduce the toxicity score of the generated text by an interesting margin. The gap is clear for `gpt-neo-2B` model but we see less so for the `gpt-j-6B` model. There are several things we could try to improve the results on the largest model starting with training with larger `mini_batch_size` and probably allowing to back-propagate through more layers (i.e. use less shared layers).
-
-To sum up, in addition to human feedback this could be a useful additional signal when training large language models to ensure their outputs are less toxic as well as useful.
-
-### Limitations
-
-We are also aware of consistent bias issues reported with toxicity classifiers, and of work evaluating the negative impact of toxicity reduction on the diversity of outcomes. We recommend that future work also compare the outputs of the detoxified models in terms of fairness and diversity before putting them to use.
-
-## What is next?
-
-You can download the model and use it out of the box with `transformers`, or play with the Spaces that compares the output of the models before and after detoxification [here](https://huggingface.co/spaces/ybelkada/detoxified-lms).
--- a/docs/source/distributing_training.md
+++ b/docs/source/distributing_training.md
@ -26,11 +26,12 @@ accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train
 This automatically distributes the workload across all available GPUs.

 Under the hood, [🤗 Accelerate](https://github.com/huggingface/accelerate) creates one model per GPU. Each process:
+
 - Processes its own batch of data
 - Computes the loss and gradients for that batch
 - Shares gradient updates across all GPUs

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)
+![multi gpu](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)

 The effective batch size is calculated as:

@ -177,8 +178,7 @@ These results show that **Context Parallelism (CP) scales effectively with more
 >
 > You can learn more and explore configuration examples in the [Accelerate ND-parallelism guide](https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#nd-parallelism).

-
-**Further Reading on Context Parallelism**  
+### Further Reading on Context Parallelism

 - [Accelerate: Context Parallelism Guide](https://github.com/huggingface/accelerate/blob/main/docs/source/concept_guides/context_parallelism.md)  
 - [Accelerate Example: 128k Sequence Length](https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#context-parallelism-128k-sequence-length)  
@ -187,4 +187,4 @@ These results show that **Context Parallelism (CP) scales effectively with more

 ## Multi-Node Training

-We're working on a guide for multi-node training. Stay tuned! 🚀
+We're working on a guide for multi-node training. Stay tuned! 🚀
--- a/docs/source/dpo_trainer.md
+++ b/docs/source/dpo_trainer.md
@ -1,6 +1,6 @@
 # DPO Trainer

-[![](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo,trl) [![](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
+[![model badge](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo,trl) [![model badge](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)

 ## Overview

@ -19,7 +19,7 @@ Then, fine-tuning a language model via DPO consists of two steps and is easier t

 This process is illustrated in the sketch below (from [Figure 1 of the DPO paper](https://huggingface.co/papers/2305.18290)):

-![](https://github.com/huggingface/trl/assets/49240599/9150fac6-3d88-4ca2-8ec6-2a6f3473216d)
+![Figure 1 DPO](https://github.com/huggingface/trl/assets/49240599/9150fac6-3d88-4ca2-8ec6-2a6f3473216d)

 Read more about DPO algorithm in the [original paper](https://huggingface.co/papers/2305.18290).

@ -101,7 +101,6 @@ Additionally, unlike standard text-based models where a `tokenizer` is used, for

 For a complete example of fine-tuning a vision-language model, refer to the script in [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py).

-
 ## Example script

 We provide an example script to train a model using the DPO method. The script is available in [`trl/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py)
@ -192,10 +191,10 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype

 You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below:

-| GPU      | Model     | Dataset    | 🤗   | 🤗 + FlashAttention 2 | 🦥 Unsloth | 🦥 VRAM saved |
-| -------- | --------- | ---------- | --- | --------------------- | --------- | ------------ |
-| A100 40G | Zephyr 7b | Ultra Chat | 1x  | 1.24x                 | **1.88x** | -11.6%       |
-| Tesla T4 | Zephyr 7b | Ultra Chat | 1x  | 1.09x                 | **1.55x** | -18.6%       |
+| GPU | Model | Dataset | 🤗 | 🤗 + FlashAttention 2 | 🦥 Unsloth | 🦥 VRAM saved |
+| --- | --- | --- | --- | --- | --- | --- |
+| A100 40G | Zephyr 7b | Ultra Chat | 1x | 1.24x | **1.88x** | -11.6% |
+| Tesla T4 | Zephyr 7b | Ultra Chat | 1x | 1.09x | **1.55x** | -18.6% |

 First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:

--- a/docs/source/example_overview.md
+++ b/docs/source/example_overview.md
@ -1,16 +1,15 @@
 # Examples

-
 ## Introduction

 The examples should work in any of the following settings (with the same script):
-   - single GPU
-   - multi GPUs (using PyTorch distributed mode)
-   - multi GPUs (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
-   - fp16 (mixed-precision), fp32 (normal precision), or bf16 (bfloat16 precision)

-To run it in each of these various modes, first initialize the accelerate
-configuration with `accelerate config`
+- single GPU
+- multi GPUs (using PyTorch distributed mode)
+- multi GPUs (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
+- fp16 (mixed-precision), fp32 (normal precision), or bf16 (bfloat16 precision)
+
+To run it in each of these various modes, first initialize the accelerate configuration with `accelerate config`.

 To train with a 4-bit or 8-bit model, please run:

@ -28,7 +27,6 @@ accelerate config # will prompt you to define the training configuration

 Then, it is encouraged to launch jobs with `accelerate launch`!

-
 ## Maintained Examples

 Scripts can be used as examples of how to use TRL trainers. They are located in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) directory. Additionally, we provide examples in the [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directory. These examples are maintained and tested regularly.
@ -42,9 +40,9 @@ Scripts can be used as examples of how to use TRL trainers. They are located in
 | [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`HfPairwiseJudge`] or [`OpenAIPairwiseJudge`] to judge model generations. |
 | [`examples/scripts/gkd.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gkd.py) | This script shows how to use the [`GKDTrainer`] to fine-tune a model. |
 | [`trl/scripts/grpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/grpo.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a model. |
-| [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset.  |
-| [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset.  |
-| [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset.  |
+| [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
+| [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset. |
+| [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
 | [`examples/scripts/kto.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/kto.py) | This script shows how to use the [`KTOTrainer`] to fine-tune a model. |
 | [`examples/scripts/mpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/mpo_vlm.py) | This script shows how to use MPO via the [`DPOTrainer`] to align a model based on preferences using the [HuggingFaceH4/rlaif-v_formatted](https://huggingface.co/datasets/HuggingFaceH4/rlaif-v_formatted) dataset and a set of loss weights with weights. |
 | [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`NashMDTrainer`] to fine-tune a model. |
@ -72,11 +70,6 @@ Here are also some easier-to-run colab notebooks that you can use to get started
 | [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
 | [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. |

-
-We also have some other examples that are less maintained but can be used as a reference:
-1. **[research_projects](https://github.com/huggingface/trl/tree/main/examples/research_projects)**: Check out this folder to find the scripts used for some research projects that used TRL (LM de-toxification, Stack-Llama, etc.)
-
-
 ## Distributed training

 All the scripts can be run on multiple GPUs by providing the path of an 🤗 Accelerate config file when calling `accelerate launch`. To launch one of them on one or multiple GPUs, run the following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine and `--all_arguments_of_the_script` with your arguments).
--- a/docs/source/gkd_trainer.md
+++ b/docs/source/gkd_trainer.md
@ -1,17 +1,17 @@
 # Generalized Knowledge Distillation Trainer

-[![](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd,trl)
+[![model badge](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd,trl)

 ## Overview

-Generalized Knowledge Distillation (GKD) was proposed in [On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes](https://huggingface.co/papers/2306.13649) by Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, and Olivier Bachem. 
+Generalized Knowledge Distillation (GKD) was proposed in [On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes](https://huggingface.co/papers/2306.13649) by Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, and Olivier Bachem.

 The abstract from the paper is the following:

 > Knowledge distillation (KD) is widely used for compressing a teacher model to reduce its inference cost and memory footprint, by training a smaller student model. However, current KD methods for auto-regressive sequence models suffer from distribution mismatch between output sequences seen during training and those generated by the student during inference. To address this issue, we introduce Generalized Knowledge Distillation (GKD). Instead of solely relying on a fixed set of output sequences, GKD trains the student on its self-generated output sequences by leveraging feedback from the teacher on such sequences. Unlike supervised KD approaches, GKD also offers the flexibility to employ alternative loss functions between the student and teacher, which can be useful when the student lacks the expressivity to mimic the teacher's distribution. Furthermore, GKD facilitates the seamless integration of distillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for distilling auto-regressive language models on summarization, translation, and arithmetic reasoning tasks, and task-agnostic distillation for instruction-tuning.

-
 The key aspects of GKD are:
+
 1. It addresses the train-inference distribution mismatch in auto-regressive sequence models by training the student model on its self-generated output sequences.
 2. GKD allows flexibility in choosing different divergence measures between student and teacher models via the generalized Jensen-Shannon Divergence (JSD), which can be useful when the student lacks the capacity to fully mimic the teacher.

@ -20,6 +20,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 ## Usage tips

 The [`GKDTrainer`] is a wrapper around the [`SFTTrainer`] class that takes in a teacher model argument. It needs three parameters to be set via the [`GKDConfig`] namely:
+
 * `lmbda`:  controls the student data fraction, i.e., the proportion of on-policy student-generated outputs. When `lmbda=0.0`, the loss reduces to supervised JSD where the student is trained with the token-level probabilities of the teacher. When `lmbda=1.0`, the loss reduces to on-policy JSD, where the student generates output sequences and token-specific feedback on these sequences from the teacher. For values in between [0, 1] it is random between the two based on the `lmbda` value for each batch.
 * `seq_kd`:  controls whether to perform Sequence-Level KD (can be viewed as supervised FT on teacher-generated out). When `seq_kd=True` and `lmbda=0.0`, the loss reduces to supervised JSD, where the teacher generates output sequences and the student receives token-specific feedback on these sequences from the teacher. 
 * `beta`: controls the interpolation in the generalized Jensen-Shannon Divergence.  When `beta=0.0` the loss approximates forward KL divergence, while for `beta=1.0` the loss approximates reverse KL divergence. For values in between [0, 1] it interpolates between the two.
@ -85,6 +86,7 @@ trainer.train()
 ### Expected dataset type

 The dataset should be formatted as a list of "messages" where each message is a list of dictionaries with the following keys:
+
 * `role`: either `system`, `assistant` or `user`
 * `content`: the message content

--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@ -1,6 +1,6 @@
 # GRPO Trainer

-[![](https://img.shields.io/badge/All_models-GRPO-blue)](https://huggingface.co/models?other=grpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-GRPO-blue)](https://huggingface.co/models?other=grpo,trl)

 ## Overview

@ -56,13 +56,13 @@ accelerate launch train_grpo.py

 Distributed across 8 GPUs, the training takes approximately 1 day.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_curves.png)
+![GRPO curves](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_curves.png)

 ## Looking deeper into the GRPO method

 GRPO is an online learning algorithm, meaning it improves iteratively by using the data generated by the trained model itself during training. The intuition behind GRPO objective is to maximize the advantage of the generated completions, while ensuring that the model remains close to the reference policy. To understand how GRPO works, it can be broken down into four main steps: **Generating completions**, **computing the advantage**, **estimating the KL divergence**, and **computing the loss**.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_visual.png)
+![GRPO visual](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_visual.png)

 ### Generating completions

@ -80,7 +80,6 @@ This approach gives the method its name: **Group Relative Policy Optimization (G
 > It was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that scaling by  \\( \text{std}(\mathbf{r}) \\) may cause a question-level difficulty bias. You can disable this scaling by setting `scale_rewards=False` in [`GRPOConfig`].

 > [!TIP]
-> 
 > [Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)](https://huggingface.co/papers/2508.08221) showed that calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping. You can use this scaling strategy by setting `scale_rewards="batch"` in [`GRPOConfig`].

 ### Estimating the KL divergence
@ -167,10 +166,10 @@ While training and evaluating, we record the following reward metrics:
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:
-$$
-\text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}\,.
-$$
-A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
+  $$
+  \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}\,.
+  $$
+  A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
 - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
 - `clip_ratio/low_min`: The minimum ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
 - `clip_ratio/high_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the upper bound of the trust region:  \\(r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}\\)
@ -181,6 +180,7 @@ A higher value means more tokens are clipped, which constrains how much the poli
 ### Speed up training with vLLM-powered generation

 Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
+
 ```shell
 pip install trl[vllm]
 ```
@ -195,11 +195,13 @@ We support two ways of using vLLM during training: **server mode** and **colocat
 In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.

 1. **Start the vLLM server**:
+
   ```bash
   trl vllm-serve --model <model_name>
   ```

 2. **Enable server mode in your training script**:
+
   ```python
   from trl import GRPOConfig

@ -232,12 +234,7 @@ training_args = GRPOConfig(
 >
 > We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
 >
-> <iframe
-> 	src="https://trl-lib-recommend-vllm-memory.hf.space"
-> 	frameborder="0"
-> 	width="850"
-> 	height="450"
-> ></iframe>
+> <iframe src="https://trl-lib-recommend-vllm-memory.hf.space" frameborder="0" width="850" height="450"></iframe>
 >
 > If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
 >
@ -436,6 +433,7 @@ You can test this function as follows:
 >>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
 [1.0, 0.0]
 ```
+
 #### Example 4: Multi-task reward functions

 Below is an example of using multiple reward functions in the [`GRPOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
@ -496,8 +494,6 @@ In this example, the `math_reward_func` and `coding_reward_func` are designed to

 Note that the [`GRPOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.

-
-
 #### Passing the reward function to the trainer

 To use your custom reward function, pass it to the [`GRPOTrainer`] as follows:
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -7,6 +7,46 @@
 TRL is a full stack library where we provide a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more.
 The library is integrated with 🤗 [transformers](https://github.com/huggingface/transformers).

+Below is the current list of TRL trainers, organized by method type (⚡️ = vLLM support).
+
+## Taxonomy
+
+<div style="display: flex; justify-content: space-between; width: 100%; gap: 2rem;">
+<div style="flex: 1; min-width: 0;">
+
+### Online methods
+
+- [`GRPOTrainer`] ⚡️
+- [`RLOOTrainer`] ⚡️
+- [`OnlineDPOTrainer`] ⚡️
+- [`NashMDTrainer`] ⚡️
+- [`XPOTrainer`] ⚡️
+- [`PPOTrainer`]
+
+### Reward modeling
+
+- [`PRMTrainer`]
+- [`RewardTrainer`]
+
+</div>
+<div style="flex: 1; min-width: 0;">
+
+### Offline methods
+
+- [`SFTTrainer`]
+- [`DPOTrainer`]
+- [`ORPOTrainer`]
+- [`BCOTrainer`]
+- [`CPOTrainer`]
+- [`KTOTrainer`]
+
+### Knowledge distillation
+
+- [`GKDTrainer`]
+
+</div>
+</div>
+
 ## 🎉 What's New

 **✨ OpenAI GPT OSS Support**: TRL now fully supports fine-tuning the latest [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4)! Check out the:
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,13 +1,15 @@
 # Installation
+
 You can install TRL either from PyPI or from source:

 ## PyPI
+
 Install the library with pip or [uv](https://docs.astral.sh/uv/):

 <hfoptions id="install">
 <hfoption id="uv">

-uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions).
+uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions.

 ```bash
 uv pip install trl
@ -24,6 +26,7 @@ pip install trl
 </hfoptions>

 ## Source
+
 You can also install the latest version from source. First clone the repo and then run the installation with `pip`:

 ```bash
--- a/docs/source/jobs_training.md
+++ b/docs/source/jobs_training.md
@ -1,6 +1,6 @@
 # Training with Jobs

-[![](https://img.shields.io/badge/All_models-HF_Jobs-blue)](https://huggingface.co/models?other=hf_jobs,trl)
+[![model badge](https://img.shields.io/badge/All_models-HF_Jobs-blue)](https://huggingface.co/models?other=hf_jobs,trl)

 [Hugging Face Jobs](https://huggingface.co/docs/huggingface_hub/guides/jobs) lets you run training scripts on fully managed infrastructure—no need to manage GPUs or local environment setup.

--- a/docs/source/judges.md
+++ b/docs/source/judges.md
@ -13,7 +13,7 @@ pip install trl[judges]

 ## Using the provided judges

-TRL provides several judges out of the box. For example, you can use the `HfPairwiseJudge` to compare two completions using a pre-trained model from the Hugging Face model hub:
+TRL provides several judges out of the box. For example, you can use the [`HfPairwiseJudge`] to compare two completions using a pre-trained model from the Hugging Face model hub:

 ```python
 from trl import HfPairwiseJudge
--- a/docs/source/kernels_hub.md
+++ b/docs/source/kernels_hub.md
@ -46,7 +46,6 @@ trl sft ... --attn_implementation kernels-community/flash-attn
 > [!TIP]
 > Now you can leverage faster attention backends with a pre-optimized kernel for your hardware configuration from the Hub, speeding up both development and training.

-
 ## Comparing Attention Implementations

 We evaluated various attention implementations available in transformers, along with different kernel backends, using **TRL** and **SFT**.  
@ -54,15 +53,14 @@ The experiments were run on a single **H100 GPU** with **CUDA 12.9**, leveraging
 Keep in mind that the results shown here are specific to this setup and may vary with different training configurations.

 The following figure illustrates both **latency** (time per training step) and **peak allocated memory** for the different attention implementations and kernel backends.  
-Kernel-based implementations perform on par with custom-installed attention, and increasing the model’s `max_length` further enhances performance. Memory consumption is similar across all implementations, showing no significant differences. We get the same performance but with less friction, as described in [the following section](#benchmarking-flash-attention-build-from-source-vs-hub-kernels).
-
+Kernel-based implementations perform on par with custom-installed attention, and increasing the model’s `max_length` further enhances performance. Memory consumption is similar across all implementations, showing no significant differences. We get the same performance but with less friction, as described in [the following section](#flash-attention-vs-hub-kernels).

 <div class="flex justify-center">
  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_latency.png" alt="Latency and Memory Usage" width="45%"/>
  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_peak_allocated_memory.png" alt="Latency and Memory Usage" width="45%"/>
 </div>

-## Flash Attention (Build-from-Source) vs. Hub Kernels
+## Flash Attention vs. Hub Kernels

 Building Flash Attention from source can be time-consuming, often taking anywhere from several minutes to hours, depending on your hardware, CUDA/PyTorch configuration, and whether precompiled wheels are available.  

@ -74,7 +72,6 @@ You can combine **FlashAttention kernels** with **Liger kernels** for additional

 First, install the Liger kernel dependency:

-
 ```bash
 pip install liger-kernel
 ```
@ -96,6 +93,4 @@ training_args = SFTConfig(
 )
 ```

-Learn more about this integration [here](./liger_kernel_integration).
-
-
+Learn more about the [Liger Kernel Integration](./liger_kernel_integration).
--- a/docs/source/kto_trainer.md
+++ b/docs/source/kto_trainer.md
@ -1,12 +1,11 @@
 # KTO Trainer

-[![](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto,trl)
+[![model badge](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto,trl)

 ## Overview

 Kahneman-Tversky Optimization (KTO) was introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306) by [Kawin Ethayarajh](https://huggingface.co/kawine), [Winnie Xu](https://huggingface.co/xwinxu), [Niklas Muennighoff](https://huggingface.co/Muennighoff), Dan Jurafsky, [Douwe Kiela](https://huggingface.co/douwekiela).

-
 The abstract from the paper is the following:

 > Kahneman & Tversky's prospect theory tells us that humans perceive random variables in a biased but well-defined manner; for example, humans are famously loss-averse. We show that objectives for aligning LLMs with human feedback implicitly incorporate many of these biases -- the success of these objectives (e.g., DPO) over cross-entropy minimization can partly be ascribed to them being human-aware loss functions (HALOs). However, the utility functions these methods attribute to humans still differ from those in the prospect theory literature. Using a Kahneman-Tversky model of human utility, we propose a HALO that directly maximizes the utility of generations instead of maximizing the log-likelihood of preferences, as current methods do. We call this approach Kahneman-Tversky Optimization (KTO), and it matches or exceeds the performance of preference-based methods at scales from 1B to 30B. Crucially, KTO does not need preferences -- only a binary signal of whether an output is desirable or undesirable for a given input. This makes it far easier to use in the real world, where preference data is scarce and expensive.
@ -51,7 +50,7 @@ accelerate launch train_kto.py

 Distributed across 8 x H100 GPUs, the training takes approximately 30 minutes. You can verify the training progress by checking the reward graph. An increasing trend in the reward margin indicates that the model is improving and generating better responses over time.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kto-qwen2-reward-margin.png)
+![kto qwen2 reward margin](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kto-qwen2-reward-margin.png)

 To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

@ -60,14 +59,14 @@ To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) pe
 What is the best programming language?

 <strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-KTO&gt;:</span></strong>
-The best programming language can vary depending on individual preferences, industry-specific requirements, technical skills, and familiarity with the specific use case or task. Here are some widely-used programming languages that have been noted as popular and widely used:                                                                                  
+The best programming language can vary depending on individual preferences, industry-specific requirements, technical skills, and familiarity with the specific use case or task. Here are some widely-used programming languages that have been noted as popular and widely used:

 Here are some other factors to consider when choosing a programming language for a project:

- <strong><span style="color: green;">1</span> JavaScript</strong>: JavaScript is at the heart of the web and can be used for building web applications, APIs, and interactive front-end applications like frameworks like React and Angular. It's similar to C, C++, and F# in syntax structure and is accessible and easy to learn, making it a popular choice for beginners and professionals alike.                                                                   
- <strong><span style="color: green;">2</span> Java</strong>: Known for its object-oriented programming (OOP) and support for Java 8 and .NET, Java is used for developing enterprise-level software applications, high-performance games, as well as mobile apps, game development, and desktop applications.                                                                                                                                                            
- <strong><span style="color: green;">3</span> C++</strong>: Known for its flexibility and scalability, C++ offers comprehensive object-oriented programming and is a popular choice for high-performance computing and other technical fields. It's a powerful platform for building real-world applications and games at scale.                                                                                                                                         
- <strong><span style="color: green;">4</span> Python</strong>: Developed by Guido van Rossum in 1991, Python is a high-level, interpreted, and dynamically typed language known for its simplicity, readability, and versatility.   
+ <strong><span style="color: green;">1</span> JavaScript</strong>: JavaScript is at the heart of the web and can be used for building web applications, APIs, and interactive front-end applications like frameworks like React and Angular. It's similar to C, C++, and F# in syntax structure and is accessible and easy to learn, making it a popular choice for beginners and professionals alike.
+ <strong><span style="color: green;">2</span> Java</strong>: Known for its object-oriented programming (OOP) and support for Java 8 and .NET, Java is used for developing enterprise-level software applications, high-performance games, as well as mobile apps, game development, and desktop applications.
+ <strong><span style="color: green;">3</span> C++</strong>: Known for its flexibility and scalability, C++ offers comprehensive object-oriented programming and is a popular choice for high-performance computing and other technical fields. It's a powerful platform for building real-world applications and games at scale.
+ <strong><span style="color: green;">4</span> Python</strong>: Developed by Guido van Rossum in 1991, Python is a high-level, interpreted, and dynamically typed language known for its simplicity, readability, and versatility.
 </code></pre>

 ## Expected dataset format
@ -102,7 +101,6 @@ To ensure that we train MOEs similarly during preference-tuning, it is beneficia
 This option is enabled by setting `output_router_logits=True` in the model config (e.g. [`~transformers.MixtralConfig`]).  
 To scale how much the auxiliary loss contributes to the total loss, use the hyperparameter `router_aux_loss_coef=...` (default: `0.001`) in the model config.

-
 ### Batch size recommendations

 Use a per-step batch size that is at least 4, and an effective batch size between 16 and 128. Even if your effective batch size is large, if your per-step batch size is poor, then the KL estimate in KTO will be poor.
--- a/docs/source/liger_kernel_integration.md
+++ b/docs/source/liger_kernel_integration.md
@ -7,15 +7,15 @@

 With this memory reduction, you can potentially turn off `cpu_offloading` or gradient checkpointing to further boost the performance.

-| Speed Up                 | Memory Reduction        |
-|--------------------------|-------------------------|
+| Speed Up | Memory Reduction |
+| --- | --- |
 | ![Speed up](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-tps.png) | ![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-memory.png) |

 1. To use Liger-Kernel in [`SFTTrainer`], first install it by:
-
-```bash
-pip install liger-kernel
-```
+  
+  ```bash
+  pip install liger-kernel
+  ```

 2. Once installed, set `use_liger_kernel` in [`SFTConfig`]. No other changes are needed!

--- a/docs/source/logging.md
+++ b/docs/source/logging.md
@ -3,7 +3,7 @@
 As reinforcement learning algorithms are historically challenging to debug, it's important to pay careful attention to logging.
 By default, TRL trainers like [`PPOTrainer`] and [`GRPOTrainer`] save a lot of relevant information to supported experiment trackers like Trackio, Weights & Biases (wandb) or TensorBoard.

-Upon initialization, pass the `report_to` argument to the respective configuration object (e.g., [`PPOConfig`] for `PPOTrainer`, or [`GRPOConfig`] for `GRPOTrainer`):
+Upon initialization, pass the `report_to` argument to the respective configuration object (e.g., [`PPOConfig`] for [`PPOTrainer`], or [`GRPOConfig`] for [`GRPOTrainer`]):

 ```python
 # For PPOTrainer
@ -19,7 +19,7 @@ grpo_config = GRPOConfig(
 )
 ```

-If you want to log with TensorBoard, you might also need to specify logging directories, for example, by adding `logging_dir=PATH_TO_LOGS` to the configuration object (e.g., `PPOConfig` or `GRPOConfig`).
+If you want to log with TensorBoard, you might also need to specify logging directories, for example, by adding `logging_dir=PATH_TO_LOGS` to the configuration object (e.g., [`PPOConfig`] or [`GRPOConfig`]).

 ## PPO Logging

@ -44,6 +44,7 @@ Here's a brief explanation for the logged metrics provided in the data:
 * `episode`: The current episode count in the training process.

 ### Crucial values
+
 During training, many values are logged, here are the most important ones:

 1. `objective/scores`: The mean scores returned by the reward model / environment.
@ -63,7 +64,7 @@ Here's a brief explanation for the logged metrics provided in the data for the G

 * `num_tokens`: Total number of input tokens processed during training so far.

-#### Completions
+### Completions

 * `completions/mean_length`: Mean length of all generated completions (including those not ending with an EOS token).
 * `completions/min_length`: Minimum length among all generated completions.
@ -73,34 +74,33 @@ Here's a brief explanation for the logged metrics provided in the data for the G
 * `completions/min_terminated_length`: Minimum length among completions that ended with an EOS token.
 * `completions/max_terminated_length`: Maximum length among completions that ended with an EOS token.

-#### Rewards
+### Rewards

 * `rewards/{reward_func_name}/mean`: The mean reward obtained from a specific, named reward function (e.g., `rewards/my_custom_reward/mean`). This is logged for each reward function used.
 * `rewards/{reward_func_name}/std`: The standard deviation of rewards from a specific, named reward function.
 * `reward`: The overall mean of the (potentially weighted and, if `args.scale_rewards` is true, normalized) rewards, after group-wise normalization (advantages).
 * `reward_std`: The standard deviation of the (potentially weighted) rewards *before* group-wise normalization for advantages.

-#### Policy and Loss Metrics
+### Policy and Loss Metrics

-* `kl`: The mean Kullback-Leibler (KL) divergence between the current policy and the reference policy. This is logged only if `beta` (the KL coefficient in `GRPOConfig`) is non-zero.
+* `kl`: The mean Kullback-Leibler (KL) divergence between the current policy and the reference policy. This is logged only if `beta` (the KL coefficient in [`GRPOConfig`]) is non-zero.
 * `entropy`: Average entropy of token predictions across generated completions.
-* If Liger GRPOLoss is used (`use_liger_loss: True` in `GRPOConfig`):
-    *   `clip_ratio`: The fraction of policy updates where the probability ratio was clipped according to the GRPO loss's epsilon bounds.
+* If Liger GRPOLoss is used (`use_liger_loss: True` in [`GRPOConfig`]):
+  * `clip_ratio`: The fraction of policy updates where the probability ratio was clipped according to the GRPO loss's epsilon bounds.
 * If standard GRPOLoss is used (`use_liger_loss: False`):
-    *   `clip_ratio/low_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the lower bound `1 - epsilon_low` (occurs when advantage is negative and ratio is below the bound).
-    *   `clip_ratio/low_min`: The minimum observed fraction for `clip_ratio/low_mean` across batches/processes.
-    *   `clip_ratio/high_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the upper bound `1 + epsilon_high` (occurs when advantage is positive and ratio is above the bound).
-    *   `clip_ratio/high_max`: The maximum observed fraction for `clip_ratio/high_mean` across batches/processes.
-    *   `clip_ratio/region_mean`: The mean fraction of instances where the probability ratio was clipped at either the lower or upper bound.
+  * `clip_ratio/low_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the lower bound `1 - epsilon_low` (occurs when advantage is negative and ratio is below the bound).
+  * `clip_ratio/low_min`: The minimum observed fraction for `clip_ratio/low_mean` across batches/processes.
+  * `clip_ratio/high_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the upper bound `1 + epsilon_high` (occurs when advantage is positive and ratio is above the bound).
+  * `clip_ratio/high_max`: The maximum observed fraction for `clip_ratio/high_mean` across batches/processes.
+  * `clip_ratio/region_mean`: The mean fraction of instances where the probability ratio was clipped at either the lower or upper bound.

 ### Crucial GRPO values

 During GRPO training, monitor these values for insights into performance and stability:

-1.  `reward`: This is the primary objective. It reflects the (group-wise normalized) rewards the policy is achieving. It should generally increase during successful training.
-1.  `kl`: If `beta > 0`, this tracks the divergence from the reference model. Keep an eye on it to ensure the policy doesn't stray too far, which can lead to instability.
-1.  `clip_ratio/*` (either `clip_ratio` for Liger loss or the more detailed `clip_ratio/...` metrics for standard loss): These indicate how often the policy updates are being constrained by the GRPO clipping mechanism. Very high values might suggest that the policy is trying to change too drastically (potentially due to large advantages or a learning rate that's too high) or that the epsilon clipping range is too restrictive.
-1.  `completions/clipped_ratio`: A high ratio here indicates that the model is frequently generating completions that are cut off by `max_completion_length` rather than naturally ending with an EOS token. This might suggest issues with learning sequence termination or that `max_completion_length` is too short.
-1. `rewards/{reward_func_name}/mean`: Monitoring the mean of individual reward functions can help diagnose which aspects of the desired behavior the model is learning or struggling with, especially when using multiple reward sources.
-1. `entropy`: Measures how uncertain the policy is in its action choices, higher entropy suggests more exploration. A collapse in entropy means the policy is becoming overconfident and deterministic, often too early. This can stall learning by reducing exploration and making updates overly biased. Stable but non-zero entropy is usually a sign that the policy retains flexibility and continues to explore.
-
+* `reward`: This is the primary objective. It reflects the (group-wise normalized) rewards the policy is achieving. It should generally increase during successful training.
+* `kl`: If `beta > 0`, this tracks the divergence from the reference model. Keep an eye on it to ensure the policy doesn't stray too far, which can lead to instability.
+* `clip_ratio/*` (either `clip_ratio` for Liger loss or the more detailed `clip_ratio/...` metrics for standard loss): These indicate how often the policy updates are being constrained by the GRPO clipping mechanism. Very high values might suggest that the policy is trying to change too drastically (potentially due to large advantages or a learning rate that's too high) or that the epsilon clipping range is too restrictive.
+* `completions/clipped_ratio`: A high ratio here indicates that the model is frequently generating completions that are cut off by `max_completion_length` rather than naturally ending with an EOS token. This might suggest issues with learning sequence termination or that `max_completion_length` is too short.
+* `rewards/{reward_func_name}/mean`: Monitoring the mean of individual reward functions can help diagnose which aspects of the desired behavior the model is learning or struggling with, especially when using multiple reward sources.
+* `entropy`: Measures how uncertain the policy is in its action choices, higher entropy suggests more exploration. A collapse in entropy means the policy is becoming overconfident and deterministic, often too early. This can stall learning by reducing exploration and making updates overly biased. Stable but non-zero entropy is usually a sign that the policy retains flexibility and continues to explore.
--- a/docs/source/lora_without_regret.md
+++ b/docs/source/lora_without_regret.md
@ -0,0 +1,442 @@
+# LoRA Without Regret
+
+Recent research from the team at [Thinking Machines Lab](https://thinkingmachines.ai/blog/lora/) (Schulman et al., 2025) shows that **LoRA can match full fine-tuning performance** when configured correctly, while using only ~67% of the compute. These findings are exciting to TRL users because they're straightforward to implement and can improve model performance on smaller budgets.
+
+This guide provides simple instructions to reproduce the results of the blog post in TRL.
+
+> [!TIP]
+> It is recommended to read the blog post before following this guide, or to consult both resources in parallel for best results.
+
+## Benefits of LoRA over full fine-tuning
+
+First of all, let's remind ourselves of the benefits of [LoRA over full fine-tuning](https://huggingface.co/docs/trl/en/peft_integration).
+
+LoRA adds adapter layers on top of the base model, which contains significantly fewer parameters than the base model itself. This design reduces GPU memory requirements and enables more efficient training. As described in the [blog](https://thinkingmachines.ai/blog/lora/), this approach was originally thought to involve a performance trade-off, although careful configuration can overcome this trade-off and match full fine-tuning performance.  
+
+## Examples with TRL
+
+Let's implement and train LoRA adapters in TRL scripts based on the core findings of the blog post. Afterwards, we'll revisit each finding in light of the TRL results.
+
+### Supervised Fine-Tuning (SFT)
+
+The blog post performs SFT on a range of models and datasets from the Hub, which we can reproduce in TRL.
+
+| Model | Dataset |
+| --- | --- |
+| [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B) | [allenai/tulu-3-sft-mixture](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) |
+| [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B) | [open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) |
+| [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B) | [allenai/tulu-3-sft-mixture](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) |
+| [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B) | [open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) |
+
+<hfoptions id="sft">
+<hfoption id="python">
+
+We can integrate these findings with the TRL Python API like so:
+
+```python
+
+from datasets import load_dataset
+from peft import LoraConfig
+from trl import SFTTrainer, SFTConfig
+
+dataset = load_dataset("open-thoughts/OpenThoughts-114k", split="train")
+
+peft_config = LoraConfig(r=256, lora_alpha=16, target_modules="all-linear")
+
+training_args = SFTConfig(
+    learning_rate=2e-4,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_train_epochs=1,
+    report_to=["trackio"],
+)
+
+trainer = SFTTrainer(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    train_dataset=dataset,
+    peft_config=peft_config,
+    args=training_args,
+)
+
+trainer.train()
+
+```
+
+</hfoption>
+<hfoption id="jobs">
+
+```bash
+
+hf jobs uv run \
+    --flavor a100-large \
+    --timeout 8h \
+    --secrets HF_TOKEN \
+    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
+    --model_name_or_path Qwen/Qwen2.5-3B-Instruct \
+    --dataset_name open-thoughts/OpenThoughts-114k \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 16 \
+    --use_peft \
+    --lora_r 256 \
+    --lora_alpha 16 \
+    --lora_target_modules all-linear \
+    --output_dir Qwen2.5-3B-OpenThoughts-LoRA \
+    --report_to trackio \
+    --push_to_hub
+
+```
+
+To use Hugging Face Jobs, you will need to be logged in to the Hugging Face Hub (`hf auth login`) and have a [Pro](https://hf.co/pro), [Team](https://hf.co/enterprise), or [Enterprise](https://hf.co/enterprise) plan. Check out the [Jobs documentation](https://huggingface.co/docs/huggingface_hub/en/guides/jobs) for more details.
+
+</hfoption>
+<hfoption id="local">
+
+```bash
+
+uv run "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
+    --model_name_or_path Qwen/Qwen2.5-3B-Instruct \
+    --dataset_name open-thoughts/OpenThoughts-114k \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 16 \
+    --gradient_checkpointing \
+    --eval_strategy no \
+    --use_peft \
+    --lora_r 256 \
+    --lora_alpha 16 \
+    --lora_target_modules all-linear \
+    --output_dir Qwen2.5-3B-OpenThoughts-LoRA \
+    --report_to trackio \
+    --push_to_hub
+
+```
+
+To run the script locally, you will need to have `uv` installed. Check out the [uv documentation](https://docs.astral.sh/uv/) for more details.
+
+</hfoption>
+</hfoptions>
+
+Once training starts, you can monitor the progress in [Trackio](https://huggingface.co/trackio), which will log the URL.
+
+### Reinforcement Learning (GRPO)
+
+The blog post performs GRPO on a range of models and datasets from the Hub, and once again we can reproduce the results in TRL.
+
+| Model | Dataset |
+| --- | --- |
+| [Llama-3.1-8B-Base](https://huggingface.co/meta-llama/Llama-3.2-1B) | [GSM8k](https://huggingface.co/datasets/openai/gsm8k) |
+| [Llama-3.1-8B-Base](https://huggingface.co/meta-llama/Llama-3.2-1B) | [DeepMath-103K](https://huggingface.co/datasets/zwhe99/DeepMath-103K) |
+| [Qwen3-8b-base](https://huggingface.co/Qwen/Qwen3-8b-base) | [DeepMath-103K](https://huggingface.co/datasets/zwhe99/DeepMath-103K) |
+
+For reinforcement learning, the blog uses a math reasoning task that we can reproduce as a Python function.
+
+<details>
+<summary>Reward function</summary>
+
+```python
+def strip_reasoning_accuracy_reward(
+    completions: list[list[dict[str, str]]], solution: list[str], **kwargs
+) -> list[Optional[float]]:
+    """Reward function that strips reasoning tags and checks mathematical accuracy.
+
+    This function:
+    1. Extracts the content from completions
+    2. Removes <think></think> tags (for reasoning that shouldn't be evaluated)
+    3. Parses both the gold solution and the predicted answer
+    4. Uses math_verify to check if they are mathematically equivalent
+
+    Args:
+        completions: List of model completions, each containing a list of messages
+        solution: List of ground truth solutions
+        **kwargs: Additional arguments (ignored but required for trainer compatibility)
+
+    Returns:
+        List of rewards where:
+        - 1.0 if the answer is correct
+        - 0.0 if the answer is incorrect
+        - None if the solution is not parseable (skips this example)
+    """
+    contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+
+    for content, sol in zip(contents, solution):
+        # Strip reasoning tags from completion
+        while "<think>" in content and "</think>" in content:
+            start = content.find("<think>")
+            end = content.find("</think>", start)
+            if start != -1 and end != -1:
+                content = content[:start] + content[end + len("</think>") :]
+            else:
+                break
+
+        # Parse gold solution
+        gold_parsed = parse(
+            f"${sol}$",
+            extraction_config=[
+                LatexExtractionConfig(
+                    boxed_match_priority=0, try_extract_without_anchor=True
+                )
+            ],
+        )
+
+        if len(gold_parsed) != 0:
+            # We require the answer to be provided in correct latex (no malformed operators)
+            answer_parsed = parse(
+                content,
+                extraction_config=[
+                    LatexExtractionConfig(
+                        boxed_match_priority=0,
+                        normalization_config=NormalizationConfig(
+                            basic_latex=True,
+                            units=True,
+                            malformed_operators=False,
+                            nits=False,
+                            boxed=True,
+                        ),
+                        try_extract_without_anchor=False,
+                    )
+                ],
+                extraction_mode="first_match",
+            )
+
+            # Compute binary rewards if verifiable, `None` otherwise to skip this example
+            try:
+                reward = float(verify(gold_parsed, answer_parsed))
+            except Exception as e:
+                print(
+                    f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}"
+                )
+                reward = None
+        else:
+            # If the gold solution is not parseable, we assign `None` to skip this example
+            reward = None
+
+        rewards.append(reward)
+
+    return rewards
+```
+
+</details>
+
+<hfoptions id="grpo">
+<hfoption id="python">
+
+We can implement these recommendations with the TRL Python API like so:
+
+```python
+
+from datasets import load_dataset
+from peft import LoraConfig
+from trl import GRPOConfig, GRPOTrainer
+
+dataset = load_dataset("HuggingFaceH4/OpenR1-Math-220k-default-verified", split="train")
+
+def strip_reasoning_accuracy_reward(completions, **kwargs):
+    """Reward function that strips reasoning and accuracy scores from the model outputs."""
+
+    ... 
+
+peft_config = LoraConfig(
+    r=1,
+    lora_alpha=32,
+    target_modules="all-linear"
+)
+
+training_args = GRPOConfig(
+    learning_rate=5e-5,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_train_epochs=1,
+    num_generations=8,
+    generation_batch_size=8,
+    report_to=["trackio"],
+)
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen3-0.6B",
+    reward_funcs=strip_reasoning_accuracy_reward,
+    args=training_args,
+    train_dataset=dataset,
+    peft_config=peft_config,
+)
+
+trainer.train()
+
+```
+
+> [!WARNING]
+> This snippet skips the reward function which is defined above to keep the example concise.
+
+</hfoption>
+<hfoption id="jobs">
+
+```bash
+
+hf jobs uv run \
+    --flavor a100-large \
+    --timeout 4h \
+    --secrets HF_TOKEN \
+    --env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+    "https://huggingface.co/datasets/burtenshaw/lora-without-regrets/resolve/main/grpo.py" \
+    --model_name_or_path Qwen/Qwen3-0.6B \
+    --dataset_name HuggingFaceH4/OpenR1-Math-220k-default-verified \
+    --output_dir grpo-full-qwen3-0.6b \
+    --learning_rate 1.0e-6 \
+    --lr_scheduler_type cosine \
+    --warmup_ratio 0.0 \
+    --max_grad_norm 1.0 \
+    --beta 0.0 \
+    --max_prompt_length 1024 \
+    --max_completion_length 4096 \
+    --num_generations 16 \
+    --generation_batch_size 16 \
+    --gradient_accumulation_steps 8 \
+    --per_device_train_batch_size 1 \
+    --num_train_epochs 1 \
+    --lora_r 1 \
+    --lora_alpha 32 \
+    --lora_dropout 0.0 \
+    --lora_target_modules all-linear \
+    --vllm_mode colocate \
+    --save_strategy steps \
+    --save_steps 50 \
+    --save_total_limit 1 \
+    --logging_steps 1 \
+    --max_steps 200 \
+    --report_to trackio
+```
+
+To use Hugging Face Jobs, you will need to be logged in to the Hugging Face Hub (`hf auth login`) and have a [Pro](https://hf.co/pro), [Team](https://hf.co/enterprise), or [Enterprise](https://hf.co/enterprise) plan. Check out the [Jobs documentation](https://huggingface.co/docs/huggingface_hub/en/guides/jobs) for more details.
+
+</hfoption>
+<hfoption id="local">
+
+```bash
+uv run "https://huggingface.co/datasets/burtenshaw/lora-without-regrets/resolve/main/grpo.py" \
+    --model_name_or_path Qwen/Qwen3-0.6B \
+    --dataset_name HuggingFaceH4/OpenR1-Math-220k-default-verified \
+    --output_dir grpo-full-qwen3-0.6b \
+    --learning_rate 1.0e-6 \
+    --lr_scheduler_type cosine \
+    --warmup_ratio 0.0 \
+    --max_grad_norm 1.0 \
+    --beta 0.0 \
+    --max_prompt_length 1024 \
+    --max_completion_length 4096 \
+    --num_generations 16 \
+    --generation_batch_size 16 \
+    --gradient_accumulation_steps 8 \
+    --per_device_train_batch_size 1 \
+    --num_train_epochs 1 \
+    --lora_r 1 \
+    --lora_alpha 32 \
+    --lora_dropout 0.0 \
+    --lora_target_modules all-linear \
+    --vllm_mode colocate \
+    --save_strategy steps \
+    --save_steps 50 \
+    --save_total_limit 1 \
+    --logging_steps 1 \
+    --max_steps 200 \
+    --report_to trackio
+```
+
+To run the script locally, you will need to have `uv` installed. Check out the [uv documentation](https://docs.astral.sh/uv/) for more details.
+
+</hfoption>
+</hfoptions>
+
+The reinforcement learning script with GRPO is implemented as a custom script in TRL, which uses the reward function shown above. You can review it at [`grpo.py`](https://huggingface.co/datasets/burtenshaw/lora-without-regrets/blob/main/grpo.py) - Reinforcement learning with LoRA best practices
+
+## Key findings in optimizing LoRA
+
+The authors recommend applying LoRA to all weight matrices rather than limiting it to attention layers, as increasing the rank does not compensate for this restriction. In TRL, this can be configured using `--lora_target_modules all-linear` to apply LoRA to all weight matrices.
+
+We were able to reproduce the results of the blog post using TRL and the SmolLM3 model. We trained the model for 500 steps on the [Math 220k dataset](https://huggingface.co/datasets/HuggingFaceH4/OpenR1-Math-220k-default-verified) with the reward function and configuration above. As you can see in the figure below, the LoRA model's average train reward curve matches the full fine-tuning curve.
+
+![train reward](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/5.png)
+
+And most importantly, the LoRA model uses significantly less memory than the full fine-tuning model, as we can see in the figure below.
+
+![memory usage](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/6.png)
+
+Here are the parameters we used to train the above models
+
+| Parameter | LoRA | Full FT |
+| --- | --- | --- |
+| `--model_name_or_path` | HuggingFaceTB/SmolLM3-3B | HuggingFaceTB/SmolLM3-3B |
+| `--dataset_name` | HuggingFaceH4/OpenR1-Math-220k-default-verified | HuggingFaceH4/OpenR1-Math-220k-default-verified |
+| `--learning_rate` | 1.0e-5 | 1.0e-6 |
+| `--max_prompt_length` | 1024 | 1024 |
+| `--max_completion_length` | 4096 | 4096 |
+| `--lora_r` | 1 | - |
+| `--lora_alpha` | 32 | - |
+| `--lora_dropout` | 0.0 | - |
+| `--lora_target_modules` | all-linear | - |
+
+Let's break down the key findings of the blog post and how we were able to reproduce them.
+
+### 1. *LoRA performs better when applied to all weight matrices*
+
+The authors recommend applying LoRA to all weight matrices rather than limiting it to attention layers, as increasing the rank does not compensate for this restriction.
+
+![all layers](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/1.png)
+
+Attention-only LoRA underperforms even when using a higher rank to match parameter count. In TRL, this can be configured using `--lora_target_modules all-linear` to apply LoRA to all weight matrices.  In Python, we can do this like so:
+
+```python
+from peft import LoraConfig  
+
+peft_config = LoraConfig(target_modules="all-linear")  
+```
+
+### 2. *The adapter needs sufficient capacity to learn from the dataset*
+
+The blog post recommends using a sufficient LoRA rank to learn from the dataset. The rank determines the number of trainable parameters in the LoRA adapter. Therefore, "For datasets that exceed LoRA capacity, LoRA underperforms FullFT".
+
+![learning rate](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/3.png)
+
+In the TRL script, we could use `--lora_r` to set the rank and adapt it based on the task and dataset we're training on. The blog post recommends the following ranks based on the task and dataset size:
+
+Reinforcement learning tasks typically require lower capacity, so smaller LoRA ranks can be used. This is because policy gradient algorithms extract roughly ~1 bit of information per episode, demanding minimal parameter capacity.  
+
+The blog post defines the ideal dataset size for LoRA to match full fine-tuning as "Post-training scale". Which we can use to determine the recommended rank for SFT and RL LoRAs as:
+
+| Task Type | Dataset Size | Recommended Rank |
+| --- | --- | --- |
+| **SFT** | Post-training scale | 256 |
+| **RL** | Any size | 1-32 |
+
+### 3. *"FullFT and high-rank LoRAs have similar learning curves"*
+
+Counterintuitively, the blog post recommends using a higher learning rate than for full fine-tuning. In the table above, we used 1.0e-5 for LoRA and 1.0e-6 for full fine-tuning. In the TRL script, we could use `--learning_rate` to set the learning rate. The  \\( \frac{1}{r} \\) scaling in LoRA makes the optimal learning rate approximately rank-independent.
+
+![learning rate](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/2.png)
+
+### 4. *"In some scenarios, LoRA is less tolerant of large batch sizes than full fine-tuning."*
+
+The blog post recommends using an effective batch size < 32 because the authors found LoRA to be less tolerant of large batch sizes. This could not be mitigated by increasing the LoRA rank. In the TRL script, we could use `--per_device_train_batch_size` and `--gradient_accumulation_steps` to set the batch size.
+
+![learning rate](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/4.png)
+
+## Takeaways
+
+Using TRL, you can efficiently implement LoRA adapters to match full fine-tuning performance, applying the core insights (targeting all weight matrices, choosing the right rank, and managing batch size and learning rate) without the heavy compute cost of FullFT.
+
+## Citation
+
+```bibtex
+@article{schulman2025lora,  
+    title        = {{LoRA Without Regret}},  
+    author       = {John Schulman and Thinking Machines Lab},  
+    year         = 2025,  
+    journal      = {Thinking Machines Lab: Connectionism},  
+    doi          = {10.64434/tml.20250929},  
+    note         = {https://thinkingmachines.ai/blog/lora/}  
+}  
+```
--- a/docs/source/models.md
+++ b/docs/source/models.md
@ -8,7 +8,6 @@ With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder mode

 ## AutoModelForCausalLMWithValueHead

-
 [[autodoc]] AutoModelForCausalLMWithValueHead
    - __init__
    - forward
@ -25,4 +24,4 @@ With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder mode

 ## create_reference_model

-[[autodoc]] create_reference_model
+[[autodoc]] create_reference_model
--- a/docs/source/multi_adapter_rl.md
+++ b/docs/source/multi_adapter_rl.md
@ -14,11 +14,11 @@ You need to address this approach in three stages that we summarize as follows:
 2- Train a reward model using `peft`. This is required in order to re-use the adapter during the RL optimisation process (step 3 below). We show an example of leveraging the `RewardTrainer` from TRL in [this example](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_modeling.py)
 3- Fine tune new adapters on the base model using PPO and the reward adapter. ("0 abstraction RL")

-Make sure to use the same model (i.e. same architecture and same weights) for the stages 2 & 3. 
+Make sure to use the same model (i.e. same architecture and same weights) for the stages 2 & 3.

 ## Quickstart

-Let us assume you have trained your reward adapter on `llama-7b` model using `RewardTrainer` and pushed the weights on the hub under `trl-lib/llama-7b-hh-rm-adapter`. 
+Let us assume you have trained your reward adapter on `llama-7b` model using `RewardTrainer` and pushed the weights on the hub under `trl-lib/llama-7b-hh-rm-adapter`.
 When doing PPO, before passing the model to `PPOTrainer` create your model as follows:

 ```python
@ -48,6 +48,7 @@ trainer = PPOTrainer(

 ...
 ```
+
 Then inside your PPO training loop, call the `compute_reward_score` method by accessing the `model` attribute from `PPOTrainer`.

 ```python
@ -56,9 +57,9 @@ rewards = trainer.model.compute_reward_score(**inputs)

 ## Advanced usage

-### Control on the adapter name 
+### Control on the adapter name

-If you are familiar with the `peft` library, you know that you can use multiple adapters inside the same model. What you can do is train multiple adapters on the same base model to fine-tune on different policies. 
+If you are familiar with the `peft` library, you know that you can use multiple adapters inside the same model. What you can do is train multiple adapters on the same base model to fine-tune on different policies.
 In this case, you want to be able to control the adapter name you want to activate back, after retrieving the reward. For that, simply pass the appropriate `adapter_name` to `ppo_adapter_name` argument when calling `compute_reward_score`.

 ```python
@ -71,6 +72,7 @@ rewards = trainer.model.compute_reward_score(**inputs, ppo_adapter_name=adapter_

 For more memory efficient fine-tuning, you can load your base model in 8-bit or 4-bit while keeping the adapters in the default precision (float32).
 Just pass the appropriate arguments (i.e. `load_in_8bit=True` or `load_in_4bit=True`) to `AutoModelForCausalLMWithValueHead.from_pretrained` as follows (assuming you have installed `bitsandbytes`):
+
 ```python
 model_name = "llama-7b"
 rm_adapter_id = "trl-lib/llama-7b-hh-rm-adapter"
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@ -1,16 +1,16 @@
 # Nash-MD Trainer

-[![](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md,trl)
+[![model badge](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md,trl)

 ## Overview

-Nash-MD was proposed in the paper [Nash Learning from Human Feedback](https://huggingface.co/papers/2312.00886) by Rémi Munos, [Michal Valko](https://huggingface.co/misovalko), Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mésnard, and Andrea Michi. 
+Nash-MD was proposed in the paper [Nash Learning from Human Feedback](https://huggingface.co/papers/2312.00886) by Rémi Munos, [Michal Valko](https://huggingface.co/misovalko), Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mésnard, and Andrea Michi.

 The abstract from the paper is the following:

 > Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences.

-This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif) and [Daniil Tiapkin](https://huggingface.co/dtiapkin), [Pierre Ménard](https://huggingface.co/menardprr), Daniele Calandriello and [Quentin Gallouédec](https://huggingface.co/qgallouedec). 
+This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif) and [Daniil Tiapkin](https://huggingface.co/dtiapkin), [Pierre Ménard](https://huggingface.co/menardprr), Daniele Calandriello and [Quentin Gallouédec](https://huggingface.co/qgallouedec).

 ## Quick start

--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@ -1,10 +1,10 @@
 # Online DPO Trainer

-[![](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo,trl)

-## Overview 
+## Overview

-Online DPO was proposed in [Direct Language Model Alignment from Online AI Feedback](https://huggingface.co/papers/2402.04792) by Shangmin Guo, Biao Zhang, Tianlin Liu, Tianqi Liu, Misha Khalman, Felipe Llinares, Alexandre Rame, Thomas Mesnard, Yao Zhao, Bilal Piot, Johan Ferret, and Mathieu Blondel. 
+Online DPO was proposed in [Direct Language Model Alignment from Online AI Feedback](https://huggingface.co/papers/2402.04792) by Shangmin Guo, Biao Zhang, Tianlin Liu, Tianqi Liu, Misha Khalman, Felipe Llinares, Alexandre Rame, Thomas Mesnard, Yao Zhao, Bilal Piot, Johan Ferret, and Mathieu Blondel.

 The abstract from the paper is the following:

@ -112,7 +112,6 @@ This callback logs the model's generated completions directly to Weights & Biase

 ![Logged Completions](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/wandb_completions.png)

-
 ## Example script

 We provide an example script to train a model using the online DPO method. The script is available in [`examples/scripts/dpo_online.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_online.py)
@ -153,8 +152,7 @@ While training and evaluating, we record the following reward metrics. Here is a

 To validate the online DPO implementation works, we ran experiments with the Pythia 1B, 2.8B, and 6.9B models on a single node of 8 x H100s. Here are the commands we used to run the experiments. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).

-
-```
+```shell
 # 1B Online DPO experiment
 accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml \
    examples/scripts/dpo_online.py \
@ -213,9 +211,8 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml

 Checkpoints and experiment tracking are available at:

- [🤗 Model checkpoints](https://huggingface.co/collections/trl-lib/online-dpo-66acd3fa38a331a9cd457b07)
- [🐝 Tracked experiment](https://wandb.ai/huggingface/trl/reports/Online-DPO-experiments-for-TL-DR-summarisation--Vmlldzo5MTczMDU0)
-
+* [🤗 Model checkpoints](https://huggingface.co/collections/trl-lib/online-dpo-66acd3fa38a331a9cd457b07)
+* [🐝 Tracked experiment](https://wandb.ai/huggingface/trl/reports/Online-DPO-experiments-for-TL-DR-summarisation--Vmlldzo5MTczMDU0)

 To evaluate, we use [vLLM](https://github.com/vllm-project/vllm) to load the checkpoints and GPT-4o mini as a judge model to evaluate the generated TL;DR against the reference TL;DR.
 For more information on how to use judges, see [Judges](judges).
--- a/docs/source/orpo_trainer.md
+++ b/docs/source/orpo_trainer.md
@ -1,6 +1,6 @@
 # ORPO Trainer

-[![](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo,trl) [![](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
+[![model badge](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo,trl) [![model badge](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)

 ## Overview

@ -54,7 +54,7 @@ accelerate launch train_orpo.py

 Distributed across 8 GPUs, the training takes approximately 30 minutes. You can verify the training progress by checking the reward graph. An increasing trend in the reward margin indicates that the model is improving and generating better responses over time.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/orpo-qwen2-reward-margin.png)
+![orpo qwen2 reward margin](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/orpo-qwen2-reward-margin.png)

 To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-ORPO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

@ -64,11 +64,11 @@ What is the best programming language?

 <strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-ORPO&gt;:</span></strong>
 It's challenging to determine the best programming language as no one language is perfect, as the complexity of a task and the type of project are significant factors. Some popular languages include Java, Python, JavaScript, and
-C++. If you have specific needs or requirements for a specific project, it's important to choose the language that best suits those needs.                                                                                          
+C++. If you have specific needs or requirements for a specific project, it's important to choose the language that best suits those needs.

 Here are some other factors to consider when choosing a programming language for a project:

- <strong><span style="color: green;">• Language proficiency:</span></strong> A good programming language is more likely to be easy to understand and use, and will allow developers to collaborate on projects more efficiently.                                     
+ <strong><span style="color: green;">• Language proficiency:</span></strong> A good programming language is more likely to be easy to understand and use, and will allow developers to collaborate on projects more efficiently.
 <strong><span style="color: green;">• Ease of use:</span></strong> There are tools and libraries available to make programming more accessible, so developers should choose a language that can help them get started easier.
 <strong><span style="color: green;">• Code readability:</span></strong> A clear and concise codebase should be easy to read and understand, especially when working with large projects.
 <strong><span style="color: green;">• Tool and framework support:</span></strong> There are numerous libraries available for Python, Java, and JavaScript, along with tools like IDEs and static code analysis tools.
@ -118,7 +118,7 @@ While training and evaluating, we record the following reward metrics:
 - `log_odds_chosen`: the mean log odds ratio of the chosen responses over the rejected responses
 - `log_odds_ratio`: the mean of the `log(sigmoid(log_odds_chosen))`
 - `nll_loss`: the mean negative log likelihood loss from the SFT part of the loss over chosen responses
- 
+
 ## ORPOTrainer

 [[autodoc]] ORPOTrainer
--- a/docs/source/paper_index.md
+++ b/docs/source/paper_index.md
@ -170,7 +170,7 @@ $$
 }
 $$

-Despite  \\( \textcolor{red}{\pi_{\text{inference}}} \\) and  \\( \textcolor{blue}{\pi_{\text{training}}} \\) sharing the same model parameters  \\( \theta \\), they can produce significantly different token probabilities. This unexpected behavior implicitly breaks the on-policy assumption, and silently turns training off-policy. 
+Despite  \\( \textcolor{red}{\pi_{\text{inference}}} \\) and  \\( \textcolor{blue}{\pi_{\text{training}}} \\) sharing the same model parameters  \\( \theta \\), they can produce significantly different token probabilities. This unexpected behavior implicitly breaks the on-policy assumption, and silently turns training off-policy.

 Truncated Importance Sampling (TIS) addresses this issue by adapting the model update via importance-sampling correction. The gradient computation of the aforementioned PPO objective becomes

@ -338,7 +338,7 @@ training_args = DPOConfig(
 )
 ```

-For the unpaired version, the user should utilize `BCOConfig` and `BCOTrainer`.
+For the unpaired version, the user should utilize [`BCOConfig`] and [`BCOTrainer`].

 ### Self-Play Preference Optimization for Language Model Alignment

@ -458,10 +458,7 @@ trainer = SFTTrainer(
 Dynamic Fine-Tuning (DFT) improves the generalization of Large Language Models (LLMs) by dynamically rescaling gradients, outperforming standard Supervised Fine-Tuning (SFT) and showing competitive results in offline reinforcement learning.

 $$
-\mathcal{L}_{\text{DFT}}(\theta) 
-= \mathbb{E}_{(x,y) \sim \mathcal{D}} \left[ - \sum_{t=1}^{|y|} 
-\textcolor{red}{\text{sg}\big(\pi_\theta(y_t \mid y_{<t}, x)\big)} 
-\; \log \pi_\theta(y_t \mid y_{<t}, x) \right]
+\mathcal{L}_{\text{DFT}}(\theta) = \mathbb{E}_{(x,y) \sim \mathcal{D}} \left[ - \sum_{t=1}^{|y|} \textcolor{red}{\text{sg}\big(\pi_\theta(y_t \mid y_{<t}, x)\big)} \; \log \pi_\theta(y_t \mid y_{<t}, x) \right]
 $$

 where  \\( \text{sg}(\cdot) \\) is the stop-gradient operator. To use DFT with SFT as described in the paper, you can use the `loss_type="dft"` argument:
--- a/docs/source/peft_integration.md
+++ b/docs/source/peft_integration.md
@ -3,17 +3,10 @@
 The notebooks and scripts in these examples show how to use Low Rank Adaptation (LoRA) to fine-tune models in a memory efficient manner. Most of PEFT methods supported in peft library but note that some PEFT methods such as Prompt tuning are not supported.
 For more information on LoRA, see the [original paper](https://huggingface.co/papers/2106.09685).

-Here's an overview of the `peft`-enabled notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):
-
-| File | Task | Description | Colab link |
-|---|---| --- |
-| [`stack_llama/rl_training.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py) | RLHF | Distributed fine-tuning of the 7b parameter LLaMA models with a learned reward model and `peft`. |  |
-| [`stack_llama/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/reward_modeling.py) | Reward Modeling | Distributed training of the 7b parameter LLaMA reward model with `peft`. |  |
-| [`stack_llama/supervised_finetuning.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py) | SFT | Distributed instruction/supervised fine-tuning of the 7b parameter LLaMA model with `peft`. |  |
-
 ## Installation
+
 Note: peft is in active development, so we install directly from their Github page.
-Peft also relies on the latest version of transformers. 
+Peft also relies on the latest version of transformers.

 ```bash
 pip install trl[peft]
@ -27,7 +20,7 @@ Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scr

 ## How to use it?

-Simply declare a `PeftConfig` object in your script and pass it through `.from_pretrained` to load the TRL+PEFT model. 
+Simply declare a [`~peft.PeftConfig`] object in your script and pass it through `.from_pretrained` to load the TRL+PEFT model.

 ```python
 from peft import LoraConfig
@ -47,7 +40,9 @@ model = AutoModelForCausalLMWithValueHead.from_pretrained(
    peft_config=lora_config,
 )
 ```
+
 And if you want to load your model in 8bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -55,7 +50,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    peft_config=lora_config,
 )
 ```
+
 ... or in 4bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -64,7 +61,6 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
 )
 ```

-
 ## Launch scripts

 The `trl` library is powered by `accelerate`. As such it is best to configure and launch trainings with the following commands:
@ -77,6 +73,7 @@ accelerate launch examples/scripts/ppo.py --use_peft # launch`es training
 ## Using `trl` + `peft` and Data Parallelism

 You can scale up to as many GPUs as you want, as long as you are able to fit the training process in a single device. The only tweak you need to apply is to load the model as follows:
+
 ```python
 from peft import LoraConfig
 ...
@ -94,7 +91,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    peft_config=lora_config,
 )
 ```
+
 And if you want to load your model in 8bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -102,7 +101,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    load_in_8bit=True,
 )
 ```
+
 ... or in 4bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -110,21 +111,20 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    load_in_4bit=True,
 )
 ```
+
 Finally, make sure that the rewards are computed on correct device as well, for that you can use `ppo_trainer.model.current_device`.

 ## Naive pipeline parallelism (NPP) for large models (>60B models)

-The `trl` library also supports naive pipeline parallelism (NPP) for large models (>60B models). This is a simple way to parallelize the model across multiple GPUs. 
+The `trl` library also supports naive pipeline parallelism (NPP) for large models (>60B models). This is a simple way to parallelize the model across multiple GPUs.
 This paradigm, termed as "Naive Pipeline Parallelism" (NPP) is a simple way to parallelize the model across multiple GPUs. We load the model and the adapters across multiple GPUs and the activations and gradients will be naively communicated across the GPUs. This supports `int8` models as well as other `dtype` models.

-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-npp.png">
-</div>
+![NPP](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-npp.png)

 ### How to use NPP?

-Simply load your model with a custom `device_map` argument on the `from_pretrained` to split your model across multiple devices. Check out this [nice tutorial](https://github.com/huggingface/blog/blob/main/accelerate-large-models.md) on how to properly create a `device_map` for your model. 
- 
+Simply load your model with a custom `device_map` argument on the `from_pretrained` to split your model across multiple devices. Check out this [nice tutorial](https://github.com/huggingface/blog/blob/main/accelerate-large-models.md) on how to properly create a `device_map` for your model.
+
 Also make sure to have the `lm_head` module on the first GPU device as it may throw an error if it is not on the first device. As this time of writing, you need to install the `main` branch of `accelerate`: `pip install git+https://github.com/huggingface/accelerate.git@main` and `peft`: `pip install git+https://github.com/huggingface/peft.git@main`.

 ### Launch scripts
--- a/docs/source/ppo_trainer.md
+++ b/docs/source/ppo_trainer.md
@ -1,10 +1,11 @@
 # PPO Trainer

-[![](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)
+[![model badge](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)

 TRL supports training LLMs with [Proximal Policy Optimization (PPO)](https://huggingface.co/papers/1707.06347).

 References:
+
 - [Fine-Tuning Language Models from Human Preferences](https://github.com/openai/lm-human-preferences)
 - [Learning to Summarize from Human Feedback](https://github.com/openai/summarize-from-feedback)
 - [The N Implementation Details of RLHF with PPO](https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo)
@ -31,49 +32,45 @@ python examples/scripts/ppo/ppo.py \
    --missing_eos_penalty 1.0
 ```

-
 ## Explanation of the logged metrics

 The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35)

-* `eps`: Tracks the number of episodes per second.
-* `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
-* `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
-* `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
-* `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
-* `objective/scores`: The mean scores returned by the reward model / environment.
-* `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
-* `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
-* `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
-* `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
-* `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to policy/clipfrac_avg but for the value function.
-* `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
-* `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
-* `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
-* `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
-* `lr`: lr: The current learning rate used by the optimizer.
-* `episode`: episode: The current episode count in the training process.
-
+- `eps`: Tracks the number of episodes per second.
+- `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
+- `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
+- `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
+- `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
+- `objective/scores`: The mean scores returned by the reward model / environment.
+- `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
+- `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
+- `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
+- `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
+- `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to policy/clipfrac_avg but for the value function.
+- `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
+- `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
+- `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
+- `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
+- `lr`: lr: The current learning rate used by the optimizer.
+- `episode`: episode: The current episode count in the training process.

 ## Cookbook

-* Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
-* Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try understand why this is happening and try to fix it.
-* Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
-* Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
-* Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
-
+- Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
+- Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try understand why this is happening and try to fix it.
+- Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
+- Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
+- Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.

 ## What is my model doing exactly?

 To help you understand what your model is doing, we periodically log some sample completions from the model. Here is an example of a completion. In an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35), it looks like the following, allowing you to see the model's response at different stages of training. By default we generate `--num_sample_generations 10` during training, but you can customize the number of generations.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2_completions.gif)
+![ppov2_completions](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2_completions.gif)

+In the logs the sampled generations look like

-In the logs the sampled generations look like 
-
-```
+```txt
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
 ┃ query                           ┃ model response                  ┃ score    ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
@ -177,7 +174,7 @@ This PPO implementation is based on the [The N+ Implementation Details of RLHF w

 To validate the PPO implementation works, we ran experiment on the 1B model. Here are the command we used to run the experiment. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).

-```
+```shell
 accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \
    examples/scripts/ppo/ppo_tldr.py \
    --output_dir models/minimal/ppo_tldr \
@ -212,8 +209,7 @@ The PPO checkpoint gets a 64.7% preferred rate vs the 33.0% preference rate of t

 Metrics:

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2.png)
-
+![PPO v2](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2.png)

 ```bash
 # pip install openrlbenchmark==0.2.1a5
--- a/docs/source/prm_trainer.md
+++ b/docs/source/prm_trainer.md
@ -1,6 +1,6 @@
 # PRM Trainer

-[![](https://img.shields.io/badge/All_models-PRM-blue)](https://huggingface.co/models?other=prm,trl)
+[![model badge](https://img.shields.io/badge/All_models-PRM-blue)](https://huggingface.co/models?other=prm,trl)

 > [!WARNING]
 > PRM Trainer is an experimental API which is subject to change at any time.
@ -15,7 +15,6 @@ The abstract from the paper is the following:

 This post-training method was contributed by [Gaetan Lopez](https://github.com/gaetanlop), [Lewis Tunstall](https://huggingface.co/lewtun), [Quentin Gallouédec](https://huggingface.co/qgallouedec) and [Agustín Piqueres](https://huggingface.co/plaguss).

-
 ## Quick start

 This example demonstrates how to train a model using the PRM method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B) as the base model. We use the stepwise supervision data from the [Math Shepherd dataset](https://huggingface.co/datasets/trl-lib/math_shepherd). You can view the data in the dataset here:
@ -54,7 +53,6 @@ Distributed across 8 GPUs, the training takes approximately 1 hour.

 To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward-Math-Sheperd) performs, you can use the following script.

-
 ```python
 from datasets import load_dataset
 from transformers import pipeline
--- a/docs/source/reducing_memory_usage.md
+++ b/docs/source/reducing_memory_usage.md
@ -7,9 +7,7 @@

 Sequence lengths in the dataset can vary widely. When data is batched, sequences are padded to match the longest one in the batch, which can cause high memory usage, even if most sequences are relatively short.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png" alt="Truncation prompt-completion" width="600"/>
-</div>
+![Truncation prompt-completion](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png)

 To reduce memory usage, it's important to truncate sequences to a reasonable length. While TRL trainers truncate sequences by default, you may want to adjust the default truncation length to better align with your specific use case.

@ -18,9 +16,7 @@ To reduce memory usage, it's important to truncate sequences to a reasonable len

 DPO truncation is applied first to the prompt and to the completion via the `max_prompt_length` and `max_completion_length` parameters. The `max_length` parameter is then used to truncate the resulting sequence.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png" alt="DPO truncation" width="600"/>
-</div>
+![DPO truncation](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png)

 To set the truncation parameters, use the following code snippet:

@ -43,9 +39,7 @@ training_args = DPOConfig(..., max_completion_length=...)

 SFT truncation is applied to the input sequence via the `max_length` parameter.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png" alt="Truncation input ids" width="600"/>
-</div>
+![Truncation input ids](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png)

 To set the truncation parameter, use the following code snippet:

@ -71,21 +65,19 @@ To help you choose an appropriate value, we provide a utility to visualize the s
 > [!TIP]
 > This technique applies only to SFT.

-
 [Truncation](#truncation) has several drawbacks:
+
 1. **Loss of information**: Key data at the end of a sequence may be discarded.
 2. **Choosing truncation length**: Too short loses data; too long undermines efficiency.

 Packing, introduced in [Raffel et al., 2020](https://huggingface.co/papers/1910.10683), addresses these issues by grouping sequences instead of truncating. It concatenates and splits dataset sequences into the desired lengths.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_2.png" alt="Packing" width="600"/>
-</div>
+![Packing](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_2.png)

 Packing reduces padding by merging several sequences in one row when possible. We use an advanced method to be near-optimal in the way we pack the dataset. To enable packing, use `packing=True` in the [`SFTConfig`].

 > [!TIP]
-> In TRL 0.18 and earlier, packing used a more aggressive method that reduced padding to almost nothing, but had the downside of breaking sequence continuity for a large fraction of the dataset. To revert to this strategy, use `packing_strategy="wrapped"` in `SFTConfig`.
+> In TRL 0.18 and earlier, packing used a more aggressive method that reduced padding to almost nothing, but had the downside of breaking sequence continuity for a large fraction of the dataset. To revert to this strategy, use `packing_strategy="wrapped"` in [`SFTConfig`].

 ```python
 from trl import SFTConfig
@ -142,9 +134,7 @@ training_args = KTOConfig(..., use_liger_loss=True)

 Padding-free batching is an alternative approach for reducing memory usage. In this method, a batch is first sampled and then flattened into a single sequence, avoiding padding. Unlike packing, which can result in incomplete sequences by combining parts of different samples, padding-free batching ensures that all sequences remain complete and intact.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png" alt="Padding-free batching" width="600"/>
-</div>
+![Padding-free](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png)

 > [!WARNING]
 > It's highly recommended to use padding-free batching with **FlashAttention 2** or **FlashAttention 3**. Otherwise, you may encounter batch contamination issues.
--- a/docs/source/reward_trainer.md
+++ b/docs/source/reward_trainer.md
@ -1,6 +1,6 @@
 # Reward Modeling

-[![](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer,trl)
+[![model badge](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer,trl)

 ## Overview

--- a/docs/source/rloo_trainer.md
+++ b/docs/source/rloo_trainer.md
@ -1,6 +1,6 @@
 # RLOO Trainer

-[![](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)
+[![model badge](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)

 ## Overview

@ -101,14 +101,13 @@ where  \\( \beta > 0 \\) controls the strength of the KL penalty.

 ### Computing the advantage

-Once the rewards for each completion have been computed, we calculate a baseline as the average reward of all other samples in the same batch, excluding the current sample. This baseline is used to reduce the variance of the policy gradient estimate. The advantage for each completion is then obtained as the difference between its own reward and this leave-one-out baseline. 
+Once the rewards for each completion have been computed, we calculate a baseline as the average reward of all other samples in the same batch, excluding the current sample. This baseline is used to reduce the variance of the policy gradient estimate. The advantage for each completion is then obtained as the difference between its own reward and this leave-one-out baseline.

 Formally, for a batch of G completions, the baseline for completion is:
 $$
 b_i = \frac{1}{G-1} \sum_{j \neq i} r_j
 $$

-
 and then the advantage for each completion is computed as the difference between its reward and the baseline:

 $$
@ -151,9 +150,9 @@ While training and evaluating, we record the following reward metrics:
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of sequence probabilities where the RLOO objective is clipped to stay within the trust region:
-$$
-\text{clip}\left( r_{i}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i}(\theta) = \frac{\pi_\theta(o_{i} \mid q)}{\pi_{\theta_{\text{old}}}(o_{i} \mid q)}\,.
-$$
+  $$
+  \text{clip}\left( r_{i}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i}(\theta) = \frac{\pi_\theta(o_{i} \mid q)}{\pi_{\theta_{\text{old}}}(o_{i} \mid q)}\,.
+  $$

    A higher value means more samples are clipped, which constrains how much the policy $\pi_\theta$ can change.
 - `clip_ratio/low_mean`: The average ratio of sequence probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
@ -166,6 +165,7 @@ $$
 ### Speed up training with vLLM-powered generation

 Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
+
 ```shell
 pip install trl[vllm]
 ```
@ -177,11 +177,13 @@ We support two ways of using vLLM during training: **server mode** and **colocat
 In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.

 1. **Start the vLLM server**:
+
   ```bash
   trl vllm-serve --model <model_name>
   ```

 2. **Enable server mode in your training script**:
+
   ```python
   from trl import RLOOConfig

@ -214,12 +216,7 @@ training_args = RLOOConfig(
 >
 > We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
 >
-> <iframe
-> 	src="https://trl-lib-recommend-vllm-memory.hf.space"
-> 	frameborder="0"
-> 	width="850"
-> 	height="450"
-> ></iframe>
+> <iframe src="https://trl-lib-recommend-vllm-memory.hf.space" frameborder="0" width="850" height="450"></iframe>
 >
 > If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
 >
@ -418,6 +415,7 @@ You can test this function as follows:
 >>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
 [1.0, 0.0]
 ```
+
 #### Example 4: Multi-task reward functions

 Below is an example of using multiple reward functions in the [`RLOOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
@ -478,8 +476,6 @@ In this example, the `math_reward_func` and `coding_reward_func` are designed to

 Note that the [`RLOOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.

-
-
 #### Passing the reward function to the trainer

 To use your custom reward function, pass it to the [`RLOOTrainer`] as follows:
--- a/docs/source/sentiment_tuning.md
+++ b/docs/source/sentiment_tuning.md
@ -4,15 +4,11 @@ The notebooks and scripts in these examples show how to fine-tune a model with a

 Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):

-
-
-| File                                                                                           | Description                                                                                                              |
-|------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
-| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset                 |
-| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb)              | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.                |
-| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb)                | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.    
-
-
+| File | Description |
+| --- |--- |
+| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset |
+| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
+| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. |

 ## Usage

@ -30,7 +26,6 @@ python examples/scripts/ppo.py --log_with wandb --mini_batch_size 1 --gradient_a

 Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).

+## Few notes on multi-GPU

-## Few notes on multi-GPU 
-
-To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.
+To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.
--- a/docs/source/sft_trainer.md
+++ b/docs/source/sft_trainer.md
@ -106,7 +106,6 @@ $$
 where  \\( y_t \\) is the target token at timestep  \\( t \\), and the model is trained to predict the next token given the previous ones. In practice, padding tokens are masked out during loss computation.

 > [!TIP]
-> 
 > [On the Generalization of SFT: A Reinforcement Learning Perspective with Reward Rectification](https://huggingface.co/papers/2508.05629) proposes an alternative loss function, called **Dynamic Fine-Tuning (DFT)**, which aims to improve generalization by rectifying the reward signal. This method can be enabled by setting `loss_type="dft"` in the [`SFTConfig`]. For more details, see [Paper Index - Dynamic Fine-Tuning](paper_index#on-the-generalization-of-sft-a-reinforcement-learning-perspective-with-reward-rectification).

 ### Label shifting and masking
--- a/docs/source/speeding_up_training.md
+++ b/docs/source/speeding_up_training.md
@ -48,11 +48,13 @@ You can customize the server configuration by passing additional arguments. For
 > When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  
 >
 > Set GPUs **0-3** for vLLM generation:  
+>
 > ```sh
 > CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
 > ```  
 >
-> And GPUs **4-7** for training:  
+> And GPUs **4-7** for training:
+>
 > ```sh
 > CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
 > ```
@ -79,12 +81,14 @@ You can customize the server configuration by passing additional arguments. For
 > [!WARNING]
 > When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  
 >
-> Set GPUs **0-3** for vLLM generation:  
+> Set GPUs **0-3** for vLLM generation:
+>
 > ```sh
 > CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
 > ```  
 >
-> And GPUs **4-7** for training:  
+> And GPUs **4-7** for training:
+>
 > ```sh
 > CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
 > ```
--- a/docs/source/using_llama_models.md
+++ b/docs/source/using_llama_models.md
@ -43,7 +43,6 @@ To use the data efficiently, we use a technique called packing: instead of havin
 With this approach the training is much more efficient as each token that is passed through the model is also trained in contrast to padding tokens which are usually masked from the loss.
 If you don't have much data and are more concerned about occasionally cutting off some tokens that are overflowing the context you can also use a classical data loader.

-
 ```python
 # load model in 8bit
 model = AutoModelForCausalLM.from_pretrained(
@ -109,6 +108,7 @@ peft_config = LoraConfig(
    lora_dropout=0.1,
 )
 ```
+
 As detailed in the next section, the resulting adapter can be merged into the frozen model and saved for further downstream use.

 ## Reinforcement Learning from Human Feedback
--- a/docs/source/vllm_integration.md
+++ b/docs/source/vllm_integration.md
@ -1,16 +1,26 @@
 # vLLM Integration

-This document will guide you through the process of using vLLM with TRL for faster generation in online methods like GRPO and Online DPO. We first summarize a tl;dr on how to use vLLM with TRL, and then we will go into the details of how it works under the hood. Let's go! 🔥
+This document will guide you through the process of using vLLM with TRL for faster generation in online methods like GRPO and Online DPO. We first summarize a tl;dr on how to use vLLM with TRL, and then we will go into the details of how it works under the hood.

 > [!WARNING]
-> TRL currently only supports vLLM versions `0.10.0`, `0.10.1`, and `0.10.2`. Please ensure you have one of these versions installed to avoid compatibility issues.
+> TRL currently only supports vLLM version `0.10.2`. Please ensure you have this version installed to avoid compatibility issues.
+
+> [!TIP]
+> The following trainers currently support generation with vLLM:
+>
+> - [`GRPOTrainer`]
+> - [`OnlineDPOTrainer`]
+> - [`NashMDTrainer`]
+> - [`XPOTrainer`]
+> - [`RLOOTrainer`]

 ## 🚀 How can I use vLLM with TRL to speed up training?

 💡 **Note**: Resources required for this specific example: a single node with 8 GPUs.

 > [!WARNING]
-> vLLM server and TRL trainer must use different CUDA devices to avoid conflicts.
+> When using vLLM with TRL, the **vLLM server** and the **trainer** must run on **separate CUDA devices** to prevent conflicts.
+> For guidance on configuring this properly, see [Modes of using vLLM during training](#modes-of-using-vllm-during-training).

 First, install vLLM using the following command:

@ -24,12 +34,15 @@ Then run the server on specific GPUs (e.g., GPUs 0-3):
 CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model Qwen/Qwen2.5-7B --tensor-parallel-size 2 --data-parallel-size 2
 ```

-Once the server is running, you can use it to generate completions for training. In the example below, we are using the `GRPOTrainer` to train a model using the vLLM server for generation. The `--tensor-parallel-size` and `--data-parallel-size` arguments control how the model and data are sharded across GPUs.
+Once the server is running, you can use it to generate completions for training. In the example below, we are using the different supported trainers using the vLLM server for generation. The `--tensor-parallel-size` and `--data-parallel-size` arguments control how the model and data are sharded across GPUs.

 In this example, we are sharding two copies of the model across 4 GPUs. Increasing data parallelism increases throughput, while increasing tensor parallelism allows for serving larger models. Then, run the training script on different GPUs (e.g., GPUs 4-7) by passing `use_vllm=True` in the training arguments as follows:

 Sample of a simple `train.py` script:

+<hfoptions id="vllm examples">
+<hfoption id="GRPO">
+
 ```python
 from datasets import load_dataset
 from trl import GRPOTrainer, GRPOConfig
@ -57,21 +70,148 @@ trainer = GRPOTrainer(
 trainer.train()
 ```

+</hfoption>
+<hfoption id="OnlineDPO">
+
+```python
+from datasets import load_dataset
+from trl import OnlineDPOTrainer, OnlineDPOConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = OnlineDPOConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = OnlineDPOTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+<hfoption id="NashMD">
+
+```python
+from datasets import load_dataset
+from trl import NashMDTrainer, NashMDConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = NashMDConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = NashMDTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+<hfoption id="XPO">
+
+```python
+from datasets import load_dataset
+from trl import XPOTrainer, XPOConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = XPOConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = XPOTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```python
+from datasets import load_dataset
+from trl import RLOOTrainer, RLOOConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = RLOOConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = RLOOTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+</hfoptions>
+
 And the train command on separate GPUs from the server:

 ```sh
 CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
 ```

-## 🎬 Flashback: Why do we need to use vLLM in online methods?
+## Why using vLLM?
+
+### 🎬 Flashback: Why do we need to use vLLM in online methods?

 Online methods like GRPO or Online DPO require the model to generate completions during training, which are then used to compute reward signals. However, generation can be extremely time-consuming, especially with large or reasoning models. In the default setup (without vLLM), completions are generated using the [(unwrapped) model's `generate` method](https://github.com/huggingface/trl/blob/f3e8c2304428ef16e9ae5de9e5741ed84d533b7b/trl/trainer/grpo_trainer.py#L965C39-L965C66). This approach quickly becomes a major bottleneck — generation is slow and inefficient, particularly for large batches or models. As a result, training times increase significantly, and overall efficiency drops. To address this, we turn to vLLM, which enables much faster and more scalable generation, helping eliminate this bottleneck in online methods.

-## 🤔 How does vLLM solve the slow generation issue?
+### 🤔 How does vLLM solve the slow generation issue?

 If you've ever done autoregressive decoder training, you know all the input tokens to the LLM produce their attention key and value tensors, and these tensors are kept in GPU memory to later generate subsequent tokens based on them. These cached key and value tensors are often referred to as the KV cache. However, storing the KV cache occupies a lot of memory, so vLLM uses a technique called **PagedAttention** to solve this problem. PagedAttention, which is inspired by the OS’s virtual memory concept, stores continuous keys and values in **non-contiguous memory space**, which is much more efficient. The details of this are beyond the scope of this document, but in short, it allows the model to store the keys and values in a more efficient way, reducing the memory footprint and speeding up the generation process. If you are interested, make sure to check out the [vLLM PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) for more details.

-## 🤔 What exactly happens when you run `trl vllm-serve --model <model_name>`?
+## How vLLM Works (Under the Hood) 🔍
+
+### 🤔 What exactly happens when you run `trl vllm-serve --model <model_name>`?

 When you run for example

@ -92,18 +232,18 @@ Each worker operates independently and processes a chunk of the incoming request
 This GPU-to-GPU communication is managed efficiently by NVIDIA’s NCCL library. The communication mainly ensures that each GPU gets its correct portion of the incoming requests — it’s lightweight and doesn’t interfere with generation itself.
 Separately, the number of completions to generate per prompt is controlled by the `num_generations` setting in the GRPO config. For instance, if you set `num_generations=2` (like in the picture above), each prompt will have 2 completions. So, with 8 prompts and `num_generations=2`, you would end up with 16 completions total — regardless of the number of GPUs or parallelism settings.

-## 🥸 More detail on what happens under the hood when running the server
+### 🥸 More detail on what happens under the hood when running the server

-* The vLLM server starts by running the command: `trl vllm-serve --model Qwen/Qwen2.5-7B`.
-* Once the server is running, it generates completions based on requests from the client (trainer) using `vllm_client.generate` [here](https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035).
-* The client (trainer) then requests these completions from the server.
-* These completions are used to compute the reward signal.
-* Based on the reward signal and the model’s output, the loss is computed, and the backward pass is performed to update the model’s weights.
-* **Note**: The server only handles completion generation — it doesn’t train the model. Therefore, the model’s weights aren’t updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using `vllm_client.update_named_param(name, param.data)`.
+- The vLLM server starts by running the command: `trl vllm-serve --model Qwen/Qwen2.5-7B`.
+- Once the server is running, it generates completions based on requests from the client (trainer) using `vllm_client.generate` [these lines](https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035).
+- The client (trainer) then requests these completions from the server.
+- These completions are used to compute the reward signal.
+- Based on the reward signal and the model’s output, the loss is computed, and the backward pass is performed to update the model’s weights.
+- **Note**: The server only handles completion generation — it doesn’t train the model. Therefore, the model’s weights aren’t updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using `vllm_client.update_named_param(name, param.data)`.

 When using vLLM, ensure the GPUs assigned for training and generation are separate to avoid NCCL communication conflicts. If you do not set the `CUDA_VISIBLE_DEVICES` environment variable, the training script will use all available GPUs by default, which may lead to device conflicts. Starting from TRL next release after v0.19.1, the code automatically detects and prevents same-device usage, raising a error at the vllm server process:

-```
+```log
 RuntimeError: Attempting to use the same CUDA device for multiple distinct roles/ranks within the same communicator. 
 Ensure that trainer is using different devices than vLLM server.
 ```
@ -114,19 +254,21 @@ For example, if you want to use GPUs 4–7 for training while the server runs on
 CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
 ```

-## 🍷 More customization options with vLLM?
+## Advanced usage
+
+### 🍷 More customization options with vLLM?

 You can customize the server configuration by passing additional arguments.

-```
+```txt
 $ trl vllm-serve --help
-usage: trl vllm-serve [-h] --model MODEL [--revision REVISION] [--tensor_parallel_size TENSOR_PARALLEL_SIZE]
-                      [--data_parallel_size DATA_PARALLEL_SIZE] [--host HOST] [--port PORT]
-                      [--gpu_memory_utilization GPU_MEMORY_UTILIZATION] [--dtype DTYPE] [--max_model_len MAX_MODEL_LEN]
-                      [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--enforce_eager ENFORCE_EAGER] [--log_level LOG_LEVEL]
+usage: trl vllm-serve [-h] --model MODEL [--revision REVISION] [--tensor_parallel_size TENSOR_PARALLEL_SIZE] [--data_parallel_size DATA_PARALLEL_SIZE] [--host HOST]
+                      [--port PORT] [--gpu_memory_utilization GPU_MEMORY_UTILIZATION] [--dtype DTYPE] [--max_model_len MAX_MODEL_LEN]
+                      [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--enforce_eager [ENFORCE_EAGER]] [--kv_cache_dtype KV_CACHE_DTYPE]
+                      [--trust_remote_code [TRUST_REMOTE_CODE]] [--log_level LOG_LEVEL] [--vllm_model_impl VLLM_MODEL_IMPL]

 options:
-  -h, --help            Show this help message and exit
+  -h, --help            show this help message and exit
  --model MODEL         Model name or path to load the model from. (default: None)
  --revision REVISION   Revision to use for the model. If not specified, the default branch will be used. (default: None)
  --tensor_parallel_size TENSOR_PARALLEL_SIZE, --tensor-parallel-size TENSOR_PARALLEL_SIZE
@ -136,63 +278,222 @@ options:
  --host HOST           Host address to run the server on. (default: 0.0.0.0)
  --port PORT           Port to run the server on. (default: 8000)
  --gpu_memory_utilization GPU_MEMORY_UTILIZATION, --gpu-memory-utilization GPU_MEMORY_UTILIZATION
-                        Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the device
-                        dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus improve the
-                        model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors during
-                        initialization. (default: 0.9)
-  --dtype DTYPE         Data type to use for vLLM generation. If set to 'auto', the data type will be automatically determined based on
-                        the model configuration. Find the supported values in the vLLM documentation. (default: auto)
+                        Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the device dedicated to generation
+                        powered by vLLM. Higher values will increase the KV cache size and thus improve the model's throughput. However, if the value is too high,
+                        it may cause out-of-memory (OOM) errors during initialization. (default: 0.9)
+  --dtype DTYPE         Data type to use for vLLM generation. If set to 'auto', the data type will be automatically determined based on the model configuration.
+                        Find the supported values in the vLLM documentation. (default: auto)
  --max_model_len MAX_MODEL_LEN, --max-model-len MAX_MODEL_LEN
-                        If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced
-                        `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model context
-                        size, which might be much larger than the KV cache, leading to inefficiencies. (default: None)
+                        If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced `vllm_gpu_memory_utilization`, leading to a
+                        reduced KV cache size. If not set, vLLM will use the model context size, which might be much larger than the KV cache, leading to
+                        inefficiencies. (default: None)
  --enable_prefix_caching ENABLE_PREFIX_CACHING, --enable-prefix-caching ENABLE_PREFIX_CACHING
-                        Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support this
-                        feature. (default: None)
-  --enforce_eager ENFORCE_EAGER, --enforce-eager ENFORCE_EAGER
-                        Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the model
-                        in eager mode. If `False` (default behavior), we will use CUDA graph and eager execution in hybrid. (default:
-                        None)
+                        Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support this feature. (default: None)
+  --enforce_eager [ENFORCE_EAGER], --enforce-eager [ENFORCE_EAGER]
+                        Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the model in eager mode. If `False`
+                        (default behavior), we will use CUDA graph and eager execution in hybrid. (default: False)
+  --kv_cache_dtype KV_CACHE_DTYPE, --kv-cache-dtype KV_CACHE_DTYPE
+                        Data type to use for KV cache. If set to 'auto', the dtype will default to the model data type. (default: auto)
+  --trust_remote_code [TRUST_REMOTE_CODE], --trust-remote-code [TRUST_REMOTE_CODE]
+                        Whether to trust remote code when loading models. Set to True to allow executing code from model repositories. This is required for some
+                        custom models but introduces security risks. (default: False)
  --log_level LOG_LEVEL, --log-level LOG_LEVEL
-                        Log level for uvicorn. Possible choices: 'critical', 'error', 'warning', 'info', 'debug', 'trace'. (default:
-                        info)
+                        Log level for uvicorn. Possible choices: 'critical', 'error', 'warning', 'info', 'debug', 'trace'. (default: info)
+  --vllm_model_impl VLLM_MODEL_IMPL, --vllm-model-impl VLLM_MODEL_IMPL
+                        Model implementation to use for vLLM. Must be one of `transformers` or `vllm`. `transformers`: Use the `transformers` backend for model
+                        implementation. `vllm`: Use the `vllm` library for model implementation. (default: vllm)
 ```

-## 🥳 Okay, now that we have the server running, how can we use it to generate completions?
+### 💆🏻‍♀️ What's the best distributed setup?

-Run the training script and pass `use_vllm=True` in the training arguments:
+![tp dp throughput 8 gpus](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png)
+![tp dp throughput 4 gpus](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png)
+
+First and foremost, always remember that the optimal setup depends on:
+
+- The model size
+- The number of GPUs you have
+- The GPU memory size
+- The batch size you are using
+- The number of requests you are sending to the server (prompts)
+- The `max_model_len` you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)
+- The number of completions you are generating for each request (`num_generations`)
+
+Given these factors, our experiments on the Qwen model family (3B, 7B, 14B, 32B) using 8 H100 GPUs show that:
+
+- For reasonable-sized models (3B–14B) and a moderate context window (`max_len < 8k`), using full capacity for data parallelism gives better throughput. The setup `(tp=1, dp=8)` yields the best results.
+- For larger models (32B) and longer context windows (`max_len > 8k`), a smaller DP size combined with some model-side parallelism performs better. For example, `(tp=2, dp=4)` is a good setup for 32B models with a larger context window.
+
+### vLLM with Transformers Backend
+
+vLLM can use the **Transformers backend** for model implementations, which works for both LLMs and VLMs.
+To enable this, set `vllm_model_impl="transformers"` in your configuration or pass it via the command-line argument.
+
+For more details, check out [vLLM Transformers Backend](https://blog.vllm.ai/2025/04/11/transformers-backend.html).
+
+Example:
+
+```sh
+CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen
+2.5-VL-3B-Instruct --tensor-parallel-size 1 --port 8000 --enforce_eager --vllm_model_impl transformers
+```
+
+### Modes of Using vLLM During Training
+
+TRL supports **two modes** for integrating vLLM during training: **server mode** and **colocate mode**.
+
+#### Server Mode
+
+In **server mode**, vLLM runs as a separate process on dedicated GPUs and communicates with the trainer via HTTP.
+This setup is ideal if you have GPUs dedicated to inference.
+
+Example configuration:
+
+<hfoptions id="vllm examples">
+<hfoption id="GRPO">

 ```python
 from trl import GRPOConfig

-training_args = GRPOConfig(..., use_vllm=True)
+training_args = GRPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
 ```

-## 💆🏻‍♀️ What's the best distributed setup?
+</hfoption>
+<hfoption id="OnlineDPO">

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png)
-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png)
-
-First and foremost, always remember that the optimal setup depends on:
-
-* The model size
-* The number of GPUs you have
-* The GPU memory size
-* The batch size you are using
-* The number of requests you are sending to the server (prompts)
-* The `max_model_len` you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)
-* The number of completions you are generating for each request (`num_generations`)
-
-Given these factors, our experiments on the Qwen model family (3B, 7B, 14B, 32B) using 8 H100 GPUs show that:
-
-* For reasonable-sized models (3B–14B) and a moderate context window (`max_len < 8k`), using full capacity for data parallelism gives better throughput. The setup `(tp=1, dp=8)` yields the best results.
-* For larger models (32B) and longer context windows (`max_len > 8k`), a smaller DP size combined with some model-side parallelism performs better. For example, `(tp=2, dp=4)` is a good setup for 32B models with a larger context window.
-
-## vLLM with Transformers Backend
-
-vLLM now supports transformers backend for model implementations. Simply passing in `transformers` in `vllm_model_impl` in configurations or through argument parser will set use transformers backend. This works for both LLMs and VLMs. See an example below, you can get more information [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html).
+```python
+from trl import OnlineDPOConfig

+training_args = OnlineDPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
 ```
-CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen
-2.5-VL-3B-Instruct --tensor-parallel-size 1 --port 8000 --enforce_eager --vllm_model_impl transformers
+
+</hfoption>
+<hfoption id="NashMD">
+
+```python
+from trl import NashMDConfig
+
+training_args = NashMDConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
 ```
+
+</hfoption>
+<hfoption id="XPO">
+
+```python
+from trl import XPOConfig
+
+training_args = XPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```python
+from trl import RLOOConfig
+
+training_args = RLOOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
+```
+
+</hfoption>
+</hfoptions>
+
+#### Colocate Mode
+
+In **colocate mode**, vLLM runs inside the trainer process and shares GPU memory with the training model.
+This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.
+
+Example configuration:
+
+<hfoptions id="vllm examples">
+<hfoption id="GRPO">
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="OnlineDPO">
+
+```python
+from trl import OnlineDPOConfig
+
+training_args = OnlineDPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="NashMD">
+
+```python
+from trl import NashMDConfig
+
+training_args = NashMDConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="XPO">
+
+```python
+from trl import XPOConfig
+
+training_args = XPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```python
+from trl import RLOOConfig
+
+training_args = RLOOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+</hfoptions>
+
+> [!WARNING]
+> Check the documentation of the trainer you are using for specific details on vLLM usage and parameters.
+
+> [!WARNING]
+> To reduce GPU memory usage when running vLLM, consider [enabling vLLM sleep mode](reducing_memory_usage#vllm-sleep-mode).
--- a/docs/source/xpo_trainer.md
+++ b/docs/source/xpo_trainer.md
@ -1,6 +1,6 @@
 # XPO Trainer

-[![](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo,trl)

 ## Overview

@ -57,7 +57,7 @@ To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-XPO) pe
 What is the best programming language?

 <strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-XPO&gt;:</span></strong>
-The best programming language depends on individual preferences and familiarity with coding concepts. Some popular languages include Python, Java, C++, and JavaScript. 
+The best programming language depends on individual preferences and familiarity with coding concepts. Some popular languages include Python, Java, C++, and JavaScript.
 </code></pre>

 ## Expected dataset type
@ -148,7 +148,6 @@ While training and evaluating we record the following reward metrics:
 * `alpha`: The weight of the XPO loss term. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
 * `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].

-
 ## XPOTrainer

 [[autodoc]] XPOTrainer
--- a/examples/README.md
+++ b/examples/README.md
@ -1,3 +1,3 @@
 # Examples

-Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.
+Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.
--- a/examples/notebooks/grpo_qwen3_vl.ipynb
+++ b/examples/notebooks/grpo_qwen3_vl.ipynb
@ -0,0 +1,694 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-J8iGzLf4rUJ"
+      },
+      "source": [
+        "# GRPO Qwen3-VL with QLoRA using TRL\n",
+        "\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_qwen3_vl.ipynb)\n",
+        "\n",
+        "![trl banner](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png)\n",
+        "\n",
+        "\n",
+        "With [**Transformers Reinforcement Learning (TRL)**](https://github.com/huggingface/trl), you can fine-tune cutting edge vision language models. It comes with support for quantized parameter efficient fine-tuning technique **QLoRA**, so we can use free Colab (T4 GPU) to fine-tune models like [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl-68d2a7c1b8a8afce4ebd2dbe).\n",
+        "\n",
+        "\n",
+        "- [TRL GitHub Repository](https://github.com/huggingface/trl) — star us to support the project!  \n",
+        "- [Official TRL Examples](https://huggingface.co/docs/trl/example_overview)  \n",
+        "- [Community Tutorials](https://huggingface.co/docs/trl/community_tutorials)\n",
+        "- [More Qwen3-VL Fine-tuning Examples (including TRL scripts)](https://github.com/QwenLM/Qwen3-VL/tree/main/qwen-vl-finetune/)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NvrzGRnu48Vz"
+      },
+      "source": [
+        "## Install dependencies\n",
+        "\n",
+        "We'll install **TRL** with the **PEFT** extra, which ensures all main dependencies such as **Transformers** and **PEFT** (a package for parameter-efficient fine-tuning, e.g., LoRA/QLoRA) are included. Additionally, we'll install **trackio** to log and monitor our experiments, and **bitsandbytes** to enable quantization of LLMs, reducing memory consumption for both inference and training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8CfZlUevmkg7"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -Uq \"trl[peft]\" bitsandbytes trackio math_verify"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gpzI6omi7728"
+      },
+      "source": [
+        "### Log in to Hugging Face\n",
+        "\n",
+        "Log in to your **Hugging Face** account to save your fine-tuned model, track your experiment results directly on the Hub or access gated models. You can find your **access token** on your [account settings page](https://huggingface.co/settings/tokens)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4Ncx0wYtnYCW"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import notebook_login\n",
+        "\n",
+        "notebook_login()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "V_Zylc4t79-n"
+      },
+      "source": [
+        "## Load dataset\n",
+        "\n",
+        "\n",
+        "We'll load the [**lmms-lab/multimodal-open-r1-8k-verified**](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset from the Hugging Face Hub using the `datasets` library.\n",
+        "\n",
+        "This dataset contains maths problems with the image representing the problem,  along with the solution in thinking format specially tailored for VLMs. By training our model with this dataset, it'll improve its maths and thinking reasoning.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TzXogU24F_QR"
+      },
+      "outputs": [],
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "dataset_id = 'lmms-lab/multimodal-open-r1-8k-verified'\n",
+        "train_dataset = load_dataset(dataset_id, split='train[:5%]')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gVV7RoRN8zk5"
+      },
+      "source": [
+        "In addition to the `problem` and `image` columns, we also include a custom system prompt to tell the model how we'd like the generation.\n",
+        "\n",
+        "The system prompt is extracted from DeepSeek R1. Refer to [this previous recipe](https://huggingface.co/learn/cookbook/fine_tuning_llm_grpo_trl) for more details.\n",
+        "\n",
+        "We convert the dataset samples into conversation samples, including the system prompt and one image and problem description per sample, since this is how the GRPO trainer expects them.\n",
+        "\n",
+        "We also set `padding_side=\"left\"` to ensure that generated completions during training are concatenated directly after the prompt, which is essential for GRPO to correctly compare token-level probabilities between preferred and rejected responses."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZT1JfiiTGExB"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import AutoProcessor\n",
+        "\n",
+        "model_name = \"Qwen/Qwen3-VL-4B-Instruct\" # \"Qwen/Qwen3-VL-8B-Instruct\"\n",
+        "processor = AutoProcessor.from_pretrained(model_name, padding_side=\"left\")\n",
+        "\n",
+        "SYSTEM_PROMPT = (\n",
+        "    \"You are a helpful AI Assistant that provides well-reasoned and detailed responses. \"\n",
+        "    \"You first think about the reasoning process as an internal monologue and then provide the user with the answer. \"\n",
+        "    \"Respond in the following format: <think>\\n...\\n</think>\\n<answer>\\n...\\n</answer>\"\n",
+        ")\n",
+        "\n",
+        "\n",
+        "def make_conversation(example):\n",
+        "    conversation = [\n",
+        "        {\n",
+        "            \"role\": \"system\",\n",
+        "            \"content\": [{\"type\": \"text\", \"text\": SYSTEM_PROMPT}],\n",
+        "        },\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\"type\": \"image\", \"image\": example[\"image\"]},\n",
+        "                {\"type\": \"text\", \"text\": example[\"problem\"]},\n",
+        "            ],\n",
+        "        },\n",
+        "    ]\n",
+        "    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
+        "    return {\n",
+        "        \"prompt\": prompt,\n",
+        "        \"image\": example[\"image\"],\n",
+        "    }\n",
+        "\n",
+        "train_dataset = train_dataset.map(make_conversation)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5txAuMAa8ock"
+      },
+      "source": [
+        "Let's review one example to understand the internal structure:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PDXQd5Jk2Bqe"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hzSR_56wxKDA"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset = train_dataset.remove_columns(['problem', 'original_question', 'original_answer'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T9rCkeqDODba"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset[0]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YY3uMp909Eqy"
+      },
+      "source": [
+        "## Load model and configure LoRA/QLoRA\n",
+        "\n",
+        "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply comment out the `BitsAndBytesConfig` configuration."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gt05dgXgm9QR"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import Qwen3VLForConditionalGeneration, BitsAndBytesConfig\n",
+        "import torch\n",
+        "\n",
+        "model = Qwen3VLForConditionalGeneration.from_pretrained(\n",
+        "    model_name, dtype=\"auto\",\n",
+        "    device_map=\"auto\",\n",
+        "    quantization_config=BitsAndBytesConfig(\n",
+        "        load_in_4bit=True,\n",
+        "        bnb_4bit_use_double_quant=True,\n",
+        "        bnb_4bit_quant_type=\"nf4\",\n",
+        "        bnb_4bit_compute_dtype=torch.float16\n",
+        "    ),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WZGf-GF09Gsc"
+      },
+      "source": [
+        "The following cell defines LoRA (or QLoRA if needed). When training with LoRA/QLoRA, we use a **base model** (the one selected above) and, instead of modifying its original weights, we fine-tune a **LoRA adapter** — a lightweight layer that enables efficient and memory-friendly training. The **`target_modules`** specify which parts of the model (e.g., attention or projection layers) will be adapted by LoRA during fine-tuning."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ME1im5gh2LFg"
+      },
+      "outputs": [],
+      "source": [
+        "from peft import LoraConfig\n",
+        "\n",
+        "# You may need to update `target_modules` depending on the architecture of your chosen model.\n",
+        "# For example, different VLMs might have different attention/projection layer names.\n",
+        "peft_config = LoraConfig(\n",
+        "    r=8,\n",
+        "    lora_alpha=32,\n",
+        "    lora_dropout=0.1,\n",
+        "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mDq4V6dN9MGk"
+      },
+      "source": [
+        "## Train model\n",
+        "\n",
+        "We'll configure **GRPO** using `GRPOConfig`, keeping the parameters minimal so the training fits on a free Colab instance. You can adjust these settings if more resources are available. For full details on all available parameters, check the [TRL GRPOConfig documentation](https://huggingface.co/docs/trl/sft_trainer#trl.GRPOConfig).\n",
+        "\n",
+        "First, we need to define the rewards functions that the training algorithm will use to improve the model. In this case, we'll include two reward functions.\n",
+        "We'll use a format reward that will reward the model when the output includes `<think>` and `<answer>` tags and additionally a length-based reward to discourage overthinking. Both functions have been extracted from [here](https://github.com/huggingface/open-r1/blob/main/src/open_r1/rewards.py)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Dqp3TfUwHUxW"
+      },
+      "outputs": [],
+      "source": [
+        "import re\n",
+        "\n",
+        "def format_reward(completions, **kwargs):\n",
+        "    \"\"\"Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags.\"\"\"\n",
+        "    pattern = r\"^<think>\\n.*?\\n</think>\\n<answer>\\n.*?\\n</answer>$\"\n",
+        "    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completions]\n",
+        "    return [1.0 if match else 0.0 for match in matches]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rxNPUp7RBFcz"
+      },
+      "outputs": [],
+      "source": [
+        "from math_verify import LatexExtractionConfig, parse, verify\n",
+        "from latex2sympy2_extended import NormalizationConfig\n",
+        "\n",
+        "\n",
+        "def len_reward(completions, solution, **kwargs) -> float:\n",
+        "    \"\"\"Compute length-based rewards to discourage overthinking and promote token efficiency.\n",
+        "\n",
+        "    Taken from the Kimi 1.5 tech report: https://huggingface.co/papers/2501.12599\n",
+        "\n",
+        "    Args:\n",
+        "        completions: List of model completions\n",
+        "        solution: List of ground truth solutions\n",
+        "\n",
+        "    Returns:\n",
+        "        List of rewards where:\n",
+        "        - For correct answers: reward = 0.5 - (len - min_len)/(max_len - min_len)\n",
+        "        - For incorrect answers: reward = min(0, 0.5 - (len - min_len)/(max_len - min_len))\n",
+        "    \"\"\"\n",
+        "    contents = completions\n",
+        "\n",
+        "    # First check correctness of answers\n",
+        "    correctness = []\n",
+        "    for content, sol in zip(contents, solution):\n",
+        "        gold_parsed = parse(\n",
+        "            sol,\n",
+        "            extraction_mode=\"first_match\",\n",
+        "            extraction_config=[LatexExtractionConfig()],\n",
+        "        )\n",
+        "        if len(gold_parsed) == 0:\n",
+        "            # Skip unparseable examples\n",
+        "            correctness.append(True)  # Treat as correct to avoid penalizing\n",
+        "            print(\"Failed to parse gold solution: \", sol)\n",
+        "            continue\n",
+        "\n",
+        "        answer_parsed = parse(\n",
+        "            content,\n",
+        "            extraction_config=[\n",
+        "                LatexExtractionConfig(\n",
+        "                    normalization_config=NormalizationConfig(\n",
+        "                        nits=False,\n",
+        "                        malformed_operators=False,\n",
+        "                        basic_latex=True,\n",
+        "                        equations=True,\n",
+        "                        boxed=True,\n",
+        "                        units=True,\n",
+        "                    ),\n",
+        "                    boxed_match_priority=0,\n",
+        "                    try_extract_without_anchor=False,\n",
+        "                )\n",
+        "            ],\n",
+        "            extraction_mode=\"first_match\",\n",
+        "        )\n",
+        "        correctness.append(verify(answer_parsed, gold_parsed))\n",
+        "\n",
+        "    # Calculate lengths\n",
+        "    lengths = [len(content) for content in contents]\n",
+        "    min_len = min(lengths)\n",
+        "    max_len = max(lengths)\n",
+        "\n",
+        "    # If all responses have the same length, return zero rewards\n",
+        "    if max_len == min_len:\n",
+        "        return [0.0] * len(completions)\n",
+        "\n",
+        "    rewards = []\n",
+        "    for length, is_correct in zip(lengths, correctness):\n",
+        "        lambda_val = 0.5 - (length - min_len) / (max_len - min_len)\n",
+        "\n",
+        "        if is_correct:\n",
+        "            reward = lambda_val\n",
+        "        else:\n",
+        "            reward = min(0, lambda_val)\n",
+        "\n",
+        "        rewards.append(float(reward))\n",
+        "\n",
+        "    return rewards\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9xBL7Rni9LZb"
+      },
+      "source": [
+        "After defining the reward function(s), we can define the `GRPOConfig`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OEmRM0rIHXQ4"
+      },
+      "outputs": [],
+      "source": [
+        "from trl import GRPOConfig\n",
+        "\n",
+        "output_dir = \"Qwen3-VL-4B-Instruct-trl-grpo\"\n",
+        "\n",
+        "# Configure training arguments using GRPOConfig\n",
+        "training_args = GRPOConfig(\n",
+        "    learning_rate=2e-5,\n",
+        "    #num_train_epochs=1,\n",
+        "    max_steps=100,                                        # Number of dataset passes. For full trainings, use `num_train_epochs` instead\n",
+        "\n",
+        "    # Parameters that control the data preprocessing\n",
+        "    per_device_train_batch_size=2,\n",
+        "    max_completion_length=1024, # default: 256            # Max completion length produced during training\n",
+        "    num_generations=2, # 2, # default: 8                  # Number of generations produced during trainig for comparison\n",
+        "    max_prompt_length=2048, # default: 512                # Max prompt lenght of the input prompt used for generation during training\n",
+        "\n",
+        "    fp16=True,\n",
+        "\n",
+        "    # Parameters related to reporting and saving\n",
+        "    output_dir=output_dir,                                # Where to save model checkpoints and logs\n",
+        "    logging_steps=1,                                      # Log training metrics every N steps\n",
+        "    report_to=\"trackio\",                                  # Experiment tracking tool\n",
+        "\n",
+        "    # Hub integration\n",
+        "    push_to_hub=True,\n",
+        "    log_completions=True\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "O0q3myQg927v"
+      },
+      "source": [
+        "Configure the GRPO Trainer. We pass the previously configured `training_args`. We don't use eval dataset to maintain memory usage low but you can configure it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "z5JxkmS9HqD5",
+        "outputId": "2b39338e-2194-4829-fc54-5e286566fd28"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.12/dist-packages/peft/mapping_func.py:73: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.\n",
+            "  warnings.warn(\n",
+            "/usr/local/lib/python3.12/dist-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!\n",
+            "  warnings.warn(\n"
+          ]
+        }
+      ],
+      "source": [
+        "from trl import GRPOTrainer\n",
+        "\n",
+        "trainer = GRPOTrainer(\n",
+        "    model=model,\n",
+        "    reward_funcs=[format_reward, len_reward],\n",
+        "    args=training_args,\n",
+        "    train_dataset=train_dataset,\n",
+        "    peft_config=peft_config,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kQC7Q5kg95xq"
+      },
+      "source": [
+        "Show memory stats before training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "naG_7qlYyBP6"
+      },
+      "outputs": [],
+      "source": [
+        "gpu_stats = torch.cuda.get_device_properties(0)\n",
+        "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+        "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+        "\n",
+        "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+        "print(f\"{start_gpu_memory} GB of memory reserved.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YazYtLAe97Dc"
+      },
+      "source": [
+        "And train!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pbJXrhA0ywra"
+      },
+      "outputs": [],
+      "source": [
+        "trainer_stats = trainer.train()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SmcYN5yW99IP"
+      },
+      "source": [
+        "Show memory stats after training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TrrwP4ADMmrp"
+      },
+      "outputs": [],
+      "source": [
+        "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+        "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+        "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+        "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+        "\n",
+        "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+        "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n",
+        "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+        "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+        "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+        "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "saarW87Y9_-R"
+      },
+      "source": [
+        "## Saving fine tuned model\n",
+        "\n",
+        "In this step, we save the fine-tuned model both **locally** and to the **Hugging Face Hub** using the credentials from your account."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "71A8aqEyyETA"
+      },
+      "outputs": [],
+      "source": [
+        "trainer.save_model(output_dir)\n",
+        "trainer.push_to_hub(dataset_name=dataset_id)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nfqvO0qw-OvS"
+      },
+      "source": [
+        "## Load the fine-tuned model and run inference\n",
+        "\n",
+        "Now, let's test our fine-tuned model by loading the **LoRA/QLoRA adapter** and performing **inference**. We'll start by loading the **base model**, then attach the adapter to it, creating the final fine-tuned model ready for evaluation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "R8T2uFQVyFeH"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import Qwen3VLForConditionalGeneration, AutoProcessor\n",
+        "from peft import PeftModel\n",
+        "\n",
+        "base_model = model_name\n",
+        "adapter_model = f\"{output_dir}\" # Replace with your HF username or organization\n",
+        "\n",
+        "model = Qwen3VLForConditionalGeneration.from_pretrained(base_model, dtype=\"auto\", device_map=\"auto\")\n",
+        "model = PeftModel.from_pretrained(model, adapter_model)\n",
+        "\n",
+        "processor = AutoProcessor.from_pretrained(base_model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dPBHP0CpLa6K"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cG5-ccGRyHgo"
+      },
+      "outputs": [],
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "dataset_id = 'lmms-lab/multimodal-open-r1-8k-verified'\n",
+        "train_dataset = load_dataset(dataset_id, split='train[:5%]')\n",
+        "\n",
+        "problem = train_dataset[0]['problem']\n",
+        "image = train_dataset[0]['image']\n",
+        "\n",
+        "messages = [\n",
+        "    {\n",
+        "        \"role\": \"system\", \"content\": [\n",
+        "            {\"type\": \"text\", \"text\": SYSTEM_PROMPT}\n",
+        "        ]\n",
+        "    },\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\"type\": \"image\", \"image\": image},\n",
+        "            {\"type\": \"text\", \"text\": problem},\n",
+        "        ],\n",
+        "    },\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "r_70q_8lLgfV"
+      },
+      "outputs": [],
+      "source": [
+        "messages"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PX92MjqlyIwB"
+      },
+      "outputs": [],
+      "source": [
+        "inputs = processor.apply_chat_template(\n",
+        "    messages,\n",
+        "    tokenize=True,\n",
+        "    add_generation_prompt=True,\n",
+        "    return_dict=True,\n",
+        "    return_tensors=\"pt\"\n",
+        ").to(model.device)\n",
+        "\n",
+        "# Inference: Generation of the output\n",
+        "generated_ids = model.generate(**inputs, max_new_tokens=500)\n",
+        "generated_ids_trimmed = [\n",
+        "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
+        "]\n",
+        "output_text = processor.batch_decode(\n",
+        "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+        ")\n",
+        "print(output_text)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/examples/notebooks/sft_qwen_vl.ipynb
+++ b/examples/notebooks/sft_qwen_vl.ipynb
--- a/examples/research_projects/README.md
+++ b/examples/research_projects/README.md
@ -1,7 +0,0 @@
-# Research projects that use TRL
-
-Welcome to the research projects folder! Here you can find the scripts used for some research projects that used TRL and maintained by the developers and the community (LM de-toxification, Stack-Llama, etc.). Check out the READMEs in the subfolders for more information!
-
- [De-detoxifying language models](https://github.com/huggingface/trl/tree/main/examples/research_projects/toxicity)
- [Stack-Llama](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama)
- [Stack-Llama-2](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2)
--- a/examples/research_projects/layer_skip/README.md
+++ b/examples/research_projects/layer_skip/README.md
@ -1,15 +0,0 @@
-# LayerSkip Training Recipe
-
-Implements the training recipe as described in the [LayerSkip paper](https://huggingface.co/papers/2404.16710).
-
-## Run training
-```
-cd scripts
-python layer_skip_sft.py
-```
-
-## Run benchmark
-```
-cd scripts
-python benchmark_layer_skip.py
-```
--- a/examples/research_projects/layer_skip/scripts/benchmark_layer_skip.py
+++ b/examples/research_projects/layer_skip/scripts/benchmark_layer_skip.py
@ -1,77 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import config
-import torch
-from torch.utils import benchmark
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-def generate_tokens(model, inputs):
-    outputs = model.generate(
-        **inputs,
-        do_sample=False,
-        max_new_tokens=64,
-    )
-    return outputs
-
-
-def generate_tokens_with_assistance(model, inputs, assistant_early_exit):
-    outputs = model.generate(
-        **inputs,
-        assistant_early_exit=assistant_early_exit,
-        do_sample=False,
-        max_new_tokens=64,
-    )
-    return outputs
-
-
-if __name__ == "__main__":
-    ckpt = config.hub_model_id
-
-    model = AutoModelForCausalLM.from_pretrained(ckpt, device_map="auto", dtype=torch.bfloat16)
-    tokenizer = AutoTokenizer.from_pretrained(ckpt)
-
-    prompt = "### Instruction: What are my alarms for the rest of the day?\n ### Response: "
-
-    results = []
-    label = "Generation Times"
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-
-    results.append(
-        benchmark.Timer(
-            stmt="generate_tokens(model, inputs)",
-            setup="from __main__ import generate_tokens",
-            globals={"model": model, "inputs": inputs},
-            num_threads=torch.get_num_threads(),
-            label=label,
-            sub_label="no layer skip",
-            description="generation",
-        ).blocked_autorange()
-    )
-
-    for i in range(1, model.config.num_hidden_layers):
-        results.append(
-            benchmark.Timer(
-                stmt="generate_tokens_with_assistance(model, inputs, assistant_early_exit)",
-                setup="from __main__ import generate_assistant_tokens",
-                globals={"model": model, "assistant_early_exit": i, "inputs": inputs},
-                num_threads=torch.get_num_threads(),
-                label=label,
-                sub_label=f"layer skip {i}",
-                description="generation",
-            ).blocked_autorange()
-        )
-
-    benchmark.Compare(results).print()
--- a/examples/research_projects/layer_skip/scripts/custom_trainer.py
+++ b/examples/research_projects/layer_skip/scripts/custom_trainer.py
@ -1,48 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from trl import SFTTrainer
-
-
-class LayerSkipSFTTrainer(SFTTrainer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.early_exit_layer = 0  # initialize with 0
-        self.always_last_layer = True
-        self.early_exit_loss_scale = 1.0
-
-    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
-        self.early_exit_layer = (
-            self.early_exit_layer % (model.config.num_hidden_layers - 1)
-        ) + 1  # rotates between [1, num_hidden_layers-1]
-        bs, seqlen = inputs.input_ids.shape
-
-        labels = inputs.pop("labels")
-        outputs = model(**inputs, output_hidden_states=True)
-
-        hidden_state = outputs["hidden_states"][self.early_exit_layer].to(model.dtype)
-        if self.early_exit_layer != model.config.num_hidden_layers:
-            hidden_state = model.model.norm(hidden_state)
-        logits = model.lm_head(hidden_state)
-        loss_early = model.loss_function(logits=logits, labels=labels, vocab_size=model.vocab_size)
-
-        if self.always_last_layer:
-            loss_last = model.loss_function(logits=outputs["logits"], labels=labels, vocab_size=model.vocab_size)
-            loss = self.early_exit_loss_scale * loss_early.to(loss_last.device) + 1.0 * loss_last
-            # normalize loss scales
-            loss = loss / (1.0 + self.early_exit_loss_scale)
-        else:
-            loss = loss_early
-
-        return loss
--- a/examples/research_projects/layer_skip/scripts/layer_skip_sft.py
+++ b/examples/research_projects/layer_skip/scripts/layer_skip_sft.py
@ -1,90 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import config
-import torch
-from custom_trainer import LayerSkipSFTTrainer
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from trl import DataCollatorForCompletionOnlyLM, SFTConfig
-
-
-def formatting_prompts_func(example):
-    text = f"### Instruction: {example['utterance']}\n ### Response: {example['semantic_parse']}"
-
-    # Inject eos_token as a string before tokenization, because they are not always added
-    # See: https://github.com/huggingface/transformers/issues/22794 and
-    # https://github.com/huggingface/trl/issues/1623
-    if tokenizer.eos_token:  # usually something like "</s>" for GPT2 or "<|endoftext|>"
-        text += f"{tokenizer.eos_token}"
-
-    return text
-
-
-if __name__ == "__main__":
-    # load the dataset
-    print("[INFO] loading the dataset...")
-    train_dataset = load_dataset(config.dataset_name, split="train")
-
-    print(f"output_root_dir: {config.output_root_dir}")
-    print(f"hub_model_id: {config.hub_model_id}")
-
-    # load the model and tokenizer
-    print("[INFO] loading the model and tokenizer...")
-    model = AutoModelForCausalLM.from_pretrained(config.model_name, device_map="auto", dtype=torch.bfloat16)
-    tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, add_eos_token=True)
-
-    # adding pad and eos tokens if not provided in the tokenizer
-    if tokenizer.pad_token is None:
-        # Add '[PAD]' token if it doesn't exist
-        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
-        model.resize_token_embeddings(len(tokenizer))
-        model.config.pad_token_id = tokenizer.pad_token_id
-
-    if tokenizer.eos_token is None or tokenizer.eos_token == tokenizer.bos_token:
-        # Add '[EOS]' token if it doesn't exist
-        tokenizer.add_special_tokens({"eos_token": "[EOS]"})
-        model.resize_token_embeddings(len(tokenizer))
-        model.config.eos_token_id = tokenizer.eos_token_id
-
-    response_template = " ### Response:"
-    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
-
-    args = SFTConfig(
-        do_train=True,
-        bf16=True,
-        max_seq_length=None,
-        per_device_train_batch_size=config.per_device_train_batch_size,
-        gradient_accumulation_steps=config.gradient_accumulation_steps,
-        learning_rate=config.learning_rate,
-        packing=False,
-        num_train_epochs=1.0,
-        report_to="none",
-        push_to_hub=True,
-        hub_model_id=config.hub_model_id,
-        output_dir=config.output_dir,
-        save_steps=1000,
-        save_total_limit=2,
-    )
-
-    trainer = LayerSkipSFTTrainer(
-        model,
-        train_dataset=train_dataset,
-        args=args,
-        formatting_func=formatting_prompts_func,
-        data_collator=collator,
-    )
-
-    trainer.train()
--- a/examples/research_projects/stack_llama/scripts/README.md
+++ b/examples/research_projects/stack_llama/scripts/README.md
@ -1,18 +0,0 @@
-# RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model.
-There were three main steps to the training process:
-1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se:
-    - `torchrun --nnodes 1  --nproc_per_node 8 examples/research_projects/stack_llama/scripts/supervised_finetuning.py --model_path=<LLAMA_MODEL_PATH> --streaming --learning_rate 1e-5 --max_steps 5000 --output_dir ./llama-se`
-2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm:
-    - `torchrun --nnodes 1  --nproc_per_node 8 examples/research_projects/stack_llama/scripts/reward_modeling.py --model_name=<LLAMA_SE_MODEL>`
-3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model:
-    - `accelerate launch --multi_gpu --num_machines 1  --num_processes 8 examples/research_projects/stack_llama/scripts/rl_training.py --log_with=wandb --model_name=<LLAMA_SE_MODEL> --reward_model_name=<LLAMA_SE_RM_MODEL> --adafactor=False --tokenizer_name=<LLAMA_TOKENIZER> --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam`
-
-
-LoRA layers were using at all stages to reduce memory requirements. 
-At each stage the peft adapter layers were merged with the base model, using: 
-```shell
-python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ
-```
-Note that this script requires `peft>=0.3.0`.
-
-For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform).
--- a/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py
+++ b/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py
@ -1,60 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from peft import PeftConfig, PeftModel
-from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
-
-
-@dataclass
-class ScriptArguments:
-    """
-    The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the
-    merged model.
-    """
-
-    adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"})
-    base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"})
-    output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"})
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args = parser.parse_args_into_dataclasses()[0]
-assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge"
-assert script_args.base_model_name is not None, "please provide the name of the Base model"
-assert script_args.output_name is not None, "please provide the output name of the merged model"
-
-peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
-if peft_config.task_type == "SEQ_CLS":
-    # The sequence classification task is used for the reward model in PPO
-    model = AutoModelForSequenceClassification.from_pretrained(
-        script_args.base_model_name, num_labels=1, dtype=torch.bfloat16
-    )
-else:
-    model = AutoModelForCausalLM.from_pretrained(script_args.base_model_name, return_dict=True, dtype=torch.bfloat16)
-
-tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name)
-
-# Load the PEFT model
-model = PeftModel.from_pretrained(model, script_args.adapter_model_name)
-model.eval()
-
-model = model.merge_and_unload()
-
-model.save_pretrained(f"{script_args.output_name}")
-tokenizer.save_pretrained(f"{script_args.output_name}")
-model.push_to_hub(f"{script_args.output_name}", use_temp_dir=False)
--- a/examples/research_projects/stack_llama/scripts/reward_modeling.py
+++ b/examples/research_projects/stack_llama/scripts/reward_modeling.py
@ -1,321 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Any, Optional, Union
-
-import evaluate
-import numpy as np
-import torch
-import torch.nn as nn
-from datasets import load_dataset
-from peft import LoraConfig, TaskType, get_peft_model
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    HfArgumentParser,
-    PreTrainedTokenizerBase,
-    Trainer,
-    TrainerCallback,
-    TrainingArguments,
-    set_seed,
-)
-from transformers.utils import PaddingStrategy
-
-
-# Define and parse arguments.
-@dataclass
-class ScriptArguments:
-    """
-    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
-    """
-
-    local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
-    resume_from_checkpoint: Optional[bool] = field(
-        default=False,
-        metadata={"help": "If you want to resume training where it left off."},
-    )
-    deepspeed: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "Path to deepspeed config if using deepspeed. You may need this if the model that you want to train doesn't fit on a single GPU."
-        },
-    )
-    per_device_train_batch_size: Optional[int] = field(default=4)
-    per_device_eval_batch_size: Optional[int] = field(default=1)
-    gradient_accumulation_steps: Optional[int] = field(default=1)
-    learning_rate: Optional[float] = field(default=2e-5)
-    weight_decay: Optional[float] = field(default=0.001)
-    model_name: Optional[str] = field(
-        default="gpt2",
-        metadata={
-            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
-        },
-    )
-    tokenizer_name: Optional[str] = field(
-        default=None,
-        metadata={
-            "help": "The tokenizer for your model, if left empty will use the default for your model",
-        },
-    )
-    bf16: Optional[bool] = field(
-        default=True,
-        metadata={
-            "help": "This essentially cuts the training time in half if you want to sacrifice a little precision and have a supported GPU."
-        },
-    )
-    num_train_epochs: Optional[int] = field(
-        default=1,
-        metadata={"help": "The number of training epochs for the reward model."},
-    )
-    train_subset: Optional[int] = field(
-        default=100000,
-        metadata={"help": "The size of the subset of the training data to use"},
-    )
-    eval_subset: Optional[int] = field(
-        default=50000,
-        metadata={"help": "The size of the subset of the eval data to use"},
-    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Enables gradient checkpointing."},
-    )
-    optim: Optional[str] = field(
-        default="adamw_hf",
-        metadata={"help": "The optimizer to use."},
-    )
-    lr_scheduler_type: Optional[str] = field(
-        default="linear",
-        metadata={"help": "The lr scheduler"},
-    )
-    max_length: Optional[int] = field(default=512)
-    eval_first_step: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Whether to run eval after the first step"},
-    )
-    seed: Optional[int] = field(
-        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
-    )
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args = parser.parse_args_into_dataclasses()[0]
-set_seed(script_args.seed)
-# Load the human stack-exchange-paired dataset for tuning the reward model.
-train_dataset = load_dataset(
-    "lvwerra/stack-exchange-paired", data_dir="data/reward", split="train", verification_mode="no_checks"
-)
-if script_args.train_subset > 0:
-    train_dataset = train_dataset.select(range(script_args.train_subset))
-eval_dataset = load_dataset(
-    "lvwerra/stack-exchange-paired", data_dir="data/evaluation", split="train", verification_mode="no_checks"
-)
-if script_args.eval_subset > 0:
-    eval_dataset = eval_dataset.select(range(script_args.eval_subset))
-# Define the training args. Needs to be done before the model is loaded if you are using deepspeed.
-model_name_split = script_args.model_name.split("/")[-1]
-output_name = (
-    f"{model_name_split}_peft_stack-exchange-paired_rmts__{script_args.train_subset}_{script_args.learning_rate}"
-)
-
-training_args = TrainingArguments(
-    output_dir=output_name,
-    learning_rate=script_args.learning_rate,
-    per_device_train_batch_size=script_args.per_device_train_batch_size,
-    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
-    num_train_epochs=script_args.num_train_epochs,
-    weight_decay=script_args.weight_decay,
-    eval_strategy="steps",
-    eval_steps=500,
-    save_strategy="steps",
-    save_steps=500,
-    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-    gradient_checkpointing=script_args.gradient_checkpointing,
-    deepspeed=script_args.deepspeed,
-    local_rank=script_args.local_rank,
-    remove_unused_columns=False,
-    label_names=[],
-    bf16=script_args.bf16,
-    logging_strategy="steps",
-    optim=script_args.optim,
-    lr_scheduler_type=script_args.lr_scheduler_type,
-    seed=script_args.seed,
-)
-
-
-# Load the value-head model and tokenizer.
-tokenizer_name = script_args.tokenizer_name if script_args.tokenizer_name is not None else script_args.model_name
-tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True)
-tokenizer.pad_token = tokenizer.eos_token
-
-
-peft_config = LoraConfig(
-    task_type=TaskType.SEQ_CLS,
-    inference_mode=False,
-    r=8,
-    lora_alpha=32,
-    lora_dropout=0.1,
-)
-
-model = AutoModelForSequenceClassification.from_pretrained(script_args.model_name, num_labels=1, dtype=torch.bfloat16)
-model = get_peft_model(model, peft_config)
-model.print_trainable_parameters()
-
-# Need to do this for gpt2, because it doesn't have an official pad token.
-tokenizer.pad_token = tokenizer.eos_token
-model.config.pad_token_id = tokenizer.eos_token_id
-model.config.use_cache = not script_args.gradient_checkpointing
-num_proc = 24  # Can adjust to be higher if you have more processors.
-original_columns = train_dataset.column_names
-
-
-# Turn the dataset into pairs of post + summaries, where text_j is the preferred question + answer and text_k is the other.
-# Then tokenize the dataset.
-def preprocess_function(examples):
-    new_examples = {
-        "input_ids_j": [],
-        "attention_mask_j": [],
-        "input_ids_k": [],
-        "attention_mask_k": [],
-    }
-    for question, response_j, response_k in zip(examples["question"], examples["response_j"], examples["response_k"]):
-        tokenized_j = tokenizer("Question: " + question + "\n\nAnswer: " + response_j, truncation=True)
-        tokenized_k = tokenizer("Question: " + question + "\n\nAnswer: " + response_k, truncation=True)
-
-        new_examples["input_ids_j"].append(tokenized_j["input_ids"])
-        new_examples["attention_mask_j"].append(tokenized_j["attention_mask"])
-        new_examples["input_ids_k"].append(tokenized_k["input_ids"])
-        new_examples["attention_mask_k"].append(tokenized_k["attention_mask"])
-
-    return new_examples
-
-
-# preprocess the dataset and filter out QAs that are longer than script_args.max_length
-train_dataset = train_dataset.map(
-    preprocess_function,
-    batched=True,
-    num_proc=num_proc,
-    remove_columns=original_columns,
-)
-train_dataset = train_dataset.filter(
-    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length,
-    num_proc=num_proc,
-)
-
-eval_dataset = eval_dataset.map(
-    preprocess_function,
-    batched=True,
-    num_proc=num_proc,
-    remove_columns=original_columns,
-)
-eval_dataset = eval_dataset.filter(
-    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length,
-    num_proc=num_proc,
-)
-
-
-# We need to define a special data collator that batches the data in our j vs k format.
-@dataclass
-class RewardDataCollatorWithPadding:
-    tokenizer: PreTrainedTokenizerBase
-    padding: Union[bool, str, PaddingStrategy] = True
-    pad_to_multiple_of: Optional[int] = None
-    return_tensors: str = "pt"
-
-    def __call__(self, features: list[dict[str, Any]]) -> dict[str, Any]:
-        features_j = []
-        features_k = []
-        for feature in features:
-            features_j.append(
-                {
-                    "input_ids": feature["input_ids_j"],
-                    "attention_mask": feature["attention_mask_j"],
-                }
-            )
-            features_k.append(
-                {
-                    "input_ids": feature["input_ids_k"],
-                    "attention_mask": feature["attention_mask_k"],
-                }
-            )
-        batch_j = self.tokenizer.pad(
-            features_j,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch_k = self.tokenizer.pad(
-            features_k,
-            padding=self.padding,
-            pad_to_multiple_of=self.pad_to_multiple_of,
-            return_tensors=self.return_tensors,
-        )
-        batch = {
-            "input_ids_j": batch_j["input_ids"],
-            "attention_mask_j": batch_j["attention_mask"],
-            "input_ids_k": batch_k["input_ids"],
-            "attention_mask_k": batch_k["attention_mask"],
-            "return_loss": True,
-        }
-        return batch
-
-
-# Define the metric that we'll use for validation.
-accuracy = evaluate.load("accuracy")
-
-
-def compute_metrics(eval_pred):
-    predictions, _ = eval_pred
-    # Here, predictions is rewards_j and rewards_k.
-    # We want to see how much of the time rewards_j > rewards_k.
-    predictions = np.argmax(predictions, axis=0)
-    labels = np.zeros(predictions.shape)
-    return accuracy.compute(predictions=predictions, references=labels)
-
-
-class RewardTrainer(Trainer):
-    # Define how to compute the reward loss. We use the InstructGPT pairwise logloss: https://huggingface.co/papers/2203.02155
-    def compute_loss(self, model, inputs, return_outputs=False):
-        rewards_j = model(input_ids=inputs["input_ids_j"], attention_mask=inputs["attention_mask_j"])[0]
-        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
-        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
-        if return_outputs:
-            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
-        return loss
-
-
-# Train the model, woohoo.
-trainer = RewardTrainer(
-    model=model,
-    args=training_args,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    compute_metrics=compute_metrics,
-    data_collator=RewardDataCollatorWithPadding(tokenizer=tokenizer),
-)
-
-
-if script_args.eval_first_step:
-
-    class EvaluateFirstStepCallback(TrainerCallback):
-        def on_step_end(self, args, state, control, **kwargs):
-            if state.global_step == 1:
-                control.should_evaluate = True
-
-    trainer.add_callback(EvaluateFirstStepCallback())
-
-trainer.train(script_args.resume_from_checkpoint)
-
-print("Saving last checkpoint of the model")
-model.save_pretrained(output_name + "_peft_last_checkpoint")
--- a/examples/research_projects/stack_llama/scripts/rl_training.py
+++ b/examples/research_projects/stack_llama/scripts/rl_training.py
@ -1,270 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from accelerate import Accelerator
-from datasets import load_dataset
-from peft import LoraConfig
-from tqdm import tqdm
-from transformers import Adafactor, AutoTokenizer, HfArgumentParser, pipeline, set_seed
-
-from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
-from trl.core import LengthSampler
-
-
-tqdm.pandas()
-
-
-@dataclass
-class ScriptArguments:
-    """
-    The name of the Casual LM model we wish to fine-tune with PPO
-    """
-
-    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
-    # models like gpt-neo* models are more suitable.
-    model_name: Optional[str] = field(default="", metadata={"help": "the model name"})
-    tokenizer_name: Optional[str] = field(default="", metadata={"help": "the tokenizer name"})
-    reward_model_name: Optional[str] = field(default="", metadata={"help": "the reward model name"})
-    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
-    learning_rate: Optional[float] = field(default=1.41e-5, metadata={"help": "the learning rate"})
-    output_max_length: Optional[int] = field(default=128, metadata={"help": "maximum length for generation"})
-    mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
-    batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
-    ppo_epochs: Optional[int] = field(default=4, metadata={"help": "the number of ppo epochs"})
-    gradient_accumulation_steps: Optional[int] = field(
-        default=4, metadata={"help": "the number of gradient accumulation steps"}
-    )
-    adafactor: Optional[bool] = field(default=False, metadata={"help": "whether to use the adafactor optimizer"})
-    early_stopping: Optional[bool] = field(default=False, metadata={"help": "whether to early stop"})
-    target_kl: Optional[float] = field(default=0.1, metadata={"help": "kl target for early stopping"})
-    reward_baseline: Optional[float] = field(
-        default=0.0,
-        metadata={"help": "a baseline value that is subtracted from the reward"},
-    )
-    batched_gen: Optional[bool] = field(default=False, metadata={"help": "whether to use the batched text gen"})
-    save_freq: Optional[int] = field(default=None, metadata={"help": "n steps to save the model"})
-    output_dir: Optional[str] = field(default="runs/", metadata={"help": "n steps to save the model"})
-    seed: Optional[int] = field(default=0, metadata={"help": "the seed"})
-    steps: Optional[int] = field(default=20000, metadata={"help": "number of epochs"})
-    init_kl_coef: Optional[float] = field(
-        default=0.2,
-        metadata={"help": "Initial KL penalty coefficient (used for adaptive and linear control)"},
-    )
-
-    adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"})
-    load_in_8bit: Optional[bool] = field(default=True, metadata={"help": "whether to load the model in 8bit"})
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args: ScriptArguments = parser.parse_args_into_dataclasses()[0]
-reward_model_name = script_args.reward_model_name
-dataset_name = "lvwerra/stack-exchange-paired"
-config = PPOConfig(
-    steps=script_args.steps,
-    model_name=script_args.model_name,
-    learning_rate=script_args.learning_rate,
-    log_with=script_args.log_with,
-    batch_size=script_args.batch_size,
-    mini_batch_size=script_args.mini_batch_size,
-    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-    optimize_device_cache=True,
-    early_stopping=script_args.early_stopping,
-    target_kl=script_args.target_kl,
-    ppo_epochs=script_args.ppo_epochs,
-    seed=script_args.seed,
-    init_kl_coef=script_args.init_kl_coef,
-    adap_kl_ctrl=script_args.adap_kl_ctrl,
-)
-
-train_dataset = load_dataset(
-    "lvwerra/stack-exchange-paired", data_dir="data/rl", split="train", verification_mode="no_checks"
-)
-train_dataset = train_dataset.select(range(100000))
-original_columns = train_dataset.column_names
-
-# We then define the arguments to pass to the sentiment analysis pipeline.
-# We set `return_all_scores` to True to get the sentiment score for each token.
-sent_kwargs = {
-    "return_all_scores": True,
-    "function_to_apply": "none",
-    "batch_size": 16,
-    "truncation": True,
-}
-
-tokenizer = AutoTokenizer.from_pretrained(script_args.tokenizer_name)
-# GPT-2 tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
-# only for this model.
-
-if getattr(tokenizer, "pad_token", None) is None:
-    tokenizer.pad_token = tokenizer.eos_token
-
-
-# Below is an example function to build the dataset. In our case, we use the IMDB dataset
-# from the `datasets` library. One should customize this function to train the model on
-# its own dataset.
-def build_dataset(
-    tokenizer,
-    dataset_name="lvwerra/stack-exchange-paired",
-):
-    """
-    Build dataset for training. This builds the dataset from `load_dataset`, one should
-    customize this function to train the model on its own dataset.
-
-    Args:
-        tokenizer (`transformers.PreTrainedTokenizer`):
-            The tokenizer used for the model.
-        dataset_name (`str`):
-            The name of the dataset to be loaded.
-
-    Returns:
-        dataloader (`torch.utils.data.DataLoader`):
-            The dataloader for the dataset.
-    """
-
-    num_proc = 24
-
-    def preprocess_function(examples):
-        new_examples = {
-            "query": [],
-            "input_ids": [],
-        }
-        for question in examples["question"]:
-            query = "Question: " + question + "\n\nAnswer: "
-            tokenized_question = tokenizer(query, truncation=True)
-            new_examples["query"].append(query)
-            new_examples["input_ids"].append(tokenized_question["input_ids"])
-
-        return new_examples
-
-    ds = train_dataset.map(
-        preprocess_function,
-        batched=True,
-        num_proc=num_proc,
-        remove_columns=original_columns,
-    )
-    ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False, num_proc=num_proc)
-
-    ds.set_format(type="torch")
-    return ds
-
-
-# We retrieve the dataloader by calling the `build_dataset` function.
-dataset = build_dataset(tokenizer)
-
-
-def collator(data):
-    return {key: [d[key] for d in data] for key in data[0]}
-
-
-# set seed before initializing value head for deterministic eval
-set_seed(config.seed)
-
-# Now let's build the model, the reference model, and the tokenizer.
-current_device = Accelerator().local_process_index
-
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-model = AutoModelForCausalLMWithValueHead.from_pretrained(
-    config.model_name,
-    load_in_8bit=script_args.load_in_8bit,
-    device_map={"": current_device},
-    peft_config=lora_config,
-)
-
-optimizer = None
-if script_args.adafactor:
-    optimizer = Adafactor(
-        filter(lambda p: p.requires_grad, model.parameters()),
-        scale_parameter=False,
-        relative_step=False,
-        warmup_init=False,
-        lr=config.learning_rate,
-    )
-# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
-ppo_trainer = PPOTrainer(
-    config,
-    model,
-    ref_model=None,
-    tokenizer=tokenizer,
-    dataset=dataset,
-    data_collator=collator,
-    optimizer=optimizer,
-)
-
-# We then build the sentiment analysis pipeline using our reward model, passing the
-# model name and the sentiment analysis pipeline arguments. Let's also make sure to
-# set the device to the same device as the PPOTrainer.
-device = ppo_trainer.accelerator.device
-if ppo_trainer.accelerator.num_processes == 1:
-    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug
-sentiment_pipe = pipeline(
-    "sentiment-analysis",
-    model=reward_model_name,
-    device_map={"": current_device},
-    model_kwargs={"load_in_8bit": script_args.load_in_8bit},
-    tokenizer=tokenizer,
-    return_token_type_ids=False,
-)
-
-if sentiment_pipe.model.config.pad_token_id is None:
-    sentiment_pipe.model.config.pad_token_id = sentiment_pipe.model.config.eos_token_id
-# We then define the arguments to pass to the `generate` function. These arguments
-# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
-# the `generate` function of the trained model.
-generation_kwargs = {
-    # "min_length": -1,
-    "top_k": 0.0,
-    "top_p": 1.0,
-    "do_sample": True,
-    "pad_token_id": tokenizer.pad_token_id,
-    "eos_token_id": 100_000,
-}
-output_min_length = 32
-output_max_length = script_args.output_max_length
-output_length_sampler = LengthSampler(output_min_length, output_max_length)
-
-for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
-    if epoch >= config.total_ppo_epochs:
-        break
-
-    question_tensors = batch["input_ids"]
-
-    response_tensors = ppo_trainer.generate(
-        question_tensors,
-        return_prompt=False,
-        length_sampler=output_length_sampler,
-        **generation_kwargs,
-    )
-    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
-
-    # Compute reward score (using the sentiment analysis pipeline)
-    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
-    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
-    rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]
-
-    # Run PPO step
-    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
-    ppo_trainer.log_stats(stats, batch, rewards)
-
-    if script_args.save_freq and epoch and epoch % script_args.save_freq == 0:
-        ppo_trainer.save_pretrained(script_args.output_dir + f"step_{epoch}")
--- a/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
+++ b/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
@ -1,222 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import os
-
-from accelerate import Accelerator
-from datasets import load_dataset
-from peft import LoraConfig
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, logging, set_seed
-
-from trl import SFTTrainer
-from trl.trainer import ConstantLengthDataset
-
-
-"""
-Fine-Tune Llama-7b on SE paired dataset
-"""
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model_path", type=str, default="")
-    parser.add_argument("--dataset_name", type=str, default="lvwerra/stack-exchange-paired")
-    parser.add_argument("--subset", type=str, default="data/finetune")
-    parser.add_argument("--split", type=str, default="train")
-    parser.add_argument("--size_valid_set", type=int, default=4000)
-    parser.add_argument("--streaming", action="store_true")
-    parser.add_argument("--shuffle_buffer", type=int, default=5000)
-
-    parser.add_argument("--seq_length", type=int, default=1024)
-    parser.add_argument("--max_steps", type=int, default=10000)
-    parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument("--gradient_accumulation_steps", type=int, default=1)
-    parser.add_argument("--eos_token_id", type=int, default=49152)
-
-    parser.add_argument("--learning_rate", type=float, default=1e-4)
-    parser.add_argument("--lr_scheduler_type", type=str, default="cosine")
-    parser.add_argument("--num_warmup_steps", type=int, default=100)
-    parser.add_argument("--weight_decay", type=float, default=0.05)
-
-    parser.add_argument("--local_rank", type=int, default=0)
-    parser.add_argument("--fp16", action="store_true", default=False)
-    parser.add_argument("--bf16", action="store_true", default=False)
-    parser.add_argument("--gradient_checkpointing", action="store_true", default=False)
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--num_workers", type=int, default=None)
-    parser.add_argument("--output_dir", type=str, default="./checkpoints")
-    parser.add_argument("--log_freq", default=1, type=int)
-    parser.add_argument("--eval_freq", default=1000, type=int)
-    parser.add_argument("--save_freq", default=1000, type=int)
-
-    return parser.parse_args()
-
-
-def chars_token_ratio(dataset, tokenizer, nb_examples=400):
-    """
-    Estimate the average number of characters per token in the dataset.
-    """
-    total_characters, total_tokens = 0, 0
-    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
-        text = prepare_sample_text(example)
-        total_characters += len(text)
-        if tokenizer.is_fast:
-            total_tokens += len(tokenizer(text).tokens())
-        else:
-            total_tokens += len(tokenizer.tokenize(text))
-
-    return total_characters / total_tokens
-
-
-def print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
-    )
-
-
-def prepare_sample_text(example):
-    """Prepare the text from a sample of the dataset."""
-    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
-    return text
-
-
-def create_datasets(tokenizer, args):
-    dataset = load_dataset(
-        args.dataset_name,
-        data_dir=args.subset,
-        split=args.split,
-        use_auth_token=True,
-        num_proc=args.num_workers if not args.streaming else None,
-        streaming=args.streaming,
-    )
-    if args.streaming:
-        print("Loading the dataset in streaming mode")
-        valid_data = dataset.take(args.size_valid_set)
-        train_data = dataset.skip(args.size_valid_set)
-        train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=args.seed)
-    else:
-        dataset = dataset.train_test_split(test_size=0.005, seed=args.seed)
-        train_data = dataset["train"]
-        valid_data = dataset["test"]
-        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
-
-    chars_per_token = chars_token_ratio(train_data, tokenizer)
-    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
-
-    train_dataset = ConstantLengthDataset(
-        tokenizer,
-        train_data,
-        formatting_func=prepare_sample_text,
-        infinite=True,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    valid_dataset = ConstantLengthDataset(
-        tokenizer,
-        valid_data,
-        formatting_func=prepare_sample_text,
-        infinite=False,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    return train_dataset, valid_dataset
-
-
-def run_training(args, train_data, val_data):
-    print("Loading the model")
-
-    lora_config = LoraConfig(
-        r=16,
-        lora_alpha=32,
-        lora_dropout=0.05,
-        bias="none",
-        task_type="CAUSAL_LM",
-    )
-
-    train_data.start_iteration = 0
-
-    print("Starting main loop")
-
-    training_args = TrainingArguments(
-        output_dir=args.output_dir,
-        dataloader_drop_last=True,
-        eval_strategy="steps",
-        max_steps=args.max_steps,
-        eval_steps=args.eval_freq,
-        save_steps=args.save_freq,
-        logging_steps=args.log_freq,
-        per_device_train_batch_size=args.batch_size,
-        per_device_eval_batch_size=args.batch_size,
-        learning_rate=args.learning_rate,
-        lr_scheduler_type=args.lr_scheduler_type,
-        warmup_steps=args.num_warmup_steps,
-        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        gradient_checkpointing=args.gradient_checkpointing,
-        fp16=args.fp16,
-        bf16=args.bf16,
-        weight_decay=args.weight_decay,
-        run_name="llama-7b-finetuned",
-        report_to="wandb",
-        ddp_find_unused_parameters=False,
-    )
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model_path, load_in_8bit=True, device_map={"": Accelerator().process_index}
-    )
-
-    trainer = SFTTrainer(
-        model=model,
-        args=training_args,
-        train_dataset=train_data,
-        eval_dataset=val_data,
-        peft_config=lora_config,
-        packing=True,
-    )
-
-    print_trainable_parameters(trainer.model)
-
-    print("Training...")
-    trainer.train()
-
-    print("Saving last checkpoint of the model")
-    trainer.model.save_pretrained(os.path.join(args.output_dir, "final_checkpoint/"))
-
-
-def main(args):
-    tokenizer = AutoTokenizer.from_pretrained(args.model_path)
-    train_dataset, eval_dataset = create_datasets(tokenizer, args)
-    run_training(args, train_dataset, eval_dataset)
-
-
-if __name__ == "__main__":
-    args = get_args()
-    assert args.model_path != "", "Please provide the llama model path"
-
-    set_seed(args.seed)
-    os.makedirs(args.output_dir, exist_ok=True)
-
-    logging.set_verbosity_error()
-
-    main(args)
--- a/examples/research_projects/stack_llama_2/scripts/README.md
+++ b/examples/research_projects/stack_llama_2/scripts/README.md
@ -1,75 +0,0 @@
-# DPO pipeline for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model
-
-## Prerequisites
-
-Install all the dependencies in the `requirements.txt`:
-
-```
-$ pip install -U -r requirements.txt
-```
-
-Since we will use `accelerate` for training, make sure to run:
-```
-$ accelerate config
-```
-
-## Training
-
-There were two main steps to the DPO training process:
-1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
-
-    ```
-    accelerate launch examples/research_projects/stack_llama_2/scripts/sft_llama2.py \
-        --output_dir="./sft" \
-        --max_steps=500 \
-        --save_steps=10 \
-        --per_device_train_batch_size=4 \
-        --per_device_eval_batch_size=1 \
-        --gradient_accumulation_steps=2 \
-        --gradient_checkpointing=False \
-        --group_by_length=False \
-        --learning_rate=1e-4 \
-        --lr_scheduler_type="cosine" \
-        --warmup_steps=100 \
-        --weight_decay=0.05 \
-        --optim="paged_adamw_32bit" \
-        --bf16=True \
-        --remove_unused_columns=False \
-        --run_name="sft_llama2" \
-        --report_to="wandb"
-    ```
-1. Run the DPO trainer using the model saved by the previous step:
-    ```
-    accelerate launch examples/research_projects/stack_llama_2/scripts/dpo_llama2.py \
-        --model_name_or_path="sft/final_checkpoint" \
-        --output_dir="dpo"
-    ```
-
-
-## Merging the adaptors
-
-To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
-
-```
-python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo/final_checkpoint/" --output_name="stack-llama-2"
-```
-
-which will also push the model to your HuggingFace hub account.
-
-## Running the model
-
-We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and load them via:
-
-```py
-from peft import AutoPeftModelForCausalLM
-
-
-model = AutoPeftModelForCausalLM.from_pretrained(
-    "dpo/final_checkpoint",
-    low_cpu_mem_usage=True,
-    dtype=torch.float16,
-    load_in_4bit=True,
-)
-
-model.generate(...)
-```
--- a/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
+++ b/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
@ -1,252 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# 0. imports
-import os
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from accelerate import Accelerator
-from datasets import Dataset, load_dataset
-from peft import LoraConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, set_seed
-
-from trl import DPOConfig, DPOTrainer
-
-
-# Define and parse arguments.
-@dataclass
-class ScriptArguments:
-    """
-    The arguments for the DPO training script.
-    """
-
-    # data parameters
-    beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})
-
-    # training parameters
-    model_name_or_path: Optional[str] = field(
-        default="../sft/results/final_checkpoint",
-        metadata={"help": "the location of the SFT model name or path"},
-    )
-    learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"})
-    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
-    warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
-    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
-    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})
-
-    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "train batch size per device"})
-    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"})
-    gradient_accumulation_steps: Optional[int] = field(
-        default=4, metadata={"help": "the number of gradient accumulation steps"}
-    )
-    gradient_checkpointing: Optional[bool] = field(
-        default=True, metadata={"help": "whether to use gradient checkpointing"}
-    )
-
-    gradient_checkpointing_use_reentrant: Optional[bool] = field(
-        default=False, metadata={"help": "whether to use reentrant for gradient checkpointing"}
-    )
-
-    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
-    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
-    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
-
-    max_prompt_length: Optional[int] = field(default=512, metadata={"help": "the maximum prompt length"})
-    max_length: Optional[int] = field(default=1024, metadata={"help": "the maximum sequence length"})
-    max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"})
-    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
-    save_steps: Optional[int] = field(default=100, metadata={"help": "the saving frequency"})
-    eval_steps: Optional[int] = field(default=100, metadata={"help": "the evaluation frequency"})
-
-    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
-    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})
-    load_in_4bit: Optional[bool] = field(default=True, metadata={"help": "whether to load the model in 4bit"})
-    model_dtype: Optional[str] = field(
-        default="float16", metadata={"help": "model_dtype[float16, bfloat16, float] for loading."}
-    )
-
-    # instrumentation
-    report_to: Optional[str] = field(
-        default="wandb",
-        metadata={
-            "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,'
-            '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
-            'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
-        },
-    )
-    # debug argument for distributed training
-    ignore_bias_buffers: Optional[bool] = field(
-        default=False,
-        metadata={
-            "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See"
-            "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
-        },
-    )
-    seed: Optional[int] = field(
-        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
-    )
-
-
-def get_stack_exchange_paired(
-    data_dir: str = "data/rl",
-    cache_dir: Optional[str] = None,
-    num_proc=24,
-) -> Dataset:
-    """Load the stack-exchange-paired dataset from Hugging Face and convert it to the necessary format.
-
-    The dataset is converted to a dictionary with the following structure:
-    {
-        'prompt': list[str],
-        'chosen': list[str],
-        'rejected': list[str],
-    }
-
-    Prompts are structured as follows:
-      "Question: " + <prompt> + "\n\nAnswer: "
-    """
-    dataset = load_dataset(
-        "lvwerra/stack-exchange-paired",
-        split="train",
-        cache_dir=cache_dir,
-        data_dir=data_dir,
-        verification_mode="no_checks",
-    )
-    original_columns = dataset.column_names
-
-    def return_prompt_and_responses(samples) -> dict[str, str]:
-        return {
-            "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
-            "chosen": samples["response_j"],
-            "rejected": samples["response_k"],
-        }
-
-    return dataset.map(
-        return_prompt_and_responses,
-        batched=True,
-        num_proc=num_proc,
-        remove_columns=original_columns,
-    )
-
-
-if __name__ == "__main__":
-    parser = HfArgumentParser(ScriptArguments)
-    script_args = parser.parse_args_into_dataclasses()[0]
-
-    set_seed(script_args.seed)
-
-    # 1. load a pretrained model
-    dtype = torch.float
-    if script_args.model_dtype == "float16":
-        dtype = torch.float16
-    elif script_args.model_dtype == "bfloat16":
-        dtype = torch.bfloat16
-
-    model = AutoModelForCausalLM.from_pretrained(
-        script_args.model_name_or_path,
-        low_cpu_mem_usage=True,
-        dtype=dtype,
-        load_in_4bit=script_args.load_in_4bit,
-        device_map={"": Accelerator().local_process_index},
-    )
-    model.config.use_cache = False
-
-    if script_args.ignore_bias_buffers:
-        # torch distributed hack
-        model._ddp_params_and_buffers_to_ignore = [
-            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
-        ]
-
-    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
-    tokenizer.pad_token = tokenizer.eos_token
-
-    # 2. Load the Stack-exchange paired dataset
-    train_dataset = get_stack_exchange_paired(data_dir="data/rl")
-    train_dataset = train_dataset.filter(
-        lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
-        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length,
-        num_proc=script_args.num_proc,
-    )
-
-    # 3. Load evaluation dataset
-    eval_dataset = get_stack_exchange_paired(data_dir="data/evaluation")
-    eval_dataset = eval_dataset.filter(
-        lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
-        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length,
-        num_proc=script_args.num_proc,
-    )
-
-    # 4. initialize training arguments:
-    training_args = DPOConfig(
-        per_device_train_batch_size=script_args.per_device_train_batch_size,
-        per_device_eval_batch_size=script_args.per_device_eval_batch_size,
-        max_steps=script_args.max_steps,
-        logging_steps=script_args.logging_steps,
-        save_steps=script_args.save_steps,
-        gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-        gradient_checkpointing=script_args.gradient_checkpointing,
-        learning_rate=script_args.learning_rate,
-        eval_strategy="steps",
-        eval_steps=script_args.eval_steps,
-        output_dir=script_args.output_dir,
-        report_to=script_args.report_to,
-        lr_scheduler_type=script_args.lr_scheduler_type,
-        warmup_steps=script_args.warmup_steps,
-        optim=script_args.optimizer_type,
-        bf16=True,
-        remove_unused_columns=False,
-        run_name="dpo_llama2",
-        gradient_checkpointing_kwargs=dict(use_reentrant=script_args.gradient_checkpointing_use_reentrant),
-        seed=script_args.seed,
-    )
-
-    peft_config = LoraConfig(
-        r=script_args.lora_r,
-        lora_alpha=script_args.lora_alpha,
-        lora_dropout=script_args.lora_dropout,
-        target_modules=[
-            "q_proj",
-            "v_proj",
-            "k_proj",
-            "out_proj",
-            "fc_in",
-            "fc_out",
-            "wte",
-        ],
-        bias="none",
-        task_type="CAUSAL_LM",
-    )
-
-    # 5. initialize the DPO trainer
-    dpo_trainer = DPOTrainer(
-        model,
-        ref_model=None,
-        args=training_args,
-        beta=script_args.beta,
-        train_dataset=train_dataset,
-        eval_dataset=eval_dataset,
-        processing_class=tokenizer,
-        peft_config=peft_config,
-        max_prompt_length=script_args.max_prompt_length,
-        max_length=script_args.max_length,
-    )
-
-    # 6. train
-    dpo_trainer.train()
-    dpo_trainer.save_model(script_args.output_dir)
-
-    # 7. save
-    output_dir = os.path.join(script_args.output_dir, "final_checkpoint")
-    dpo_trainer.model.save_pretrained(output_dir)
--- a/examples/research_projects/stack_llama_2/scripts/requirements.txt
+++ b/examples/research_projects/stack_llama_2/scripts/requirements.txt
@ -1,7 +0,0 @@
-transformers
-trl
-peft
-accelerate
-datasets
-bitsandbytes
-wandb
--- a/examples/research_projects/stack_llama_2/scripts/sft_llama2.py
+++ b/examples/research_projects/stack_llama_2/scripts/sft_llama2.py
@ -1,212 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# Fine-Tune Llama2-7b on SE paired dataset
-import os
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from accelerate import Accelerator
-from datasets import load_dataset
-from peft import AutoPeftModelForCausalLM, LoraConfig
-from tqdm import tqdm
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-    HfArgumentParser,
-    is_torch_npu_available,
-    is_torch_xpu_available,
-    set_seed,
-)
-
-from trl import SFTConfig, SFTTrainer
-from trl.trainer import ConstantLengthDataset
-
-
-@dataclass
-class ScriptArguments:
-    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
-    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
-    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
-    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
-    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
-    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
-    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
-    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
-    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})
-    use_bnb: Optional[bool] = field(default=True, metadata={"help": "whether to use BitsAndBytes"})
-
-    # LoraConfig
-    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
-    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
-    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
-
-
-parser = HfArgumentParser((ScriptArguments, SFTConfig))
-script_args, training_args = parser.parse_args_into_dataclasses()
-peft_config = LoraConfig(
-    r=script_args.lora_r,
-    lora_alpha=script_args.lora_alpha,
-    lora_dropout=script_args.lora_dropout,
-    target_modules=["q_proj", "v_proj"],
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-if training_args.group_by_length and training_args.packing:
-    raise ValueError("Cannot use both packing and group by length")
-
-# `gradient_checkpointing` was True by default until `1f3314`, but it's actually not used.
-# `gradient_checkpointing=True` will cause `Variable._execution_engine.run_backward`.
-if training_args.gradient_checkpointing:
-    raise ValueError("gradient_checkpointing not supported")
-
-set_seed(training_args.seed)
-
-
-def chars_token_ratio(dataset, tokenizer, nb_examples=400):
-    """
-    Estimate the average number of characters per token in the dataset.
-    """
-    total_characters, total_tokens = 0, 0
-    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
-        text = prepare_sample_text(example)
-        total_characters += len(text)
-        if tokenizer.is_fast:
-            total_tokens += len(tokenizer(text).tokens())
-        else:
-            total_tokens += len(tokenizer.tokenize(text))
-
-    return total_characters / total_tokens
-
-
-def print_trainable_parameters(model):
-    """
-    Prints the number of trainable parameters in the model.
-    """
-    trainable_params = 0
-    all_param = 0
-    for _, param in model.named_parameters():
-        all_param += param.numel()
-        if param.requires_grad:
-            trainable_params += param.numel()
-    print(
-        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
-    )
-
-
-def prepare_sample_text(example):
-    """Prepare the text from a sample of the dataset."""
-    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
-    return text
-
-
-def create_datasets(tokenizer, args, seed=None):
-    dataset = load_dataset(
-        args.dataset_name,
-        data_dir=args.subset,
-        split=args.split,
-        use_auth_token=True,
-        num_proc=args.num_workers if not args.streaming else None,
-        streaming=args.streaming,
-    )
-    if args.streaming:
-        print("Loading the dataset in streaming mode")
-        valid_data = dataset.take(args.size_valid_set)
-        train_data = dataset.skip(args.size_valid_set)
-        train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=seed)
-    else:
-        dataset = dataset.train_test_split(test_size=0.005, seed=seed)
-        train_data = dataset["train"]
-        valid_data = dataset["test"]
-        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
-
-    chars_per_token = chars_token_ratio(train_data, tokenizer)
-    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
-
-    train_dataset = ConstantLengthDataset(
-        tokenizer,
-        train_data,
-        formatting_func=prepare_sample_text,
-        infinite=True,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    valid_dataset = ConstantLengthDataset(
-        tokenizer,
-        valid_data,
-        formatting_func=prepare_sample_text,
-        infinite=False,
-        seq_length=args.seq_length,
-        chars_per_token=chars_per_token,
-    )
-    return train_dataset, valid_dataset
-
-
-bnb_config = None
-if script_args.use_bnb:
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        bnb_4bit_quant_type="nf4",
-        bnb_4bit_compute_dtype=torch.bfloat16,
-    )
-
-base_model = AutoModelForCausalLM.from_pretrained(
-    script_args.model_name,
-    quantization_config=bnb_config,
-    device_map={"": Accelerator().local_process_index},
-    trust_remote_code=True,
-    use_auth_token=True,
-)
-base_model.config.use_cache = False
-
-
-tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
-tokenizer.pad_token = tokenizer.eos_token
-tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
-
-train_dataset, eval_dataset = create_datasets(tokenizer, script_args, seed=training_args.seed)
-
-trainer = SFTTrainer(
-    model=base_model,
-    train_dataset=train_dataset,
-    eval_dataset=eval_dataset,
-    peft_config=peft_config,
-    max_length=None,
-    formatting_func=prepare_sample_text,
-    processing_class=tokenizer,
-    args=training_args,
-)
-trainer.train()
-trainer.save_model(training_args.output_dir)
-
-output_dir = os.path.join(training_args.output_dir, "final_checkpoint")
-trainer.model.save_pretrained(output_dir)
-
-# Free memory for merging weights
-del base_model
-if is_torch_xpu_available():
-    torch.xpu.empty_cache()
-elif is_torch_npu_available():
-    torch.npu.empty_cache()
-else:
-    torch.cuda.empty_cache()
-
-model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", dtype=torch.bfloat16)
-model = model.merge_and_unload()
-
-output_merged_dir = os.path.join(training_args.output_dir, "final_merged_checkpoint")
-model.save_pretrained(output_merged_dir, safe_serialization=True)
--- a/examples/research_projects/toxicity/README.md
+++ b/examples/research_projects/toxicity/README.md
@ -1,7 +0,0 @@
-# De-detoxifying language models
-
-To run this code, do the following:
-
-```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file {CONFIG} examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py --log_with wandb
-```
--- a/examples/research_projects/toxicity/scripts/evaluate-toxicity.py
+++ b/examples/research_projects/toxicity/scripts/evaluate-toxicity.py
@ -1,146 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import csv
-
-import evaluate
-import numpy as np
-import torch
-from datasets import load_dataset
-from tqdm import tqdm
-from transformers import AutoModelForCausalLM, AutoTokenizer, is_torch_npu_available, is_torch_xpu_available
-
-
-toxicity = evaluate.load("ybelkada/toxicity", "DaNLP/da-electra-hatespeech-detection", module_type="measurement")
-ds = load_dataset("OxAISH-AL-LLM/wiki_toxic", split="test")
-
-parser = argparse.ArgumentParser(description="Evaluate de-toxified models")
-parser.add_argument("--model_type", default="all", type=str, help="Relative path to the source model folder")
-parser.add_argument("--output_file", default="toxicity.csv", type=str, help="Relative path to the source model folder")
-parser.add_argument("--batch_size", default=64, type=int, help="Batch size")
-parser.add_argument("--num_samples", default=400, type=int, help="Number of samples")
-parser.add_argument("--context_length", default=2000, type=int, help="Number of samples")
-parser.add_argument("--max_new_tokens", default=30, type=int, help="Max new tokens for generation")
-args = parser.parse_args()
-
-
-if args.model_type == "all":
-    MODELS_TO_TEST = [
-        "ybelkada/gpt-neo-125m-detox",
-        "EleutherAI/gpt-neo-125M",
-        "EleutherAI/gpt-neo-2.7B",
-        "ybelkada/gpt-neo-2.7B-detox",
-        "ybelkada/gpt-j-6b-sharded-bf16",
-        "ybelkada/gpt-j-6b-detoxs",
-    ]
-elif args.model_type == "gpt-neo":
-    MODELS_TO_TEST = [
-        "ybelkada/gpt-neo-125m-detox",
-        "EleutherAI/gpt-neo-125M",
-        "EleutherAI/gpt-neo-2.7B",
-        "ybelkada/gpt-neo-2.7B-detox",
-    ]
-elif args.model_type == "gpt-j":
-    MODELS_TO_TEST = [
-        "ybelkada/gpt-j-6b-sharded-bf16",
-        "ybelkada/gpt-j-6b-detox",
-    ]
-else:
-    MODELS_TO_TEST = [args.model_type]
-NUM_SAMPLES = args.num_samples
-BATCH_SIZE = args.batch_size
-output_file = args.output_file
-max_new_tokens = args.max_new_tokens
-context_length = args.context_length
-if is_torch_xpu_available():
-    device = torch.xpu.current_device()
-elif is_torch_npu_available():
-    device = torch.npu.current_device()
-else:
-    device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
-
-# consider only toxic prompts
-ds = ds.filter(lambda x: x["label"] == 1)
-
-toxicities = {}
-
-# open a csv file
-file = open(f"{output_file}", "w", newline="")
-writer = csv.writer(file)
-# add first rows
-writer.writerow(["model_id", "mean_toxicity", "std_toxicity"])
-
-
-for model_id in tqdm(MODELS_TO_TEST):
-    model = AutoModelForCausalLM.from_pretrained(model_id, device_map={"": device}, dtype=torch.bfloat16)
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "left"
-    input_texts = []
-
-    for i, example in enumerate(ds):
-        # set seed
-        torch.manual_seed(42)
-
-        input_text = example["comment_text"]
-        input_texts.append(input_text[:2000])
-
-        if i > NUM_SAMPLES:
-            break
-
-        if (i + 1) % BATCH_SIZE == 0:
-            inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(device)
-            inputs.input_ids = inputs.input_ids[:context_length]
-            inputs.attention_mask = inputs.attention_mask[:context_length]
-            outputs = model.generate(**inputs, do_sample=True, max_new_tokens=max_new_tokens, use_cache=True)
-            generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-            generated_texts = [
-                generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts)
-            ]
-            toxicity_score = toxicity.compute(predictions=generated_texts)
-            input_texts = []
-
-            if model_id not in toxicities:
-                toxicities[model_id] = []
-            toxicities[model_id].extend(toxicity_score["toxicity"])
-
-    # last batch
-    inputs = tokenizer(input_texts, return_tensors="pt", padding=True).to(device)
-    outputs = model.generate(**inputs, do_sample=True, max_new_tokens=30)
-    generated_texts = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-    generated_texts = [generated_text.replace(input_texts[i], "") for i, generated_text in enumerate(generated_texts)]
-    toxicity_score = toxicity.compute(predictions=generated_texts)
-    toxicities[model_id].extend(toxicity_score["toxicity"])
-
-    # compute mean & std using np
-    mean = np.mean(toxicities[model_id])
-    std = np.std(toxicities[model_id])
-
-    # save to file
-    writer.writerow([model_id, mean, std])
-
-    # print
-    print(f"Model: {model_id} - Mean: {mean} - Std: {std}")
-
-    model = None
-    if is_torch_xpu_available():
-        torch.xpu.empty_cache()
-    elif is_torch_npu_available():
-        torch.npu.empty_cache()
-    else:
-        torch.cuda.empty_cache()
-
-# close file
-file.close()
--- a/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py
+++ b/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py
@ -1,245 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from datasets import load_dataset
-from torch.optim import Adam
-from tqdm import tqdm
-from transformers import (
-    AutoModelForCausalLM,
-    AutoTokenizer,
-    HfArgumentParser,
-    RobertaForSequenceClassification,
-    RobertaTokenizer,
-    set_seed,
-)
-
-from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, create_reference_model
-from trl.core import LengthSampler
-
-
-tqdm.pandas()
-
-########################################################################
-# This is a fully working simple example to use trl with accelerate.
-#
-# This example fine-tunes a GPTJ model to generate less toxic contents
-# by using allenai/real-toxicity-prompts dataset. We use PPO
-#  (proximal policy optimization) to optimize the model.
-# in any of the following settings (with the same script):
-#   - single CPU or single GPU
-#   - multi GPUS (using PyTorch distributed mode)
-#   - multi GPUS (using DeepSpeed ZeRO-Offload stages 1 & 2)
-#   - fp16 (mixed-precision) or fp32 (normal precision)
-#
-# To run it in each of these various modes, first initialize the accelerate
-# configuration with `accelerate config`
-#
-########################################################################
-
-
-# We first define the configuration of the experiment, defining the model, the dataset,
-# the training parameters, and the PPO parameters.
-# Check the default arguments in the `PPOConfig` class for more details.
-# If you want to log with tensorboard, add the kwarg
-# `project_kwargs={"logging_dir": PATH_TO_LOGS}` to the PPOConfig.
-@dataclass
-class ScriptArguments:
-    """
-    The name of the Casual LM model we wish to fine-tune with PPO
-    """
-
-    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
-    # models like gpt-neo* models are more suitable.
-    model_name: Optional[str] = field(default="ybelkada/gpt-j-6b-sharded-bf16", metadata={"help": "the model name"})
-    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
-    learning_rate: Optional[float] = field(default=(1.47e-5) * 2, metadata={"help": "the learning rate"})
-    mini_batch_size: Optional[int] = field(default=4, metadata={"help": "the PPO minibatch size"})
-    batch_size: Optional[int] = field(default=16, metadata={"help": "the batch size"})
-    gradient_accumulation_steps: Optional[int] = field(
-        default=1, metadata={"help": "the number of gradient accumulation steps"}
-    )
-    model_save_path: Optional[str] = field(
-        default="./gpt-j-6B-detoxified-long-context-26-shl-1e4-final",
-        metadata={"help": "the path to save the model"},
-    )
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args = parser.parse_args_into_dataclasses()[0]
-
-config = PPOConfig(
-    model_name=script_args.model_name,
-    learning_rate=script_args.learning_rate,
-    log_with=script_args.log_with,
-    ppo_epochs=100,
-    mini_batch_size=script_args.mini_batch_size,
-    batch_size=script_args.batch_size,
-    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-)
-
-
-# Below is an example function to build the dataset. In our case, we use the IMDB dataset
-# from the `datasets` library. One should customize this function to train the model on
-# its own dataset.
-def build_dataset(
-    config, dataset_name="allenai/real-toxicity-prompts", input_min_text_length=5, input_max_text_length=10
-):
-    """
-    Build dataset for training. This builds the dataset from `load_dataset`, one should
-    customize this function to train the model on its own dataset.
-
-    Args:
-        config (`PPOConfig`):
-            The configuration of the PPO training.
-        dataset_name (`str`):
-            The name of the dataset to be loaded.
-        input_min_text_length (`int`, defaults to 5):
-            The minimum length of the input text.
-        input_max_text_length (`int`, defaults to 10):
-            The maximum length of the input text.
-
-    Returns:
-        dataloader (`torch.utils.data.DataLoader`):
-            The dataloader for the dataset.
-    """
-    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    ds = load_dataset(dataset_name, split="train")
-
-    def filter_fn(sample):
-        toxicity = sample["prompt"]["toxicity"]
-        return toxicity is not None and toxicity > 0.3
-
-    ds = ds.filter(filter_fn, batched=False)
-
-    input_size = LengthSampler(input_min_text_length, input_max_text_length)
-
-    def tokenize(sample):
-        prompt = sample["prompt"]["text"]
-        continuation = sample["continuation"]["text"]
-
-        sample["input_ids"] = tokenizer.encode(prompt + continuation)[: input_size()]
-        sample["query"] = tokenizer.decode(sample["input_ids"])
-        return sample
-
-    ds = ds.map(tokenize, batched=False)
-    ds.set_format(type="torch")
-
-    ds = ds.train_test_split(test_size=0.2, shuffle=False)["train"]
-
-    return ds
-
-
-# We retrieve the dataloader by calling the `build_dataset` function.
-min_input_length = 30
-max_input_length = 40
-dataset = build_dataset(config, input_min_text_length=min_input_length, input_max_text_length=max_input_length)
-
-
-def collator(data):
-    return {key: [d[key] for d in data] for key in data[0]}
-
-
-# set seed before initializing value head for deterministic eval
-set_seed(config.seed)
-
-# Now let's build the model, the reference model, and the tokenizer. We first load the model
-# in bfloat16 to save memory using `transformers`.
-model = AutoModelForCausalLM.from_pretrained(config.model_name, dtype=torch.bfloat16)
-# And then we pass the loaded model to `AutoModelForCausalLMWithValueHead`.
-model = AutoModelForCausalLMWithValueHead.from_pretrained(model)
-
-# We create a reference model by sharing 20 layers
-ref_model = create_reference_model(model, num_shared_layers=20)
-
-# We make sure to use `Adam` optimizer on the model parameters that require gradients.
-optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=config.learning_rate)
-
-# GPT-2 / GPT-J tokenizer has a pad token, but it is not eos_token by default. We need to set it to eos_token.
-# only for this model.
-tokenizer = AutoTokenizer.from_pretrained(config.model_name)
-tokenizer.pad_token = tokenizer.eos_token
-
-# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
-ppo_trainer = PPOTrainer(
-    config,
-    model,
-    ref_model=ref_model,
-    tokenizer=tokenizer,
-    dataset=dataset,
-    data_collator=collator,
-    optimizer=optimizer,
-)
-
-# We then build the reward pipeline, we will use the toxicity model to compute the reward.
-# We first load the toxicity model and tokenizer.
-toxicity_model_id = "facebook/roberta-hate-speech-dynabench-r4-target"
-toxicity_tokenizer = RobertaTokenizer.from_pretrained(toxicity_model_id)
-# We load the toxicity model in fp16 to save memory.
-toxicity_model = RobertaForSequenceClassification.from_pretrained(toxicity_model_id, dtype=torch.float16).to(
-    ppo_trainer.accelerator.device
-)
-
-
-# We then define the arguments to pass to the `generate` function. These arguments
-# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
-# the `generate` function of the trained model.
-generation_kwargs = {
-    "min_length": -1,
-    "top_k": 0.0,
-    "top_p": 1.0,
-    "do_sample": True,
-    "pad_token_id": tokenizer.eos_token_id,
-}
-output_min_length = 20
-output_max_length = 30
-output_length_sampler = LengthSampler(output_min_length, output_max_length)
-
-model_save_path = script_args.model_save_path
-
-for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
-    query_tensors = batch["input_ids"]
-
-    # Get response from the policy model
-    response_tensors = []
-    for query in query_tensors:
-        gen_len = output_length_sampler()
-        generation_kwargs["max_new_tokens"] = gen_len
-        response = ppo_trainer.generate(query, **generation_kwargs)
-        response_tensors.append(response.squeeze()[-gen_len:])
-    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
-
-    # Compute sentiment score
-    texts = batch["response"]
-    toxicity_inputs = toxicity_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(
-        ppo_trainer.accelerator.device
-    )
-    logits = toxicity_model(**toxicity_inputs).logits.float()
-    toxicity_labels = (logits[:, 0]).tolist()
-
-    rewards = [torch.tensor(output) for output in toxicity_labels]
-
-    # Run PPO step
-    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
-    ppo_trainer.log_stats(stats, batch, rewards)
-
-    # Save model every 100 epochs
-    if epoch % 100 == 0:
-        if ppo_trainer.accelerator.is_main_process:
-            ppo_trainer.save_pretrained(model_save_path)
--- a/examples/scripts/dpo_vlm.py
+++ b/examples/scripts/dpo_vlm.py
@ -85,7 +85,7 @@ if __name__ == "__main__":
    script_args, training_args, model_args = parser.parse_args_and_config()

    ################
-    # Model & Tokenizer
+    # Model & Processor
    ################
    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)

@ -117,7 +117,6 @@ if __name__ == "__main__":
    processor = AutoProcessor.from_pretrained(
        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, do_image_splitting=False
    )
-    tokenizer = processor.tokenizer

    # Set up the chat template
    if model.config.model_type == "idefics2":
@ -127,8 +126,6 @@ if __name__ == "__main__":
    elif model.config.model_type == "llava":
        processor.chat_template = """{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{% if message['role'] == 'user' %}USER: {% else %}ASSISTANT: {% endif %}{% for item in message['content'] %}{% if item['type'] == 'text' %}{{ item['text'] }}{% elif item['type'] == 'image' %}<image>{% endif %}{% endfor %}{% if message['role'] == 'user' %} {% else %}{{eos_token}}{% endif %}{% endfor %}{% if add_generation_prompt %}ASSISTANT: {% endif %}"""

-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
    if script_args.ignore_bias_buffers:
        # torch distributed hack
        model._ddp_params_and_buffers_to_ignore = [
@ -153,7 +150,6 @@ if __name__ == "__main__":
        args=training_args,
        train_dataset=dataset[script_args.dataset_train_split],
        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
-        processing_class=processor,
        peft_config=peft_config,
    )

--- a/examples/scripts/grpo_vlm.py
+++ b/examples/scripts/grpo_vlm.py
@ -94,7 +94,7 @@ if __name__ == "__main__":
    parser = TrlParser((ScriptArguments, GRPOConfig, ModelConfig))
    script_args, training_args, model_args = parser.parse_args_and_config()
    ################
-    # Model & Processor
+    # Model
    ################
    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
    training_args.model_init_kwargs = dict(
--- a/examples/scripts/gspo_vlm.py
+++ b/examples/scripts/gspo_vlm.py
@ -81,7 +81,7 @@ if __name__ == "__main__":
    parser = TrlParser((ScriptArguments, GRPOConfig, ModelConfig))
    script_args, training_args, model_args = parser.parse_args_and_config()
    ################
-    # Model & Processor
+    # Model
    ################
    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
    training_args.model_init_kwargs = dict(
--- a/examples/scripts/mpo_vlm.py
+++ b/examples/scripts/mpo_vlm.py
@ -46,7 +46,7 @@ import os
 import torch
 from datasets import load_dataset
 from PIL import Image
-from transformers import AutoModelForImageTextToText, AutoProcessor
+from transformers import AutoModelForImageTextToText

 from trl import (
    DPOConfig,
@ -97,9 +97,6 @@ if __name__ == "__main__":
        )
    else:
        ref_model = None
-    processor = AutoProcessor.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code
-    )

    ################
    # Dataset
@ -135,7 +132,6 @@ if __name__ == "__main__":
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
-        processing_class=processor,
        peft_config=peft_config,
    )

--- a/examples/scripts/reward_modeling.py
+++ b/examples/scripts/reward_modeling.py
@ -57,7 +57,7 @@ import os
 import torch
 from accelerate import logging
 from datasets import load_dataset
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, HfArgumentParser
+from transformers import AutoModelForSequenceClassification, HfArgumentParser

 from trl import (
    ModelConfig,
@ -67,7 +67,6 @@ from trl import (
    get_kbit_device_map,
    get_peft_config,
    get_quantization_config,
-    setup_chat_format,
 )


@ -97,18 +96,9 @@ if __name__ == "__main__":
        model_kwargs["device_map"] = get_kbit_device_map()
        model_kwargs["quantization_config"] = quantization_config

-    tokenizer = AutoTokenizer.from_pretrained(
-        model_args.model_name_or_path, trust_remote_code=model_args.trust_remote_code, use_fast=True
-    )
    model = AutoModelForSequenceClassification.from_pretrained(
        model_args.model_name_or_path, num_labels=1, trust_remote_code=model_args.trust_remote_code, **model_kwargs
    )
-    # Align padding tokens between tokenizer and model
-    model.config.pad_token_id = tokenizer.pad_token_id
-
-    # If post-training a base model, use ChatML as the default template
-    if tokenizer.chat_template is None:
-        model, tokenizer = setup_chat_format(model, tokenizer)

    if model_args.use_peft and model_args.lora_task_type != "SEQ_CLS":
        logger.warning(
@ -126,7 +116,6 @@ if __name__ == "__main__":
    ##########
    trainer = RewardTrainer(
        model=model,
-        processing_class=tokenizer,
        args=training_args,
        train_dataset=dataset[script_args.dataset_train_split],
        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
--- a/examples/scripts/rloo_vlm.py
+++ b/examples/scripts/rloo_vlm.py
@ -94,7 +94,7 @@ if __name__ == "__main__":
    parser = TrlParser((ScriptArguments, RLOOConfig, ModelConfig))
    script_args, training_args, model_args = parser.parse_args_and_config()
    ################
-    # Model & Processor
+    # Model
    ################
    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
    training_args.model_init_kwargs = dict(
--- a/examples/scripts/sft_gpt_oss.py
+++ b/examples/scripts/sft_gpt_oss.py
@ -52,7 +52,7 @@ accelerate launch \
 import os

 from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer, Mxfp4Config
+from transformers import AutoModelForCausalLM, Mxfp4Config

 from trl import ModelConfig, ScriptArguments, SFTConfig, SFTTrainer, TrlParser, get_peft_config

@ -62,7 +62,7 @@ os.environ.setdefault("TRACKIO_SPACE_ID", "trl-trackio")


 def main(script_args, training_args, model_args):
-    # Load model & tokenizer
+    # Load model
    quantization_config = Mxfp4Config(dequantize=True)
    model_kwargs = dict(
        revision=model_args.model_revision,
@ -75,8 +75,6 @@ def main(script_args, training_args, model_args):

    model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path, **model_kwargs)

-    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
-
    # Load dataset
    dataset = load_dataset(script_args.dataset_name, name=script_args.dataset_config)

@ -86,7 +84,6 @@ def main(script_args, training_args, model_args):
        args=training_args,
        train_dataset=dataset[script_args.dataset_train_split],
        eval_dataset=dataset[script_args.dataset_test_split] if training_args.eval_strategy != "no" else None,
-        processing_class=tokenizer,
        peft_config=get_peft_config(model_args),
    )

--- a/examples/scripts/sft_video_llm.py
+++ b/examples/scripts/sft_video_llm.py
@ -248,8 +248,6 @@ if __name__ == "__main__":
    trainer.save_model(training_args.output_dir)
    if training_args.push_to_hub:
        trainer.push_to_hub(dataset_name=script_args.dataset_name)
-        if trainer.accelerator.is_main_process:
-            processor.push_to_hub(training_args.hub_model_id)

    # Cleanup
    del model
--- a/examples/scripts/sft_vlm.py
+++ b/examples/scripts/sft_vlm.py
@ -82,7 +82,7 @@ if __name__ == "__main__":
    training_args.max_length = None

    ################
-    # Model, Tokenizer & Processor
+    # Model
    ################
    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
    model_kwargs = dict(
--- a/examples/scripts/sft_vlm_gemma3.py
+++ b/examples/scripts/sft_vlm_gemma3.py
@ -147,7 +147,7 @@ def main():
    training_args.max_length = None

    ################
-    # Model, Tokenizer & Processor
+    # Model
    ################
    dtype = model_args.dtype if model_args.dtype in ["auto", None] else getattr(torch, model_args.dtype)
    model_kwargs = dict(
--- a/pyproject.toml
+++ b/pyproject.toml
@ -1,3 +1,135 @@
+[build-system]
+requires = ["setuptools >= 77.0.3"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "trl"
+description = "Train transformer language models with reinforcement learning."
+authors = [
+    { name = "Leandro von Werra", email = "leandro.vonwerra@gmail.com" }
+]
+readme = { file = "README.md", content-type = "text/markdown" }
+license = "Apache-2.0"
+license-files = ["LICENSE"]
+keywords = [
+    "transformers", "huggingface", "language modeling", "post-training", "rlhf", "sft", "dpo", "grpo"
+]
+classifiers = [
+    "Development Status :: 2 - Pre-Alpha",
+    "Intended Audience :: Developers",
+    "Intended Audience :: Science/Research",
+    "Natural Language :: English",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.9",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13"
+]
+requires-python = ">=3.9"
+dependencies = [
+    "accelerate>=1.4.0",
+    "datasets>=3.0.0",
+    "transformers>=4.56.1",
+    "transformers!=4.57.0; python_version == '3.9'"
+]
+dynamic = ["version"]
+
+[project.urls]
+Homepage = "https://github.com/huggingface/trl"
+
+[project.scripts]
+trl = "trl.cli:main"
+
+[project.optional-dependencies]
+bco = [
+    "scikit-learn",
+    "joblib"
+]
+deepspeed = [
+    "deepspeed>=0.14.4"
+]
+judges = [
+    "openai>=1.23.2",
+    "llm-blender>=0.0.2"
+]
+liger = [
+    "liger-kernel>=0.6.2"
+]
+peft = [
+    "peft>=0.8.0"
+]
+quality = [
+    "pre-commit",
+    "hf-doc-builder"
+]
+quantization = [
+    "bitsandbytes"
+]
+scikit = [
+    "scikit-learn"
+]
+test = [
+    "parameterized",
+    "pytest-cov",
+    "pytest-rerunfailures==15.1",
+    "pytest-xdist",
+    "pytest"
+]
+vllm = [
+    "vllm==0.10.2",
+    "fastapi",
+    "pydantic",
+    "requests",
+    "uvicorn"
+]
+vlm = [
+    "Pillow",
+    "torchvision",
+    "num2words==0.5.14"
+]
+dev = [
+    # bco
+    "scikit-learn",
+    "joblib",
+    # deepspeed
+    "deepspeed>=0.14.4",
+    # judges
+    "openai>=1.23.2",
+    "llm-blender>=0.0.2",
+    # liger
+    "liger-kernel>=0.6.2",
+    # peft
+    "peft>=0.8.0",
+    # quality
+    "pre-commit",
+    "hf-doc-builder",
+    # quantization
+    "bitsandbytes",
+    # scikit: included in bco
+    # test
+    "parameterized",
+    "pytest-cov",
+    "pytest-rerunfailures==15.1",
+    "pytest-xdist",
+    "pytest",
+    # vllm: not included in dev by default due to CUDA error; see GH-4228
+    # vlm
+    "Pillow",
+    "torchvision",
+    "num2words==0.5.14"
+]
+
+[tool.setuptools]
+package-dir = {"trl" = "trl"}
+
+[tool.setuptools.dynamic]
+version = { file = "VERSION" }
+
+[tool.coverage.run]
+branch = true
+
 [tool.ruff]
 target-version = "py39"
 line-length = 119
--- a/scripts/generate_tiny_models.py
+++ b/scripts/generate_tiny_models.py
@ -155,7 +155,6 @@ def init_weights_tiny_model(model):
 for model_id, config_class, model_class, suffix in [
    ("bigscience/bloomz-560m", BloomConfig, BloomForCausalLM, None),
    ("CohereForAI/aya-expanse-8b", CohereConfig, CohereForCausalLM, None),
-    ("databricks/dbrx-instruct", DbrxConfig, DbrxForCausalLM, None),
    ("deepseek-ai/DeepSeek-R1", DeepseekV3Config, DeepseekV3ForCausalLM, None),
    # It's important to have R1-0528 as it doesn't have the same chat template
    ("deepseek-ai/DeepSeek-R1-0528", DeepseekV3Config, DeepseekV3ForCausalLM, "0528"),
@ -209,6 +208,17 @@ for model_id, config_class, model_class, suffix in [
    init_weights_tiny_model(model)
    push_to_hub(model, tokenizer, "tiny", suffix)

+# Special case for databricks/dbrx-instruct as it requires specific changes in the config
+model_id = "databricks/dbrx-instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+config = DbrxConfig.from_pretrained(model_id, n_layers=2, n_heads=16, d_model=24)
+# transformers mistakenly ignores ffn_config keys when loading from pretrained. We need to set them manually after
+# loading the config
+config.ffn_config.ffn_hidden_size = 24
+config.ffn_config.hidden_size = 24
+model = DbrxForCausalLM(config).to(dtype=torch.bfloat16)
+init_weights_tiny_model(model)
+push_to_hub(model, tokenizer, "tiny")

 # Two slightly bigger models, required for vLLM testing
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-32B-Instruct")
--- a/scripts/log_example_reports.py
+++ b/scripts/log_example_reports.py
@ -1,158 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import logging
-import os
-from datetime import date
-
-from tabulate import tabulate
-
-
-MAX_LEN_MESSAGE = 2900  # slack endpoint has a limit of 3001 characters
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--slack_channel_name", default="trl-push-examples-ci")
-parser.add_argument("--text_file_name", required=True)
-
-
-def main(text_file_name, slack_channel_name=None):
-    logging.basicConfig(level=logging.INFO)
-    logger = logging.getLogger(__name__)
-
-    message = ""
-
-    if os.path.isfile(text_file_name):
-        final_results = {}
-
-        try:
-            with open(text_file_name) as file:
-                for line in file:
-                    result, config_name = line.strip().split(",")
-                    config_name = config_name.split("/")[-1].split(".yaml")[0]
-                    final_results[config_name] = int(result)
-        except Exception as e:
-            logger.error(f"Error reading file {text_file_name}: {str(e)}")
-            final_results = {}
-
-        no_error_payload = {
-            "type": "section",
-            "text": {
-                "type": "plain_text",
-                "text": "🌞 There were no failures on the example tests!"
-                if not len(final_results) == 0
-                else "Something went wrong there is at least one empty file - please check GH action results.",
-                "emoji": True,
-            },
-        }
-
-        total_num_failed = sum(final_results.values())
-    else:
-        no_error_payload = {
-            "type": "section",
-            "text": {
-                "type": "plain_text",
-                "text": "❌ Something is wrong with the workflow please check ASAP!"
-                "Something went wrong there is no text file being produced. Please check ASAP.",
-                "emoji": True,
-            },
-        }
-
-        total_num_failed = 0
-
-    test_type_name = text_file_name.replace(".txt", "").replace("temp_results_", "").replace("_", " ").title()
-
-    payload = [
-        {
-            "type": "header",
-            "text": {
-                "type": "plain_text",
-                "text": "🤗 Results of the {} TRL {} example tests.".format(
-                    os.environ.get("TEST_TYPE", ""), test_type_name
-                ),
-            },
-        },
-    ]
-
-    if total_num_failed > 0:
-        message += f"{total_num_failed} failed tests for example tests!"
-
-        for test_name, failed in final_results.items():
-            failed_table = tabulate(
-                [[test_name, "✅" if not failed else "❌"]],
-                headers=["Test Name", "Status"],
-                showindex="always",
-                tablefmt="grid",
-                maxcolwidths=[12],
-            )
-            message += "\n```\n" + failed_table + "\n```"
-
-        print(f"### {message}")
-    else:
-        payload.append(no_error_payload)
-
-    if os.environ.get("TEST_TYPE", "") != "":
-        try:
-            from slack_sdk import WebClient
-        except ImportError:
-            logger.error("slack_sdk is not installed. Please install it to use Slack integration.")
-            return
-
-        if len(message) > MAX_LEN_MESSAGE:
-            print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
-            message = message[:MAX_LEN_MESSAGE] + "..."
-
-        if len(message) != 0:
-            md_report = {
-                "type": "section",
-                "text": {"type": "mrkdwn", "text": message},
-            }
-            payload.append(md_report)
-            action_button = {
-                "type": "section",
-                "text": {"type": "mrkdwn", "text": "*For more details:*"},
-                "accessory": {
-                    "type": "button",
-                    "text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
-                    "url": f"https://github.com/huggingface/trl/actions/runs/{os.environ['GITHUB_RUN_ID']}",
-                },
-            }
-            payload.append(action_button)
-
-        date_report = {
-            "type": "context",
-            "elements": [
-                {
-                    "type": "plain_text",
-                    "text": f"On Push - main {os.environ.get('TEST_TYPE')} test results for {date.today()}",
-                },
-            ],
-        }
-        payload.append(date_report)
-
-        print(payload)
-
-        try:
-            client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
-            response = client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload)
-            if response["ok"]:
-                logger.info("Message sent successfully to Slack.")
-            else:
-                logger.error(f"Failed to send message to Slack: {response['error']}")
-        except Exception as e:
-            logger.error(f"Error sending message to Slack: {str(e)}")
-
-    if __name__ == "__main__":
-        args = parser.parse_args()
-        main(args.text_file_name, args.slack_channel_name)
--- a/setup.cfg
+++ b/setup.cfg
@ -1,92 +0,0 @@
-[metadata]
-name = trl
-version = file: VERSION
-description = Train transformer language models with reinforcement learning.
-long_description = file: README.md
-long_description_content_type = text/markdown
-author = Leandro von Werra
-author_email = leandro.vonwerra@gmail.com
-url = https://github.com/huggingface/trl
-keywords = transformers, huggingface, language modeling, post-training, rlhf, sft, dpo, grpo
-license_file = LICENSE
-classifiers =
-    Development Status :: 2 - Pre-Alpha
-    Intended Audience :: Developers
-    Intended Audience :: Science/Research
-    Natural Language :: English
-    Operating System :: OS Independent
-    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3.9
-    Programming Language :: Python :: 3.10
-    Programming Language :: Python :: 3.11
-    Programming Language :: Python :: 3.12
-    Programming Language :: Python :: 3.13
-
-[options]
-packages = find_namespace:
-python_requires = >=3.9
-include_package_data = True
-install_requires =
-    accelerate>=1.4.0
-    datasets>=3.0.0
-    transformers>=4.56.1
-
-[options.packages.find]
-exclude =
-    tests*
-
-[options.extras_require]
-bco =
-    scikit-learn
-    joblib
-deepspeed =
-    deepspeed>=0.14.4
-judges =
-    openai>=1.23.2
-    llm-blender>=0.0.2
-liger =
-    liger-kernel>=0.6.2
-peft =
-    peft>=0.8.0
-quality =
-    pre-commit
-    hf-doc-builder
-quantization =
-    bitsandbytes
-scikit =
-    scikit-learn
-test =
-    parameterized
-    pytest-cov
-    pytest-rerunfailures==15.1
-    pytest-xdist
-    pytest
-vllm =
-    vllm>=0.10.0,<=0.10.2
-    fastapi
-    pydantic
-    requests
-    uvicorn
-
-vlm =
-    Pillow
-    torchvision
-    num2words==0.5.14
-dev =
-    %(bco)s
-    %(deepspeed)s
-    %(judges)s
-    %(liger)s
-    %(peft)s
-    %(quality)s
-    %(quantization)s
-    %(scikit)s
-    %(test)s
-    %(vlm)s
-
-[options.entry_points]
-console_scripts =
-    trl = trl.cli:main
-
-[coverage:run]
-branch = True
--- a/setup.py
+++ b/setup.py
@ -1,18 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from setuptools import setup
-
-
-setup()
--- a/examples/research_projects/layer_skip/scripts/config.py
+++ b/examples/research_projects/layer_skip/scripts/config.py
@ -12,17 +12,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from huggingface_hub import whoami
+import gc
+
+import pytest
+import torch


-model_name = "unsloth/Llama-3.2-3B"
-tokenizer_name = "unsloth/Llama-3.2-3B"
-dataset_name = "WillHeld/top_v2"
+@pytest.fixture(autouse=True)
+def cleanup_gpu():
+    """
+    Automatically cleanup GPU memory after each test.

-output_root_dir = "./checkpoints/"
-hub_model_id = f"{whoami()['name']}/layerskip-{model_name.split('/')[1]}-{dataset_name.split('/')[1]}"
-output_dir = f"{output_root_dir}/{hub_model_id}"
-
-per_device_train_batch_size = 8
-gradient_accumulation_steps = 1
-learning_rate = 2e-5
+    This fixture helps prevent CUDA out of memory errors when running tests in parallel with pytest-xdist by ensuring
+    models and tensors are properly garbage collected and GPU memory caches are cleared between tests.
+    """
+    yield
+    # Cleanup after test
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
--- a/tests/slow/test_dpo_slow.py
+++ b/tests/slow/test_dpo_slow.py
@ -21,12 +21,12 @@ from accelerate.utils.memory import release_memory
 from datasets import load_dataset
 from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
-from transformers.testing_utils import backend_empty_cache, require_peft, require_torch_accelerator, torch_device
+from transformers.testing_utils import backend_empty_cache, require_torch_accelerator, torch_device
 from transformers.utils import is_peft_available

 from trl import DPOConfig, DPOTrainer

-from ..testing_utils import TrlTestCase, require_bitsandbytes
+from ..testing_utils import TrlTestCase, require_bitsandbytes, require_peft
 from .testing_constants import DPO_LOSS_TYPES, DPO_PRECOMPUTE_LOGITS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST


@ -37,9 +37,8 @@ if is_peft_available():
@pytest.mark.slow
@require_torch_accelerator
@require_peft
-class DPOTrainerSlowTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestDPOTrainerSlow(TrlTestCase):
+    def setup_method(self):
        self.dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
        self.peft_config = LoraConfig(
            lora_alpha=16,
@ -50,11 +49,10 @@ class DPOTrainerSlowTester(TrlTestCase):
        )
        self.max_length = 128

-    def tearDown(self):
+    def teardown_method(self):
        gc.collect()
        backend_empty_cache(torch_device)
        gc.collect()
-        super().tearDown()

    @parameterized.expand(list(itertools.product(MODELS_TO_TEST, DPO_LOSS_TYPES, DPO_PRECOMPUTE_LOGITS)))
    def test_dpo_bare_model(self, model_id, loss_type, pre_compute_logits):
@ -151,8 +149,8 @@ class DPOTrainerSlowTester(TrlTestCase):
            peft_config=self.peft_config,
        )

-        self.assertIsInstance(trainer.model, PeftModel)
-        self.assertIsNone(trainer.ref_model)
+        assert isinstance(trainer.model, PeftModel)
+        assert trainer.ref_model is None

        # train the model
        trainer.train()
@ -215,8 +213,8 @@ class DPOTrainerSlowTester(TrlTestCase):
            peft_config=self.peft_config,
        )

-        self.assertIsInstance(trainer.model, PeftModel)
-        self.assertIsNone(trainer.ref_model)
+        assert isinstance(trainer.model, PeftModel)
+        assert trainer.ref_model is None

        # train the model
        trainer.train()
--- a/tests/slow/test_grpo_slow.py
+++ b/tests/slow/test_grpo_slow.py
@ -35,7 +35,6 @@ from transformers.testing_utils import (
    backend_empty_cache,
    require_flash_attn,
    require_liger_kernel,
-    require_peft,
    require_torch_accelerator,
    torch_device,
 )
@ -44,7 +43,7 @@ from transformers.utils import is_peft_available
 from trl import GRPOConfig, GRPOTrainer
 from trl.trainer.utils import get_kbit_device_map

-from ..testing_utils import TrlTestCase, require_bitsandbytes, require_vllm
+from ..testing_utils import TrlTestCase, require_bitsandbytes, require_peft, require_vllm
 from .testing_constants import MODELS_TO_TEST


@ -54,18 +53,16 @@ if is_peft_available():

@pytest.mark.slow
@require_torch_accelerator
-class GRPOTrainerSlowTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestGRPOTrainerSlow(TrlTestCase):
+    def setup_method(self):
        self.train_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
        self.eval_dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="test")
        self.max_length = 128

-    def tearDown(self):
+    def teardown_method(self):
        gc.collect()
        backend_empty_cache(torch_device)
        gc.collect()
-        super().tearDown()

    @parameterized.expand(MODELS_TO_TEST)
    @require_liger_kernel
@ -103,7 +100,7 @@ class GRPOTrainerSlowTester(TrlTestCase):

        for n, param in previous_trainable_params.items():
            new_param = model.get_parameter(n)
-            self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."

        release_memory(model, trainer)

@ -121,6 +118,7 @@ class GRPOTrainerSlowTester(TrlTestCase):
            max_completion_length=self.max_length,
            report_to="none",
            logging_strategy="no",
+            loss_type="bnpo",  # liger-kernel does not support "dapo" default; see https://github.com/linkedin/Liger-Kernel/issues/620
        )

        model = AutoModelForCausalLM.from_pretrained(model_name)
@ -153,20 +151,20 @@ class GRPOTrainerSlowTester(TrlTestCase):
        # Verify PEFT adapter is properly initialized
        from peft import PeftModel

-        self.assertTrue(isinstance(trainer.model, PeftModel), "Model should be wrapped with PEFT")
+        assert isinstance(trainer.model, PeftModel), "Model should be wrapped with PEFT"

        # Store adapter weights before training
        previous_trainable_params = {
            n: param.clone() for n, param in trainer.model.named_parameters() if param.requires_grad
        }
-        self.assertTrue(len(previous_trainable_params) > 0, "No trainable parameters found in PEFT model")
+        assert len(previous_trainable_params) > 0, "No trainable parameters found in PEFT model"

        trainer.train()

        # Verify adapter weights have changed after training
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
-            self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."

        release_memory(model, trainer)

@ -199,23 +197,23 @@ class GRPOTrainerSlowTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the params have changed
        for n, param in previous_trainable_params.items():
            new_param = model.get_parameter(n)
-            self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
+            assert not torch.equal(param, new_param), f"Parameter {n} has not changed."

        release_memory(model, trainer)

-    @require_flash_attn
-    @require_bitsandbytes
-    @require_peft
    @parameterized.expand(
        [
            ("HuggingFaceTB/SmolVLM-Instruct",),  # Only test the smaller model to avoid OOM
        ]
    )
+    @require_flash_attn
+    @require_bitsandbytes
+    @require_peft
    def test_vlm_training(self, model_name):
        """
        Test VLM training with aggressive memory optimization.
@ -310,13 +308,13 @@ class GRPOTrainerSlowTester(TrlTestCase):
                peft_config=lora_config,
            )

-            self.assertIsInstance(trainer.model, PeftModel)
+            assert isinstance(trainer.model, PeftModel)

            previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}

            trainer.train()

-            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+            assert trainer.state.log_history[-1]["train_loss"] is not None

            # Check that LoRA parameters have changed
            # For VLM models, we're more permissive about which parameters can change
@ -328,7 +326,7 @@ class GRPOTrainerSlowTester(TrlTestCase):
                        lora_params_changed = True

            # At least some LoRA parameters should have changed during training
-            self.assertTrue(lora_params_changed, "No LoRA parameters were updated during training.")
+            assert lora_params_changed, "No LoRA parameters were updated during training."

        except torch.OutOfMemoryError as e:
            self.skipTest(f"Skipping VLM training test due to insufficient GPU memory: {e}")
@ -378,8 +376,8 @@ class GRPOTrainerSlowTester(TrlTestCase):
        processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM-Instruct", use_fast=True, padding_side="left")

        # Verify processor has both required attributes for VLM detection
-        self.assertTrue(hasattr(processor, "tokenizer"))
-        self.assertTrue(hasattr(processor, "image_processor"))
+        assert hasattr(processor, "tokenizer")
+        assert hasattr(processor, "image_processor")

        def dummy_reward_func(completions, **kwargs):
            return [1.0] * len(completions)
@ -438,16 +436,14 @@ class GRPOTrainerSlowTester(TrlTestCase):
                    )

                    # Should detect VLM processor correctly and allow vLLM
-                    self.assertTrue(trainer.use_vllm, "vLLM should be enabled for VLM processors in colocate mode")
-                    self.assertEqual(trainer.vllm_mode, "colocate", "Should use colocate mode")
+                    assert trainer.use_vllm, "vLLM should be enabled for VLM processors in colocate mode"
+                    assert trainer.vllm_mode == "colocate", "Should use colocate mode"

                    # Check if signature columns were set properly
                    if trainer._signature_columns is not None:
                        # Should include 'image' in signature columns for VLM processors
-                        self.assertIn(
-                            "image",
-                            trainer._signature_columns,
-                            "Should include 'image' in signature columns for VLM",
+                        assert "image" in trainer._signature_columns, (
+                            "Should include 'image' in signature columns for VLM"
                        )

                    # Should not emit any warnings about VLM incompatibility
@ -457,10 +453,8 @@ class GRPOTrainerSlowTester(TrlTestCase):
                        if "does not support VLMs" in str(w_item.message)
                        or "not compatible" in str(w_item.message).lower()
                    ]
-                    self.assertEqual(
-                        len(incompatibility_warnings),
-                        0,
-                        f"Should not emit VLM incompatibility warnings, but got: {incompatibility_warnings}",
+                    assert len(incompatibility_warnings) == 0, (
+                        f"Should not emit VLM incompatibility warnings, but got: {incompatibility_warnings}"
                    )

                    # Test passes if we get this far without exceptions
@ -525,12 +519,12 @@ class GRPOTrainerSlowTester(TrlTestCase):

            trainer.train()

-            self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+            assert trainer.state.log_history[-1]["train_loss"] is not None

            # Check that the params have changed
            for n, param in previous_trainable_params.items():
                new_param = trainer.model.get_parameter(n)
-                self.assertFalse(torch.equal(param, new_param), f"Parameter {n} has not changed.")
+                assert not torch.equal(param, new_param), f"Parameter {n} has not changed."

        except Exception as e:
            # If vLLM fails to initialize due to hardware constraints or other issues, that's expected
--- a/tests/slow/test_sft_slow.py
+++ b/tests/slow/test_sft_slow.py
@ -24,7 +24,6 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 from transformers.testing_utils import (
    backend_empty_cache,
    require_liger_kernel,
-    require_peft,
    require_torch_accelerator,
    require_torch_multi_accelerator,
    torch_device,
@ -33,7 +32,7 @@ from transformers.utils import is_peft_available

 from trl import SFTConfig, SFTTrainer

-from ..testing_utils import TrlTestCase, require_bitsandbytes
+from ..testing_utils import TrlTestCase, require_bitsandbytes, require_peft
 from .testing_constants import DEVICE_MAP_OPTIONS, GRADIENT_CHECKPOINTING_KWARGS, MODELS_TO_TEST, PACKING_OPTIONS


@ -44,9 +43,8 @@ if is_peft_available():
@pytest.mark.slow
@require_torch_accelerator
@require_peft
-class SFTTrainerSlowTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestSFTTrainerSlow(TrlTestCase):
+    def setup_method(self):
        self.train_dataset = load_dataset("stanfordnlp/imdb", split="train[:10%]")
        self.eval_dataset = load_dataset("stanfordnlp/imdb", split="test[:10%]")
        self.max_length = 128
@ -58,11 +56,10 @@ class SFTTrainerSlowTester(TrlTestCase):
            task_type="CAUSAL_LM",
        )

-    def tearDown(self):
+    def teardown_method(self):
        gc.collect()
        backend_empty_cache(torch_device)
        gc.collect()
-        super().tearDown()

    @parameterized.expand(list(itertools.product(MODELS_TO_TEST, PACKING_OPTIONS)))
    def test_sft_trainer_str(self, model_name, packing):
@ -148,7 +145,7 @@ class SFTTrainerSlowTester(TrlTestCase):
            peft_config=self.peft_config,
        )

-        self.assertIsInstance(trainer.model, PeftModel)
+        assert isinstance(trainer.model, PeftModel)

        trainer.train()

@ -252,7 +249,7 @@ class SFTTrainerSlowTester(TrlTestCase):
            peft_config=self.peft_config,
        )

-        self.assertIsInstance(trainer.model, PeftModel)
+        assert isinstance(trainer.model, PeftModel)

        trainer.train()

@ -332,7 +329,7 @@ class SFTTrainerSlowTester(TrlTestCase):
            peft_config=self.peft_config,
        )

-        self.assertIsInstance(trainer.model, PeftModel)
+        assert isinstance(trainer.model, PeftModel)

        trainer.train()

@ -372,7 +369,7 @@ class SFTTrainerSlowTester(TrlTestCase):
            peft_config=self.peft_config,
        )

-        self.assertIsInstance(trainer.model, PeftModel)
+        assert isinstance(trainer.model, PeftModel)

        trainer.train()

@ -415,12 +412,12 @@ class SFTTrainerSlowTester(TrlTestCase):
            eval_dataset=self.eval_dataset,
        )

-        # Register cleanup now that we have the trainer
-        self.addCleanup(cleanup_liger_patches, trainer)
-
-        trainer.train()
-
-        release_memory(trainer.model, trainer)
+        # Ensure cleanup of liger patches after the test
+        try:
+            trainer.train()
+            release_memory(trainer.model, trainer)
+        finally:
+            cleanup_liger_patches(trainer)

    @parameterized.expand(list(itertools.product(MODELS_TO_TEST, PACKING_OPTIONS)))
    @require_torch_accelerator
@ -447,11 +444,11 @@ class SFTTrainerSlowTester(TrlTestCase):
        trainer.train()

        # Check that the training loss is not None
-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check the params have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
-            self.assertFalse(torch.allclose(param, new_param), f"Parameter {n} has not changed")
+            assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"

        release_memory(trainer.model, trainer)
--- a/tests/test_activation_offloading.py
+++ b/tests/test_activation_offloading.py
@ -16,12 +16,12 @@
 import torch
 from torch import nn
 from transformers import AutoModelForCausalLM
-from transformers.testing_utils import require_peft, require_torch_accelerator, torch_device
+from transformers.testing_utils import require_torch_accelerator, torch_device
 from transformers.utils import is_peft_available

 from trl.models.activation_offloading import NoOpManager, OffloadActivations

-from .testing_utils import TrlTestCase
+from .testing_utils import TrlTestCase, require_peft


 if is_peft_available():
@ -72,9 +72,8 @@ class TestActivationOffloading(TrlTestCase):
        for name_orig, grad_orig in grads_original:
            for name_param, param in model.named_parameters():
                if name_param == name_orig and param.requires_grad and param.grad is not None:
-                    self.assertTrue(
-                        torch.allclose(grad_orig, param.grad, rtol=1e-4, atol=1e-5),
-                        f"Gradient mismatch for {name_orig}",
+                    assert torch.allclose(grad_orig, param.grad, rtol=1e-4, atol=1e-5), (
+                        f"Gradient mismatch for {name_orig}"
                    )

    @require_torch_accelerator
@ -105,7 +104,7 @@ class TestActivationOffloading(TrlTestCase):

        # Gradients should match as NoOpManager should have prevented offloading
        for g1, g2 in zip(grads1, grads2):
-            self.assertTrue(torch.allclose(g1, g2, rtol=1e-4, atol=1e-5))
+            assert torch.allclose(g1, g2, rtol=1e-4, atol=1e-5)

    @require_torch_accelerator
    def test_min_offload_size(self):
@ -152,6 +151,6 @@ class TestActivationOffloading(TrlTestCase):
        grads2 = [p.grad.clone() for p in model.parameters()]

        # Check outputs and gradients match
-        self.assertTrue(torch.allclose(out1, out2, rtol=1e-5))
+        assert torch.allclose(out1, out2, rtol=1e-5)
        for g1, g2 in zip(grads1, grads2):
-            self.assertTrue(torch.allclose(g1, g2, rtol=1e-5))
+            assert torch.allclose(g1, g2, rtol=1e-5)
--- a/tests/test_bco_trainer.py
+++ b/tests/test_bco_trainer.py
@ -14,25 +14,25 @@

 from functools import partial

+import pytest
 import torch
 from accelerate import Accelerator
 from datasets import load_dataset
 from parameterized import parameterized
 from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer
-from transformers.testing_utils import require_peft
 from transformers.utils import is_peft_available

 from trl import BCOConfig, BCOTrainer
 from trl.trainer.bco_trainer import _process_tokens, _tokenize

-from .testing_utils import TrlTestCase, require_no_wandb, require_sklearn
+from .testing_utils import TrlTestCase, require_no_wandb, require_peft, require_sklearn


 if is_peft_available():
    from peft import LoraConfig


-class BCOTrainerTester(TrlTestCase):
+class TestBCOTrainer(TrlTestCase):
    @parameterized.expand(
        [
            ("standard_preference",),
@ -71,13 +71,13 @@ class BCOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.equal(param.cpu(), new_param.cpu()))
+                assert not torch.equal(param.cpu(), new_param.cpu())

    @require_sklearn
    def test_train_with_precompute(self):
@ -108,13 +108,13 @@ class BCOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.equal(param.cpu(), new_param.cpu()))
+                assert not torch.equal(param.cpu(), new_param.cpu())

    @require_sklearn
    def test_train_eval(self):
@ -158,7 +158,7 @@ class BCOTrainerTester(TrlTestCase):
            report_to="none",
        )

-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
            BCOTrainer(
                model=model,
                ref_model=model,  # ref_model can't be the same as model
@ -192,36 +192,36 @@ class BCOTrainerTester(TrlTestCase):

        tokenized_dataset = dataset.map(
            _tokenize,
-            fn_kwargs={"tokenizer": trainer.tokenizer},
+            fn_kwargs={"tokenizer": trainer.processing_class},
            batched=True,
            batch_size=2,
        )
-        self.assertListEqual(tokenized_dataset["prompt"][:], dataset["prompt"][:])
-        self.assertListEqual(tokenized_dataset["completion"][:], dataset["completion"][:])
-        self.assertListEqual(tokenized_dataset["label"][:], dataset["label"][:])
-        self.assertListEqual(tokenized_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
-        self.assertListEqual(tokenized_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
-        self.assertListEqual(tokenized_dataset["answer_input_ids"][0], [27261, 13])
-        self.assertListEqual(tokenized_dataset["answer_attention_mask"][0], [1, 1])
+        assert tokenized_dataset["prompt"][:] == dataset["prompt"][:]
+        assert tokenized_dataset["completion"][:] == dataset["completion"][:]
+        assert tokenized_dataset["label"][:] == dataset["label"][:]
+        assert tokenized_dataset["prompt_input_ids"][0] == [46518, 374, 2664, 1091]
+        assert tokenized_dataset["prompt_attention_mask"][0] == [1, 1, 1, 1]
+        assert tokenized_dataset["answer_input_ids"][0] == [27261, 13]
+        assert tokenized_dataset["answer_attention_mask"][0] == [1, 1]

        fn_kwargs = {
            "prefix": "",
            "is_encoder_decoder": trainer.is_encoder_decoder,
-            "tokenizer": trainer.tokenizer,
+            "tokenizer": trainer.processing_class,
            "max_length": trainer.max_length,
            "truncation_mode": trainer.truncation_mode,
            "label_pad_token_id": trainer.label_pad_token_id,
            "max_prompt_length": trainer.max_prompt_length,
        }
        processed_dataset = tokenized_dataset.map(_process_tokens, fn_kwargs=fn_kwargs)
-        self.assertListEqual(processed_dataset["prompt"][:], dataset["prompt"][:])
-        self.assertListEqual(processed_dataset["completion"][:], dataset["completion"][:])
-        self.assertListEqual(processed_dataset["label"][:], dataset["label"][:])
-        self.assertListEqual(processed_dataset["prompt_input_ids"][0], [46518, 374, 2664, 1091])
-        self.assertListEqual(processed_dataset["prompt_attention_mask"][0], [1, 1, 1, 1])
-        self.assertListEqual(processed_dataset["completion_input_ids"][0], [46518, 374, 2664, 1091, 27261, 13, 151645])
-        self.assertListEqual(processed_dataset["completion_attention_mask"][0], [1, 1, 1, 1, 1, 1, 1])
-        self.assertListEqual(processed_dataset["completion_labels"][0], [-100, -100, -100, -100, 27261, 13, 151645])
+        assert processed_dataset["prompt"][:] == dataset["prompt"][:]
+        assert processed_dataset["completion"][:] == dataset["completion"][:]
+        assert processed_dataset["label"][:] == dataset["label"][:]
+        assert processed_dataset["prompt_input_ids"][0] == [46518, 374, 2664, 1091]
+        assert processed_dataset["prompt_attention_mask"][0] == [1, 1, 1, 1]
+        assert processed_dataset["completion_input_ids"][0] == [46518, 374, 2664, 1091, 27261, 13, 151645]
+        assert processed_dataset["completion_attention_mask"][0] == [1, 1, 1, 1, 1, 1, 1]
+        assert processed_dataset["completion_labels"][0] == [-100, -100, -100, -100, 27261, 13, 151645]

    @require_sklearn
    def test_train_without_providing_ref_model(self):
@ -249,13 +249,13 @@ class BCOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.equal(param.cpu(), new_param.cpu()))
+                assert not torch.equal(param.cpu(), new_param.cpu())

    @require_sklearn
    def test_train_udm(self):
@ -298,13 +298,13 @@ class BCOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.equal(param.cpu(), new_param.cpu()))
+                assert not torch.equal(param.cpu(), new_param.cpu())

    @require_sklearn
    @require_peft
@ -335,14 +335,14 @@ class BCOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            if "lora" in n:
                new_param = trainer.model.get_parameter(n)
                if param.sum() != 0:  # ignore 0 biases
-                    self.assertFalse(torch.equal(param.cpu(), new_param.cpu()))
+                    assert not torch.equal(param.cpu(), new_param.cpu())

    @require_sklearn
    @require_no_wandb
@ -362,9 +362,9 @@ class BCOTrainerTester(TrlTestCase):
            report_to="none",
        )

-        with self.assertRaisesRegex(
+        with pytest.raises(
            ValueError,
-            expected_regex="`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
+            match="`generate_during_eval=True` requires Weights and Biases or Comet to be installed."
            " Please install `wandb` or `comet-ml` to resolve.",
        ):
            BCOTrainer(
@ -440,4 +440,4 @@ class BCOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertEqual(trainer.state.log_history[-2]["eval_test"], 0.0)
+        assert trainer.state.log_history[-2]["eval_test"] == 0.0
--- a/tests/test_best_of_n_sampler.py
+++ b/tests/test_best_of_n_sampler.py
@ -27,7 +27,7 @@ def queries_to_scores(list_of_strings):
    return [torch.rand(1).item() for _ in list_of_strings]


-class BestOfNSamplerTester(TrlTestCase):
+class TestBestOfNSampler(TrlTestCase):
    """
    Tests the BestOfNSampler class
    """
@ -74,8 +74,8 @@ class BestOfNSamplerTester(TrlTestCase):

        for q, expected_length in various_queries_formats:
            results = best_of_n.generate(q)
-            self.assertIsInstance(results, list)
-            self.assertEqual(len(results), expected_length)
+            assert isinstance(results, list)
+            assert len(results) == expected_length

    def test_different_sample_sizes_and_n_candidates_values(self):
        r"""
@ -110,4 +110,4 @@ class BestOfNSamplerTester(TrlTestCase):
            tokenized_queries = [self.tokenizer.encode(query) for query in queries]
            results = best_of_n.generate(tokenized_queries)
            for result in results:
-                self.assertEqual(len(result), expected)
+                assert len(result) == expected
--- a/tests/test_callbacks.py
+++ b/tests/test_callbacks.py
@ -18,11 +18,10 @@ from unittest.mock import call, patch

 from datasets import load_dataset
 from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, Trainer, TrainingArguments
-from transformers.testing_utils import require_peft, require_wandb
+from transformers.testing_utils import require_wandb
 from transformers.trainer_utils import get_last_checkpoint
 from transformers.utils import is_peft_available

-from tests.testing_utils import require_comet, require_mergekit
 from trl import (
    BasePairwiseJudge,
    BEMACallback,
@ -34,7 +33,7 @@ from trl import (
 )
 from trl.mergekit_utils import MergeConfig

-from .testing_utils import TrlTestCase
+from .testing_utils import TrlTestCase, require_comet, require_mergekit, require_peft


 if is_peft_available():
@ -66,9 +65,8 @@ class TrainerWithRefModel(Trainer):
        self.ref_model = ref_model


-class WinRateCallbackTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestWinRateCallback(TrlTestCase):
+    def setup_method(self):
        self.model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.ref_model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
@ -119,7 +117,7 @@ class WinRateCallbackTester(TrlTestCase):
        trainer.train()
        winrate_history = [h for h in trainer.state.log_history if "eval_win_rate" in h]
        for history_row, expected_row in zip(winrate_history, self.expected_winrates):
-            self.assertTrue(all(key in history_row and history_row[key] == expected_row[key] for key in expected_row))
+            assert all(key in history_row and history_row[key] == expected_row[key] for key in expected_row)

    def test_without_ref_model(self):
        # Same as before, but without the ref_model attribute. It should use the model attribute instead
@ -145,7 +143,7 @@ class WinRateCallbackTester(TrlTestCase):
        trainer.train()
        winrate_history = [h for h in trainer.state.log_history if "eval_win_rate" in h]
        for history_row, expected_row in zip(winrate_history, self.expected_winrates):
-            self.assertTrue(all(key in history_row and history_row[key] == expected_row[key] for key in expected_row))
+            assert all(key in history_row and history_row[key] == expected_row[key] for key in expected_row)

    def test_soft_judge(self):
        """Test that the soft judge functionality works correctly"""
@ -188,7 +186,7 @@ class WinRateCallbackTester(TrlTestCase):
            if "eval_avg_win_prob" in h
        ]
        for history_row, expected_row in zip(winrate_history, expected_soft_winrates):
-            self.assertTrue(all(key in history_row and history_row[key] == expected_row[key] for key in expected_row))
+            assert all(key in history_row and history_row[key] == expected_row[key] for key in expected_row)

    @require_peft
    def test_lora(self):
@ -222,12 +220,11 @@ class WinRateCallbackTester(TrlTestCase):
        trainer.train()
        winrate_history = [h for h in trainer.state.log_history if "eval_win_rate" in h]
        for history_row, expected_row in zip(winrate_history, self.expected_winrates):
-            self.assertTrue(all(key in history_row and history_row[key] == expected_row[key] for key in expected_row))
+            assert all(key in history_row and history_row[key] == expected_row[key] for key in expected_row)


-class LogCompletionsCallbackTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestLogCompletionsCallback(TrlTestCase):
+    def setup_method(self):
        self.model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.tokenizer.pad_token = self.tokenizer.eos_token
@ -273,12 +270,12 @@ class LogCompletionsCallbackTester(TrlTestCase):
            completions = json.load(f)

        # Check that the columns are correct
-        self.assertIn("step", completions["columns"])
-        self.assertIn("prompt", completions["columns"])
-        self.assertIn("completion", completions["columns"])
+        assert "step" in completions["columns"]
+        assert "prompt" in completions["columns"]
+        assert "completion" in completions["columns"]

        # Check that the prompt is in the log
-        self.assertIn(self.dataset["test"][0]["prompt"], completions["data"][0])
+        assert self.dataset["test"][0]["prompt"] in completions["data"][0]

    @require_comet
    def test_basic_comet(self):
@ -320,9 +317,8 @@ class LogCompletionsCallbackTester(TrlTestCase):


@require_mergekit
-class MergeModelCallbackTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestMergeModelCallback(TrlTestCase):
+    def setup_method(self):
        self.model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
@ -347,7 +343,7 @@ class MergeModelCallbackTester(TrlTestCase):
        trainer.train()
        last_checkpoint = get_last_checkpoint(self.tmp_dir)
        merged_path = os.path.join(last_checkpoint, "merged")
-        self.assertTrue(os.path.isdir(merged_path), "Merged folder does not exist in the last checkpoint.")
+        assert os.path.isdir(merged_path), "Merged folder does not exist in the last checkpoint."

    def test_every_checkpoint(self):
        training_args = DPOConfig(
@ -374,12 +370,11 @@ class MergeModelCallbackTester(TrlTestCase):

        for checkpoint in checkpoints:
            merged_path = os.path.join(checkpoint, "merged")
-            self.assertTrue(os.path.isdir(merged_path), f"Merged folder does not exist in checkpoint {checkpoint}.")
+            assert os.path.isdir(merged_path), f"Merged folder does not exist in checkpoint {checkpoint}."


-class BEMACallbackTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestBEMACallback(TrlTestCase):
+    def setup_method(self):
        self.model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.tokenizer.pad_token = self.tokenizer.eos_token
@ -409,7 +404,7 @@ class BEMACallbackTester(TrlTestCase):

        # Check that the BEMA model was saved and can be loaded
        bema_path = os.path.join(self.tmp_dir, "bema")
-        self.assertTrue(os.path.isdir(bema_path), "BEMA directory was not created")
+        assert os.path.isdir(bema_path), "BEMA directory was not created"
        AutoModelForCausalLM.from_pretrained(bema_path)

    def test_update_frequency_0(self):
@ -430,7 +425,7 @@ class BEMACallbackTester(TrlTestCase):

            # Total 9 steps (17 samples, batch size 8, 3 epochs).
            # BEMA starts after step 0 and updates every 2 steps → updates at 2, 4, 5, 8
-            self.assertEqual(mock_update.call_args_list, [call(2), call(4), call(6), call(8)])
+            assert mock_update.call_args_list == [call(2), call(4), call(6), call(8)]

    def test_update_frequency_1(self):
        """Test that BEMA callback respects the update frequency."""
@ -450,7 +445,7 @@ class BEMACallbackTester(TrlTestCase):

            # Total 9 steps (17 samples, batch size 8, 3 epochs).
            # BEMA starts after step 0 and updates every 3 steps → updates at 3, 6, 9
-            self.assertEqual(mock_update.call_args_list, [call(3), call(6), call(9)])
+            assert mock_update.call_args_list == [call(3), call(6), call(9)]

    def test_update_frequency_2(self):
        """Test that BEMA callback respects the update frequency."""
@ -470,7 +465,7 @@ class BEMACallbackTester(TrlTestCase):

            # Total 9 steps (17 samples, batch size 8, 3 epochs).
            # BEMA starts after step 3 and updates every 2 steps → updates at 5, 7, 9
-            self.assertEqual(mock_update.call_args_list, [call(5), call(7), call(9)])
+            assert mock_update.call_args_list == [call(5), call(7), call(9)]

    def test_no_bema(self):
        """Test that BEMACallback works without BEMA updates."""
--- a/tests/test_cli.py
+++ b/tests/test_cli.py
@ -15,18 +15,18 @@

 import os
 import sys
-import unittest
 from io import StringIO
 from unittest.mock import patch

+import pytest
 import yaml

 from .testing_utils import TrlTestCase


-@unittest.skipIf(
+@pytest.mark.skipif(
    sys.version_info < (3, 10),
-    "Transformers' generation codebase uses a Python >3.10 syntax (`str | None`), which seems to cause the CLI tests "
+    reason="Transformers' generation codebase uses a Python >3.10 syntax (`str | None`), which seems to cause the CLI tests "
    "to fail on Python <3.10.",  # let's say it's a known issue, but not expected to be fixed, because too niche
 )
 class TestCLI(TrlTestCase):
@ -51,7 +51,7 @@ class TestCLI(TrlTestCase):
        command = "trl env"
        with patch("sys.argv", command.split(" ")):
            main()
-        self.assertIn("TRL version: ", mock_stdout.getvalue().strip())
+        assert "TRL version: " in mock_stdout.getvalue().strip()

    def test_grpo(self):
        from trl.cli import main
@ -112,8 +112,4 @@ class TestCLI(TrlTestCase):
            main()

        # Verify that output directory was created
-        self.assertTrue(os.path.exists(output_dir))
-
-
-if __name__ == "__main__":
-    unittest.main()
+        assert os.path.exists(output_dir)
--- a/tests/test_cli_utils.py
+++ b/tests/test_cli_utils.py
@ -13,10 +13,10 @@
 # limitations under the License.

 import tempfile
-import unittest
 from dataclasses import dataclass
 from unittest.mock import mock_open, patch

+import pytest
 from datasets import DatasetDict, load_dataset

 from trl import DatasetMixtureConfig, TrlParser, get_dataset
@ -40,13 +40,12 @@ class TestTrlParser(TrlTestCase):
    def test_init_without_config_field(self):
        """Test initialization without 'config' field in the dataclasses."""
        parser = TrlParser(dataclass_types=[MyDataclass])
-        self.assertIsInstance(parser, TrlParser)
+        assert isinstance(parser, TrlParser)

    def test_init_with_config_field(self):
        """Test initialization with a 'config' field in the dataclass (should raise ValueError)."""
-        with self.assertRaises(ValueError) as context:
+        with pytest.raises(ValueError, match="has a field named 'config'"):
            TrlParser(dataclass_types=[InvalidDataclass])
-        self.assertTrue("has a field named 'config'" in str(context.exception))

    @patch("builtins.open", mock_open(read_data="env:\n VAR1: value1\n VAR2: value2\narg1: 2"))
    @patch("yaml.safe_load")
@ -67,14 +66,14 @@ class TestTrlParser(TrlTestCase):
        mock_environ["VAR2"] = "value2"

        # Ensure that the environment variables were set correctly
-        self.assertEqual(mock_environ.get("VAR1"), "value1")
-        self.assertEqual(mock_environ.get("VAR2"), "value2")
+        assert mock_environ.get("VAR1") == "value1"
+        assert mock_environ.get("VAR2") == "value2"

        # Check the parsed arguments
-        self.assertEqual(len(result_args), 1)
-        self.assertIsInstance(result_args[0], MyDataclass)
-        self.assertEqual(result_args[0].arg1, 2)
-        self.assertEqual(result_args[0].arg2, "value")
+        assert len(result_args) == 1
+        assert isinstance(result_args[0], MyDataclass)
+        assert result_args[0].arg1 == 2
+        assert result_args[0].arg2 == "value"

    @patch("builtins.open", mock_open(read_data="arg1: 2"))
    @patch("yaml.safe_load")
@ -90,9 +89,9 @@ class TestTrlParser(TrlTestCase):
        result_args = parser.parse_args_and_config(args)

        # Check the parsed arguments
-        self.assertEqual(len(result_args), 1)
-        self.assertIsInstance(result_args[0], MyDataclass)
-        self.assertEqual(result_args[0].arg1, 3)
+        assert len(result_args) == 1
+        assert isinstance(result_args[0], MyDataclass)
+        assert result_args[0].arg1 == 3

    @patch("builtins.open", mock_open(read_data="env: not_a_dict"))
    @patch("yaml.safe_load")
@ -104,11 +103,9 @@ class TestTrlParser(TrlTestCase):

        args = ["--arg1", "2", "--arg2", "value", "--config", "config.yaml"]

-        with self.assertRaises(ValueError) as context:
+        with pytest.raises(ValueError, match="`env` field should be a dict in the YAML file."):
            parser.parse_args_and_config(args)

-        self.assertEqual(str(context.exception), "`env` field should be a dict in the YAML file.")
-
    def test_parse_args_and_config_without_config(self):
        """Test parse_args_and_config without the `--config` argument."""
        parser = TrlParser(dataclass_types=[MyDataclass])
@ -119,10 +116,10 @@ class TestTrlParser(TrlTestCase):
        result_args = parser.parse_args_and_config(args)

        # Check that the arguments are parsed as is
-        self.assertEqual(len(result_args), 1)
-        self.assertIsInstance(result_args[0], MyDataclass)
-        self.assertEqual(result_args[0].arg1, 2)
-        self.assertEqual(result_args[0].arg2, "value")
+        assert len(result_args) == 1
+        assert isinstance(result_args[0], MyDataclass)
+        assert result_args[0].arg1 == 2
+        assert result_args[0].arg2 == "value"

    def test_set_defaults_with_config(self):
        """Test set_defaults_with_config updates the defaults."""
@ -133,9 +130,9 @@ class TestTrlParser(TrlTestCase):

        # Ensure the default value is updated
        result_args = parser.parse_args_and_config([])
-        self.assertEqual(len(result_args), 1)
-        self.assertIsInstance(result_args[0], MyDataclass)
-        self.assertEqual(result_args[0].arg1, 42)
+        assert len(result_args) == 1
+        assert isinstance(result_args[0], MyDataclass)
+        assert result_args[0].arg1 == 42

    def test_parse_args_and_config_with_remaining_strings(self):
        parser = TrlParser(dataclass_types=[MyDataclass])
@ -146,11 +143,11 @@ class TestTrlParser(TrlTestCase):
        result_args = parser.parse_args_and_config(args, return_remaining_strings=True)

        # Check that the arguments are parsed as is
-        self.assertEqual(len(result_args), 2)
-        self.assertIsInstance(result_args[0], MyDataclass)
-        self.assertEqual(result_args[0].arg1, 2)
-        self.assertEqual(result_args[0].arg2, "value")
-        self.assertEqual(result_args[1], ["remaining"])
+        assert len(result_args) == 2
+        assert isinstance(result_args[0], MyDataclass)
+        assert result_args[0].arg1 == 2
+        assert result_args[0].arg2 == "value"
+        assert result_args[1] == ["remaining"]

    @patch("builtins.open", mock_open(read_data="remaining_string_in_config: abc"))
    @patch("yaml.safe_load")
@ -165,10 +162,10 @@ class TestTrlParser(TrlTestCase):
        result_args = parser.parse_args_and_config(args, return_remaining_strings=True)

        # Check that the arguments are parsed as is
-        self.assertEqual(len(result_args), 2)
-        self.assertIsInstance(result_args[0], MyDataclass)
-        self.assertEqual(result_args[0].arg1, 2)
-        self.assertEqual(result_args[1], ["--remaining_string_in_config", "abc", "--remaining_string_in_args", "def"])
+        assert len(result_args) == 2
+        assert isinstance(result_args[0], MyDataclass)
+        assert result_args[0].arg1 == 2
+        assert result_args[1] == ["--remaining_string_in_config", "abc", "--remaining_string_in_args", "def"]

    @patch("builtins.open", mock_open(read_data="arg1: 2\narg2: config_value"))
    @patch("yaml.safe_load")
@ -190,11 +187,11 @@ class TestTrlParser(TrlTestCase):
        result_args = parser.parse_args_and_config(args)

        # Check main parser arguments
-        self.assertEqual(len(result_args), 1)
+        assert len(result_args) == 1

        # Check that config values were applied to the subparser
-        self.assertEqual(result_args[0].arg1, 2)  # Default from config
-        self.assertEqual(result_args[0].arg2, "config_value")  # Default from config
+        assert result_args[0].arg1 == 2  # Default from config
+        assert result_args[0].arg2 == "config_value"  # Default from config

    @patch("builtins.open", mock_open(read_data="arg1: 2\narg2: config_value"))
    @patch("yaml.safe_load")
@ -216,8 +213,8 @@ class TestTrlParser(TrlTestCase):
        result_args = parser.parse_args_and_config(args)

        # Command line arguments should override config
-        self.assertEqual(result_args[0].arg1, 3)
-        self.assertEqual(result_args[0].arg2, "config_value")  # Still from config
+        assert result_args[0].arg1 == 3
+        assert result_args[0].arg2 == "config_value"  # Still from config

    @patch("builtins.open", mock_open(read_data="arg1: 2\nthis_arg_does_not_exist: config_value"))
    @patch("yaml.safe_load")
@ -236,7 +233,7 @@ class TestTrlParser(TrlTestCase):

        # Test with command line arguments overriding config
        args = ["subcommand", "--arg1", "3", "--config", "config.yaml"]
-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
            parser.parse_args_and_config(args)

        parser.parse_args_and_config(args, fail_with_unknown_args=False)
@ -263,21 +260,21 @@ class TestTrlParser(TrlTestCase):
            result_args = parser.parse_args_and_config(args)

            # Check main parser arguments
-            self.assertEqual(len(result_args), 1)
+            assert len(result_args) == 1

            # Check that config values were applied to the subparser
-            self.assertEqual(result_args[0].arg1, 2)  # Default from config
-            self.assertEqual(result_args[0].arg2, "config_value")  # Default from config
+            assert result_args[0].arg1 == 2  # Default from config
+            assert result_args[0].arg2 == "config_value"  # Default from config


-class TestGetDataset(unittest.TestCase):
+class TestGetDataset:
    def test_single_dataset_with_config(self):
        mixture_config = DatasetMixtureConfig(
            datasets=[DatasetConfig(path="trl-internal-testing/zen", name="standard_language_modeling")]
        )
        result = get_dataset(mixture_config)
        expected = load_dataset("trl-internal-testing/zen", "standard_language_modeling")
-        self.assertEqual(expected["train"][:], result["train"][:])
+        assert expected["train"][:] == result["train"][:]

    def test_single_dataset_preference_config(self):
        mixture_config = DatasetMixtureConfig(
@ -285,7 +282,7 @@ class TestGetDataset(unittest.TestCase):
        )
        result = get_dataset(mixture_config)
        expected = load_dataset("trl-internal-testing/zen", "standard_preference")
-        self.assertEqual(expected["train"][:], result["train"][:])
+        assert expected["train"][:] == result["train"][:]

    def test_single_dataset_streaming(self):
        mixture_config = DatasetMixtureConfig(
@ -294,7 +291,7 @@ class TestGetDataset(unittest.TestCase):
        )
        result = get_dataset(mixture_config)
        expected = load_dataset("trl-internal-testing/zen", "standard_language_modeling")
-        self.assertEqual(expected["train"].to_list(), list(result["train"]))
+        assert expected["train"].to_list() == list(result["train"])

    def test_dataset_mixture_basic(self):
        dataset_config1 = DatasetConfig(
@ -305,15 +302,15 @@ class TestGetDataset(unittest.TestCase):
        )
        mixture_config = DatasetMixtureConfig(datasets=[dataset_config1, dataset_config2])
        result = get_dataset(mixture_config)
-        self.assertIsInstance(result, DatasetDict)
-        self.assertIn("train", result)
+        assert isinstance(result, DatasetDict)
+        assert "train" in result
        train_dataset = result["train"]
-        self.assertEqual(train_dataset.column_names, ["prompt"])
+        assert train_dataset.column_names == ["prompt"]
        prompts = train_dataset["prompt"]
        expected_first_half = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
-        self.assertEqual(prompts[: len(prompts) // 2], expected_first_half["prompt"])
+        assert prompts[: len(prompts) // 2] == expected_first_half["prompt"]
        expected_second_half = load_dataset("trl-internal-testing/zen", "standard_prompt_completion", split="train")
-        self.assertEqual(prompts[len(prompts) // 2 :], expected_second_half["prompt"])
+        assert prompts[len(prompts) // 2 :] == expected_second_half["prompt"]

    def test_dataset_mixture_with_weights(self):
        dataset_config1 = DatasetConfig(
@ -324,17 +321,17 @@ class TestGetDataset(unittest.TestCase):
        )
        mixture_config = DatasetMixtureConfig(datasets=[dataset_config1, dataset_config2])
        result = get_dataset(mixture_config)
-        self.assertIsInstance(result, DatasetDict)
-        self.assertIn("train", result)
+        assert isinstance(result, DatasetDict)
+        assert "train" in result
        train_dataset = result["train"]
-        self.assertEqual(train_dataset.column_names, ["prompt"])
+        assert train_dataset.column_names == ["prompt"]
        prompts = train_dataset["prompt"]
        expected_first_half = load_dataset("trl-internal-testing/zen", "standard_preference", split="train[:50%]")
-        self.assertEqual(prompts[: len(prompts) // 2], expected_first_half["prompt"])
+        assert prompts[: len(prompts) // 2] == expected_first_half["prompt"]
        expected_second_half = load_dataset(
            "trl-internal-testing/zen", "standard_prompt_completion", split="train[:50%]"
        )
-        self.assertEqual(prompts[len(prompts) // 2 :], expected_second_half["prompt"])
+        assert prompts[len(prompts) // 2 :] == expected_second_half["prompt"]

    def test_dataset_mixture_with_test_split(self):
        mixture_config = DatasetMixtureConfig(
@ -342,20 +339,18 @@ class TestGetDataset(unittest.TestCase):
            test_split_size=2,
        )
        result = get_dataset(mixture_config)
-        self.assertIsInstance(result, DatasetDict)
-        self.assertIn("train", result)
-        self.assertIn("test", result)
-        self.assertEqual(len(result["train"]), 15)
-        self.assertEqual(len(result["test"]), 2)
+        assert isinstance(result, DatasetDict)
+        assert "train" in result
+        assert "test" in result
+        assert len(result["train"]) == 15
+        assert len(result["test"]) == 2

    def test_empty_dataset_mixture_raises_error(self):
        mixture_config = DatasetMixtureConfig(datasets=[])

-        with self.assertRaises(ValueError) as context:
+        with pytest.raises(ValueError, match="No datasets were loaded"):
            get_dataset(mixture_config)

-        self.assertIn("No datasets were loaded", str(context.exception))
-
    def test_mixture_multiple_different_configs(self):
        dataset_config1 = DatasetConfig(
            path="trl-internal-testing/zen", name="conversational_preference", split="train", columns=["prompt"]
@ -365,9 +360,9 @@ class TestGetDataset(unittest.TestCase):
        )
        mixture_config = DatasetMixtureConfig(datasets=[dataset_config1, dataset_config2])
        result = get_dataset(mixture_config)
-        self.assertIsInstance(result, DatasetDict)
-        self.assertIn("train", result)
-        self.assertGreater(len(result["train"]), 0)
+        assert isinstance(result, DatasetDict)
+        assert "train" in result
+        assert len(result["train"]) > 0

    def test_trlparser_parses_yaml_config_correctly(self):
        # Prepare YAML content exactly like your example
@ -390,24 +385,24 @@ class TestGetDataset(unittest.TestCase):
            args = parser.parse_args_and_config(args=["--config", tmpfile.name])[0]

        # Assert that we got DatasetMixtureConfig instance
-        self.assertIsInstance(args, DatasetMixtureConfig)
+        assert isinstance(args, DatasetMixtureConfig)

        # Assert datasets list length
-        self.assertEqual(len(args.datasets), 2)
+        assert len(args.datasets) == 2

        # Check first dataset
        dataset_config1 = args.datasets[0]
-        self.assertIsInstance(dataset_config1, DatasetConfig)
-        self.assertEqual(dataset_config1.path, "trl-internal-testing/zen")
-        self.assertEqual(dataset_config1.name, "standard_prompt_only")
-        self.assertIsNone(dataset_config1.columns)  # No columns specified
+        assert isinstance(dataset_config1, DatasetConfig)
+        assert dataset_config1.path == "trl-internal-testing/zen"
+        assert dataset_config1.name == "standard_prompt_only"
+        assert dataset_config1.columns is None  # No columns specified

        # Check second dataset
        dataset_config2 = args.datasets[1]
-        self.assertIsInstance(dataset_config2, DatasetConfig)
-        self.assertEqual(dataset_config2.path, "trl-internal-testing/zen")
-        self.assertEqual(dataset_config2.name, "standard_preference")
-        self.assertEqual(dataset_config2.columns, ["prompt"])  # Columns specified
+        assert isinstance(dataset_config2, DatasetConfig)
+        assert dataset_config2.path == "trl-internal-testing/zen"
+        assert dataset_config2.name == "standard_preference"
+        assert dataset_config2.columns == ["prompt"]  # Columns specified

    def test_trlparser_parses_yaml_and_loads_dataset(self):
        # Prepare YAML content exactly like your example
@ -428,4 +423,4 @@ class TestGetDataset(unittest.TestCase):
        # Load the dataset using get_dataset
        result = get_dataset(args)
        expected = load_dataset("trl-internal-testing/zen", "standard_language_modeling")
-        self.assertEqual(expected["train"][:], result["train"][:])
+        assert expected["train"][:] == result["train"][:]
--- a/tests/test_collators.py
+++ b/tests/test_collators.py
@ -21,12 +21,11 @@ from .testing_utils import TrlTestCase


 class TestDataCollatorForPreference(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
        self.collator = DataCollatorForPreference(pad_token_id=0)

    def assertTensorEqual(self, tensor1, tensor2):
-        self.assertTrue(torch.equal(tensor1, tensor2), f"Tensors are not equal:\n{tensor1}\n{tensor2}")
+        assert torch.equal(tensor1, tensor2), f"Tensors are not equal:\n{tensor1}\n{tensor2}"

    def test_padding_behavior(self):
        examples = [
--- a/tests/test_core.py
+++ b/tests/test_core.py
@ -20,22 +20,21 @@ from trl.core import masked_mean, masked_var, masked_whiten
 from .testing_utils import TrlTestCase


-class CoreTester(TrlTestCase):
+class TestCore(TrlTestCase):
    """
    A wrapper class for testing core utils functions
    """

-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
        self.test_input = torch.Tensor([1, 2, 3, 4])
        self.test_mask = torch.Tensor([0, 1, 1, 0])
        self.test_input_unmasked = self.test_input[1:3]

    def test_masked_mean(self):
-        self.assertEqual(torch.mean(self.test_input_unmasked), masked_mean(self.test_input, self.test_mask))
+        assert torch.mean(self.test_input_unmasked) == masked_mean(self.test_input, self.test_mask)

    def test_masked_var(self):
-        self.assertEqual(torch.var(self.test_input_unmasked), masked_var(self.test_input, self.test_mask))
+        assert torch.var(self.test_input_unmasked) == masked_var(self.test_input, self.test_mask)

    def test_masked_whiten(self):
        def whiten(values: torch.Tensor) -> torch.Tensor:
@ -45,4 +44,4 @@ class CoreTester(TrlTestCase):
        whiten_unmasked = whiten(self.test_input_unmasked)
        whiten_masked = masked_whiten(self.test_input, self.test_mask)[1:3]
        diffs = (whiten_unmasked - whiten_masked).sum()
-        self.assertLess(abs(diffs.item()), 0.00001)
+        assert abs(diffs.item()) < 0.00001
--- a/tests/test_cpo_trainer.py
+++ b/tests/test_cpo_trainer.py
@ -17,17 +17,15 @@ import torch
 from datasets import load_dataset
 from parameterized import parameterized
 from transformers import AutoModelForCausalLM, AutoModelForSeq2SeqLM, AutoTokenizer
-from transformers.testing_utils import require_peft

 from trl import CPOConfig, CPOTrainer
 from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE

-from .testing_utils import TrlTestCase
+from .testing_utils import TrlTestCase, require_peft


-class CPOTrainerTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestCPOTrainer(TrlTestCase):
+    def setup_method(self):
        self.model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id)
        self.tokenizer = AutoTokenizer.from_pretrained(self.model_id)
@ -87,13 +85,13 @@ class CPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.equal(param, new_param))
+                assert not torch.equal(param, new_param)

    @parameterized.expand(
        [
@ -143,14 +141,14 @@ class CPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            if "lora" in n:
                new_param = trainer.model.get_parameter(n)
                if param.sum() != 0:  # ignore 0 biases
-                    self.assertFalse(torch.equal(param, new_param))
+                    assert not torch.equal(param, new_param)

    def test_compute_metrics(self):
        dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")
@ -180,7 +178,7 @@ class CPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertEqual(trainer.state.log_history[-2]["eval_test"], 0.0)
+        assert trainer.state.log_history[-2]["eval_test"] == 0.0

    def test_alphapo_trainer(self):
        training_args = CPOConfig(
@ -212,9 +210,9 @@ class CPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:
-                self.assertFalse(torch.equal(param, new_param))
+                assert not torch.equal(param, new_param)
--- a/tests/test_data_utils.py
+++ b/tests/test_data_utils.py
@ -15,7 +15,6 @@
 import copy
 import itertools
 import textwrap
-import unittest
 from time import strftime

 from datasets import Dataset, DatasetDict
@ -40,7 +39,7 @@ from trl.data_utils import (
 from .testing_utils import TrlTestCase


-class PrepareMultimodalMessagesTester(unittest.TestCase):
+class TestPrepareMultimodalMessages:
    def test_basic_user_assistant_conversation(self):
        """Test basic conversation with user and assistant messages."""
        messages = [
@ -55,7 +54,7 @@ class PrepareMultimodalMessagesTester(unittest.TestCase):
            {"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]},
        ]

-        self.assertEqual(messages, expected)
+        assert messages == expected

    def test_first_user_message_gets_image(self):
        """Test that only the first user message gets an image placeholder."""
@ -73,7 +72,7 @@ class PrepareMultimodalMessagesTester(unittest.TestCase):
            {"role": "user", "content": [{"type": "text", "text": "How about the grass?"}]},
        ]

-        self.assertEqual(messages, expected)
+        assert messages == expected

    def test_multiple_images(self):
        """Test that multiple images are added to the first user message."""
@ -97,7 +96,7 @@ class PrepareMultimodalMessagesTester(unittest.TestCase):
            {"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]},
        ]

-        self.assertEqual(messages, expected)
+        assert messages == expected

    def test_system_message_transformation(self):
        """Test that system messages are properly transformed."""
@ -113,7 +112,7 @@ class PrepareMultimodalMessagesTester(unittest.TestCase):
            {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky?"}]},
        ]

-        self.assertEqual(messages, expected)
+        assert messages == expected

    def test_already_prepared_messages_unchanged(self):
        """Test that messages with list content are not modified."""
@ -126,7 +125,7 @@ class PrepareMultimodalMessagesTester(unittest.TestCase):
        original = copy.deepcopy(messages)
        prepare_multimodal_messages(messages, num_images=1)

-        self.assertEqual(messages, original)
+        assert messages == original

    def test_mixed_prepared_and_unprepared_messages(self):
        """Test handling of mixed prepared and unprepared messages."""
@ -144,10 +143,10 @@ class PrepareMultimodalMessagesTester(unittest.TestCase):
            {"role": "user", "content": [{"type": "text", "text": "What about the grass?"}]},
        ]

-        self.assertEqual(messages, expected)
+        assert messages == expected


-class IsConversationalTester(TrlTestCase):
+class TestIsConversational(TrlTestCase):
    conversational_examples = [
        {  # Language modeling
            "messages": [
@ -250,14 +249,14 @@ class IsConversationalTester(TrlTestCase):

    @parameterized.expand(itertools.product(conversational_examples))
    def test_conversational(self, example):
-        self.assertTrue(is_conversational(example))
+        assert is_conversational(example)

    @parameterized.expand(itertools.product(non_conversational_examples))
    def test_non_conversational(self, example):
-        self.assertFalse(is_conversational(example))
+        assert not is_conversational(example)


-class IsConversationalFromValueTester(TrlTestCase):
+class TestIsConversationalFromValue(TrlTestCase):
    def test_positive_1(self):
        example = {
            "conversations": [
@ -265,7 +264,7 @@ class IsConversationalFromValueTester(TrlTestCase):
                {"from": "assistant", "value": "It is blue."},
            ],
        }
-        self.assertTrue(is_conversational_from_value(example))
+        assert is_conversational_from_value(example)

    def test_negative_1(self):
        example = {
@ -274,14 +273,14 @@ class IsConversationalFromValueTester(TrlTestCase):
                {"role": "assistant", "content": "It is blue."},
            ],
        }
-        self.assertFalse(is_conversational_from_value(example))
+        assert not is_conversational_from_value(example)

    def test_negative_2(self):
        example = {"text": "The sky is blue."}
-        self.assertFalse(is_conversational_from_value(example))
+        assert not is_conversational_from_value(example)


-class ApplyChatTemplateTester(TrlTestCase):
+class TestApplyChatTemplate(TrlTestCase):
    tokenizers = [
        "trl-internal-testing/tiny-CohereForCausalLM",
        "trl-internal-testing/tiny-DbrxForCausalLM",
@ -352,24 +351,24 @@ class ApplyChatTemplateTester(TrlTestCase):
        result = apply_chat_template(example, tokenizer)

        # Checking if the result is a dictionary
-        self.assertIsInstance(result, dict)
+        assert isinstance(result, dict)

        # The chat template should be applied to the following keys
        for key in ["prompt", "chosen", "rejected", "completion"]:
            if key in example:
-                self.assertIn(key, result)
-                self.assertIsInstance(result[key], str)
+                assert key in result
+                assert isinstance(result[key], str)

        # Exception for messages, the key is "text" once the chat template is applied
        if "messages" in example:
-            self.assertIn("text", result)
-            self.assertIsInstance(result["text"], str)
+            assert "text" in result
+            assert isinstance(result["text"], str)

        # The label should be kept
        if "label" in example:
-            self.assertIn("label", result)
-            self.assertIsInstance(result["label"], bool)
-            self.assertEqual(result["label"], example["label"])
+            assert "label" in result
+            assert isinstance(result["label"], bool)
+            assert result["label"] == example["label"]

    # both conversational and non-conversational examples
    @parameterized.expand(itertools.product(tokenizers, conversational_examples + non_conversational_examples))
@ -378,24 +377,47 @@ class ApplyChatTemplateTester(TrlTestCase):
        result = maybe_apply_chat_template(example, tokenizer)

        # Checking if the result is a dictionary
-        self.assertIsInstance(result, dict)
+        assert isinstance(result, dict)

        # The chat template should be applied to the following keys
        for key in ["prompt", "chosen", "rejected", "completion"]:
            if key in example:
-                self.assertIn(key, result)
-                self.assertIsInstance(result[key], str)
+                assert key in result
+                assert isinstance(result[key], str)

        # Exception for messages, the key is "text" once the chat template is applied
        if "messages" in example:
-            self.assertIn("text", result)
-            self.assertIsInstance(result["text"], str)
+            assert "text" in result
+            assert isinstance(result["text"], str)

        # The label should be kept
        if "label" in example:
-            self.assertIn("label", result)
-            self.assertIsInstance(result["label"], bool)
-            self.assertEqual(result["label"], example["label"])
+            assert "label" in result
+            assert isinstance(result["label"], bool)
+            assert result["label"] == example["label"]
+
+    def test_apply_chat_template_with_chat_template_kwargs(self):
+        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen3ForCausalLM")
+
+        example = {
+            "prompt": [{"role": "user", "content": "What color is the sky?"}],
+            # with this tokenizer, when you pass enable_thinking=False, it will add "<think>\n\n</think>\n\n"
+            "chat_template_kwargs": {"enable_thinking": False},
+        }
+        result = apply_chat_template(example, tokenizer)
+
+        # docstyle-ignore
+        expected = textwrap.dedent("""\
+        <|im_start|>user
+        What color is the sky?<|im_end|>
+        <|im_start|>assistant
+        <think>
+
+        </think>
+
+        """)
+
+        assert result["prompt"] == expected

    def test_apply_chat_template_with_tools(self):
        tokenizer = AutoProcessor.from_pretrained("trl-internal-testing/tiny-LlamaForCausalLM-3.2")
@ -420,16 +442,16 @@ class ApplyChatTemplateTester(TrlTestCase):
        result_with_tools = apply_chat_template(test_case, tokenizer, tools=[get_current_temperature])

        # Verify tools are included in the output
-        self.assertIn("get_current_temperature", result_with_tools["prompt"])
+        assert "get_current_temperature" in result_with_tools["prompt"]

        # Test without tools
        result_without_tools = apply_chat_template(test_case, tokenizer, tools=None)

        # Verify tools are not included in the output
-        self.assertNotIn("get_current_temperature", result_without_tools["prompt"])
+        assert "get_current_temperature" not in result_without_tools["prompt"]


-class ApplyChatTemplateHarmonyTester(TrlTestCase):
+class TestApplyChatTemplateHarmony(TrlTestCase):
    def test_language_modeling(self):
        messages = {
            "messages": [
@ -459,7 +481,7 @@ class ApplyChatTemplateHarmonyTester(TrlTestCase):

        <|end|><|start|>user<|message|>What color is the sky?<|end|><|start|>assistant<|channel|>analysis<|message|>The user asks the color of the sky...<|end|><|start|>assistant<|channel|>final<|message|>It is blue.<|return|>""")

-        self.assertEqual(output["text"], expected)
+        assert output["text"] == expected

    def test_prompt_only(self):
        messages = {
@ -489,7 +511,7 @@ class ApplyChatTemplateHarmonyTester(TrlTestCase):

        <|end|><|start|>user<|message|>What color is the sky?<|end|><|start|>assistant""")

-        self.assertEqual(output["prompt"], expected)
+        assert output["prompt"] == expected

    def test_prompt_completion(self):
        messages = {
@ -523,8 +545,8 @@ class ApplyChatTemplateHarmonyTester(TrlTestCase):
        <|end|><|start|>user<|message|>What color is the sky?<|end|><|start|>assistant""")
        expected_completion = "<|channel|>analysis<|message|>The user asks the color of the sky...<|end|><|start|>assistant<|channel|>final<|message|>It is blue.<|return|>"

-        self.assertEqual(output["prompt"], expected_prompt)
-        self.assertEqual(output["completion"], expected_completion)
+        assert output["prompt"] == expected_prompt
+        assert output["completion"] == expected_completion

    def test_preference(self):
        messages = {
@ -562,9 +584,9 @@ class ApplyChatTemplateHarmonyTester(TrlTestCase):
        expected_chosen = "<|channel|>analysis<|message|>The user asks the color of the sky...<|end|><|start|>assistant<|channel|>final<|message|>It is blue.<|return|>"
        expected_rejected = "<|channel|>analysis<|message|>The user asks the color of the tree...<|end|><|start|>assistant<|channel|>final<|message|>It is green.<|return|>"

-        self.assertEqual(output["prompt"], expected_prompt)
-        self.assertEqual(output["chosen"], expected_chosen)
-        self.assertEqual(output["rejected"], expected_rejected)
+        assert output["prompt"] == expected_prompt
+        assert output["chosen"] == expected_chosen
+        assert output["rejected"] == expected_rejected

    def test_preference_with_implicit_prompt(self):
        messages = {
@ -614,8 +636,8 @@ class ApplyChatTemplateHarmonyTester(TrlTestCase):

        <|end|><|start|>user<|message|>What color is the sky?<|end|><|start|>assistant<|channel|>analysis<|message|>The user asks the color of the tree...<|end|><|start|>assistant<|channel|>final<|message|>It is green.<|return|>""")

-        self.assertEqual(output["chosen"], expected_chosen)
-        self.assertEqual(output["rejected"], expected_rejected)
+        assert output["chosen"] == expected_chosen
+        assert output["rejected"] == expected_rejected

    def test_unpaired_preference(self):
        messages = {
@ -650,12 +672,12 @@ class ApplyChatTemplateHarmonyTester(TrlTestCase):
        <|end|><|start|>user<|message|>What color is the sky?<|end|><|start|>assistant""")
        expected_completion = "<|channel|>analysis<|message|>The user asks the color of the sky...<|end|><|start|>assistant<|channel|>final<|message|>It is blue.<|return|>"

-        self.assertEqual(output["prompt"], expected_prompt)
-        self.assertEqual(output["completion"], expected_completion)
-        self.assertTrue(output["label"])
+        assert output["prompt"] == expected_prompt
+        assert output["completion"] == expected_completion
+        assert output["label"]


-class UnpairPreferenceDatasetTester(TrlTestCase):
+class TestUnpairPreferenceDataset(TrlTestCase):
    paired_dataset = Dataset.from_dict(
        {
            "prompt": ["The sky is", "The sun is"],
@ -675,61 +697,49 @@ class UnpairPreferenceDatasetTester(TrlTestCase):
    def test_unpair_preference_dataset(self):
        # Test that a paired dataset is correctly converted to unpaired
        unpaired_dataset = unpair_preference_dataset(self.paired_dataset)
-        self.assertEqual(
-            unpaired_dataset.to_dict(),
-            self.unpaired_dataset.to_dict(),
-            "The paired dataset should be converted to unpaired.",
+        assert unpaired_dataset.to_dict() == self.unpaired_dataset.to_dict(), (
+            "The paired dataset should be converted to unpaired."
        )

    def test_unpair_preference_dataset_dict(self):
        # Test that a paired dataset dict is correctly converted to unpaired
        paired_dataset_dict = DatasetDict({"abc": self.paired_dataset})
        unpaired_dataset_dict = unpair_preference_dataset(paired_dataset_dict)
-        self.assertEqual(
-            unpaired_dataset_dict["abc"].to_dict(),
-            self.unpaired_dataset.to_dict(),
-            "The paired dataset should be converted to unpaired.",
+        assert unpaired_dataset_dict["abc"].to_dict() == self.unpaired_dataset.to_dict(), (
+            "The paired dataset should be converted to unpaired."
        )

    def test_maybe_unpair_preference_dataset(self):
        # Test that a paired dataset is correctly converted to unpaired with maybe_unpair_preference_dataset
        unpaired_dataset = maybe_unpair_preference_dataset(self.paired_dataset)
-        self.assertEqual(
-            unpaired_dataset.to_dict(),
-            self.unpaired_dataset.to_dict(),
-            "The paired dataset should be converted to unpaired.",
+        assert unpaired_dataset.to_dict() == self.unpaired_dataset.to_dict(), (
+            "The paired dataset should be converted to unpaired."
        )

    def test_maybe_unpair_preference_dataset_dict(self):
        # Test that a paired dataset dict is correctly converted to unpaired with maybe_unpair_preference_dataset
        paired_dataset_dict = DatasetDict({"abc": self.paired_dataset})
        unpaired_dataset_dict = maybe_unpair_preference_dataset(paired_dataset_dict)
-        self.assertEqual(
-            unpaired_dataset_dict["abc"].to_dict(),
-            self.unpaired_dataset.to_dict(),
-            "The paired dataset should be converted to unpaired.",
+        assert unpaired_dataset_dict["abc"].to_dict() == self.unpaired_dataset.to_dict(), (
+            "The paired dataset should be converted to unpaired."
        )

    def test_maybe_unpair_preference_dataset_already_paired(self):
        # Test that a paired dataset remains unchanged with maybe_unpair_preference_dataset
        unpaired_dataset = maybe_unpair_preference_dataset(self.unpaired_dataset)
-        self.assertEqual(
-            unpaired_dataset.to_dict(),
-            self.unpaired_dataset.to_dict(),
-            "The unpaired dataset should remain unchanged.",
+        assert unpaired_dataset.to_dict() == self.unpaired_dataset.to_dict(), (
+            "The unpaired dataset should remain unchanged."
        )

    def test_maybe_unpair_preference_dataset_dict_already_paired(self):
        # Test that a paired dataset dict remains unchanged with maybe_unpair_preference_dataset
        unpaired_dataset_dict = maybe_unpair_preference_dataset(DatasetDict({"abc": self.unpaired_dataset}))
-        self.assertEqual(
-            unpaired_dataset_dict["abc"].to_dict(),
-            self.unpaired_dataset.to_dict(),
-            "The unpaired dataset should remain unchanged.",
+        assert unpaired_dataset_dict["abc"].to_dict() == self.unpaired_dataset.to_dict(), (
+            "The unpaired dataset should remain unchanged."
        )


-class ExtractPromptTester(TrlTestCase):
+class TestExtractPrompt(TrlTestCase):
    example_implicit_prompt_conversational = {
        "chosen": [
            {"role": "user", "content": "What color is the sky?"},
@ -767,56 +777,42 @@ class ExtractPromptTester(TrlTestCase):
    def test_extract_prompt_conversational(self):
        # Test that the prompt is correctly extracted from the dataset
        example_extracted_prompt = extract_prompt(self.example_implicit_prompt_conversational)
-        self.assertEqual(
-            example_extracted_prompt,
-            self.example_explicit_prompt_conversational,
-            "The prompt is not correctly extracted from the dataset.",
+        assert example_extracted_prompt == self.example_explicit_prompt_conversational, (
+            "The prompt is not correctly extracted from the dataset."
        )

    def test_maybe_extract_prompt_conversational(self):
        # Test that the prompt is correctly extracted from the dataset with maybe_extract_prompt
        example_extracted_prompt = maybe_extract_prompt(self.example_implicit_prompt_conversational)
-        self.assertEqual(
-            example_extracted_prompt,
-            self.example_explicit_prompt_conversational,
-            "The prompt is not correctly extracted from the dataset.",
+        assert example_extracted_prompt == self.example_explicit_prompt_conversational, (
+            "The prompt is not correctly extracted from the dataset."
        )

    def test_maybe_extract_prompt_conversational_already_explicit(self):
        # Test that the prompt remains unchanged with maybe_extract_prompt
        example_extracted_prompt = maybe_extract_prompt(self.example_explicit_prompt_conversational)
-        self.assertEqual(
-            example_extracted_prompt,
-            self.example_explicit_prompt_conversational,
-            "The prompt should remain unchanged.",
+        assert example_extracted_prompt == self.example_explicit_prompt_conversational, (
+            "The prompt should remain unchanged."
        )

    def test_extract_prompt_standard(self):
        # Test that the prompt is correctly extracted from the dataset
        example_extracted_prompt = extract_prompt(self.example_implicit_prompt_standard)
-        self.assertEqual(
-            example_extracted_prompt,
-            self.example_explicit_prompt_standard,
-            "The prompt is not correctly extracted from the dataset.",
+        assert example_extracted_prompt == self.example_explicit_prompt_standard, (
+            "The prompt is not correctly extracted from the dataset."
        )

    def test_maybe_extract_prompt_standard(self):
        # Test that the prompt is correctly extracted from the dataset with maybe_extract_prompt
        example_extracted_prompt = maybe_extract_prompt(self.example_implicit_prompt_standard)
-        self.assertEqual(
-            example_extracted_prompt,
-            self.example_explicit_prompt_standard,
-            "The prompt is not correctly extracted from the dataset.",
+        assert example_extracted_prompt == self.example_explicit_prompt_standard, (
+            "The prompt is not correctly extracted from the dataset."
        )

    def test_maybe_extract_prompt_standard_already_explicit(self):
        # Test that the prompt remains unchanged with maybe_extract_prompt
        example_extracted_prompt = maybe_extract_prompt(self.example_explicit_prompt_standard)
-        self.assertEqual(
-            example_extracted_prompt,
-            self.example_explicit_prompt_standard,
-            "The prompt should remain unchanged.",
-        )
+        assert example_extracted_prompt == self.example_explicit_prompt_standard, "The prompt should remain unchanged."


 class TestPackDatasetWrapped(TrlTestCase):
@ -832,7 +828,7 @@ class TestPackDatasetWrapped(TrlTestCase):
            "attention_mask": [[0, 1, 1], [0, 0, 1], [1, 1]],
        }
        dataset = pack_dataset(dataset, seq_length, strategy="wrapped")
-        self.assertEqual(dataset.to_dict(), expected_output)
+        assert dataset.to_dict() == expected_output

    def test_with_iterable_dataset(self):
        examples = {
@ -847,7 +843,7 @@ class TestPackDatasetWrapped(TrlTestCase):
        }
        dataset = pack_dataset(dataset, seq_length, strategy="wrapped")
        num_examples = len(examples[next(iter(examples))])
-        self.assertEqual(next(iter(dataset.batch(batch_size=num_examples))), expected_output)
+        assert next(iter(dataset.batch(batch_size=num_examples))) == expected_output


 class TestPackDatasetBfd(TrlTestCase):
@ -864,7 +860,7 @@ class TestPackDatasetBfd(TrlTestCase):
            "seq_lengths": [[4], [3, 1]],
        }
        dataset = pack_dataset(dataset, seq_length, strategy="bfd")
-        self.assertEqual(dataset.to_dict(), expected_output)
+        assert dataset.to_dict() == expected_output

    def test_with_iterable_dataset(self):
        examples = {
@ -880,7 +876,7 @@ class TestPackDatasetBfd(TrlTestCase):
        }
        dataset = pack_dataset(dataset, seq_length, strategy="bfd")
        num_examples = len(examples[next(iter(examples))])
-        self.assertEqual(next(iter(dataset.batch(batch_size=num_examples))), expected_output)
+        assert next(iter(dataset.batch(batch_size=num_examples))) == expected_output

    def test_with_truncation(self):
        examples = {
@ -895,7 +891,7 @@ class TestPackDatasetBfd(TrlTestCase):
            "seq_lengths": [[4], [4], [2, 1]],
        }
        dataset = pack_dataset(dataset, seq_length, strategy="bfd")
-        self.assertEqual(dataset.to_dict(), expected_output)
+        assert dataset.to_dict() == expected_output

    def test_with_non_power_of_2(self):
        examples = {
@ -910,7 +906,7 @@ class TestPackDatasetBfd(TrlTestCase):
            "seq_lengths": [[5], [4, 1], [3]],
        }
        dataset = pack_dataset(dataset, seq_length, strategy="bfd")
-        self.assertEqual(dataset.to_dict(), expected_output)
+        assert dataset.to_dict() == expected_output


 class TestTruncateExamples(TrlTestCase):
@ -926,7 +922,7 @@ class TestTruncateExamples(TrlTestCase):
            "attention_mask": [[0, 1], [0, 0], [1]],
        }
        dataset = truncate_dataset(dataset, max_length)
-        self.assertEqual(dataset.to_dict(), expected_output)
+        assert dataset.to_dict() == expected_output

    def test_with_iterable_dataset(self):
        examples = {
@ -941,7 +937,7 @@ class TestTruncateExamples(TrlTestCase):
        }
        dataset = truncate_dataset(dataset, max_length)
        num_examples = len(examples[next(iter(examples))])
-        self.assertEqual(next(iter(dataset.batch(batch_size=num_examples))), expected_output)
+        assert next(iter(dataset.batch(batch_size=num_examples))) == expected_output

    def test_with_extra_column(self):
        examples = {
@ -957,7 +953,7 @@ class TestTruncateExamples(TrlTestCase):
            "my_column": ["a", "b", "c"],
        }
        dataset = truncate_dataset(dataset, max_length)
-        self.assertEqual(dataset.to_dict(), expected_output)
+        assert dataset.to_dict() == expected_output


 class TestMaybeConvertToChatML(TrlTestCase):
@ -975,7 +971,7 @@ class TestMaybeConvertToChatML(TrlTestCase):
                {"role": "assistant", "content": "It is blue."},
            ]
        }
-        self.assertEqual(maybe_convert_to_chatml(example), expected_output)
+        assert maybe_convert_to_chatml(example) == expected_output

    def test_without_conversations_key(self):
        # Same as before, but we don't rename the keys
@ -987,12 +983,12 @@ class TestMaybeConvertToChatML(TrlTestCase):
            "prompt": [{"role": "user", "content": "What color is the sky?"}],
            "completion": [{"role": "assistant", "content": "It is blue."}],
        }
-        self.assertEqual(maybe_convert_to_chatml(example), expected_output)
+        assert maybe_convert_to_chatml(example) == expected_output

    def test_not_conversional(self):
        # When not needed, the example should remain unchanged
        example = {"text": "The sky is blue."}
-        self.assertEqual(maybe_convert_to_chatml(example), example)
+        assert maybe_convert_to_chatml(example) == example

    def test_already_chatml(self):
        # When the example is already in ChatML format, it should remain unchanged
@ -1002,9 +998,4 @@ class TestMaybeConvertToChatML(TrlTestCase):
                {"role": "assistant", "content": "It is blue."},
            ]
        }
-        self.assertEqual(maybe_convert_to_chatml(example), example)
-
-
-# Run the tests
-if __name__ == "__main__":
-    unittest.main()
+        assert maybe_convert_to_chatml(example) == example
--- a/tests/test_dataset_formatting.py
+++ b/tests/test_dataset_formatting.py
@ -14,6 +14,7 @@

 from typing import Callable

+import pytest
 from datasets import Dataset, load_dataset
 from transformers import AutoModelForCausalLM, AutoModelForSequenceClassification, AutoTokenizer

@ -23,9 +24,9 @@ from trl.models.utils import ChatMlSpecialTokens, clone_chat_template, setup_cha
 from .testing_utils import TrlTestCase


-class DatasetFormattingTestCase(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+@pytest.mark.filterwarnings("ignore::FutureWarning")
+class TestDatasetFormatting(TrlTestCase):
+    def setup_method(self):
        self.llama_tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-MistralForCausalLM-0.1")
        self.chatml_tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")

@ -44,20 +45,20 @@ class DatasetFormattingTestCase(TrlTestCase):

        # Llama tokenizer
        formatting_func = get_formatting_func_from_dataset(dataset, self.llama_tokenizer)
-        self.assertIsInstance(formatting_func, Callable)
+        assert isinstance(formatting_func, Callable)
        formatted_text = formatting_func(dataset[0])
        expected = "<s> [INST] You are helpful\n\nHello [/INST] Hi, how can I help you?</s>"
-        self.assertEqual(formatted_text, expected)
+        assert formatted_text == expected
        formatted_text = formatting_func(dataset[0:1])
-        self.assertListEqual(formatted_text, [expected])
+        assert formatted_text == [expected]

        # ChatML tokenizer
        formatting_func = get_formatting_func_from_dataset(dataset, self.chatml_tokenizer)
        formatted_text = formatting_func(dataset[0])
        expected = "<|im_start|>system\nYou are helpful<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi, how can I help you?<|im_end|>\n"
-        self.assertEqual(formatted_text, expected)
+        assert formatted_text == expected
        formatted_text = formatting_func(dataset[0:1])
-        self.assertListEqual(formatted_text, [expected])
+        assert formatted_text == [expected]

    def test_get_formatting_func_from_dataset_with_chatml_conversations(self):
        dataset = Dataset.from_dict(
@ -73,53 +74,52 @@ class DatasetFormattingTestCase(TrlTestCase):
        )
        # Llama tokenizer
        formatting_func = get_formatting_func_from_dataset(dataset, self.llama_tokenizer)
-        self.assertIsInstance(formatting_func, Callable)
+        assert isinstance(formatting_func, Callable)
        formatted_text = formatting_func(dataset[0])
        expected = "<s> [INST] You are helpful\n\nHello [/INST] Hi, how can I help you?</s>"
-        self.assertEqual(formatted_text, expected)
+        assert formatted_text == expected
        formatted_text = formatting_func(dataset[0:1])
-        self.assertListEqual(formatted_text, [expected])
+        assert formatted_text == [expected]

        # ChatML tokenizer
        formatting_func = get_formatting_func_from_dataset(dataset, self.chatml_tokenizer)
        formatted_text = formatting_func(dataset[0])
        expected = "<|im_start|>system\nYou are helpful<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi, how can I help you?<|im_end|>\n"
-        self.assertEqual(formatted_text, expected)
+        assert formatted_text == expected
        formatted_text = formatting_func(dataset[0:1])
-        self.assertListEqual(formatted_text, [expected])
+        assert formatted_text == [expected]

    def test_get_formatting_func_from_dataset_with_instruction(self):
        dataset = Dataset.from_list(
            [{"prompt": "What is 2+2?", "completion": "4"}, {"prompt": "What is 3+3?", "completion": "6"}]
        )
        formatting_func = get_formatting_func_from_dataset(dataset, self.llama_tokenizer)
-        self.assertIsNotNone(formatting_func)
-        self.assertIsInstance(formatting_func, Callable)
+        assert formatting_func is not None
+        assert isinstance(formatting_func, Callable)
        formatted_text = formatting_func(dataset[0])
-        self.assertEqual(formatted_text, "<s> [INST] What is 2+2? [/INST] 4</s>")
+        assert formatted_text == "<s> [INST] What is 2+2? [/INST] 4</s>"
        formatted_text = formatting_func(dataset[0:1])
-        self.assertListEqual(formatted_text, ["<s> [INST] What is 2+2? [/INST] 4</s>"])
+        assert formatted_text == ["<s> [INST] What is 2+2? [/INST] 4</s>"]

    def test_get_formatting_func_from_dataset_from_hub(self):
        ds_1 = load_dataset("philschmid/trl-test-instruction", split="train")
        ds_2 = load_dataset("philschmid/dolly-15k-oai-style", split="train")
        for ds in [ds_1, ds_2]:
            formatting_func = get_formatting_func_from_dataset(ds, self.llama_tokenizer)
-            self.assertIsNotNone(formatting_func)
-            self.assertIsInstance(formatting_func, Callable)
+            assert formatting_func is not None
+            assert isinstance(formatting_func, Callable)
        ds_3 = load_dataset("philschmid/guanaco-sharegpt-style", split="train")
        formatting_func = get_formatting_func_from_dataset(ds_3, self.llama_tokenizer)
-        self.assertIsNone(formatting_func)
+        assert formatting_func is None

    def test_get_formatting_func_from_dataset_with_unknown_format(self):
        dataset = Dataset.from_dict({"text": "test"})
        formatting_func = get_formatting_func_from_dataset(dataset, self.llama_tokenizer)
-        self.assertIsNone(formatting_func)
+        assert formatting_func is None


-class SetupChatFormatTestCase(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestSetupChatFormat(TrlTestCase):
+    def setup_method(self):
        self.tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        self.model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
        # remove built-in chat_template to simulate a model having no chat_template
@ -132,13 +132,13 @@ class SetupChatFormatTestCase(TrlTestCase):

        _chatml = ChatMlSpecialTokens()
        # Check if special tokens are correctly set
-        self.assertEqual(modified_tokenizer.eos_token, "<|im_end|>")
-        self.assertEqual(modified_tokenizer.pad_token, "<|im_end|>")
-        self.assertEqual(modified_tokenizer.bos_token, "<|im_start|>")
-        self.assertEqual(modified_tokenizer.eos_token, _chatml.eos_token)
-        self.assertEqual(modified_tokenizer.pad_token, _chatml.pad_token)
-        self.assertEqual(modified_tokenizer.bos_token, _chatml.bos_token)
-        self.assertEqual((modified_model.vocab_size % 123), 0)
+        assert modified_tokenizer.eos_token == "<|im_end|>"
+        assert modified_tokenizer.pad_token == "<|im_end|>"
+        assert modified_tokenizer.bos_token == "<|im_start|>"
+        assert modified_tokenizer.eos_token == _chatml.eos_token
+        assert modified_tokenizer.pad_token == _chatml.pad_token
+        assert modified_tokenizer.bos_token == _chatml.bos_token
+        assert (modified_model.vocab_size % 123) == 0

    def test_example_with_setup_model(self):
        modified_model, modified_tokenizer = setup_chat_format(
@ -152,13 +152,13 @@ class SetupChatFormatTestCase(TrlTestCase):
        ]
        prompt = modified_tokenizer.apply_chat_template(messages, tokenize=False)

-        self.assertEqual(
-            prompt,
-            "<|im_start|>system\nYou are helpful<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi, how can I help you?<|im_end|>\n",
+        assert (
+            prompt
+            == "<|im_start|>system\nYou are helpful<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\nHi, how can I help you?<|im_end|>\n"
        )


-class CloneChatTemplateTestCase(TrlTestCase):
+class TestCloneChatTemplate(TrlTestCase):
    def test_clone(self):
        # This tokenizer doesn't have a chat_template by default
        tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-BloomForCausalLM")
@ -168,7 +168,7 @@ class CloneChatTemplateTestCase(TrlTestCase):
        _, modified_tokenizer, _ = clone_chat_template(model, tokenizer, source)

        # Check if special tokens are correctly set
-        self.assertEqual(modified_tokenizer.eos_token, "<|im_end|>")
+        assert modified_tokenizer.eos_token == "<|im_end|>"

    def test_clone_with_resize(self):
        # This tokenizer doesn't have a chat_template by default
@ -181,9 +181,9 @@ class CloneChatTemplateTestCase(TrlTestCase):
        )

        # Check that the input embeddings have been resized to a multiple of 123
-        self.assertEqual((modified_model.vocab_size % 123), 0)
+        assert (modified_model.vocab_size % 123) == 0
        # Check that the input embeddings size matches the tokenizer vocabulary size
-        self.assertEqual(model.vocab_size, len(modified_tokenizer.vocab))
+        assert model.vocab_size == len(modified_tokenizer.vocab)

    def test_clone_with_resize_and_extra_tokens_already_in_vocab(self):
        # This tokenizer doesn't have a chat_template by default
@ -201,9 +201,9 @@ class CloneChatTemplateTestCase(TrlTestCase):
        )

        # Check that the input embeddings have been resized to a multiple of 123
-        self.assertEqual((modified_model.vocab_size % 124), 0)
+        assert (modified_model.vocab_size % 124) == 0
        # Check that the input embeddings size matches the tokenizer vocabulary size
-        self.assertEqual(model.vocab_size, len(modified_tokenizer.vocab))
+        assert model.vocab_size == len(modified_tokenizer.vocab)

    def test_apply_new_chat_template(self):
        # This tokenizer doesn't have a chat_template by default
@ -219,9 +219,9 @@ class CloneChatTemplateTestCase(TrlTestCase):
        ]
        prompt = modified_tokenizer.apply_chat_template(messages, tokenize=False)

-        self.assertEqual(
-            prompt,
-            "<|im_start|>system\nYou are helpful<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nHi, how can I help you?<|im_end|>\n",
+        assert (
+            prompt
+            == "<|im_start|>system\nYou are helpful<|im_end|>\n<|im_start|>user\nHello<|im_end|>\n<|im_start|>assistant\n<think>\n\n</think>\n\nHi, how can I help you?<|im_end|>\n"
        )

    def test_clone_with_sequence_classification_model(self):
@ -235,4 +235,4 @@ class CloneChatTemplateTestCase(TrlTestCase):
        _, modified_tokenizer, _ = clone_chat_template(model, tokenizer, source)

        # Check if special tokens are correctly set
-        self.assertEqual(modified_tokenizer.eos_token, "<|im_end|>")
+        assert modified_tokenizer.eos_token == "<|im_end|>"
--- a/tests/test_dpo_trainer.py
+++ b/tests/test_dpo_trainer.py
@ -12,11 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import re
 import sys
-import unittest
 from unittest.mock import MagicMock

 import numpy as np
+import pytest
 import torch
 from datasets import Dataset, features, load_dataset
 from parameterized import parameterized
@ -32,14 +33,18 @@ from transformers import (
 from transformers.testing_utils import (
    get_device_properties,
    require_liger_kernel,
-    require_peft,
-    require_torch_gpu_if_bnb_not_multi_backend_enabled,
-    require_vision,
 )

 from trl import DPOConfig, DPOTrainer, FDivergenceType

-from .testing_utils import TrlTestCase, require_bitsandbytes, require_no_wandb
+from .testing_utils import (
+    TrlTestCase,
+    require_bitsandbytes,
+    require_no_wandb,
+    require_peft,
+    require_torch_gpu_if_bnb_not_multi_backend_enabled,
+    require_vision,
+)


 if is_vision_available():
@ -47,8 +52,7 @@ if is_vision_available():


 class TestTokenizeRow(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
        # Set up the mock tokenizer with specific behaviors
        self.tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
        self.tokenizer.bos_token_id = 0
@ -84,14 +88,11 @@ class TestTokenizeRow(TrlTestCase):
        )

        # Assert the correct output without truncation or special tokens
-        self.assertEqual(
-            result,
-            {
-                "prompt_input_ids": [464, 6766, 318],
-                "chosen_input_ids": [4171, 2],  # eos_token added
-                "rejected_input_ids": [4077, 2],  # eos_token added
-            },
-        )
+        assert result == {
+            "prompt_input_ids": [464, 6766, 318],
+            "chosen_input_ids": [4171, 2],  # eos_token added
+            "rejected_input_ids": [4077, 2],  # eos_token added
+        }

    def test_tokenize_row_with_truncation(self):
        # Define the input features
@ -107,14 +108,11 @@ class TestTokenizeRow(TrlTestCase):
        )

        # Assert the correct output with truncation applied
-        self.assertEqual(
-            result,
-            {
-                "prompt_input_ids": [6766, 318],  # truncated to the last 2 tokens
-                "chosen_input_ids": [4171],  # truncated to 1 token
-                "rejected_input_ids": [4077],  # truncated to 1 token
-            },
-        )
+        assert result == {
+            "prompt_input_ids": [6766, 318],  # truncated to the last 2 tokens
+            "chosen_input_ids": [4171],  # truncated to 1 token
+            "rejected_input_ids": [4077],  # truncated to 1 token
+        }

    def test_tokenize_row_with_special_tokens(self):
        # Define the input features
@ -130,14 +128,11 @@ class TestTokenizeRow(TrlTestCase):
        )

        # Assert the correct output with special tokens added
-        self.assertEqual(
-            result,
-            {
-                "prompt_input_ids": [0, 464, 6766, 318, 2],  # bos_token and eos_token added
-                "chosen_input_ids": [4171, 2],  # eos_token added
-                "rejected_input_ids": [4077, 2],  # eos_token added
-            },
-        )
+        assert result == {
+            "prompt_input_ids": [0, 464, 6766, 318, 2],  # bos_token and eos_token added
+            "chosen_input_ids": [4171, 2],  # eos_token added
+            "rejected_input_ids": [4077, 2],  # eos_token added
+        }

    def test_tokenize_row_with_truncation_and_special_tokens(self):
        # Define the input features
@ -153,19 +148,15 @@ class TestTokenizeRow(TrlTestCase):
        )

        # Assert the correct output with both truncation and special tokens
-        self.assertEqual(
-            result,
-            {
-                "prompt_input_ids": [464, 6766, 318, 2],  # truncated to 4 tokens with bos_token and eos_token
-                "chosen_input_ids": [4171],  # truncated to 1 token
-                "rejected_input_ids": [4077],  # truncated to 1 token
-            },
-        )
+        assert result == {
+            "prompt_input_ids": [464, 6766, 318, 2],  # truncated to 4 tokens with bos_token and eos_token
+            "chosen_input_ids": [4171],  # truncated to 1 token
+            "rejected_input_ids": [4077],  # truncated to 1 token
+        }


-class DPOTrainerTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestDPOTrainer(TrlTestCase):
+    def setup_method(self):
        self.model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id)
        self.ref_model = AutoModelForCausalLM.from_pretrained(self.model_id)
@ -193,13 +184,13 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

    @parameterized.expand(
        [
@ -241,13 +232,13 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

    @require_liger_kernel
    def test_train_encoder_decoder_liger(self):
@ -274,13 +265,13 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

    def test_dpo_trainer_with_weighting(self):
        dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
@ -304,13 +295,13 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

    def test_train_with_multiple_loss_types(self):
        """
@ -338,22 +329,21 @@ class DPOTrainerTester(TrlTestCase):

        # Test that training works
        trainer.train()
-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Verify SFT loss is computed in the first test too
        with torch.no_grad():
            batch = next(iter(trainer.get_train_dataloader()))
            loss, metrics = trainer.get_batch_loss_metrics(trainer.model, batch)
-            self.assertIn("nll_loss", metrics)  # SFT loss should be computed
+            assert "nll_loss" in metrics  # SFT loss should be computed

    def test_wrong_loss_weights_length(self):
-        with self.assertRaises(ValueError) as context:
+        with pytest.raises(ValueError, match="Length of loss_weights list"):
            DPOConfig(
                output_dir=self.tmp_dir,
                loss_type=["sigmoid", "bco_pair"],
                loss_weights=[1.0, 0.5, 0.1],  # Wrong length
            )
-        self.assertIn("Length of loss_weights list", str(context.exception))

    @parameterized.expand([(None,), (0.5,)])
    def test_dpo_trainer_without_providing_ref_model(self, rpo_alpha):
@ -386,13 +376,13 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.equal(param, new_param))
+                assert not torch.equal(param, new_param)

    def test_dpo_trainer_with_ref_model_is_model(self):
        training_args = DPOConfig(
@ -404,7 +394,7 @@ class DPOTrainerTester(TrlTestCase):

        dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")

-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
            DPOTrainer(
                model=self.model,
                ref_model=self.model,  # ref_model can't be the same as model
@ -437,13 +427,13 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

    @require_peft
    def test_dpo_trainer_without_providing_ref_model_with_lora(self):
@ -486,14 +476,14 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            if "lora" in n:
                new_param = trainer.model.get_parameter(n)
                if param.sum() != 0:  # ignore 0 biases
-                    self.assertFalse(torch.equal(param, new_param))
+                    assert not torch.equal(param, new_param)

    def test_dpo_trainer_w_dataset_num_proc(self):
        training_args = DPOConfig(
@ -555,13 +545,13 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.ref_model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.equal(param, new_param))
+                assert not torch.equal(param, new_param)

    @require_no_wandb
    def test_dpo_trainer_generate_during_eval_no_wandb(self):
@ -580,9 +570,9 @@ class DPOTrainerTester(TrlTestCase):

        dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")

-        with self.assertRaisesRegex(
+        with pytest.raises(
            ValueError,
-            expected_regex="`generate_during_eval=True` requires Weights and Biases, MLFlow or Comet to be installed."
+            match="`generate_during_eval=True` requires Weights and Biases, MLFlow or Comet to be installed."
            " Please install `wandb`, `mlflow` or `comet-ml` to resolve.",
        ):
            DPOTrainer(
@ -645,7 +635,7 @@ class DPOTrainerTester(TrlTestCase):
        try:
            AutoModelForCausalLM.from_pretrained(self.tmp_dir)
        except OSError:
-            self.fail("Loading the saved peft adapter failed")
+            pytest.fail("Loading the saved peft adapter failed")

    @require_peft
    @require_torch_gpu_if_bnb_not_multi_backend_enabled
@ -729,9 +719,9 @@ class DPOTrainerTester(TrlTestCase):
    )
    @require_bitsandbytes
    @require_peft
-    @unittest.skipIf(
+    @pytest.mark.skipif(
        get_device_properties()[0] == "cuda" and get_device_properties()[1] < 8,
-        "Skipping because bf16 not supported on CUDA GPU with capability < 8.0",
+        reason="Skipping because bf16 not supported on CUDA GPU with capability < 8.0",
    )
    def test_dpo_lora_bf16_autocast(self, loss_type, pre_compute, gen_during_eval):
        from peft import LoraConfig
@ -826,7 +816,7 @@ class DPOTrainerTester(TrlTestCase):
        )

        for tag in ["dpo", "trl"]:
-            self.assertIn(tag, trainer.model.model_tags)
+            assert tag in trainer.model.model_tags

    @require_peft
    def test_dpo_tags(self):
@ -861,7 +851,7 @@ class DPOTrainerTester(TrlTestCase):
        )

        for tag in ["dpo", "trl"]:
-            self.assertIn(tag, trainer.model.model_tags)
+            assert tag in trainer.model.model_tags

    @require_peft
    def test_dpo_lora_force_use_ref(self):
@ -895,7 +885,7 @@ class DPOTrainerTester(TrlTestCase):

        dummy_dataset = load_dataset("trl-internal-testing/zen", "standard_preference")

-        with self.assertRaises(ValueError):
+        with pytest.raises(ValueError):
            # passing a peft_model as model and ref_model should error out,
            # unless you pass `force_use_ref_model`
            trainer = DPOTrainer(
@ -953,8 +943,8 @@ class DPOTrainerTester(TrlTestCase):
            args=training_args,
            train_dataset=dummy_dataset["train"],
        )
-        self.assertEqual(trainer.model.config.dtype, torch.float16)
-        self.assertEqual(trainer.ref_model.config.dtype, torch.float16)
+        assert trainer.model.config.dtype == torch.float16
+        assert trainer.ref_model.config.dtype == torch.float16

        # Now test when `dtype` is provided but is wrong to either the model or the ref_model
        training_args = DPOConfig(
@ -965,7 +955,12 @@ class DPOTrainerTester(TrlTestCase):
            report_to="none",
        )

-        with self.assertRaises(ValueError) as context:
+        with pytest.raises(
+            ValueError,
+            match=re.escape(
+                "Invalid `dtype` passed to the config. Expected either 'auto' or a string representing a valid `torch.dtype` (e.g., 'float32'), but got -1."
+            ),
+        ):
            _ = DPOTrainer(
                model=self.model_id,
                processing_class=self.tokenizer,
@ -973,12 +968,6 @@ class DPOTrainerTester(TrlTestCase):
                train_dataset=dummy_dataset["train"],
            )

-        self.assertIn(
-            "Invalid `dtype` passed to the config. Expected either 'auto' or a string representing a valid "
-            "`torch.dtype` (e.g., 'float32'), but got -1.",
-            str(context.exception),
-        )
-
        training_args = DPOConfig(
            output_dir=self.tmp_dir,
            per_device_train_batch_size=2,
@ -987,7 +976,12 @@ class DPOTrainerTester(TrlTestCase):
            report_to="none",
        )

-        with self.assertRaises(ValueError) as context:
+        with pytest.raises(
+            ValueError,
+            match=re.escape(
+                "Invalid `dtype` passed to the config. Expected either 'auto' or a string representing a valid `torch.dtype` (e.g., 'float32'), but got -1."
+            ),
+        ):
            _ = DPOTrainer(
                model=self.model_id,
                ref_model=self.model_id,
@ -996,12 +990,6 @@ class DPOTrainerTester(TrlTestCase):
                train_dataset=dummy_dataset["train"],
            )

-        self.assertIn(
-            "Invalid `dtype` passed to the config. Expected either 'auto' or a string representing a valid "
-            "`torch.dtype` (e.g., 'float32'), but got -1.",
-            str(context.exception),
-        )
-
    def test_dpo_loss_alpha_div_f(self):
        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
        tokenizer = AutoTokenizer.from_pretrained(model_id)
@ -1041,7 +1029,7 @@ class DPOTrainerTester(TrlTestCase):
        losses, _, _ = trainer.dpo_loss(
            policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
        )
-        self.assertTrue(torch.isfinite(losses).cpu().numpy().all())
+        assert torch.isfinite(losses).cpu().numpy().all()

    def test_dpo_loss_js_div_f(self):
        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
@ -1083,7 +1071,7 @@ class DPOTrainerTester(TrlTestCase):
        losses, _, _ = trainer.dpo_loss(
            policy_chosen_logps, policy_rejected_logps, reference_chosen_logps, reference_rejected_logps
        )
-        self.assertTrue(torch.isfinite(losses).cpu().numpy().all())
+        assert torch.isfinite(losses).cpu().numpy().all()

    def test_dpo_trainer_use_logits_to_keep(self):
        model_id = "trl-internal-testing/tiny-LlamaForCausalLM-3.2"
@ -1199,7 +1187,7 @@ class DPOTrainerTester(TrlTestCase):
        # We don't run the training, but at this stage, the dataset is supposed to be pre-processed. When
        # pre-processing, we expect the available tools to be explicitly mentioned in the system prompt. That's
        # what we're checking here
-        self.assertIn("get_current_temperature", tokenizer.decode(trainer.train_dataset["prompt_input_ids"][0]))
+        assert "get_current_temperature" in tokenizer.decode(trainer.train_dataset["prompt_input_ids"][0])

    def test_padding_free(self):
        model_id = "trl-internal-testing/tiny-LlamaForCausalLM-3.2"
@ -1235,7 +1223,7 @@ class DPOTrainerTester(TrlTestCase):
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

    def test_compute_metrics(self):
        model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
@ -1270,7 +1258,7 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertEqual(trainer.state.log_history[-2]["eval_test"], 0.0)
+        assert trainer.state.log_history[-2]["eval_test"] == 0.0

    def test_train_with_length_desensitization(self):
        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
@ -1295,15 +1283,14 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)

-    @unittest.skipUnless(sys.version_info >= (3, 10), "Liger kernel is not supported on Python 3.9")
    @parameterized.expand(
        [
            (0.1, "sigmoid"),
@ -1319,6 +1306,7 @@ class DPOTrainerTester(TrlTestCase):
        ]
    )
    @require_liger_kernel
+    @pytest.mark.skipif(not (sys.version_info >= (3, 10)), reason="Liger kernel is not supported on Python 3.9")
    def test_dpo_trainer_with_liger(self, beta, loss_type):
        """Test DPO trainer with Liger loss enabled across supported loss types.

@ -1359,20 +1347,20 @@ class DPOTrainerTester(TrlTestCase):
        train_output = trainer.train()

        # Verify training completed successfully
-        self.assertIsNotNone(train_output)
-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert train_output is not None
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Verify loss is finite
-        self.assertTrue(np.isfinite(trainer.state.log_history[-1]["train_loss"]))
+        assert np.isfinite(trainer.state.log_history[-1]["train_loss"])

        # Check parameters have been updated
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            # Only check non-zero parameters
            if param.sum() != 0:
-                self.assertFalse(torch.equal(param, new_param))
+                assert not torch.equal(param, new_param)
                # Verify new parameters are finite
-                self.assertTrue(torch.isfinite(new_param).all())
+                assert torch.isfinite(new_param).all()

        # Verify model can still do forward pass after training
        dummy_batch = next(iter(trainer.get_train_dataloader()))
@ -1382,8 +1370,8 @@ class DPOTrainerTester(TrlTestCase):
        }
        with torch.no_grad():
            output = trainer.model(**model_inputs)
-        self.assertIsNotNone(output)
-        self.assertFalse("loss" in output.keys())
+        assert output is not None
+        assert "loss" not in output.keys()

    def test_train_with_iterable_dataset(self):
        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
@ -1411,17 +1399,17 @@ class DPOTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the parameters have changed
        for n, param in previous_trainable_params.items():
            new_param = trainer.model.get_parameter(n)
            if param.sum() != 0:  # ignore 0 biases
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12))
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12)


@require_vision
-class DPOVisionTrainerTester(TrlTestCase):
+class TestDPOVisionTrainer(TrlTestCase):
    @parameterized.expand(
        [
            # ("trl-internal-testing/tiny-Idefics2ForConditionalGeneration",),  device issue from transformers, see https://github.com/huggingface/transformers/pull/39975
@ -1494,7 +1482,7 @@ class DPOVisionTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

        # Check that the trainable params have changed
        for n, param in previous_trainable_params.items():
@ -1510,7 +1498,7 @@ class DPOVisionTrainerTester(TrlTestCase):
                    # For some reason, these params are not updated. This is probably not related to TRL, but to
                    # the model itself. We should investigate this further, but for now we just skip these params.
                    continue
-                self.assertFalse(torch.allclose(param, new_param, rtol=1e-12, atol=1e-12), f"Param {n} is not updated")
+                assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12), f"Param {n} is not updated"


 class TestDPOConfig(TrlTestCase):
@ -1529,7 +1517,3 @@ class TestDPOConfig(TrlTestCase):
        # Serialization: TrainingArguments.to_dict should yield the enum's string value
        configparser_dict = training_args.to_dict()
        assert configparser_dict["f_divergence_type"] == f_divergence_type.value
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/tests/test_gkd_trainer.py
+++ b/tests/test_gkd_trainer.py
@ -26,9 +26,9 @@ from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
 from .testing_utils import TrlTestCase


-class TestGKDTrainer(TrlTestCase):
+class TestGKDTrainerGenerateOnPolicy(TrlTestCase):
    @classmethod
-    def setUpClass(cls):
+    def setup_class(cls):
        model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
        cls.tokenizer = AutoTokenizer.from_pretrained(model_id)
        cls.tokenizer.pad_token = cls.tokenizer.eos_token
@ -69,9 +69,8 @@ class TestGKDTrainer(TrlTestCase):

        # Check if the generated texts start with the original prompts
        for prompt, generated_text in zip(prompts, generated_texts):
-            self.assertTrue(
-                generated_text.startswith(prompt),
-                f"Generated text '{generated_text}' does not start with prompt '{prompt}'",
+            assert generated_text.startswith(prompt), (
+                f"Generated text '{generated_text}' does not start with prompt '{prompt}'"
            )

        # Run the generation twice and check if the outputs are identical
@ -82,15 +81,11 @@ class TestGKDTrainer(TrlTestCase):
        new_input_ids2, new_attention_mask2, new_labels2 = outputs2

        # Check if the two generations are identical
-        self.assertTrue(torch.all(new_input_ids.eq(new_input_ids2)), "Deterministic generations are not identical")
-        self.assertTrue(
-            torch.all(new_attention_mask.eq(new_attention_mask2)),
-            "Attention masks for deterministic generations are not identical",
-        )
-        self.assertTrue(
-            torch.all(new_labels.eq(new_labels2)),
-            "Labels for deterministic generations are not identical",
+        assert torch.all(new_input_ids.eq(new_input_ids2)), "Deterministic generations are not identical"
+        assert torch.all(new_attention_mask.eq(new_attention_mask2)), (
+            "Attention masks for deterministic generations are not identical"
        )
+        assert torch.all(new_labels.eq(new_labels2)), "Labels for deterministic generations are not identical"

    def test_generate_on_policy_outputs(self):
        prompts = ["Hello, how are you?", "What's the weather like today?"]
@ -106,30 +101,29 @@ class TestGKDTrainer(TrlTestCase):
        )

        # Check that outputs is a tuple of three tensors
-        self.assertIsInstance(outputs, tuple)
-        self.assertEqual(len(outputs), 3)
+        assert isinstance(outputs, tuple)
+        assert len(outputs) == 3

        new_input_ids, new_attention_mask, new_labels = outputs

        # Check shapes
        batch_size = len(prompts)
-        self.assertEqual(new_input_ids.shape[0], batch_size)
-        self.assertEqual(new_attention_mask.shape[0], batch_size)
-        self.assertEqual(new_labels.shape[0], batch_size)
+        assert new_input_ids.shape[0] == batch_size
+        assert new_attention_mask.shape[0] == batch_size
+        assert new_labels.shape[0] == batch_size

        # Check types
-        self.assertIsInstance(new_input_ids, torch.Tensor)
-        self.assertIsInstance(new_attention_mask, torch.Tensor)
-        self.assertIsInstance(new_labels, torch.Tensor)
+        assert isinstance(new_input_ids, torch.Tensor)
+        assert isinstance(new_attention_mask, torch.Tensor)
+        assert isinstance(new_labels, torch.Tensor)

        # Check that new_input_ids and new_attention_mask have the same shape
-        self.assertEqual(new_input_ids.shape, new_attention_mask.shape)
-        self.assertEqual(new_labels.shape, new_attention_mask.shape)
+        assert new_input_ids.shape == new_attention_mask.shape
+        assert new_labels.shape == new_attention_mask.shape


 class TestGeneralizedJSDLoss(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+    def setup_method(self):
        self.batch_size = 2
        self.seq_length = 3
        self.vocab_size = 5
@ -139,7 +133,7 @@ class TestGeneralizedJSDLoss(TrlTestCase):
    def test_uniform_distribution(self):
        logits = torch.ones(1, 1, self.vocab_size)
        loss = GKDTrainer.generalized_jsd_loss(logits, logits)
-        self.assertAlmostEqual(loss.item(), 0, places=5)
+        assert round(abs(loss.item() - 0), 5) == 0

    def test_generalized_jsd_loss_edge_cases(self):
        # Setup
@ -151,29 +145,29 @@ class TestGeneralizedJSDLoss(TrlTestCase):
        expected_loss_beta_1 = F.kl_div(
            F.log_softmax(teacher_logits, dim=-1), F.softmax(student_logits, dim=-1), reduction="batchmean"
        )
-        self.assertAlmostEqual(loss_beta_1.item(), expected_loss_beta_1.item(), places=5)
+        assert round(abs(loss_beta_1.item() - expected_loss_beta_1.item()), 5) == 0

        # Case 2: beta = 0 (should be equivalent to KL(teacher || student))
        loss_beta_0 = GKDTrainer.generalized_jsd_loss(student_logits, teacher_logits, beta=0)
        expected_loss_beta_0 = F.kl_div(
            F.log_softmax(student_logits, dim=-1), F.softmax(teacher_logits, dim=-1), reduction="batchmean"
        )
-        self.assertAlmostEqual(loss_beta_0.item(), expected_loss_beta_0.item(), places=5)
+        assert round(abs(loss_beta_0.item() - expected_loss_beta_0.item()), 5) == 0

    def test_output_shape(self):
        loss = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits)
-        self.assertTrue(torch.is_tensor(loss))
-        self.assertEqual(loss.shape, torch.Size([]))
+        assert torch.is_tensor(loss)
+        assert loss.shape == torch.Size([])

    def test_beta_values(self):
        loss_beta_0 = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, beta=0)
        loss_beta_1 = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, beta=1)
-        self.assertNotEqual(loss_beta_0, loss_beta_1)
+        assert loss_beta_0 != loss_beta_1

    def test_temperature_scaling(self):
        loss_temp_1 = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, temperature=1)
        loss_temp_2 = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, temperature=2)
-        self.assertNotEqual(loss_temp_1, loss_temp_2)
+        assert loss_temp_1 != loss_temp_2

    def test_reduction_methods(self):
        loss_batchmean = GKDTrainer.generalized_jsd_loss(
@ -183,29 +177,28 @@ class TestGeneralizedJSDLoss(TrlTestCase):
        loss_mean = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, reduction="mean")
        loss_none = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, reduction="none")

-        self.assertEqual(loss_batchmean.shape, torch.Size([]))
-        self.assertEqual(loss_sum.shape, torch.Size([]))
-        self.assertEqual(loss_mean.shape, torch.Size([]))
-        self.assertEqual(loss_none.shape, self.student_logits.shape)
+        assert loss_batchmean.shape == torch.Size([])
+        assert loss_sum.shape == torch.Size([])
+        assert loss_mean.shape == torch.Size([])
+        assert loss_none.shape == self.student_logits.shape

    def test_symmetry(self):
        student_teacher = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, beta=0.1)
        teacher_student = GKDTrainer.generalized_jsd_loss(self.teacher_logits, self.student_logits, beta=0.1)
-        self.assertNotEqual(student_teacher, teacher_student)
+        assert student_teacher != teacher_student

        student_teacher = GKDTrainer.generalized_jsd_loss(self.student_logits, self.teacher_logits, beta=0.5)
        teacher_student = GKDTrainer.generalized_jsd_loss(self.teacher_logits, self.student_logits, beta=0.5)
-        self.assertEqual(student_teacher, teacher_student)
+        assert student_teacher == teacher_student

    def test_zero_loss_for_identical_inputs(self):
        identical_logits = torch.randn(self.batch_size, self.seq_length, self.vocab_size)
        loss = GKDTrainer.generalized_jsd_loss(identical_logits, identical_logits)
-        self.assertAlmostEqual(loss.item(), 0, places=6)
+        assert round(abs(loss.item() - 0), 6) == 0


-class GKDTrainerTester(TrlTestCase):
-    def setUp(self):
-        super().setUp()
+class TestGKDTrainer(TrlTestCase):
+    def setup_method(self):
        self.model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
        self.model = AutoModelForCausalLM.from_pretrained(self.model_id)
        self.teacher_model = AutoModelForCausalLM.from_pretrained(self.model_id)
@ -241,9 +234,9 @@ class GKDTrainerTester(TrlTestCase):

        trainer.train()

-        self.assertIsNotNone(trainer.state.log_history[(-1)]["train_loss"])
-        self.assertIsNotNone(trainer.state.log_history[0]["eval_loss"])
-        self.assertIn("model.safetensors", os.listdir(self.tmp_dir + "/checkpoint-2"))
+        assert trainer.state.log_history[(-1)]["train_loss"] is not None
+        assert trainer.state.log_history[0]["eval_loss"] is not None
+        assert "model.safetensors" in os.listdir(self.tmp_dir + "/checkpoint-2")

    @require_liger_kernel
    def test_gkd_trainer_with_liger(self):
@ -269,7 +262,7 @@ class GKDTrainerTester(TrlTestCase):
        trainer.train()

        # Check we logged a train loss
-        self.assertIsNotNone(trainer.state.log_history[-1]["train_loss"])
+        assert trainer.state.log_history[-1]["train_loss"] is not None

    def test_generation_config_init(self):
        training_args = GKDConfig(output_dir=self.tmp_dir)
@ -284,8 +277,8 @@ class GKDTrainerTester(TrlTestCase):
            processing_class=self.tokenizer,
        )

-        self.assertEqual(trainer.generation_config.pad_token_id, self.tokenizer.eos_token_id)
-        self.assertEqual(trainer.generation_config.eos_token_id, self.model.generation_config.eos_token_id)
-        self.assertEqual(trainer.generation_config.max_new_tokens, training_args.max_new_tokens)
-        self.assertEqual(trainer.generation_config.temperature, training_args.temperature)
-        self.assertEqual(trainer.generation_config.top_k, 0)
+        assert trainer.generation_config.pad_token_id == self.tokenizer.eos_token_id
+        assert trainer.generation_config.eos_token_id == self.model.generation_config.eos_token_id
+        assert trainer.generation_config.max_new_tokens == training_args.max_new_tokens
+        assert trainer.generation_config.temperature == training_args.temperature
+        assert trainer.generation_config.top_k == 0
--- a/Show More
+++ b/Show More