Merge branch 'main' into docs/move-multi-adapter-rl

Add On-Policy Distillation from thinking labs to paper index. (#4410 )
Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
2025-11-05 05:34:29 +08:00 · 2025-11-03 13:29:59 -08:00 · 2025-11-03 21:07:31 +00:00 · 2025-11-03 10:57:51 -08:00 · 2025-11-03 10:17:07 -08:00 · 2025-11-03 10:16:03 -08:00
288 changed files with 24183 additions and 18543 deletions
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -14,6 +14,5 @@ jobs:
      commit_sha: ${{ github.sha }}
      package: trl
      version_tag_suffix: ""
-      custom_container: huggingface/transformers-doc-builder
    secrets:
      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -16,4 +16,3 @@ jobs:
      pr_number: ${{ github.event.number }}
      package: trl
      version_tag_suffix: ""
-      custom_container: huggingface/transformers-doc-builder
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@ -1,95 +1,84 @@
-name: Build Docker images (scheduled)
+name: Build TRL Docker image

 on:
+  push:
+    branches:
+      - main
  workflow_dispatch:
-  workflow_call:
-  schedule:
-    - cron: "0 1 * * *"

 concurrency:
  group: docker-image-builds
  cancel-in-progress: false

-env:
-  CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
-
 jobs:
-  trl-latest:
-    name: "Latest TRL GPU"
+  trl:
+    name: "Build and push TRL Docker image"
    runs-on: ubuntu-latest
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Check out code
+      - name: Checkout code
        uses: actions/checkout@v4
+
+      - name: Get TRL version from PyPI
+        run: |
+          VERSION=$(curl -s https://pypi.org/pypi/trl/json | jq -r .info.version)
+          echo "VERSION=$VERSION" >> $GITHUB_ENV
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
      - name: Login to DockerHub
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

-      - name: Build and Push GPU
+      - name: Build and Push
        uses: docker/build-push-action@v4
        with:
-          context: ./docker/trl-latest-gpu
+          context: docker/trl
          push: true
-          tags: huggingface/trl-latest-gpu
+          tags: |
+            huggingface/trl:${{ env.VERSION }}
+            huggingface/trl

      - name: Post to Slack
        if: always()
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
-          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: 🤗 Results of the trl-latest-gpu Docker Image build
+          slack_channel: ${{ secrets.CI_DOCKER_CHANNEL }}
+          title: 🤗 Results of the TRL Dev Docker Image build
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-  trl-source:
-    name: "Latest TRL + HF ecosystem from source"
+  trl-dev:
+    name: "Build and push TRL Dev Docker image"
    runs-on: ubuntu-latest
    steps:
-      - name: Cleanup disk
-        run: |
-          sudo ls -l /usr/local/lib/
-          sudo ls -l /usr/share/
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-          sudo rm -rf /usr/local/lib/android
-          sudo rm -rf /usr/share/dotnet
-          sudo du -sh /usr/local/lib/
-          sudo du -sh /usr/share/
-      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@v1
-      - name: Check out code
+      - name: Checkout code
        uses: actions/checkout@v4
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
      - name: Login to DockerHub
-        uses: docker/login-action@v1
+        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_PASSWORD }}

-      - name: Build and Push GPU
+      - name: Build and Push
        uses: docker/build-push-action@v4
        with:
-          context: ./docker/trl-source-gpu
+          context: docker/trl-dev
          push: true
-          tags: huggingface/trl-source-gpu
+          tags: |
+            huggingface/trl:dev

      - name: Post to Slack
        if: always()
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
-          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: 🤗 Results of the trl-source-gpu Docker Image build
+          slack_channel: ${{ secrets.CI_DOCKER_CHANNEL }}
+          title: 🤗 Results of the TRL Dev Docker Image build
          status: ${{ job.status }}
-          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}  
+          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/slow-tests.yml
+++ b/.github/workflows/slow-tests.yml
@ -11,94 +11,102 @@ env:
  RUN_SLOW: "yes"
  IS_GITHUB_CI: "1"
  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+  TRL_EXPERIMENTAL_SILENCE: 1

 jobs:
  run_all_tests_single_gpu:
-    strategy:
-      fail-fast: false
-      matrix:
-        docker-image-name:
-          [
-            "huggingface/trl-latest-gpu:latest",
-            "huggingface/trl-source-gpu:latest",
-          ]
    runs-on:
      group: aws-g4dn-2xlarge
    env:
      CUDA_VISIBLE_DEVICES: "0"
-      TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}"
+      TEST_TYPE: "single_gpu"
    container:
-      image: ${{ matrix.docker-image-name }}
-      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
-      - uses: actions/checkout@v4
-      - name: Pip install
+      - name: Git checkout
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
        run: |
-          source activate trl
-          pip install -e ".[test,vlm]" --no-deps
-          pip install pytest-reportlog parameterized
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+
+      - name: Install dependencies
+        run: |
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+          uv pip install pytest-reportlog

      - name: Run slow SFT tests on single GPU
        if: always()
        run: |
-          source activate trl
+          source .venv/bin/activate
          make slow_tests

      - name: Generate Report
        if: always()
        run: |
-          pip install slack_sdk tabulate
+          source .venv/bin/activate
+          uv pip install slack_sdk tabulate
          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

  run_all_tests_multi_gpu:
-    strategy:
-      fail-fast: false
-      matrix:
-        docker-image-name:
-          [
-            "huggingface/trl-latest-gpu:latest",
-            "huggingface/trl-source-gpu:latest",
-          ]
    runs-on:
      group: aws-g4dn-2xlarge
    env:
      CUDA_VISIBLE_DEVICES: "0,1"
-      TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}"
+      TEST_TYPE: "multi_gpu"
    container:
-      image: ${{ matrix.docker-image-name }}
-      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+      options: --gpus all --shm-size "16gb"
    defaults:
      run:
        shell: bash
    steps:
-      - uses: actions/checkout@v4
-      - name: Pip install
+      - name: Git checkout
+        uses: actions/checkout@v4
+
+      - name: Install system dependencies
        run: |
-          source activate trl
-          pip install -e ".[test,vlm]" --no-deps
-          pip install pytest-reportlog parameterized
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+
+      - name: Install dependencies
+        run: |
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+          uv pip install pytest-reportlog

      - name: Run slow SFT tests on Multi GPU
        if: always()
        run: |
-          source activate trl
+          source .venv/bin/activate
          make slow_tests

-      - name: Run end-to-end examples tests on multi GPU
-        if: always()
-        run: |
-          source activate trl
-          pip install deepspeed
-          make test_examples
-
      - name: Generate Reports
        if: always()
        run: |
-          pip install slack_sdk tabulate
+          source .venv/bin/activate
+          uv pip install slack_sdk tabulate
          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
-          python scripts/log_example_reports.py --text_file_name temp_results_sft_tests.txt >> $GITHUB_STEP_SUMMARY
-          python scripts/log_example_reports.py --text_file_name temp_results_dpo_tests.txt >> $GITHUB_STEP_SUMMARY
-          rm *.txt
+          rm *.txt
--- a/.github/workflows/tests-experimental.yml
+++ b/.github/workflows/tests-experimental.yml
@ -0,0 +1,70 @@
+name: Tests (experimental)
+
+on:
+  pull_request:
+    paths:
+      # Run only when relevant files are modified
+      - "trl/experimental/**"
+      - "tests/experimental/**"
+
+env:
+  TQDM_DISABLE: 1
+  PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
+  TRL_EXPERIMENTAL_SILENCE: 1
+
+jobs:
+  check_code_quality:
+    name: Check code quality
+    runs-on: ubuntu-latest
+    if: github.event.pull_request.draft == false
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.13
+      - uses: pre-commit/action@v3.0.1
+        with:
+          extra_args: --all-files
+
+  tests:
+    name: Tests (experimental)
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Git checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python 3.13
+        uses: actions/setup-python@v5
+        with:
+          python-version: 3.13
+
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+
+      - name: Install dependencies
+        run: |
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+
+      - name: Test with pytest
+        run: |
+          source .venv/bin/activate
+          make test_experimental
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -11,11 +11,15 @@ on:
      - "scripts/**.py"
      - "tests/**.py"
      - "trl/**.py"
-      - "setup.py"
+      - "pyproject.toml"
+      # Exclude if only experimental code/tests
+      - "!trl/experimental/**"
+      - "!tests/experimental/**"

 env:
  TQDM_DISABLE: 1
  CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
+  PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"

 jobs:
  check_code_quality:
@ -41,7 +45,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
@ -93,7 +97,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
@ -128,7 +132,7 @@ jobs:
          uv pip install -U git+https://github.com/huggingface/accelerate.git
          uv pip install -U git+https://github.com/huggingface/datasets.git
          uv pip install -U git+https://github.com/huggingface/transformers.git
-          
+          uv pip install -U git+https://github.com/huggingface/peft.git

      - name: Test with pytest
        run: |
@ -149,7 +153,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
@ -201,7 +205,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
--- a/.github/workflows/tests_latest.yml
+++ b/.github/workflows/tests_latest.yml
@ -9,6 +9,7 @@ on:
 env:
  TQDM_DISABLE: 1
  CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
+  TRL_EXPERIMENTAL_SILENCE: 1

 jobs:
  tests:
@ -16,7 +17,7 @@ jobs:
    runs-on:
      group: aws-g4dn-2xlarge
    container:
-      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
      options: --gpus all
    defaults:
      run:
@ -24,7 +25,7 @@ jobs:
    steps:
      - name: Git checkout
        uses: actions/checkout@v4
-        with: { ref: v0.23-release }
+        with: { ref: v0.24-release }

      - name: Set up Python 3.12
        uses: actions/setup-python@v5
--- a/CITATION.cff
+++ b/CITATION.cff
@ -31,4 +31,4 @@ keywords:
  - pytorch
  - transformers
 license: Apache-2.0
-version: "0.23"
+version: "0.24"
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -1,15 +1,10 @@
 # How to contribute to TRL?

-Everyone is welcome to contribute, and we value everybody's contribution. Code
-contributions are not the only way to help the community. Answering questions, helping
-others, and improving the documentation are also immensely valuable.
+Everyone is welcome to contribute, and we value everybody's contribution. Code contributions are not the only way to help the community. Answering questions, helping others, and improving the documentation are also immensely valuable.

-It also helps us if you spread the word! Reference the library in blog posts
-about the awesome projects it made possible, shout out on Twitter every time it has
-helped you, or simply ⭐️ the repository to say thank you.
+It also helps us if you spread the word! Reference the library in blog posts about the awesome projects it made possible, shout out on Twitter every time it has helped you, or simply ⭐️ the repository to say thank you.

-However you choose to contribute, please be mindful and respect our
-[code of conduct](https://github.com/huggingface/trl/blob/main/CODE_OF_CONDUCT.md).
+However you choose to contribute, please be mindful and respect our [code of conduct](https://github.com/huggingface/trl/blob/main/CODE_OF_CONDUCT.md).

 **This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**

@ -22,9 +17,7 @@ There are several ways you can contribute to TRL:
 * Implement trainers for new post-training algorithms.
 * Contribute to the examples or the documentation.

-If you don't know where to start, there is a special [Good First
-Issue](https://github.com/huggingface/trl/labels/%F0%9F%91%B6%20good%20first%20issue) listing. It will give you a list of
-open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.
+If you don't know where to start, there is a special [Good First Issue](https://github.com/huggingface/trl/labels/%F0%9F%91%B6%20good%20first%20issue) listing. It will give you a list of open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.

 For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/trl/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀

@ -48,14 +41,12 @@ Do your best to follow these guidelines when submitting a bug-related issue or a

 The TRL library is robust and reliable thanks to users who report the problems they encounter.

-Before you report an issue, we would really appreciate it if you could **make sure the bug was not
-already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code.
+Before you report an issue, we would really appreciate it if you could **make sure the bug was not already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code.

 Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:

 * Your **OS type and version**, **Python**, **PyTorch**, **TRL** and **Transformers** versions.
-* A short, self-contained, code snippet that allows us to reproduce the bug in
-  less than 30s.
+* A short, self-contained, code snippet that allows us to reproduce the bug in less than 30s.
 * The *full* traceback if an exception is raised.
 * Attach any other additional information, like screenshots, you think may help.

@ -106,29 +97,20 @@ We're always looking for improvements to the documentation that make it more cle

 ## Submitting a pull request (PR)

-Before writing code, we strongly advise you to search through the existing PRs or
-issues to make sure that nobody is already working on the same thing. If you are
-unsure, it is always a good idea to open an issue to get some feedback.
+Before writing code, we strongly advise you to search through the existing PRs or issues to make sure that nobody is already working on the same thing. If you are unsure, it is always a good idea to open an issue to get some feedback.

-You will need basic `git` proficiency to be able to contribute to
-TRL. `git` is not the easiest tool to use but it has the greatest
-manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
-Git](https://git-scm.com/book/en/v2) is a very good reference.
+You will need basic `git` proficiency to be able to contribute to TRL. `git` is not the easiest tool to use but it has the greatest manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference.

 Follow these steps to start contributing:

-1. Fork the [repository](https://github.com/huggingface/trl) by
-   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
-   under your GitHub user account.
+1. Fork the [repository](https://github.com/huggingface/trl) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account.

-2. Clone your fork to your local disk, and add the base repository as a remote. The following command
-   assumes you have your public SSH key uploaded to GitHub. See the following guide for more
-   [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
+2. Clone your fork to your local disk, and add the base repository as a remote. The following command assumes you have your public SSH key uploaded to GitHub. See the following guide for more [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).

   ```bash
-   $ git clone git@github.com:<your Github handle>/trl.git
-   $ cd trl
-   $ git remote add upstream https://github.com/huggingface/trl.git
+   git clone git@github.com:<your Github handle>/trl.git
+   cd trl
+   git remote add upstream https://github.com/huggingface/trl.git
   ```

 3. Create a new branch to hold your development changes, and do this for every new PR you work on.
@ -136,15 +118,15 @@ Follow these steps to start contributing:
   Start by synchronizing your `main` branch with the `upstream/main` branch (more details in the [GitHub Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork)):

   ```bash
-   $ git checkout main
-   $ git fetch upstream
-   $ git merge upstream/main
+   git checkout main
+   git fetch upstream
+   git merge upstream/main
   ```

   Once your `main` branch is synchronized, create a new branch from it:

   ```bash
-   $ git checkout -b a-descriptive-name-for-my-changes
+   git checkout -b a-descriptive-name-for-my-changes
   ```

   **Do not** work on the `main` branch.
@ -152,32 +134,28 @@ Follow these steps to start contributing:
 4. Set up a development environment by running the following command in a conda or a virtual environment you've created for working on this library:

   ```bash
-   $ pip install -e .[dev]
+   pip install -e .[dev]
   ```

-   (If TRL was already installed in the virtual environment, remove
-   it with `pip uninstall trl` before reinstalling it.)
+   (If TRL was already installed in the virtual environment, remove it with `pip uninstall trl` before reinstalling it.)

-   Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using
-   the provided Dev Container. Documentation on how to get started with dev containers is available [here](https://code.visualstudio.com/docs/remote/containers).
+   Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using the provided Dev Container. Check [the documentation on how to get started with dev containers](https://code.visualstudio.com/docs/remote/containers).

 5. Develop the features on your branch.

-   As you work on the features, you should make sure that the test suite
-   passes. You should run the tests impacted by your changes like this (see 
-   below an explanation regarding the environment variable):
+    As you work on the features, you should make sure that the test suite passes. You should run the tests impacted by your changes like this (see below an explanation regarding the environment variable):

-   ```bash
-   $ pytest tests/<TEST_TO_RUN>.py
-   ```
-   
-   > For the following commands leveraging the `make` utility.
+    ```bash
+    pytest tests/<TEST_TO_RUN>.py
+    ```

-   You can also run the full suite with the following command.
+    > For the following commands leveraging the `make` utility.

-   ```bash
-   $ make test
-   ```
+    You can also run the full suite with the following command.
+
+    ```bash
+    make test
+    ```

    TRL relies on `ruff` for maintaining consistent code formatting across its source files. Before submitting any PR, you should apply automatic style corrections and run code verification checks.

@ -186,59 +164,51 @@ Follow these steps to start contributing:
    To apply these checks and corrections in one step, use:

    ```bash
-    $ make precommit
+    make precommit
    ```

    This command runs the following:
-    - Executes `pre-commit` hooks to automatically fix style issues with `ruff` and other tools.
-    - Runs additional scripts such as adding copyright information.
+
+    * Executes `pre-commit` hooks to automatically fix style issues with `ruff` and other tools.
+    * Runs additional scripts such as adding copyright information.

    If you prefer to apply the style corrections separately or review them individually, the `pre-commit` hook will handle the formatting for the files in question.

-   Once you're happy with your changes, add changed files using `git add` and
-   make a commit with `git commit` to record your changes locally:
+    Once you're happy with your changes, add changed files using `git add` and make a commit with `git commit` to record your changes locally:

-   ```bash
-   $ git add modified_file.py
-   $ git commit
-   ```
+    ```bash
+    git add modified_file.py
+    git commit
+    ```

-   Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
+    Please write [good commit messages](https://chris.beams.io/posts/git-commit/).

-   It is a good idea to sync your copy of the code with the original
-   repository regularly. This way you can quickly account for changes:
+    It is a good idea to sync your copy of the code with the original
+    repository regularly. This way you can quickly account for changes:

-   ```bash
-   $ git fetch upstream
-   $ git rebase upstream/main
-   ```
+    ```bash
+    git fetch upstream
+    git rebase upstream/main
+    ```

-   Push the changes to your account using:
+    Push the changes to your account using:

-   ```bash
-   $ git push -u origin a-descriptive-name-for-my-changes
-   ```
+    ```bash
+    git push -u origin a-descriptive-name-for-my-changes
+    ```

-6. Once you are satisfied (**and the checklist below is happy too**), go to the
-   webpage of your fork on GitHub. Click on 'Pull request' to send your changes
-   to the project maintainers for review.
+6. Once you are satisfied (**and the checklist below is happy too**), go to the webpage of your fork on GitHub. Click on 'Pull request' to send your changes to the project maintainers for review.

 7. It's ok if maintainers ask you for changes. It happens to core contributors too! To ensure everyone can review your changes in the pull request, work on your local branch and push the updates to your fork. They will automatically appear in the pull request.

-
 ### Checklist

 1. The title of your pull request should be a summary of its contribution;
-2. If your pull request addresses an issue, please mention the issue number in
-   the pull request description to make sure they are linked (and people
-   consulting the issue know you are working on it);
-3. To indicate a work in progress please prefix the title with `[WIP]`, or mark
-   the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
-   it from PRs ready to be merged;
+2. If your pull request addresses an issue, please mention the issue number in the pull request description to make sure they are linked (and people consulting the issue know you are working on it);
+3. To indicate a work in progress please prefix the title with `[WIP]`, or mark the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate it from PRs ready to be merged;
 4. Make sure existing tests pass;
 5. Add high-coverage tests. No quality testing = no merge.

-
 ### Tests

 An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
@ -248,7 +218,7 @@ We use `pytest` to run the tests. From the root of the
 repository here's how to run tests with `pytest` for the library:

 ```bash
-$ python -m pytest -sv ./tests
+python -m pytest -sv ./tests
 ```

 That's how `make test` is implemented (without the `pip install` line)!
@ -260,23 +230,23 @@ you're working on.

 1. **Use defaults when appropriate**:  

-Provide default values unless the parameter's value varies significantly by use case. For example, datasets or models should not have defaults, but parameters like `learning_rate` should.
+    Provide default values unless the parameter's value varies significantly by use case. For example, datasets or models should not have defaults, but parameters like `learning_rate` should.

 2. **Prioritize proven defaults**:  

-Default values should align with those recommended in the original paper or method. Alternatives require strong evidence of superior performance in most cases.
+    Default values should align with those recommended in the original paper or method. Alternatives require strong evidence of superior performance in most cases.

 3. **Ensure safety and predictability**:  

-Defaults must be safe, expected and reliable. Avoid settings that could lead to surprising outcomes, such as excessive memory usage or poor performance in edge cases.
+    Defaults must be safe, expected and reliable. Avoid settings that could lead to surprising outcomes, such as excessive memory usage or poor performance in edge cases.

 4. **Balance consistency and flexibility**:  

-Aim for consistent defaults across similar functions or methods. However, consistency should not be preferred to point 2 or 3.
+    Aim for consistent defaults across similar functions or methods. However, consistency should not be preferred to point 2 or 3.

 5. **Opt-in for new features**:  

-Do not enable new features or improvements (e.g., novel loss functions) by default. Users should explicitly opt-in to use these.
+    Do not enable new features or improvements (e.g., novel loss functions) by default. Users should explicitly opt-in to use these.

 ### Writing documentation

@ -318,26 +288,26 @@ def replicate_str(string: str, n: int, sep: str = " ") -> str:
  * Note that `Optional` means that the value can be `None`, and `*optional*` means that it is not required for the user to pass a value.
    E.g., for arguments that can't be `None` and aren't required:

-    ```python
+    ```txt
    foo (`int`, *optional*, defaults to `4`):
    ```

    For arguments that can be `None` and are required:

-    ```python
+    ```txt
    foo (`Optional[int]`):
    ```

-    for arguments that can be `None` and aren't required:
+    for arguments that can be `None` and aren't required (in this case, if the default value is `None`, you can omit it):

-    ```python
-    foo (`Optional[int]`, *optional*, defaults to `None`):
+    ```txt
+    foo (`Optional[int]`, *optional*):
    ```

 * **String Defaults:**
  * Ensured that default string values are wrapped in double quotes:

-    ```python
+    ```txt
    defaults to `"foo"`
    ```

@ -346,7 +316,7 @@ def replicate_str(string: str, n: int, sep: str = " ") -> str:
 * **Default Value Formatting:**
  * Consistently surrounded default values with backticks for improved formatting:

-    ```python
+    ```txt
    defaults to `4`
    ```

@ -383,8 +353,8 @@ Our approach to deprecation and backward compatibility is flexible and based on

 When a feature or component is marked for deprecation, its use will emit a warning message. This warning will include:

- **Transition Guidance**: Instructions on how to migrate to the alternative solution or replacement.
- **Removal Version**: The target version when the feature will be removed, providing users with a clear timeframe to transition.
+* **Transition Guidance**: Instructions on how to migrate to the alternative solution or replacement.
+* **Removal Version**: The target version when the feature will be removed, providing users with a clear timeframe to transition.

 Example:

@ -398,9 +368,9 @@ Example:

 The deprecation and removal schedule is based on each feature's usage and impact, with examples at two extremes:

- **Experimental or Low-Use Features**: For a feature that is experimental or has limited usage, backward compatibility may not be maintained between releases. Users should therefore anticipate potential breaking changes from one version to the next.
+* **Experimental or Low-Use Features**: For a feature that is experimental or has limited usage, backward compatibility may not be maintained between releases. Users should therefore anticipate potential breaking changes from one version to the next.

- **Widely-Used Components**: For a feature with high usage, we aim for a more gradual transition period of approximately **5 months**, generally scheduling deprecation around **5 minor releases** after the initial warning.
+* **Widely-Used Components**: For a feature with high usage, we aim for a more gradual transition period of approximately **5 months**, generally scheduling deprecation around **5 minor releases** after the initial warning.

 These examples represent the two ends of a continuum. The specific timeline for each feature will be determined individually, balancing innovation with user stability needs.

@ -410,22 +380,22 @@ Warnings play a critical role in guiding users toward resolving potential issues

 #### Definitions

- **Correct**: An operation is correct if it is valid, follows the intended approach, and aligns with the current best practices or guidelines within the codebase. This is the recommended or intended way to perform the operation.
- **Supported**: An operation is supported if it is technically valid and works within the current codebase, but it may not be the most efficient, optimal, or recommended way to perform the task. This includes deprecated features or legacy approaches that still work but may be phased out in the future.
+* **Correct**: An operation is correct if it is valid, follows the intended approach, and aligns with the current best practices or guidelines within the codebase. This is the recommended or intended way to perform the operation.
+* **Supported**: An operation is supported if it is technically valid and works within the current codebase, but it may not be the most efficient, optimal, or recommended way to perform the task. This includes deprecated features or legacy approaches that still work but may be phased out in the future.

 #### Choosing the right message

- **Correct → No warning**:  
+* **Correct → No warning**:  
   If the operation is fully valid and expected, no message should be issued. The system is working as intended, so no warning is necessary.  

- **Correct but deserves attention → No warning, possibly a log message**:
+* **Correct but deserves attention → No warning, possibly a log message**:
   When an operation is correct but uncommon or requires special attention, providing an informational message can be helpful. This keeps users informed without implying any issue. If available, use the logger to output this message. Example:  

   ```python
   logger.info("This is an informational message about a rare but correct operation.")
   ```

- **Correct but very likely a mistake → Warning with option to disable**:  
+* **Correct but very likely a mistake → Warning with option to disable**:  
   In rare cases, you may want to issue a warning for a correct operation that’s very likely a mistake. In such cases, you must provide an option to suppress the warning. This can be done with a flag in the function. Example:  

   ```python
@ -436,7 +406,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
           # Do something
   ```

- **Supported but not correct → Warning**:  
+* **Supported but not correct → Warning**:  
   If the operation is technically supported but is deprecated, suboptimal, or could cause future issues (e.g., conflicting arguments), a warning should be raised. This message should be actionable, meaning it must explain how to resolve the issue. Example:  

   ```python
@ -446,7 +416,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
           # Do something
   ```

- **Not supported → Exception**:  
+* **Not supported → Exception**:  
   If the operation is invalid or unsupported, raise an exception. This indicates that the operation cannot be performed and requires immediate attention. Example:  

   ```python
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,6 +1,7 @@
 include LICENSE
 include CONTRIBUTING.md
 include README.md
-recursive-exclude * __pycache__
+include trl/accelerate_configs/*.yaml
 include trl/templates/*.md
-include trl/accelerate_configs/*.yaml
+recursive-exclude * __pycache__
+prune tests
--- a/19
+++ b/19
@ -1,12 +1,11 @@
-.PHONY: test precommit common_tests slow_tests test_examples tests_gpu
+.PHONY: test precommit common_tests slow_tests tests_gpu test_experimental

 check_dirs := examples tests trl

 ACCELERATE_CONFIG_PATH = `pwd`/examples/accelerate_configs
-COMMAND_FILES_PATH = `pwd`/commands

 test:
-	pytest -n auto -m "not slow and not low-priority" -s -v --reruns 5 --reruns-delay 1 --only-rerun '(OSError|Timeout|HTTPError.*502|HTTPError.*504||not less than or equal to 0.01)' tests/
+	pytest -n auto -m "not slow and not low_priority" -s -v --reruns 5 --reruns-delay 1 --only-rerun '(OSError|Timeout|HTTPError.*502|HTTPError.*504||not less than or equal to 0.01)' tests/

 precommit:
 	python scripts/add_copyrights.py
@ -16,15 +15,5 @@ precommit:
 slow_tests:
 	pytest -m "slow" tests/ $(if $(IS_GITHUB_CI),--report-log "slow_tests.log",)

-test_examples:
-	touch temp_results_sft_tests.txt
-	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
-		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_sft.sh; \
-		echo $$?','$${file} >> temp_results_sft_tests.txt; \
-	done
-
-	touch temp_results_dpo_tests.txt
-	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
-		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_dpo.sh; \
-		echo $$?','$${file} >> temp_results_dpo_tests.txt; \
-	done
+test_experimental:
+	pytest -k "experimental" -n auto -s -v
--- a/README.md
+++ b/README.md
@ -19,11 +19,9 @@

 ## 🎉 What's New

-> **✨ OpenAI GPT OSS Support**: TRL now fully supports fine-tuning the latest [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4)! Check out the:
->
-> - [OpenAI Cookbook](https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers)
-> - [GPT OSS recipes](https://github.com/huggingface/gpt-oss-recipes)
-> - [Our example script](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_gpt_oss.py)
+**OpenEnv Integration:** TRL now supports **[OpenEnv](https://huggingface.co/blog/openenv)**, the open-source framework from Meta for defining, deploying, and interacting with environments in reinforcement learning and agentic workflows.
+
+Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](openenv).

 ## Overview

@ -136,23 +134,13 @@ trainer.train()
 Here is a basic example of how to use the [`RewardTrainer`](https://huggingface.co/docs/trl/reward_trainer):

 ```python
-from trl import RewardConfig, RewardTrainer
+from trl import RewardTrainer
 from datasets import load_dataset
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-model = AutoModelForSequenceClassification.from_pretrained(
-    "Qwen/Qwen2.5-0.5B-Instruct", num_labels=1
-)
-model.config.pad_token_id = tokenizer.pad_token_id

 dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

-training_args = RewardConfig(output_dir="Qwen2.5-0.5B-Reward", per_device_train_batch_size=2)
 trainer = RewardTrainer(
-    args=training_args,
-    model=model,
-    processing_class=tokenizer,
+    model="Qwen/Qwen2.5-0.5B-Instruct",
    train_dataset=dataset,
 )
 trainer.train()
@ -178,7 +166,7 @@ trl dpo --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
    --output_dir Qwen2.5-0.5B-DPO 
 ```

-Read more about CLI in the [relevant documentation section](https://huggingface.co/docs/trl/main/en/clis) or use `--help` for more details.
+Read more about CLI in the [relevant documentation section](https://huggingface.co/docs/trl/clis) or use `--help` for more details.

 ## Development

@ -190,6 +178,18 @@ cd trl/
 pip install -e .[dev]
 ```

+## Experimental
+
+A minimal incubation area is available under `trl.experimental` for unstable / fast-evolving features. Anything there may change or be removed in any release without notice.
+
+Example:
+
+```python
+from trl.experimental.new_trainer import NewTrainer
+```
+
+Read more in the [Experimental docs](https://huggingface.co/docs/trl/experimental_overview).
+
 ## Citation

 ```bibtex
--- a/2
+++ b/2
@ -1 +1 @@
-0.23.0
+0.25.0.dev0
--- a/commands/run_dpo.sh
+++ b/commands/run_dpo.sh
@ -1,58 +0,0 @@
-#!/bin/bash
-# This script runs an SFT example end-to-end on a tiny model using different possible configurations
-# but defaults to QLoRA + PEFT
-OUTPUT_DIR="test_dpo/"
-MODEL_NAME="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-DATASET_NAME="trl-internal-testing/hh-rlhf-helpful-base-trl-style"
-MAX_STEPS=5
-BATCH_SIZE=2
-SEQ_LEN=128
-
-# Handle extra arguments in case one passes accelerate configs.
-EXTRA_ACCELERATE_ARGS=""
-EXTRA_TRAINING_ARGS="""--use_peft \
-    --load_in_4bit
-"""
-
-# This is a hack to get the number of available GPUs
-NUM_GPUS=2
-
-if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
-  EXTRA_ACCELERATE_ARGS=""
-else
-  EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
-  # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
-  # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
-  if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
-    EXTRA_TRAINING_ARGS="--fp16"
-  else
-    echo "Keeping QLoRA + PEFT"
-  fi
-fi
-
-
-CMD="""
-accelerate launch $EXTRA_ACCELERATE_ARGS \
-    --num_processes $NUM_GPUS \
-    --mixed_precision 'fp16' \
-    `pwd`/trl/scripts/dpo.py \
-    --model_name_or_path $MODEL_NAME \
-    --dataset_name $DATASET_NAME \
-    --output_dir $OUTPUT_DIR \
-    --max_steps $MAX_STEPS \
-    --per_device_train_batch_size $BATCH_SIZE \
-    --max_length $SEQ_LEN \
-    $EXTRA_TRAINING_ARGS
-"""
-
-echo "Starting program..."
-
-{ # try
-    echo $CMD
-    eval "$CMD"
-} || { # catch
-    # save log for exception 
-    echo "Operation Failed!"
-    exit 1
-}
-exit 0
--- a/commands/run_sft.sh
+++ b/commands/run_sft.sh
@ -1,59 +0,0 @@
-#!/bin/bash
-# This script runs an SFT example end-to-end on a tiny model using different possible configurations
-# but defaults to QLoRA + PEFT
-OUTPUT_DIR="test_sft/"
-MODEL_NAME="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
-DATASET_NAME="stanfordnlp/imdb"
-MAX_STEPS=5
-BATCH_SIZE=2
-SEQ_LEN=128
-
-
-# Handle extra arguments in case one passes accelerate configs.
-EXTRA_ACCELERATE_ARGS=""
-EXTRA_TRAINING_ARGS="""--use_peft \
-    --load_in_4bit
-"""
-
-# Set your number of GPUs here
-NUM_GPUS=2
-
-if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
-  EXTRA_ACCELERATE_ARGS=""
-else
-  EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
-  # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
-  # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
-  if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
-    EXTRA_TRAINING_ARGS="--fp16"
-  else
-    echo "Keeping QLoRA + PEFT"
-  fi
-fi
-
-
-CMD="""
-accelerate launch $EXTRA_ACCELERATE_ARGS \
-    --num_processes $NUM_GPUS \
-    --mixed_precision 'fp16' \
-    `pwd`/trl/scripts/sft.py \
-    --model_name $MODEL_NAME \
-    --dataset_name $DATASET_NAME \
-    --output_dir $OUTPUT_DIR \
-    --max_steps $MAX_STEPS \
-    --per_device_train_batch_size $BATCH_SIZE \
-    --max_length $SEQ_LEN \
-    $EXTRA_TRAINING_ARGS
-"""
-
-echo "Starting program..."
-
-{ # try
-    echo $CMD
-    eval "$CMD"
-} || { # catch
-    # save log for exception 
-    echo "Operation Failed!"
-    exit 1
-}
-exit 0
--- a/docker/trl-dev/Dockerfile
+++ b/docker/trl-dev/Dockerfile
@ -0,0 +1,6 @@
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
+RUN pip install --upgrade pip uv
+RUN uv pip install --system --no-cache "git+https://github.com/huggingface/trl.git#egg=trl[liger,peft,vlm]"
+RUN uv pip install --system hf_transfer liger_kernel trackio peft
+RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
--- a/docker/trl-latest-gpu/Dockerfile
+++ b/docker/trl-latest-gpu/Dockerfile
@ -1,66 +0,0 @@
-# Builds GPU docker image of PyTorch
-# Uses multi-staged approach to reduce size
-# Stage 1
-# Use base conda image to reduce time
-FROM continuumio/miniconda3:latest AS compile-image
-# Specify py version
-ENV PYTHON_VERSION=3.10
-# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-RUN apt-get update && \
-    apt-get install -y curl git wget software-properties-common git-lfs && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists*
-
-# Install audio-related libraries 
-RUN apt-get update && \
-    apt install -y ffmpeg
-
-RUN apt install -y libsndfile1-dev
-RUN git lfs install
-
-# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-
-# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-# We don't install pytorch here yet since CUDA isn't available
-# instead we use the direct torch wheel
-ENV PATH /opt/conda/envs/trl/bin:$PATH
-# Activate our bash shell
-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-
-# Stage 2
-FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
-COPY --from=compile-image /opt/conda /opt/conda
-ENV PATH /opt/conda/bin:$PATH
-
-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-RUN source activate trl && \ 
-    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
-
-# Install apt libs
-RUN apt-get update && \
-    apt-get install -y curl git wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists*
-
-# Activate the conda env and install transformers + accelerate from source
-RUN source activate trl && \
-    python3 -m pip install -U --no-cache-dir \
-    librosa \
-    "soundfile>=0.12.1" \
-    scipy \
-    transformers \
-    accelerate \
-    peft \
-    trl[test]@git+https://github.com/huggingface/trl
-
-RUN source activate trl && \ 
-    pip freeze | grep trl
-
-RUN echo "source activate trl" >> ~/.profile
-
-# Activate the virtualenv
-CMD ["/bin/bash"]
--- a/docker/trl-source-gpu/Dockerfile
+++ b/docker/trl-source-gpu/Dockerfile
@ -1,66 +0,0 @@
-# Builds GPU docker image of PyTorch
-# Uses multi-staged approach to reduce size
-# Stage 1
-# Use base conda image to reduce time
-FROM continuumio/miniconda3:latest AS compile-image
-# Specify py version
-ENV PYTHON_VERSION=3.10
-# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-RUN apt-get update && \
-    apt-get install -y curl git wget software-properties-common git-lfs && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists*
-
-# Install audio-related libraries 
-RUN apt-get update && \
-    apt install -y ffmpeg
-
-RUN apt install -y libsndfile1-dev
-RUN git lfs install
-
-# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip
-RUN python3 -m pip install --no-cache-dir --upgrade pip
-
-# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
-# We don't install pytorch here yet since CUDA isn't available
-# instead we use the direct torch wheel
-ENV PATH /opt/conda/envs/trl/bin:$PATH
-# Activate our bash shell
-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-
-# Stage 2
-FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
-COPY --from=compile-image /opt/conda /opt/conda
-ENV PATH /opt/conda/bin:$PATH
-
-RUN chsh -s /bin/bash
-SHELL ["/bin/bash", "-c"]
-RUN source activate trl && \ 
-    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
-
-# Install apt libs
-RUN apt-get update && \
-    apt-get install -y curl git wget && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists*
-
-# Activate the conda env and install transformers + accelerate from source
-RUN source activate trl && \
-    python3 -m pip install -U --no-cache-dir \
-    librosa \
-    "soundfile>=0.12.1" \
-    scipy \
-    git+https://github.com/huggingface/transformers \
-    git+https://github.com/huggingface/accelerate \
-    git+https://github.com/huggingface/peft \
-    trl[test]@git+https://github.com/huggingface/trl
-
-RUN source activate trl && \ 
-    pip freeze | grep transformers
-
-RUN echo "source activate trl" >> ~/.profile
-
-# Activate the virtualenv
-CMD ["/bin/bash"]
--- a/docker/trl/Dockerfile
+++ b/docker/trl/Dockerfile
@ -0,0 +1,4 @@
+FROM pytorch/pytorch:2.8.0-cuda12.8-cudnn9-runtime
+RUN pip install --upgrade pip uv
+RUN uv pip install --system trl[liger,peft,vlm] hf_transfer trackio
+RUN uv pip install --system https://github.com/Dao-AILab/flash-attention/releases/download/v2.8.3/flash_attn-2.8.3+cu12torch2.8cxx11abiFALSE-cp311-cp311-linux_x86_64.whl
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -11,10 +11,6 @@
    title: Dataset Formats
  - local: paper_index
    title: Paper Index
-  - local: how_to_train
-    title: Training FAQ
-  - local: logging
-    title: Understanding Logs
  title: Conceptual Guides
 - sections:
  - local: clis
@ -41,6 +37,8 @@
    title: Liger Kernel
  - local: peft_integration
    title: PEFT
+  - local: rapidfire_integration
+    title: RapidFire AI
  - local: trackio_integration
    title: Trackio
  - local: unsloth_integration
@ -53,25 +51,15 @@
    title: Example Overview
  - local: community_tutorials
    title: Community Tutorials
+  - local: lora_without_regret
+    title: LoRA Without Regret
  - local: sentiment_tuning
    title: Sentiment Tuning
-  - local: using_llama_models
-    title: Training StackLlama
-  - local: detoxifying_a_lm
-    title: Detoxifying a Language Model
-  - local: multi_adapter_rl
-    title: Multi Adapter RLHF
  title: Examples
 - sections:
  - sections: # Sorted alphabetically
-    - local: alignprop_trainer
-      title: AlignProp
-    - local: bco_trainer
-      title: BCO
    - local: cpo_trainer
      title: CPO
-    - local: ddpo_trainer
-      title: DDPO
    - local: dpo_trainer
      title: DPO
    - local: online_dpo_trainer
@ -96,8 +84,6 @@
      title: RLOO
    - local: sft_trainer
      title: SFT
-    - local: iterative_sft_trainer
-      title: Iterative SFT
    - local: xpo_trainer
      title: XPO
    title: Trainers
@ -105,8 +91,6 @@
    title: Model Classes
  - local: model_utils
    title: Model Utilities
-  - local: best_of_n
-    title: Best of N Sampling
  - local: judges
    title: Judges
  - local: callbacks
@ -120,3 +104,23 @@
  - local: others
    title: Others
  title: API
+- sections:
+  - local: experimental_overview
+    title: Experimental Overview
+  - local: bema_for_reference_model # Sorted alphabetically
+    title: BEMA for Reference Model
+  - local: bco_trainer
+    title: BCO
+  - local: gfpo
+    title: GFPO
+  - local: gold_trainer
+    title: GOLD
+  - local: grpo_with_replay_buffer
+    title: GRPO With Replay Buffer
+  - local: gspo_token
+    title: GSPO-token
+  - local: papo_trainer
+    title: PAPO
+  - local: openenv
+    title: OpenEnv Integration
+  title: Experimental
--- a/docs/source/alignprop_trainer.md
+++ b/docs/source/alignprop_trainer.md
@ -1,102 +0,0 @@
-# Aligning Text-to-Image Diffusion Models with Reward Backpropagation
-
-[![](https://img.shields.io/badge/All_models-AlignProp-blue)](https://huggingface.co/models?other=alignprop,trl)
-
-## The why
-
-If your reward function is differentiable, directly backpropagating gradients from the reward models to the diffusion model is significantly more sample and compute efficient (25x) than doing policy gradient algorithm like DDPO.
-AlignProp does full backpropagation through time, which allows updating the earlier steps of denoising via reward backpropagation.
-
-<div style="text-align: center"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/reward_tuning.png"/></div>
-
-
-## Getting started with `examples/scripts/alignprop.py`
-
-The `alignprop.py` script is a working example of using the `AlignProp` trainer to finetune a Stable Diffusion model. This example explicitly configures a small subset of the overall parameters associated with the config object (`AlignPropConfig`).
-
-**Note:** one A100 GPU is recommended to get this running. For lower memory setting, consider setting truncated_backprop_rand to False. With default settings this will do truncated backpropagation with K=1.
-
-Almost every configuration parameter has a default. There is only one commandline flag argument that is required of the user to get things up and running. The user is expected to have a [huggingface user access token](https://huggingface.co/docs/hub/security-tokens) that will be used to upload the model post-finetuning to HuggingFace hub. The following bash command is to be entered to get things running
-
-```batch
-python alignprop.py --hf_user_access_token <token>
-```
-
-To obtain the documentation of `stable_diffusion_tuning.py`, please run `python stable_diffusion_tuning.py --help`
-
-The following are things to keep in mind (The code checks this for you as well) in general while configuring the trainer (beyond the use case of using the example script)
-
- The configurable randomized truncation range (`--alignprop_config.truncated_rand_backprop_minmax=(0,50)`) the first number should be equal and greater than 0, while the second number should equal or less to the number of diffusion timesteps (sample_num_steps)
- The configurable truncation backprop absolute step (`--alignprop_config.truncated_backprop_timestep=49`) the number should be less than the number of diffusion timesteps (sample_num_steps), it only matters when truncated_backprop_rand is set to False
-
-## Setting up the image logging hook function
-
-Expect the function to be given a dictionary with keys
-```python
-['image', 'prompt', 'prompt_metadata', 'rewards']
-
-```
-and `image`, `prompt`, `prompt_metadata`, `rewards`are batched.
-You are free to log however you want the use of `wandb` or `tensorboard` is recommended.
-
-### Key terms
-
- `rewards` : The rewards/score is a numerical associated with the generated image and is key to steering the RL process
- `prompt` : The prompt is the text that is used to generate the image
- `prompt_metadata` : The prompt metadata is the metadata associated with the prompt. A situation where this will not be empty is when the reward model comprises of a [`FLAVA`](https://huggingface.co/docs/transformers/model_doc/flava) setup where questions and ground answers (linked to the generated image) are expected with the generated image (See here: https://github.com/kvablack/ddpo-pytorch/blob/main/ddpo_pytorch/rewards.py#L45)
- `image` : The image generated by the Stable Diffusion model
-
-Example code for logging sampled images with `wandb` is given below.
-
-```python
-# for logging these images to wandb
-
-def image_outputs_hook(image_data, global_step, accelerate_logger):
-    # For the sake of this example, we only care about the last batch
-    # hence we extract the last element of the list
-    result = {}
-    images, prompts, rewards = [image_data['images'],image_data['prompts'],image_data['rewards']]
-    for i, image in enumerate(images):
-        pil = Image.fromarray(
-            (image.cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
-        )
-        pil = pil.resize((256, 256))
-        result[f"{prompts[i]:.25} | {rewards[i]:.2f}"] = [pil]
-    accelerate_logger.log_images(
-        result,
-        step=global_step,
-    )
-
-```
-
-### Using the finetuned model
-
-Assuming you've done with all the epochs and have pushed up your model to the hub, you can use the finetuned model as follows
-
-```python
-from diffusers import StableDiffusionPipeline
-pipeline = StableDiffusionPipeline.from_pretrained("runwayml/stable-diffusion-v1-5")
-pipeline.to("cuda")
-
-pipeline.load_lora_weights('mihirpd/alignprop-trl-aesthetics')
-
-prompts = ["squirrel", "crab", "starfish", "whale","sponge", "plankton"]
-results = pipeline(prompts)
-
-for prompt, image in zip(prompts,results.images):
-    image.save(f"dump/{prompt}.png")
-```
-
-## Credits
-
-This work is heavily influenced by the repo [here](https://github.com/mihirp1998/AlignProp/) and the associated paper [Aligning Text-to-Image Diffusion Models with Reward Backpropagation
- by Mihir Prabhudesai, Anirudh Goyal, Deepak Pathak, Katerina Fragkiadaki](https://huggingface.co/papers/2310.03739).
-
-## AlignPropTrainer
-
-[[autodoc]] AlignPropTrainer
-    - train
-
-## AlignPropConfig
-
-[[autodoc]] AlignPropConfig
--- a/docs/source/bco_trainer.md
+++ b/docs/source/bco_trainer.md
@ -1,6 +1,6 @@
 # BCO Trainer

-[![](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco,trl)
+[![model badge](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco,trl)

 TRL supports the Binary Classifier Optimization (BCO).
 The [BCO](https://huggingface.co/papers/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0.
@ -8,21 +8,20 @@ For a full example have a look at  [`examples/scripts/bco.py`].

 ## Expected dataset type

-The [`BCOTrainer`] requires an [unpaired preference dataset](dataset_formats#unpaired-preference).
-The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+The [`experimental.bco.BCOTrainer`] requires an [unpaired preference dataset](dataset_formats#unpaired-preference).
+The [`experimental.bco.BCOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 ## Expected model format
+
 The BCO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.

 ## Using the `BCOTrainer`

-For a detailed example have a look at the `examples/scripts/bco.py` script. At a high level we need to initialize the `BCOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response. 
+For a detailed example have a look at the `examples/scripts/bco.py` script. At a high level we need to initialize the `BCOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response.

 The `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).

-
-
-```py
+```python
 training_args = BCOConfig(
    beta=0.1,
 )
@ -35,9 +34,10 @@ bco_trainer = BCOTrainer(
    processing_class=tokenizer,
 )
 ```
+
 After this one can then call:

-```py
+```python
 bco_trainer.train()
 ```

@ -49,7 +49,7 @@ If the prompts in your desired and undesired datasets differ a lot, it is useful

 Choose an embedding model and tokenizer:

-```py
+```python
 embedding_model = AutoModel.from_pretrained(your_model_id)
 embedding_tokenizer = AutoTokenizer.from_pretrained(your_model_id)

@ -64,7 +64,7 @@ embedding_func = partial(embed_prompt, model=embedding_model)

 Set `prompt_sample_size` to define how many prompts are selected to train the UDM classifier and start the training with the provided embedding function:

-```py
+```python
 training_args = BCOConfig(
    beta=0.1,
    prompt_sample_size=512,
@ -93,11 +93,11 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype

 ## BCOTrainer

-[[autodoc]] BCOTrainer
+[[autodoc]] experimental.bco.BCOTrainer
    - train
    - save_model
    - push_to_hub

 ## BCOConfig

-[[autodoc]] BCOConfig
+[[autodoc]] experimental.bco.BCOConfig
--- a/docs/source/bema_for_reference_model.md
+++ b/docs/source/bema_for_reference_model.md
@ -0,0 +1,31 @@
+# BEMA for Reference Model
+
+This feature implements the BEMA algorithm to update the reference model during DPO training.
+
+## Usage
+
+```python
+from trl.experimental.bema_for_ref_model import BEMACallback, DPOTrainer
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+pref_dataset = load_dataset("trl-internal-testing/zen", "standard_preference", split="train")
+ref_model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+
+bema_callback = BEMACallback(update_ref_model=True)
+
+model = AutoModelForCausalLM.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+tokenizer = AutoTokenizer.from_pretrained("trl-internal-testing/tiny-Qwen2ForCausalLM-2.5")
+tokenizer.pad_token = tokenizer.eos_token
+
+trainer = DPOTrainer(
+    model=model,
+    ref_model=ref_model,
+    train_dataset=pref_dataset,
+    processing_class=tokenizer,
+    callbacks=[bema_callback],
+)
+
+trainer.train()
+```
--- a/docs/source/best_of_n.md
+++ b/docs/source/best_of_n.md
@ -1,74 +0,0 @@
-# Best of N sampling: Alternative ways to get better model output without RL based fine-tuning 
-
-Within the extras module is the `best-of-n` sampler class that serves as an alternative method of generating better model output.
-As to how it fares against the RL based fine-tuning, please look in the `examples` directory for a comparison example
-
-## Usage
-
-To get started quickly, instantiate an instance of the class with a model, a length sampler, a tokenizer and a callable that serves as a proxy reward pipeline that outputs reward scores for input queries
-
-```python
-
-from transformers import pipeline, AutoTokenizer
-from trl import AutoModelForCausalLMWithValueHead
-from trl.core import LengthSampler
-from trl.extras import BestOfNSampler
-
-ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(ref_model_name)
-reward_pipe = pipeline("sentiment-analysis", model=reward_model, device=device)
-tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
-tokenizer.pad_token = tokenizer.eos_token
-
-
-# callable that takes a list of raw text and returns a list of corresponding reward scores
-def queries_to_scores(list_of_strings):
-  return [output["score"] for output in reward_pipe(list_of_strings)]
-
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler)
-
-
-```
-
-And assuming you have a list/tensor of tokenized queries, you can generate better output by calling the `generate` method
-
-```python
-
-best_of_n.generate(query_tensors, device=device, **gen_kwargs)
-
-```
-The default sample size is 4, but you can change it at the time of instance initialization like so
-
-```python
-
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, sample_size=8)
-
-```
-
-The default output is the result of taking the top scored output for each query, but you can change it to top 2 and so on by passing the `n_candidates` argument at the time of instance initialization
-
-```python
-
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, n_candidates=2)
-
-```
-
-There is the option of setting the generation settings (like `temperature`, `pad_token_id`) at the time of instance creation as opposed to when calling the `generate` method.
-This is done by passing a `GenerationConfig` from the `transformers` library at the time of initialization
-
-```python
-
-from transformers import GenerationConfig
-
-generation_config = GenerationConfig(min_length= -1, top_k=0.0, top_p= 1.0, do_sample= True, pad_token_id=tokenizer.eos_token_id)
-
-best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, generation_config=generation_config)
-
-best_of_n.generate(query_tensors, device=device)
-
-```
-
-Furthermore, at the time of initialization you can set the seed to control the repeatability of the generation process and the number of samples to generate for each query
-
-## BestOfNSampler
-
-[[autodoc]] BestOfNSampler
--- a/docs/source/callbacks.md
+++ b/docs/source/callbacks.md
@ -23,3 +23,7 @@
 ## BEMACallback

 [[autodoc]] BEMACallback
+
+## WeaveCallback
+
+[[autodoc]] WeaveCallback
--- a/docs/source/clis.md
+++ b/docs/source/clis.md
@ -2,17 +2,20 @@

 TRL provides a powerful command-line interface (CLI) to fine-tune large language models (LLMs) using methods like Supervised Fine-Tuning (SFT), Direct Preference Optimization (DPO), and more. The CLI abstracts away much of the boilerplate, letting you launch training jobs quickly and reproducibly.

+## Commands
+
 Currently supported commands are:

-#### Training Commands
+### Training Commands

 - `trl dpo`: fine-tune a LLM with DPO
 - `trl grpo`: fine-tune a LLM with GRPO
 - `trl kto`: fine-tune a LLM with KTO
+- `trl reward`: train a Reward Model
 - `trl rloo`: fine-tune a LLM with RLOO
 - `trl sft`: fine-tune a LLM with SFT

-#### Other Commands
+### Other Commands

 - `trl env`: get the system information
 - `trl vllm-serve`: serve a model with vLLM
@ -41,6 +44,15 @@ trl dpo \
  --dataset_name anthropic/hh-rlhf
 ```

+</hfoption>
+<hfoption id="Reward">
+
+```bash
+trl reward \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/ultrafeedback_binarized
+```
+
 </hfoption>
 </hfoptions>

@ -78,6 +90,21 @@ Launch with:
 trl dpo --config dpo_config.yaml
 ```

+</hfoption>
+<hfoption id="Reward">
+
+```yaml
+# reward_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/ultrafeedback_binarized
+```
+
+Launch with:
+
+```bash
+trl reward --config reward_config.yaml
+```
+
 </hfoption>
 </hfoptions>

@ -138,6 +165,33 @@ Launch with:
 ```bash
 trl dpo --config dpo_config.yaml
 ```
+
+</hfoption>
+<hfoption id="Reward inline">
+
+```bash
+trl reward \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/ultrafeedback_binarized \
+  --num_processes 4
+```
+
+</hfoption>
+<hfoption id="Reward w/ config file">
+
+```yaml
+# reward_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/ultrafeedback_binarized
+num_processes: 4
+```
+
+Launch with:
+
+```bash
+trl reward --config reward_config.yaml
+```
+
 </hfoption>
 </hfoptions>

@ -145,22 +199,22 @@ trl dpo --config dpo_config.yaml

 The `--accelerate_config` flag lets you easily configure distributed training with [🤗 Accelerate](https://github.com/huggingface/accelerate). This flag accepts either:

-* the name of a predefined config profile (built into TRL), or
-* a path to a custom Accelerate YAML config file.
+- the name of a predefined config profile (built into TRL), or
+- a path to a custom Accelerate YAML config file.

 #### Predefined Config Profiles

 TRL provides several ready-to-use Accelerate configs to simplify common training setups:

-| Name         | Description                         |
-| ------------ | ----------------------------------- |
-| `fsdp1`      | Fully Sharded Data Parallel Stage 1 |
-| `fsdp2`      | Fully Sharded Data Parallel Stage 2 |
-| `zero1`      | DeepSpeed ZeRO Stage 1              |
-| `zero2`      | DeepSpeed ZeRO Stage 2              |
-| `zero3`      | DeepSpeed ZeRO Stage 3              |
-| `multi_gpu`  | Multi-GPU training                  |
-| `single_gpu` | Single-GPU training                 |
+| Name | Description |
+| --- | --- |
+| `fsdp1` | Fully Sharded Data Parallel Stage 1 |
+| `fsdp2` | Fully Sharded Data Parallel Stage 2 |
+| `zero1` | DeepSpeed ZeRO Stage 1 |
+| `zero2` | DeepSpeed ZeRO Stage 2 |
+| `zero3` | DeepSpeed ZeRO Stage 3 |
+| `multi_gpu` | Multi-GPU training |
+| `single_gpu` | Single-GPU training |

 To use one of these, just pass the name to `--accelerate_config`. TRL will automatically load the corresponding config file from `trl/accelerate_config/`.

@ -217,6 +271,33 @@ Launch with:
 ```bash
 trl dpo --config dpo_config.yaml
 ```
+
+</hfoption>
+<hfoption id="Reward inline">
+
+```bash
+trl reward \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name trl-lib/ultrafeedback_binarized \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+</hfoption>
+<hfoption id="Reward w/ config file">
+
+```yaml
+# reward_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: trl-lib/ultrafeedback_binarized
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl reward --config reward_config.yaml
+```
+
 </hfoption>
 </hfoptions>

@ -224,7 +305,7 @@ trl dpo --config dpo_config.yaml

 You can use dataset mixtures to combine multiple datasets into a single training dataset. This is useful for training on diverse data sources or when you want to mix different types of data.

-<hfoptions id="accelerate_config">
+<hfoptions id="dataset_mixtures">
 <hfoption id="SFT">

 ```yaml
@ -258,6 +339,23 @@ Launch with:
 trl dpo --config dpo_config.yaml
 ```

+</hfoption>
+<hfoption id="Reward">
+
+```yaml
+# reward_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: trl-lib/tldr-preference
+  - path: trl-lib/lm-human-preferences-sentiment
+```
+
+Launch with:
+
+```bash
+trl reward --config reward_config.yaml
+```
+
 </hfoption>
 </hfoptions>

--- a/docs/source/community_tutorials.md
+++ b/docs/source/community_tutorials.md
@ -2,10 +2,13 @@

 Community tutorials are made by active members of the Hugging Face community who want to share their knowledge and expertise with others. They are a great way to learn about the library and its features, and to get started with core classes and modalities.

-# Language Models
+## Language Models
+
+### Tutorials

 | Task | Class | Description | Author | Tutorial | Colab |
 | --- | --- | --- | --- | --- | --- |
+| Reinforcement Learning | [`GRPOTrainer`] | Efficient Online Training with GRPO and vLLM in TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/grpo_vllm_online_training) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/grpo_vllm_online_training.ipynb) |
 | Reinforcement Learning | [`GRPOTrainer`] | Post training an LLM for reasoning with GRPO in TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_llm_grpo_trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_grpo_trl.ipynb) |
 | Reinforcement Learning | [`GRPOTrainer`] | Mini-R1: Reproduce Deepseek R1 „aha moment“ a RL tutorial | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/mini-deepseek-r1) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/mini-deepseek-r1-aha-grpo.ipynb) |
 | Reinforcement Learning | [`GRPOTrainer`] | RL on LLaMA 3.1-8B with GRPO and Unsloth optimizations | [Andrea Manzoni](https://huggingface.co/AManzoni) | [Link](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) | 
@ -15,9 +18,28 @@ Community tutorials are made by active members of the Hugging Face community who
 | Preference Optimization | [`ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
 | Instruction tuning | [`SFTTrainer`] | How to fine-tune open LLMs in 2025 with Hugging Face | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-llms-in-2025) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2025.ipynb) |

-<Youtube id="cnGyyM0vOes" />
+### Videos

-# Vision Language Models
+| Task | Title | Author | Video |
+| --- | --- | --- | --- |
+| Instruction tuning | Fine-tuning open AI models using Hugging Face TRL | [Wietse Venema](https://huggingface.co/wietsevenema) | [<img src="https://img.youtube.com/vi/cnGyyM0vOes/0.jpg">](https://youtu.be/cnGyyM0vOes) |
+| Instruction tuning | How to fine-tune a smol-LM with Hugging Face, TRL, and the smoltalk Dataset | [Mayurji](https://huggingface.co/iammayur) | [<img src="https://img.youtube.com/vi/jKdXv3BiLu0/0.jpg">](https://youtu.be/jKdXv3BiLu0) |
+
+
+<details>
+<summary>⚠️ Deprecated features notice for "How to fine-tune a smol-LM with Hugging Face, TRL, and the smoltalk Dataset" (click to expand)</summary>
+
+> [!WARNING]
+> The tutorial uses two deprecated features:
+>
+> - `SFTTrainer(..., tokenizer=tokenizer)`: Use `SFTTrainer(..., processing_class=tokenizer)` instead, or simply omit it (it will be inferred from the model).
+> - `setup_chat_format(model, tokenizer)`: Use `SFTConfig(..., chat_template_path="Qwen/Qwen3-0.6B")`, where `chat_template_path` specifies the model whose chat template you want to copy.
+
+</details>
+
+## Vision Language Models
+
+### Tutorials

 | Task | Class | Description | Author | Tutorial | Colab |
 | --- | --- | --- | --- | --- | --- |
--- a/docs/source/cpo_trainer.md
+++ b/docs/source/cpo_trainer.md
@ -1,6 +1,6 @@
 # CPO Trainer

-[![](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo,trl)

 ## Overview

@ -98,15 +98,13 @@ To use this loss as described in the paper, we can set the `loss_type="alphapo"`

 The CPO algorithm supports several loss functions. The loss function can be set using the `loss_type` parameter in the [`CPOConfig`]. The following loss functions are supported:

-| `loss_type=`                           | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `"sigmoid"` (default)                  | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `"hinge"`                              | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `"ipo"`                                | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only).                                                                                                                        |
-| `"simpo"`                              | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value.  |
-| `"alphapo"`                            | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero.  |
-
-
+| `loss_type=` | Description |
+| --- | --- |
+| `"sigmoid"` (default) | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression. |
+| `"hinge"` | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin. |
+| `"ipo"` | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only). |
+| `"simpo"` | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value. |
+| `"alphapo"` | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero. |

 ### For Mixture of Experts Models: Enabling the auxiliary loss

--- a/docs/source/customization.md
+++ b/docs/source/customization.md
@ -2,8 +2,6 @@

 TRL is designed with modularity in mind so that users are able to efficiently customize the training loop for their needs. Below are some examples on how you can apply and test different techniques.  Note: Although these examples use the DPOTrainer, the customization applies to most (if not all) trainers.

-
-
 ## Use different optimizers and schedulers

 By default, the `DPOTrainer` creates a `torch.optim.AdamW` optimizer. You can create and define a different optimizer and pass it to `DPOTrainer` as follows:
@ -84,11 +82,11 @@ trainer = DPOTrainer(
 trainer.train()
 ```

-## Pass 8-bit reference models 
- 
+## Pass 8-bit reference models
+
 Since `trl` supports all keyword arguments when loading a model from `transformers` using `from_pretrained`, you can also leverage `load_in_8bit` from `transformers` for more memory efficient fine-tuning.

-Read more about 8-bit model loading in `transformers` [here](https://huggingface.co/docs/transformers/en/peft#load-in-8bit-or-4bit).
+Read more about 8-bit model loading in `transformers` [Load in 8bit or 4bit](https://huggingface.co/docs/transformers/en/peft#load-in-8bit-or-4bit).

 ```python
 from datasets import load_dataset
@ -114,7 +112,7 @@ trainer.train()

 ## Use the accelerator cache optimizer

-When training large models, you should better handle the accelerator cache by iteratively clearing it. To do so, simply pass `optimize_device_cache=True` to `DPOConfig`:
+When training large models, you should better handle the accelerator cache by iteratively clearing it. To do so, simply pass `optimize_device_cache=True` to [`DPOConfig`]:

 ```python
 training_args = DPOConfig(..., optimize_device_cache=True)
--- a/docs/source/data_utils.md
+++ b/docs/source/data_utils.md
@ -4,6 +4,10 @@

 [[autodoc]] prepare_multimodal_messages

+## prepare_multimodal_messages_vllm
+
+[[autodoc]] prepare_multimodal_messages_vllm
+
 ## is_conversational

 [[autodoc]] is_conversational
--- a/docs/source/dataset_formats.md
+++ b/docs/source/dataset_formats.md
@ -81,7 +81,7 @@ This guide provides an overview of the dataset formats and types supported by ea
    <td>Stepwise supervision</td>
    <td>
      <pre><code>{"prompt": "Which number is larger, 9.8 or 9.11?",
- "completions": ["The fractional part of 9.8 is 0.8.", 
+ "completions": ["The fractional part of 9.8 is 0.8.",
                 "The fractional part of 9.11 is 0.11.",
                 "0.11 is greater than 0.8.",
                 "Hence, 9.11 > 9.8."],
@ -132,8 +132,6 @@ preference_example = {
 }
 ```

-Conversational datasets are useful for training chat models, but must be converted into a standard format before being used with TRL trainers. This is typically done using chat templates specific to the model being used. For more information, refer to the [Working with conversational datasets in TRL](#working-with-conversational-datasets-in-trl) section.
-
 #### Tool Calling

 Some chat templates support *tool calling*, which allows the model to interact with external functions—referred to as **tools**—during generation. This extends the conversational capabilities of the model by enabling it to output a `"tool_calls"` field instead of a standard `"content"` message whenever it decides to invoke a tool.
@ -289,31 +287,28 @@ prompt_only_example = {"prompt": [{"role": "user", "content": "What color is the

 For examples of prompt-only datasets, refer to the [Prompt-only datasets collection](https://huggingface.co/collections/trl-lib/prompt-only-datasets-677ea25245d20252cea00368).

-<Tip>
-
-While both the prompt-only and language modeling types are similar, they differ in how the input is handled. In the prompt-only type, the prompt represents a partial input that expects the model to complete or continue, while in the language modeling type, the input is treated as a complete sentence or sequence. These two types are processed differently by TRL. Below is an example showing the difference in the output of the `apply_chat_template` function for each type:
-
-```python
-from transformers import AutoTokenizer
-from trl import apply_chat_template
-
-tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
-
-# Example for prompt-only type
-prompt_only_example = {"prompt": [{"role": "user", "content": "What color is the sky?"}]}
-apply_chat_template(prompt_only_example, tokenizer)
-# Output: {'prompt': '<|user|>\nWhat color is the sky?<|end|>\n<|assistant|>\n'}
-
-# Example for language modeling type
-lm_example = {"messages": [{"role": "user", "content": "What color is the sky?"}]}
-apply_chat_template(lm_example, tokenizer)
-# Output: {'text': '<|user|>\nWhat color is the sky?<|end|>\n<|endoftext|>'}
-```
-
- The prompt-only output includes a `'<|assistant|>\n'`, indicating the beginning of the assistant’s turn and expecting the model to generate a completion.
- In contrast, the language modeling output treats the input as a complete sequence and terminates it with `'<|endoftext|>'`, signaling the end of the text and not expecting any additional content.
-
-</Tip>
+> [!TIP]
+> While both the prompt-only and language modeling types are similar, they differ in how the input is handled. In the prompt-only type, the prompt represents a partial input that expects the model to complete or continue, while in the language modeling type, the input is treated as a complete sentence or sequence. These two types are processed differently by TRL. Below is an example showing the difference in the output of the `apply_chat_template` function for each type:
+>
+> ```python
+> from transformers import AutoTokenizer
+> from trl import apply_chat_template
+>
+> tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
+>
+> # Example for prompt-only type
+> prompt_only_example = {"prompt": [{"role": "user", "content": "What color is the sky?"}]}
+> apply_chat_template(prompt_only_example, tokenizer)
+> # Output: {'prompt': '<|user|>\nWhat color is the sky?<|end|>\n<|assistant|>\n'}
+>
+> # Example for language modeling type
+> lm_example = {"messages": [{"role": "user", "content": "What color is the sky?"}]}
+> apply_chat_template(lm_example, tokenizer)
+> # Output: {'text': '<|user|>\nWhat color is the sky?<|end|>\n<|endoftext|>'}
+> ```
+>
+> - The prompt-only output includes a `'<|assistant|>\n'`, indicating the beginning of the assistant’s turn and expecting the model to generate a completion.
+> - In contrast, the language modeling output treats the input as a complete sequence and terminates it with `'<|endoftext|>'`, signaling the end of the text and not expecting any additional content.

 #### Prompt-completion

@ -390,103 +385,23 @@ For examples of stepwise supervision datasets, refer to the [Stepwise supervisio

 Choosing the right dataset type depends on the task you are working on and the specific requirements of the TRL trainer you are using. Below is a brief overview of the dataset types supported by each TRL trainer.

-| Trainer                 | Expected dataset type                                                                                  |
-| ----------------------- | ------------------------------------------------------------------------------------------------------ |
-| [`BCOTrainer`]          | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
-| [`CPOTrainer`]          | [Preference (explicit prompt recommended)](#preference)                                                |
-| [`DPOTrainer`]          | [Preference (explicit prompt recommended)](#preference)                                                |
-| [`GKDTrainer`]          | [Prompt-completion](#prompt-completion)                                                                |
-| [`GRPOTrainer`]         | [Prompt-only](#prompt-only)                                                                            |
-| [`IterativeSFTTrainer`] | [Unpaired preference](#unpaired-preference)                                                            |
-| [`KTOTrainer`]          | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
-| [`NashMDTrainer`]       | [Prompt-only](#prompt-only)                                                                            |
-| [`OnlineDPOTrainer`]    | [Prompt-only](#prompt-only)                                                                            |
-| [`ORPOTrainer`]         | [Preference (explicit prompt recommended)](#preference)                                                |
-| [`PPOTrainer`]          | Tokenized language modeling                                                                            |
-| [`PRMTrainer`]          | [Stepwise supervision](#stepwise-supervision)                                                          |
-| [`RewardTrainer`]       | [Preference (implicit prompt recommended)](#preference)                                                |
-| [`RLOOTrainer`]         | [Prompt-only](#prompt-only)                                                                            |
-| [`SFTTrainer`]          | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion)                     |
-| [`XPOTrainer`]          | [Prompt-only](#prompt-only)                                                                            |
-
-<Tip>
-
-TRL trainers only support standard dataset formats, [for now](https://github.com/huggingface/trl/issues/2071). If you have a conversational dataset, you must first convert it into a standard format.
-For more information on how to work with conversational datasets, refer to the [Working with conversational datasets in TRL](#working-with-conversational-datasets-in-trl) section.
-
-</Tip>
-
-## Working with conversational datasets in TRL
-
-Conversational datasets are increasingly common, especially for training chat models. However, some TRL trainers don't support conversational datasets in their raw format. (For more information, see [issue #2071](https://github.com/huggingface/trl/issues/2071).) These datasets must first be converted into a standard format.
-Fortunately, TRL offers tools to easily handle this conversion, which are detailed below.
-
-### Converting a conversational dataset into a standard dataset
-
-To convert a conversational dataset into a standard dataset, you need to _apply a chat template_ to the dataset. A chat template is a predefined structure that typically includes placeholders for user and assistant messages. This template is provided by the tokenizer of the model you use.
-
-For detailed instructions on using chat templating, refer to the [Chat templating section in the `transformers` documentation](https://huggingface.co/docs/transformers/en/chat_templating).
-
-In TRL, the method you apply to convert the dataset will vary depending on the task. Fortunately, TRL provides a helper function called [`apply_chat_template`] to simplify this process. Here's an example of how to use it:
-
-```python
-from transformers import AutoTokenizer
-from trl import apply_chat_template
-
-tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-128k-instruct")
-
-example = {
-    "prompt": [{"role": "user", "content": "What color is the sky?"}],
-    "completion": [{"role": "assistant", "content": "It is blue."}]
-}
-
-apply_chat_template(example, tokenizer)
-# Output:
-# {'prompt': '<|user|>\nWhat color is the sky?<|end|>\n<|assistant|>\n', 'completion': 'It is blue.<|end|>\n<|endoftext|>'}
-```
-
-Alternatively, you can use the [`~datasets.Dataset.map`] method to apply the template across an entire dataset:
-
-```python
-from datasets import Dataset
-from trl import apply_chat_template
-
-dataset_dict = {
-    "prompt": [[{"role": "user", "content": "What color is the sky?"}],
-               [{"role": "user", "content": "Where is the sun?"}]],
-    "completion": [[{"role": "assistant", "content": "It is blue."}],
-                   [{"role": "assistant", "content": "In the sky."}]]
-}
-
-dataset = Dataset.from_dict(dataset_dict)
-dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})
-# Output:
-# {'prompt': ['<|user|>\nWhat color is the sky?<|end|>\n<|assistant|>\n',
-#             '<|user|>\nWhere is the sun?<|end|>\n<|assistant|>\n'],
-#  'completion': ['It is blue.<|end|>\n<|endoftext|>', 'In the sky.<|end|>\n<|endoftext|>']}
-```
-
-<Tip warning={true}>
-
-We recommend using the [`apply_chat_template`] function instead of calling `tokenizer.apply_chat_template` directly. Handling chat templates for non-language modeling datasets can be tricky and may result in errors, such as mistakenly placing a system prompt in the middle of a conversation.
-For additional examples, see [#1930 (comment)](https://github.com/huggingface/trl/pull/1930#issuecomment-2292908614). The [`apply_chat_template`] is designed to handle these intricacies and ensure the correct application of chat templates for various tasks.
-
-</Tip>
-
-<Tip warning={true}>
-
-It's important to note that chat templates are model-specific. For example, if you use the chat template from [meta-llama/Meta-Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) with the above example, you get a different output:
-
-```python
-apply_chat_template(example, AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct"))
-# Output:
-# {'prompt': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat color is the sky?<|im_end|>\n<|im_start|>assistant\n',
-#  'completion': 'It is blue.<|im_end|>\n'}
-```
-
-Always use the chat template associated with the model you're working with. Using the wrong template can lead to inaccurate or unexpected results.
-
-</Tip>
+| Trainer | Expected dataset type |
+| --- | --- |
+| [`experimental.bco.BCOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
+| [`CPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`DPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`GKDTrainer`] | [Prompt-completion](#prompt-completion) |
+| [`GRPOTrainer`] | [Prompt-only](#prompt-only) |
+| [`KTOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
+| [`NashMDTrainer`] | [Prompt-only](#prompt-only) |
+| [`OnlineDPOTrainer`] | [Prompt-only](#prompt-only) |
+| [`ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
+| [`PPOTrainer`] | Tokenized language modeling |
+| [`PRMTrainer`] | [Stepwise supervision](#stepwise-supervision) |
+| [`RewardTrainer`] | [Preference (implicit prompt recommended)](#preference) |
+| [`RLOOTrainer`] | [Prompt-only](#prompt-only) |
+| [`SFTTrainer`] | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion) |
+| [`XPOTrainer`] | [Prompt-only](#prompt-only) |

 ## Using any dataset with TRL: preprocessing and conversion

@ -532,15 +447,15 @@ This section provides example code to help you convert between different dataset

 For simplicity, some of the examples below do not follow this recommendation and use the standard format. However, the conversions can be applied directly to the conversational format without modification.

-| From \ To                       | Language modeling                                                       | Prompt-completion                                                       | Prompt-only                                                       | Preference with implicit prompt                           | Preference                                                | Unpaired preference                                                       | Stepwise supervision |
-| ------------------------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------- | --------------------------------------------------------- | --------------------------------------------------------- | ------------------------------------------------------------------------- | -------------------- |
-| Language modeling               | N/A                                                                     | N/A                                                                     | N/A                                                               | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Prompt-completion               | [🔗](#from-prompt-completion-to-language-modeling-dataset)               | N/A                                                                     | [🔗](#from-prompt-completion-to-prompt-only-dataset)               | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Prompt-only                     | N/A                                                                     | N/A                                                                     | N/A                                                               | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Preference with implicit prompt | [🔗](#from-preference-with-implicit-prompt-to-language-modeling-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-completion-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-only-dataset) | N/A                                                       | [🔗](#from-implicit-to-explicit-prompt-preference-dataset) | [🔗](#from-preference-with-implicit-prompt-to-unpaired-preference-dataset) | N/A                  |
-| Preference                      | [🔗](#from-preference-to-language-modeling-dataset)                      | [🔗](#from-preference-to-prompt-completion-dataset)                      | [🔗](#from-preference-to-prompt-only-dataset)                      | [🔗](#from-explicit-to-implicit-prompt-preference-dataset) | N/A                                                       | [🔗](#from-preference-to-unpaired-preference-dataset)                      | N/A                  |
-| Unpaired preference             | [🔗](#from-unpaired-preference-to-language-modeling-dataset)             | [🔗](#from-unpaired-preference-to-prompt-completion-dataset)             | [🔗](#from-unpaired-preference-to-prompt-only-dataset)             | N/A                                                       | N/A                                                       | N/A                                                                       | N/A                  |
-| Stepwise supervision            | [🔗](#from-stepwise-supervision-to-language-modeling-dataset)            | [🔗](#from-stepwise-supervision-to-prompt-completion-dataset)            | [🔗](#from-stepwise-supervision-to-prompt-only-dataset)            | N/A                                                       | N/A                                                       | [🔗](#from-stepwise-supervision-to-unpaired-preference-dataset)            | N/A                  |
+| From \ To | Language modeling | Prompt-completion | Prompt-only | Preference with implicit prompt | Preference | Unpaired preference | Stepwise supervision |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| Language modeling | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| Prompt-completion | [🔗](#from-prompt-completion-to-language-modeling-dataset) | N/A | [🔗](#from-prompt-completion-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
+| Prompt-only | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
+| Preference with implicit prompt | [🔗](#from-preference-with-implicit-prompt-to-language-modeling-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-completion-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-only-dataset) | N/A | [🔗](#from-implicit-to-explicit-prompt-preference-dataset) | [🔗](#from-preference-with-implicit-prompt-to-unpaired-preference-dataset) | N/A |
+| Preference | [🔗](#from-preference-to-language-modeling-dataset) | [🔗](#from-preference-to-prompt-completion-dataset) | [🔗](#from-preference-to-prompt-only-dataset) | [🔗](#from-explicit-to-implicit-prompt-preference-dataset) | N/A | [🔗](#from-preference-to-unpaired-preference-dataset) | N/A |
+| Unpaired preference | [🔗](#from-unpaired-preference-to-language-modeling-dataset) | [🔗](#from-unpaired-preference-to-prompt-completion-dataset) | [🔗](#from-unpaired-preference-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
+| Stepwise supervision | [🔗](#from-stepwise-supervision-to-language-modeling-dataset) | [🔗](#from-stepwise-supervision-to-prompt-completion-dataset) | [🔗](#from-stepwise-supervision-to-prompt-only-dataset) | N/A | N/A | [🔗](#from-stepwise-supervision-to-unpaired-preference-dataset) | N/A |

 ### From prompt-completion to language modeling dataset

@ -716,13 +631,10 @@ dataset = unpair_preference_dataset(dataset)
 'label': True}
 ```

-<Tip warning={true}>
-
-Keep in mind that the `"chosen"` and `"rejected"` completions in a preference dataset can be both good or bad.
-Before applying [`unpair_preference_dataset`], please ensure that all `"chosen"` completions can be labeled as good and all `"rejected"` completions as bad.
-This can be ensured by checking absolute rating of each completion, e.g. from a reward model.
-
-</Tip>
+> [!WARNING]
+> Keep in mind that the `"chosen"` and `"rejected"` completions in a preference dataset can be both good or bad.
+> Before applying [`unpair_preference_dataset`], please ensure that all `"chosen"` completions can be labeled as good and all `"rejected"` completions as bad.
+> This can be ensured by checking absolute rating of each completion, e.g. from a reward model.

 ### From preference to language modeling dataset

@ -857,13 +769,10 @@ dataset = unpair_preference_dataset(dataset)
 'label': True}
 ```

-<Tip warning={true}>
-
-Keep in mind that the `"chosen"` and `"rejected"` completions in a preference dataset can be both good or bad.
-Before applying [`unpair_preference_dataset`], please ensure that all `"chosen"` completions can be labeled as good and all `"rejected"` completions as bad.
-This can be ensured by checking absolute rating of each completion, e.g. from a reward model.
-
-</Tip>
+> [!WARNING]
+> Keep in mind that the `"chosen"` and `"rejected"` completions in a preference dataset can be both good or bad.
+> Before applying [`unpair_preference_dataset`], please ensure that all `"chosen"` completions can be labeled as good and all `"rejected"` completions as bad.
+> This can be ensured by checking absolute rating of each completion, e.g. from a reward model.

 ### From unpaired preference to language modeling dataset

@ -1038,7 +947,7 @@ Some trainers also support fine-tuning vision-language models (VLMs) using image

 A conversational vision dataset differs from a standard conversational dataset in two key ways:

-1. The dataset must contain the key `images` with the image data.
+1. The dataset must contain the key `images` with the image data (as lists of PIL images) or `image` with a single PIL image.
 2. The `"content"` field in messages must be a list of dictionaries, where each dictionary specifies the type of data: `"image"` or `"text"`.

 Example:
@ -1062,3 +971,23 @@ An example of a conversational vision dataset is the [openbmb/RLAIF-V-Dataset](h
  width="100%"
  height="560px"
 ></iframe>
+
+> [!NOTE]
+> Mixing text-only and vision-language data in the dataset is possible, but it requires `transformers` version 4.57.0 or later. Example:
+>
+> ```python
+> dataset = Dataset.from_dict({
+>     "prompt": [
+>         [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "What color is the sky in the image?"}]}],
+>         [{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}],
+>     ],
+>     "completion": [
+>         [{"role": "assistant", "content": [{"type": "text", "text": "It is blue."}]}],
+>         [{"role": "assistant", "content": [{"type": "text", "text": "Paris."}]}],
+>     ],
+>     "images": [
+>         [PIL.Image.open("path/to/sky_image1.png")],
+>         [],
+>     ],
+> })
+> ```
--- a/docs/source/ddpo_trainer.md
+++ b/docs/source/ddpo_trainer.md
@ -1,131 +0,0 @@
-# Denoising Diffusion Policy Optimization
-
-[![](https://img.shields.io/badge/All_models-DDPO-blue)](https://huggingface.co/models?other=ddpo,trl)
-
-## The why
-
-| Before | After DDPO finetuning |
-| --- | --- |
-| <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/pre_squirrel.png"/></div> |  <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/post_squirrel.png"/></div> |
-| <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/pre_crab.png"/></div> |  <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/post_crab.png"/></div> |
-| <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/pre_starfish.png"/></div> |  <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/post_starfish.png"/></div> |
-
-
-## Getting started with Stable Diffusion finetuning with reinforcement learning
-
-The machinery for finetuning of Stable Diffusion models with reinforcement learning makes heavy use of HuggingFace's `diffusers`
-library. A reason for stating this is that getting started requires a bit of familiarity with the `diffusers` library concepts, mainly two of them - pipelines and schedulers.
-Right out of the box (`diffusers` library), there isn't a `Pipeline` nor a `Scheduler` instance that is suitable for finetuning with reinforcement learning. Some adjustments need to be made. 
-
-There is a pipeline interface that is provided by this library that is required to be implemented to be used with the `DDPOTrainer`, which is the main machinery for fine-tuning Stable Diffusion with reinforcement learning. **Note: Only the StableDiffusion architecture is supported at this point.**
-There is a default implementation of this interface that you can use out of the box. Assuming the default implementation is sufficient and/or to get things moving, refer to the training example alongside this guide. 
-
-The point of the interface is to fuse the pipeline and the scheduler into one object which allows for minimalness in terms of having the constraints all in one place. The interface was designed in hopes of catering to pipelines and schedulers beyond the examples in this repository and elsewhere at this time of writing. Also the scheduler step is a method of this pipeline interface and this may seem redundant given that the raw scheduler is accessible via the interface but this is the only way to constrain the scheduler step output to an output type befitting of the algorithm at hand (DDPO).
-
-For a more detailed look into the interface and the associated default implementation, go [here](https://github.com/lvwerra/trl/tree/main/trl/models/modeling_sd_base.py)
-
-Note that the default implementation has a LoRA implementation path and a non-LoRA based implementation path. The LoRA flag enabled by default and this can be turned off by passing in the flag to do so. LORA based training is faster and the LORA associated model hyperparameters responsible for model convergence aren't as finicky as non-LORA based training.
-
-Also in addition, there is the expectation of providing a reward function and a prompt function. The reward function is used to evaluate the generated images and the prompt function is used to generate the prompts that are used to generate the images.
-
-## Getting started with `examples/scripts/ddpo.py`
-
-The `ddpo.py` script is a working example of using the `DDPO` trainer to finetune a Stable Diffusion model. This example explicitly configures a small subset of the overall parameters associated with the config object (`DDPOConfig`).
-
-**Note:** one A100 GPU is recommended to get this running. Anything below a A100 will not be able to run this example script and even if it does via relatively smaller sized parameters, the results will most likely be poor.
-
-Almost every configuration parameter has a default. There is only one commandline flag argument that is required of the user to get things up and running. The user is expected to have a [huggingface user access token](https://huggingface.co/docs/hub/security-tokens) that will be used to upload the model post finetuning to HuggingFace hub. The following bash command is to be entered to get things running
-
-```batch
-python ddpo.py --hf_user_access_token <token>
-```
-
-To obtain the documentation of `stable_diffusion_tuning.py`, please run `python stable_diffusion_tuning.py --help`
-
-The following are things to keep in mind (The code checks this for you as well) in general while configuring the trainer (beyond the use case of using the example script)
-
- The configurable sample batch size (`--ddpo_config.sample_batch_size=6`) should be greater than or equal to the configurable training batch size (`--ddpo_config.train_batch_size=3`)
- The configurable sample batch size (`--ddpo_config.sample_batch_size=6`) must be divisible by the configurable train batch size (`--ddpo_config.train_batch_size=3`)
- The configurable sample batch size (`--ddpo_config.sample_batch_size=6`) must be divisible by both the configurable gradient accumulation steps (`--ddpo_config.train_gradient_accumulation_steps=1`) and the configurable accelerator processes count 
-
-## Setting up the image logging hook function
-
-Expect the function to be given a list of lists of the form
-```python
-[[image, prompt, prompt_metadata, rewards, reward_metadata], ...]
-
-```
-and `image`, `prompt`, `prompt_metadata`, `rewards`, `reward_metadata` are batched.
-The last list in the lists of lists represents the last sample batch. You are likely to want to log this one
-While you are free to log however you want the use of `wandb` or `tensorboard` is recommended.
-
-### Key terms
-
- `rewards` : The rewards/score is a numerical associated with the generated image and is key to steering the RL process
- `reward_metadata` : The reward metadata is the metadata associated with the reward. Think of this as extra information payload delivered alongside the reward
- `prompt` : The prompt is the text that is used to generate the image
- `prompt_metadata` : The prompt metadata is the metadata associated with the prompt. A situation where this will not be empty is when the reward model comprises of a [`FLAVA`](https://huggingface.co/docs/transformers/model_doc/flava) setup where questions and ground answers (linked to the generated image) are expected with the generated image (See here: https://github.com/kvablack/ddpo-pytorch/blob/main/ddpo_pytorch/rewards.py#L45)
- `image` : The image generated by the Stable Diffusion model
-
-Example code for logging sampled images with `wandb` is given below.
-
-```python
-# for logging these images to wandb
-
-def image_outputs_hook(image_data, global_step, accelerate_logger):
-    # For the sake of this example, we only care about the last batch
-    # hence we extract the last element of the list
-    result = {}
-    images, prompts, _, rewards, _ = image_data[-1]
-    for i, image in enumerate(images):
-        pil = Image.fromarray(
-            (image.cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
-        )
-        pil = pil.resize((256, 256))
-        result[f"{prompts[i]:.25} | {rewards[i]:.2f}"] = [pil]
-    accelerate_logger.log_images(
-        result,
-        step=global_step,
-    )
-
-```
-
-### Using the finetuned model
-
-Assuming you've done with all the epochs and have pushed up your model to the hub, you can use the finetuned model as follows
-
-```python
-
-import torch
-from trl import DefaultDDPOStableDiffusionPipeline
-
-pipeline = DefaultDDPOStableDiffusionPipeline("metric-space/ddpo-finetuned-sd-model")
-
-device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-
-# memory optimization
-pipeline.vae.to(device, torch.float16)
-pipeline.text_encoder.to(device, torch.float16)
-pipeline.unet.to(device, torch.float16)
-
-prompts = ["squirrel", "crab", "starfish", "whale","sponge", "plankton"]
-results = pipeline(prompts)
-
-for prompt, image in zip(prompts,results.images):
-    image.save(f"{prompt}.png")
-
-```
-
-## Credits
-
-This work is heavily influenced by the repo [here](https://github.com/kvablack/ddpo-pytorch) and the associated paper [Training Diffusion Models
-with Reinforcement Learning by Kevin Black, Michael Janner, Yilan Du, Ilya Kostrikov, Sergey Levine](https://huggingface.co/papers/2305.13301).
-
-## DDPOTrainer
-
-[[autodoc]] DDPOTrainer
-
-## DDPOConfig
-
-[[autodoc]] DDPOConfig
-
--- a/docs/source/deepspeed_integration.md
+++ b/docs/source/deepspeed_integration.md
@ -1,10 +1,7 @@
 # DeepSpeed Integration

-<Tip warning={true}>
-
-Section under construction. Feel free to contribute!
-
-</Tip>
+> [!WARNING]
+> Section under construction. Feel free to contribute!

 TRL supports training with DeepSpeed, a library that implements advanced training optimization techniques. These include optimizer state partitioning, offloading, gradient partitioning, and more.

--- a/docs/source/detoxifying_a_lm.md
+++ b/docs/source/detoxifying_a_lm.md
@ -1,187 +0,0 @@
-# Detoxifying a Language Model using PPO
-
-Language models (LMs) are known to sometimes generate toxic outputs. In this example, we will show how to "detoxify" a LM by feeding it toxic prompts and then using [Transformer Reinforcement Learning (TRL)](https://huggingface.co/docs/trl/index) and Proximal Policy Optimization (PPO) to "detoxify" it.
-
-Read this section to follow our investigation on how we can reduce toxicity in a wide range of LMs, from 125m parameters to 6B parameters! 
-
-Here's an overview of the notebooks and scripts in the [TRL toxicity repository](https://github.com/huggingface/trl/tree/main/examples/toxicity/scripts) as well as the link for the interactive demo:
-
-| File | Description | Colab link |
-|---|---| --- |
-| [`gpt-j-6b-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py) | Detoxify `GPT-J-6B` using PPO | x | 
-| [`evaluate-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py) | Evaluate de-toxified models using `evaluate` | x | 
-| [Interactive Space](https://huggingface.co/spaces/ybelkada/detoxified-lms)| An interactive Space that you can use to compare the original model with its detoxified version!| x |
-
-## Context
-
-Language models are trained on large volumes of text from the internet which also includes a lot of toxic content. Naturally,  language models pick up the toxic patterns during training. Especially when prompted with already toxic texts the models are likely to continue the generations in a toxic way. The goal here is to "force" the model to be less toxic by feeding it toxic prompts and then using PPO to "detoxify" it.
-
-### Computing toxicity scores
-
-In order to optimize a model with PPO we need to define a reward. For this use-case we want a negative reward whenever the model generates something toxic and a positive comment when it is not toxic.
-Therefore, we used [`facebook/roberta-hate-speech-dynabench-r4-target`](https://huggingface.co/facebook/roberta-hate-speech-dynabench-r4-target), which is a RoBERTa model fine-tuned to classify between "neutral" and "toxic" text as our toxic prompts classifier.
-One could have also used different techniques to evaluate the toxicity of a model, or combined different toxicity classifiers, but for simplicity we have chosen to use this one.
-
-### Selection of models
-
-We selected the following models for our experiments to show that TRL can be easily scaled to 10B parameters models: 
-
-* [`EleutherAI/gpt-neo-125M`](https://huggingface.co/EleutherAI/gpt-neo-125M) (125 million parameters)
-* [`EleutherAI/gpt-neo-2.7B`](https://huggingface.co/EleutherAI/gpt-neo-2.7B) (2.7 billion parameters)
-* [`EleutherAI/gpt-j-6B`](https://huggingface.co/EleutherAI/gpt-j-6B) (6 billion parameters)
-
-For the selection of the smallest model, we have chosen `EleutherAI/gpt-neo-125M` because it has shown to be a model that was the "most toxic" compared to other models. We have run toxicity evaluation using `facebook/roberta-hate-speech-dynabench-r4-target` model on 4 different architectures on a subset of `allenai/real-toxicity-prompts` dataset. Note that we have computed the toxicity score on the generated text only (thus ignoring the prompt).
-
-| Model | Mean toxicity score |
-|---|---| 
-| `gpt2` | 0.01602 |
-| `facebook/opt-350m` | 0.01628 |
-| `bigscience/bloom-560m` | 0.00767 |
-| `EleutherAI/gpt-neo-125M` | **0.02016** |
-
-## Designing the problem
-
-When doing PPO, it is very important to design the problem efficiently so that the model can learn to solve it. Let's cover the topics that were important for the model to converge.
-
-### Pre-processing the dataset
-
-The dataset consists of prompts and their continuations, and each of them has an associated `toxicity` score.
-
-A `prompt` example:
-```
-{ "text": "Bronx officer Luis Rios alleged on-the-job tall tales were compounded by his lying about them in court, and", "profanity": 0.08360514, "sexually_explicit": 0.118823394, "identity_attack": 0.09608547, "flirtation": 0.23102224, "threat": 0.13215046, "insult": 0.10130461, "severe_toxicity": 0.04068885, "toxicity": 0.16534281 }
-```
-And its `continuation` value:
-```
-{ "text": " cost federal prosecutors all the drug evidence that the cop collected against an armed suspect — 16 baggies of cocaine during a strip search.", "severe_toxicity": 0.067997746, "toxicity": 0.1694093, "profanity": 0.11931301, "sexually_explicit": 0.12521537, "identity_attack": 0.09268324, "flirtation": 0.13452998, "threat": 0.31312028, "insult": 0.10761123 }
-```
-
-We want to increase the chance for the model to generate toxic prompts so we get more learning signal. For this reason pre-process the dataset to consider only the prompt that has a toxicity score that is greater than a threshold. We can do this in a few lines of code:
-```python
-train_dataset = load_dataset("allenai/real-toxicity-prompts", split="train")
-
-def filter_fn(sample):
-    toxicity = sample["prompt"]["toxicity"]
-    return toxicity is not None and toxicity > 0.3
-
-train_dataset = train_dataset.filter(filter_fn, batched=False)
-```
-
-### Reward function
-
-The reward function is one of the most important part of training a model with reinforcement learning. It is the function that will tell the model if it is doing well or not.
-We tried various combinations, considering the softmax of the label "neutral", the log of the toxicity score and the raw logits of the label "neutral". We have found out that the convergence was much more smoother with the raw logits of the label "neutral".
-```python
-logits = toxicity_model(**toxicity_inputs).logits.float()
-rewards = (logits[:, 0]).tolist()
-```
-
-### Impact of input prompts length
-
-We have found out that training a model with small or long context (from 5 to 8 tokens for the small context and from 15 to 20 tokens for the long context) does not have any impact on the convergence of the model, however, when training the model with longer prompts, the model will tend to generate more toxic prompts. 
-As a compromise between the two we took for a context window of 10 to 15 tokens for the training.
-
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-long-vs-short-context.png">
-</div>
-
-### How to deal with OOM issues
-
-Our goal is to train models up to 6B parameters, which is about 24GB in float32! Here are two tricks we use to be able to train a 6B model on a single 40GB-RAM GPU:
-
- Use `bfloat16` precision: Simply load your model in `bfloat16` when calling `from_pretrained` and you can reduce the size of the model by 2:
-
-```python
-model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", dtype=torch.bfloat16)
-```
-
-and the optimizer will take care of computing the gradients in `bfloat16` precision. Note that this is a pure `bfloat16` training which is different from the mixed precision training. If one wants to train a model in mixed-precision, they should not load the model with `dtype` and specify the mixed precision argument when calling `accelerate config`.
-
- Use shared layers: Since PPO algorithm requires to have both the active and reference model to be on the same device, we have decided to use shared layers to reduce the memory footprint of the model. This can be achieved by specifying `num_shared_layers` argument when calling the `create_reference_model()` function. For example, if you want to share the first 6 layers of the model, you can do it like this:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-shared-layers.png">
-</div>
-
-```python
-ref_model = create_reference_model(model, num_shared_layers=6)
-trainer = PPOTrainer(..., ref_model=ref_model)
-```
-
-In the example above this means that the model has the 4 first layers frozen (i.e. since these layers are shared between the active model and the reference model).
-
- One could have also applied gradient checkpointing to reduce the memory footprint of the model by calling `model.pretrained_model.enable_gradient_checkpointing()` (although this has the downside of training being ~20% slower).
-
-## Training the model!
-
-We have decided to keep 3 models in total that correspond to our best models:
-
- [`ybelkada/gpt-neo-125m-detox`](https://huggingface.co/ybelkada/gpt-neo-125m-detox)
- [`ybelkada/gpt-neo-2.7B-detox`](https://huggingface.co/ybelkada/gpt-neo-2.7B-detox)
- [`ybelkada/gpt-j-6b-detox`](https://huggingface.co/ybelkada/gpt-j-6b-detox)
-
-We have used different learning rates for each model, and have found out that the largest models were quite hard to train and can easily lead to collapse mode if the learning rate is not chosen correctly (i.e. if the learning rate is too high):
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-collapse-mode.png">
-</div>
-
-The final training run of `ybelkada/gpt-j-6b-detoxified-20shdl` looks like this:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-final-run-2.png">
-</div>
-
-As you can see the model converges nicely, but obviously we don't observe a very large improvement from the first step, as the original model is not trained to generate toxic contents. 
-
-Also we have observed that training with larger `mini_batch_size` leads to smoother convergence and better results on the test set:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-mbs-run.png">
-</div>
-
-## Results
-
-We tested our models on a new dataset, the [`OxAISH-AL-LLM/wiki_toxic`](https://huggingface.co/datasets/OxAISH-AL-LLM/wiki_toxic) dataset. We feed each model with a toxic prompt from it (a sample with the label "toxic"), and generate 30 new tokens as it is done on the training loop and measure the toxicity score using `evaluate`'s [`toxicity` metric](https://huggingface.co/spaces/ybelkada/toxicity).
-We report the toxicity score of 400 sampled examples, compute its mean and standard deviation and report the results in the table below:
-
-| Model | Mean toxicity score | Std toxicity score |
-| --- | --- | --- |
-| `EleutherAI/gpt-neo-125m` | 0.1627 | 0.2997 |
-| `ybelkada/gpt-neo-125m-detox` | **0.1148** | **0.2506** |
-| --- | --- | --- |
-| `EleutherAI/gpt-neo-2.7B` | 0.1884 | 0.3178 |
-| `ybelkada/gpt-neo-2.7B-detox` | **0.0916** | **0.2104** |
-| --- | --- | --- |
-| `EleutherAI/gpt-j-6B` | 0.1699 | 0.3033 |
-| `ybelkada/gpt-j-6b-detox` | **0.1510** | **0.2798** |
-
-<div class="column" style="text-align:center">
-  <figure>
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-final-barplot.png" style="width:80%">
-    <figcaption>Toxicity score with respect to the size of the model.</figcaption>
-  </figure>
-</div>
-
-Below are few generation examples of `gpt-j-6b-detox` model:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-toxicity-examples.png">
-</div>
-
-The evaluation script can be found [here](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py).
-
-### Discussions
-
-The results are quite promising, as we can see that the models are able to reduce the toxicity score of the generated text by an interesting margin. The gap is clear for `gpt-neo-2B` model but we see less so for the `gpt-j-6B` model. There are several things we could try to improve the results on the largest model starting with training with larger `mini_batch_size` and probably allowing to back-propagate through more layers (i.e. use less shared layers).
-
-To sum up, in addition to human feedback this could be a useful additional signal when training large language models to ensure their outputs are less toxic as well as useful.
-
-### Limitations
-
-We are also aware of consistent bias issues reported with toxicity classifiers, and of work evaluating the negative impact of toxicity reduction on the diversity of outcomes. We recommend that future work also compare the outputs of the detoxified models in terms of fairness and diversity before putting them to use.
-
-## What is next?
-
-You can download the model and use it out of the box with `transformers`, or play with the Spaces that compares the output of the models before and after detoxification [here](https://huggingface.co/spaces/ybelkada/detoxified-lms).
--- a/docs/source/distributing_training.md
+++ b/docs/source/distributing_training.md
@ -1,8 +1,7 @@
 # Distributing Training

-<Tip warning={true}>
-Section under construction. Feel free to contribute!
-</Tip>
+> [!WARNING]
+> Section under construction. Feel free to contribute!

 ## Multi-GPU Training with TRL

@ -27,11 +26,12 @@ accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train
 This automatically distributes the workload across all available GPUs.

 Under the hood, [🤗 Accelerate](https://github.com/huggingface/accelerate) creates one model per GPU. Each process:
+
 - Processes its own batch of data
 - Computes the loss and gradients for that batch
 - Shares gradient updates across all GPUs

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)
+![multi gpu](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)

 The effective batch size is calculated as:

@ -49,12 +49,142 @@ Example, these configurations are equivalent, and should yield the same results:
 | 1 | 4 | 8 | Lower memory usage, slower training |
 | 8 | 4 | 1 | Multi-GPU to get the best of both worlds |

-<Tip> 
+> [!TIP]
+> Having one model per GPU can lead to high memory usage, which may not be feasible for large models or low-memory GPUs. In such cases, you can leverage [DeepSpeed](https://github.com/deepspeedai/DeepSpeed), which provides optimizations like model sharding, Zero Redundancy Optimizer, mixed precision training, and offloading to CPU or NVMe. Check out our [DeepSpeed Integration](deepspeed_integration) guide for more details.

-Having one model per GPU can lead to high memory usage, which may not be feasible for large models or low-memory GPUs. In such cases, you can leverage [DeepSpeed](https://github.com/deepspeedai/DeepSpeed), which provides optimizations like model sharding, Zero Redundancy Optimizer, mixed precision training, and offloading to CPU or NVMe. Check out our [DeepSpeed Integration](deepspeed_integration) guide for more details.
+## Context Parallelism

-</Tip>
+Context Parallelism (CP) is a parallelization technique that enables training with longer sequences by splitting the sequence dimension across multiple GPUs. Each GPU processes a portion of the sequence, allowing you to train with sequences longer than what would fit on a single GPU's memory.
+
+For more details on CP, see the [Ultrascale Playbook - Context Parallelism](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=context_parallelism).
+
+CP is particularly useful when:
+
+- You want to train with very long sequences (>32k tokens)
+- Single GPU memory is insufficient for your desired sequence length
+- You need to maintain sequence coherence across the full context
+
+### Requirements and Limitations
+
+CP has specific requirements:
+
+1. **Accelerate 1.10 or higher** is required
+2. **FSDP2 (PyTorch FSDP v2)** is required as the distributed training backend
+3. **SDPA attention** - Flash Attention is currently not supported with CP
+4. **Sequence length divisibility** - sequences must be divisible by `cp_size * 2`. This is now automatically handled using the `pad_to_multiple_of` parameter in the data collator, which works seamlessly with both standard and padding-free modes.
+
+### Configuration
+
+To enable CP, you need to configure both Accelerate and your training arguments:
+
+#### Accelerate Configuration
+
+Use one of the provided accelerate config files (e.g. [`context_parallel_2gpu.yaml`](https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/context_parallel_2gpu.yaml) for 2 GPUs):
+
+```yaml
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: true  # Enable activation checkpointing for memory efficiency
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 2  # Number of GPUs
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
+parallelism_config:
+  parallelism_config_dp_replicate_size: 1
+  parallelism_config_dp_shard_size: 1
+  parallelism_config_tp_size: 1
+  parallelism_config_cp_size: 2  # Context parallel size
+```
+
+#### Training Configuration
+
+```python
+from trl import SFTConfig
+
+training_args = SFTConfig(
+    # required
+    pad_to_multiple_of=4,           # ensures divisibility by cp_size * 2
+    # to get the most out of CP
+    max_length=16384,               # long sequence length
+    packing=True,                   # use packing to reduce padding
+    use_liger_kernel=True,          # compatible with CP
+    gradient_checkpointing=False,   # The activation_checkpointing in FSDP config and the gradient_checkpointing in training arg can't be set to True simultaneously
+    per_device_train_batch_size=1,
+    ...
+)
+```
+
+Then, launch your training script with the appropriate accelerate config file:
+
+```bash
+accelerate launch --config_file context_parallel_2gpu.yaml train.py
+```
+
+### Best Practices
+
+1. **Use the `pad_to_multiple_of` parameter** - This is now the recommended way to ensure sequence length divisibility:
+   - For `cp_size=2`: use `pad_to_multiple_of=4` (since `cp_size * 2 = 4`)
+   - For `cp_size=4`: use `pad_to_multiple_of=8` (since `cp_size * 2 = 8`)
+   - The data collator automatically pads sequences to the required multiple, ensuring compatibility with CP
+
+2. **Use packing with padding** - The default BFD (Best Fit Decreasing) strategy works perfectly:
+   - Preserves sequence boundaries and maintains training quality
+   - Works seamlessly with both `padding_free=True` and standard padding modes
+
+3. **Combine with other memory optimizations** like Liger kernels, bfloat16, and gradient checkpointing
+
+4. **Start with smaller context parallel sizes** (2-4 GPUs) before scaling up
+
+5. **Monitor memory usage** across all GPUs to ensure balanced workload
+
+### Benchmarking Context Parallelism
+
+We benchmarked CP to highlight its potential improvements in training efficiency.  
+Our experiments were conducted using **1, 2, 4, and 8 H100 GPUs**, though the results can be extended to larger clusters with more nodes and GPUs.
+
+For the setup, we fine-tuned an **8B model** ([Qwen/Qwen3-8B](https://huggingface.co/Qwen/Qwen3-8B)) using the provided accelerate configuration  
+([`context_parallel_2gpu.yaml`](https://github.com/huggingface/trl/blob/main/examples/accelerate_configs/context_parallel_2gpu.yaml)).  
+We adjusted `num_processes` and `parallelism_config_cp_size` based on the number of GPUs for each run.  
+Training was performed with the [sft.py](https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py) example script, combined with the parameters described above.
+
+The results below summarize the **maximum trainable sequence length** and **iterations per second** for different numbers of GPUs. A value marked as `OOM` indicates that the configuration ran out of memory and could not be trained.  
+
+These results show that **Context Parallelism (CP) scales effectively with more GPUs**, enabling training on much longer sequences. With **8 GPUs**, context lengths of over **300k tokens** become feasible, unlocking training with extremely long contexts while maintaining reasonable throughput.  
+
+<div class="flex justify-center">
+  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/context_parallelism_max_length_plot.png" alt="CP Max content length" width="45%"/>
+  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/context_parallelism_s_it_plot.png" alt="CP seconds/iteration" width="45%"/>
+</div>
+
+> [!TIP]
+> Accelerate also supports **N-Dimensional Parallelism (ND-parallelism)**, which enables you to combine different parallelization strategies to efficiently distribute model training across multiple GPUs.  
+>
+> You can learn more and explore configuration examples in the [Accelerate ND-parallelism guide](https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#nd-parallelism).
+
+### Further Reading on Context Parallelism
+
+- [Accelerate: Context Parallelism Guide](https://github.com/huggingface/accelerate/blob/main/docs/source/concept_guides/context_parallelism.md)  
+- [Accelerate Example: 128k Sequence Length](https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#context-parallelism-128k-sequence-length)  
+- [Hugging Face Blog: Enabling Long-Context Training with Sequence Parallelism in Axolotl](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl)  
+- [Snowflake Engineering Blog: Arctic Long Sequence Training (ALST) — Scalable and Efficient Training for Multi-Million Token Sequences (Note that they use a different strategy)](https://www.snowflake.com/en/engineering-blog/arctic-long-sequence-training-multi-million-token-ai/)

 ## Multi-Node Training

-We're working on a guide for multi-node training. Stay tuned! 🚀
+We're working on a guide for multi-node training. Stay tuned! 🚀
--- a/docs/source/dpo_trainer.md
+++ b/docs/source/dpo_trainer.md
@ -1,6 +1,6 @@
 # DPO Trainer

-[![](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo,trl) [![](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
+[![model badge](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo,trl) [![model badge](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)

 ## Overview

@ -19,7 +19,7 @@ Then, fine-tuning a language model via DPO consists of two steps and is easier t

 This process is illustrated in the sketch below (from [Figure 1 of the DPO paper](https://huggingface.co/papers/2305.18290)):

-![](https://github.com/huggingface/trl/assets/49240599/9150fac6-3d88-4ca2-8ec6-2a6f3473216d)
+![Figure 1 DPO](https://github.com/huggingface/trl/assets/49240599/9150fac6-3d88-4ca2-8ec6-2a6f3473216d)

 Read more about DPO algorithm in the [original paper](https://huggingface.co/papers/2305.18290).

@ -101,7 +101,6 @@ Additionally, unlike standard text-based models where a `tokenizer` is used, for

 For a complete example of fine-tuning a vision-language model, refer to the script in [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py).

-
 ## Example script

 We provide an example script to train a model using the DPO method. The script is available in [`trl/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py)
@ -192,10 +191,10 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype

 You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below:

-| GPU      | Model     | Dataset    | 🤗   | 🤗 + FlashAttention 2 | 🦥 Unsloth | 🦥 VRAM saved |
-| -------- | --------- | ---------- | --- | --------------------- | --------- | ------------ |
-| A100 40G | Zephyr 7b | Ultra Chat | 1x  | 1.24x                 | **1.88x** | -11.6%       |
-| Tesla T4 | Zephyr 7b | Ultra Chat | 1x  | 1.09x                 | **1.55x** | -18.6%       |
+| GPU | Model | Dataset | 🤗 | 🤗 + FlashAttention 2 | 🦥 Unsloth | 🦥 VRAM saved |
+| --- | --- | --- | --- | --- | --- | --- |
+| A100 40G | Zephyr 7b | Ultra Chat | 1x | 1.24x | **1.88x** | -11.6% |
+| Tesla T4 | Zephyr 7b | Ultra Chat | 1x | 1.09x | **1.55x** | -18.6% |

 First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:

@ -295,3 +294,7 @@ dpo_trainer = DPOTrainer(
 ## DataCollatorForPreference

 [[autodoc]] trainer.dpo_trainer.DataCollatorForPreference
+
+## FDivergenceType
+
+[[autodoc]] trainer.dpo_trainer.FDivergenceType
--- a/docs/source/example_overview.md
+++ b/docs/source/example_overview.md
@ -1,52 +1,60 @@
 # Examples

+This directory contains a collection of examples that demonstrate how to use the TRL library for various applications. We provide both **scripts** for advanced use cases and **notebooks** for an easy start and interactive experimentation.

-## Introduction
+The notebooks are self-contained and can run on **free Colab**, while the scripts can run on **single GPU, multi-GPU, or DeepSpeed** setups.

-The examples should work in any of the following settings (with the same script):
-   - single GPU
-   - multi GPUs (using PyTorch distributed mode)
-   - multi GPUs (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
-   - fp16 (mixed-precision), fp32 (normal precision), or bf16 (bfloat16 precision)
+**Getting Started**

-To run it in each of these various modes, first initialize the accelerate
-configuration with `accelerate config`
-
-To train with a 4-bit or 8-bit model, please run:
+Install TRL and additional dependencies as follows:

 ```bash
 pip install --upgrade trl[quantization]
 ```

-## Accelerate Config
+Check for additional optional dependencies [here](https://github.com/huggingface/trl/blob/main/pyproject.toml).

-For all the examples, you'll need to generate a 🤗 Accelerate config file with:
+For scripts, you will also need an 🤗 Accelerate config (recommended for multi-gpu settings):

-```shell
+```bash
 accelerate config # will prompt you to define the training configuration
 ```

-Then, it is encouraged to launch jobs with `accelerate launch`!
+This allows you to run scripts with `accelerate launch` in single or multi-GPU settings.
+
+## Notebooks
+
+These notebooks are easier to run and are designed for quick experimentation with TRL. The list of notebooks can be found in the [`trl/examples/notebooks/`](https://github.com/huggingface/trl/tree/main/examples/notebooks/) directory.


-## Maintained Examples
+| Notebook | Description | Open in Colab |
+|----------|-------------|---------------|
+| [`sft_trl_lora_qlora.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/sft_trl_lora_qlora.ipynb) | Supervised Fine-Tuning (SFT) using QLoRA on free Colab | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/sft_trl_lora_qlora.ipynb) |
+| [`sft_qwen_vl.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/sft_qwen_vl.ipynb) | Supervised Fine-Tuning (SFT) Qwen3-VL with QLoRA using TRL on free Colab | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/sft_qwen_vl.ipynb) |
+| [`grpo_qwen3_vl.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/grpo_qwen3_vl.ipynb) | GRPO Qwen3-VL with QLoRA using TRL on free Colab | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_qwen3_vl.ipynb) |

-Scripts can be used as examples of how to use TRL trainers. They are located in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) directory. Additionally, we provide examples in the [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directory. These examples are maintained and tested regularly.
+Legacy / Older Notebooks

-| File | Description |
+- [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO.
+- [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.
+- [`gpt2-sentiment-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
+
+## Scripts
+
+Scripts are maintained in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) and [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directories. They show how to use different trainers such as `SFTTrainer`, `PPOTrainer`, `DPOTrainer`, `GRPOTrainer`, and more.
+
+ File | Description |
 | --- | --- |
-| [`examples/scripts/alignprop.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/alignprop.py) | This script shows how to use the [`AlignPropTrainer`] to fine-tune a diffusion model. |
-| [`examples/scripts/bco.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/bco.py) | This script shows how to use the [`KTOTrainer`] with the BCO loss to fine-tune a model to increase instruction-following, truthfulness, honesty and helpfulness using the [openbmb/UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback) dataset. |
+| [`examples/scripts/bco.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/bco.py) | This script shows how to use the [`KTOTrainer`] with the BCO loss to fine-tune a model to increase instruction-following, truthfulness, honesty, and helpfulness using the [openbmb/UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback) dataset. |
 | [`examples/scripts/cpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/cpo.py) | This script shows how to use the [`CPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
-| [`examples/scripts/ddpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ddpo.py) | This script shows how to use the [`DDPOTrainer`] to fine-tune a stable diffusion model using reinforcement learning. |
 | [`trl/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a model. |
 | [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a Vision Language Model to reduce hallucinations using the [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) dataset. |
 | [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`HfPairwiseJudge`] or [`OpenAIPairwiseJudge`] to judge model generations. |
 | [`examples/scripts/gkd.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gkd.py) | This script shows how to use the [`GKDTrainer`] to fine-tune a model. |
 | [`trl/scripts/grpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/grpo.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a model. |
-| [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset.  |
-| [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset.  |
-| [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset.  |
+| [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
+| [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset. |
+| [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
 | [`examples/scripts/kto.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/kto.py) | This script shows how to use the [`KTOTrainer`] to fine-tune a model. |
 | [`examples/scripts/mpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/mpo_vlm.py) | This script shows how to use MPO via the [`DPOTrainer`] to align a model based on preferences using the [HuggingFaceH4/rlaif-v_formatted](https://huggingface.co/datasets/HuggingFaceH4/rlaif-v_formatted) dataset and a set of loss weights with weights. |
 | [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`NashMDTrainer`] to fine-tune a model. |
@ -56,43 +64,28 @@ Scripts can be used as examples of how to use TRL trainers. They are located in
 | [`examples/scripts/ppo/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to continue text with positive sentiment or physically descriptive language. |
 | [`examples/scripts/ppo/ppo_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to generate TL;DR summaries. |
 | [`examples/scripts/prm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/prm.py) | This script shows how to use the [`PRMTrainer`] to fine-tune a Process-supervised Reward Model (PRM). |
-| [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/reward_modeling.py) | This script shows how to use the [`RewardTrainer`] to train a Outcome Reward Model (ORM) on your own dataset. |
+| [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/reward_modeling.py) | This script shows how to use the [`RewardTrainer`] to train an Outcome Reward Model (ORM) on your own dataset. |
 | [`examples/scripts/rloo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/rloo.py) | This script shows how to use the [`RLOOTrainer`] to fine-tune a model to improve its ability to solve math questions. |
 | [`examples/scripts/sft.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a model. |
 | [`examples/scripts/sft_gemma3.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_gemma3.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Gemma 3 model. |
 | [`examples/scripts/sft_video_llm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_video_llm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Video Language Model. |
-| [`examples/scripts/sft_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Vision Language Model in a chat setting. The script has only been tested with [LLaVA 1.5](https://huggingface.co/llava-hf/llava-1.5-7b-hf), [LLaVA 1.6](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf), and [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) models so users may see unexpected behaviour in other model architectures. |
+| [`examples/scripts/sft_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Vision Language Model in a chat setting. The script has only been tested with [LLaVA 1.5](https://huggingface.co/llava-hf/llava-1.5-7b-hf), [LLaVA 1.6](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf), and [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) models, so users may see unexpected behaviour in other model architectures. |
 | [`examples/scripts/sft_vlm_gemma3.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm_gemma3.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Gemma 3 model on vision to text tasks. |
 | [`examples/scripts/sft_vlm_smol_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm_smol_vlm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a SmolVLM model. |
 | [`examples/scripts/xpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/xpo.py) | This script shows how to use the [`XPOTrainer`] to fine-tune a model. |

-Here are also some easier-to-run colab notebooks that you can use to get started with TRL:
+## Distributed Training (for scripts)

-| File | Description |
-| --- | --- |
-| [`examples/notebooks/best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb) | This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO. |
-| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
-| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. |
-
-
-We also have some other examples that are less maintained but can be used as a reference:
-1. **[research_projects](https://github.com/huggingface/trl/tree/main/examples/research_projects)**: Check out this folder to find the scripts used for some research projects that used TRL (LM de-toxification, Stack-Llama, etc.)
-
-
-## Distributed training
-
-All the scripts can be run on multiple GPUs by providing the path of an 🤗 Accelerate config file when calling `accelerate launch`. To launch one of them on one or multiple GPUs, run the following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine and `--all_arguments_of_the_script` with your arguments).
+You can run scripts on multiple GPUs with 🤗 Accelerate:

 ```shell
 accelerate launch --config_file=examples/accelerate_configs/multi_gpu.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
 ```

-You can also adjust the parameters of the 🤗 Accelerate config file to suit your needs (e.g. training in mixed precision).
-
-### Distributed training with DeepSpeed
-
-Most of the scripts can be run on multiple GPUs together with DeepSpeed ZeRO-{1,2,3} for efficient sharding of the optimizer states, gradients, and model weights. To do so, run the following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine, `--all_arguments_of_the_script` with your arguments, and `--deepspeed_config` with the path to the DeepSpeed config file such as `examples/deepspeed_configs/deepspeed_zero1.yaml`):
+For DeepSpeed ZeRO-{1,2,3}:

 ```shell
 accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero{1,2,3}.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
 ```
+
+Adjust `NUM_GPUS` and `--all_arguments_of_the_script` as needed.
--- a/docs/source/experimental_overview.md
+++ b/docs/source/experimental_overview.md
@ -0,0 +1,31 @@
+# Experimental
+
+This directory contains a minimal, clearly separated space for fast iteration on new ideas.
+
+> [!WARNING]
+> **Stability contract:** Anything under `trl.experimental` may change or be removed in *any* release (including patch versions) without prior deprecation. Do not rely on these APIs for production workloads.
+
+## Promotion Path (Simple)
+
+1. **Prototype outside the main repo:** Start development in your own fork or a separate repository to iterate quickly.
+2. **Experimental inclusion:** Once it’s ready for early users, move the idea into `trl.experimental.<feature>`.
+3. **Improve:** Add tests, a short doc/example, and demonstrate the usage.
+4. **Promote:** Once the API proves stable and there is clear interest or adoption from the community, move it into `trl.<feature>` (stable module).
+
+## FAQ
+
+**Why not just use branches?**
+Because branches are not shipped to users; experimental code inside the package lets early adopters try things and give feedback.
+
+**Can these APIs change or vanish without warning?**
+Yes. Anything inside `trl.experimental` can change or disappear in *any* release.
+
+**Should I use this in production?**
+Only if you are fine with updating your code quickly when things change.
+
+**Will maintainers promptly fix issues in `trl.experimental`?**
+Not necessarily. The experimental module is a playground for new ideas, and maintainers may not prioritize bug fixes or feature requests there. Issues may remain unresolved until (or unless) the feature graduates to the stable API.
+
+**How to silence the runtime notice?**
+
+Use: `export TRL_EXPERIMENTAL_SILENCE=1`.
--- a/docs/source/gfpo.md
+++ b/docs/source/gfpo.md
@ -0,0 +1,39 @@
+# GFPO
+
+This feature implements the GFPO algorithm to enforce concise reasoning in the model's output generation, as proposed in the paper [Sample More to Think Less: Group Filtered Policy Optimization for Concise Reasoning](https://huggingface.co/papers/2508.09726).
+
+## Usage
+
+To activate GFPO in [`GFPOTrainer`]:
+
+- set `num_remains_in_group` in [`GFPOConfig`]
+- define a group filter function and set it to `group_filter_func` in [`GFPOTrainer`]. `group_filter_func` will score the `num_generations` completions and The GFPOTrainer filters groups according to their scores to get top `num_remains_in_group` completions as a new group. Model will be trained on the filtered group.
+
+```python
+# train_gfpo.py
+from trl.experimental.gfpo import GFPOConfig, GFPOTrainer
+
+# dummy group filter to scores the completions based on its indice in group
+class GroupFilter:
+    def __call__(self, group_completions, group_rewards, **kwargs):
+        group_scores = []
+        for completions, rewards in zip(group_completions, group_rewards):
+            scores = [float(i) for i in range(len(completions))]
+            group_scores.append(scores)
+        return group_scores
+
+training_args = GFPOConfig(
+    output_dir="Qwen3-0.6B-GFPO",
+    per_device_train_batch_size=4,
+    num_remains_in_group=2,
+    bf16=True,
+)
+trainer = GFPOTrainer(
+    model="Qwen/Qwen3-0.6B",
+    reward_funcs=...,
+    train_dataset=...,
+    args=training_args,
+    group_filter_func=GroupFilter(),
+)
+trainer.train()
+```
--- a/docs/source/gkd_trainer.md
+++ b/docs/source/gkd_trainer.md
@ -1,17 +1,17 @@
 # Generalized Knowledge Distillation Trainer

-[![](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd,trl)
+[![model badge](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd,trl)

 ## Overview

-Generalized Knowledge Distillation (GKD) was proposed in [On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes](https://huggingface.co/papers/2306.13649) by Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, and Olivier Bachem. 
+Generalized Knowledge Distillation (GKD) was proposed in [On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes](https://huggingface.co/papers/2306.13649) by Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, and Olivier Bachem.

 The abstract from the paper is the following:

 > Knowledge distillation (KD) is widely used for compressing a teacher model to reduce its inference cost and memory footprint, by training a smaller student model. However, current KD methods for auto-regressive sequence models suffer from distribution mismatch between output sequences seen during training and those generated by the student during inference. To address this issue, we introduce Generalized Knowledge Distillation (GKD). Instead of solely relying on a fixed set of output sequences, GKD trains the student on its self-generated output sequences by leveraging feedback from the teacher on such sequences. Unlike supervised KD approaches, GKD also offers the flexibility to employ alternative loss functions between the student and teacher, which can be useful when the student lacks the expressivity to mimic the teacher's distribution. Furthermore, GKD facilitates the seamless integration of distillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for distilling auto-regressive language models on summarization, translation, and arithmetic reasoning tasks, and task-agnostic distillation for instruction-tuning.

-
 The key aspects of GKD are:
+
 1. It addresses the train-inference distribution mismatch in auto-regressive sequence models by training the student model on its self-generated output sequences.
 2. GKD allows flexibility in choosing different divergence measures between student and teacher models via the generalized Jensen-Shannon Divergence (JSD), which can be useful when the student lacks the capacity to fully mimic the teacher.

@ -20,6 +20,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
 ## Usage tips

 The [`GKDTrainer`] is a wrapper around the [`SFTTrainer`] class that takes in a teacher model argument. It needs three parameters to be set via the [`GKDConfig`] namely:
+
 * `lmbda`:  controls the student data fraction, i.e., the proportion of on-policy student-generated outputs. When `lmbda=0.0`, the loss reduces to supervised JSD where the student is trained with the token-level probabilities of the teacher. When `lmbda=1.0`, the loss reduces to on-policy JSD, where the student generates output sequences and token-specific feedback on these sequences from the teacher. For values in between [0, 1] it is random between the two based on the `lmbda` value for each batch.
 * `seq_kd`:  controls whether to perform Sequence-Level KD (can be viewed as supervised FT on teacher-generated out). When `seq_kd=True` and `lmbda=0.0`, the loss reduces to supervised JSD, where the teacher generates output sequences and the student receives token-specific feedback on these sequences from the teacher. 
 * `beta`: controls the interpolation in the generalized Jensen-Shannon Divergence.  When `beta=0.0` the loss approximates forward KL divergence, while for `beta=1.0` the loss approximates reverse KL divergence. For values in between [0, 1] it interpolates between the two.
@ -85,6 +86,7 @@ trainer.train()
 ### Expected dataset type

 The dataset should be formatted as a list of "messages" where each message is a list of dictionaries with the following keys:
+
 * `role`: either `system`, `assistant` or `user`
 * `content`: the message content

--- a/docs/source/gold_trainer.md
+++ b/docs/source/gold_trainer.md
@ -0,0 +1,120 @@
+# General Online Logit Distillation (GOLD) Trainer
+
+[![All_models-GOLD-blue](https://img.shields.io/badge/All_models-GOLD-blue)](https://huggingface.co/models?other=sft,gold)
+
+## Overview
+
+General Online Logit Distillation (GOLD) is an extension of Universal Logit Distillation (ULD) that supports
+student/teacher pairs with different tokenizers. It aligns the textual spans produced by both tokenizers and merges the
+associated logits so no completion tokens are dropped. This enables cross-tokenizer knowledge distillation, including
+mixed model families (for example, LLaMA students with Qwen teachers).
+
+Key capabilities:
+
+1. **Cross-tokenizer alignment** – GOLD incrementally decodes the student and teacher tokens, groups passages with the same visible text, and merges probabilities inside each group. This guarantees loss terms are computed over the full completion even when token boundaries differ.
+2. **Hybrid ULD loss** – when `uld_use_hybrid_loss` is enabled, GOLD compares exact vocabulary matches directly and falls back to the original sorted-probability ULD loss for unmatched tokens. This improves stability for students whose vocabularies only partially overlap with the teacher.
+3. **Seamless integration with GKD** – GOLD inherits the on-policy vs. off-policy scheduling from the [`GKDTrainer`](./gkd_trainer.md), so you can combine sequence-level KD, generalized JSD, and cross-tokenizer distillation in a single training run.
+
+> [!NOTE]
+> GOLD is currently part of the `trl.experimental` namespace. APIs may change without notice while the feature is iterated on.
+
+## Usage tips
+
+The [`GOLDTrainer`] subclasses [`SFTTrainer`] and accepts the same datasets as other TRL trainers (lists of ChatML style
+messages). Important configuration flags on [`GOLDConfig`] include:
+
+* `use_uld_loss` – toggles Universal Logit Distillation. Set this to `True` for cross-tokenizer setups.
+* `teacher_tokenizer_name_or_path` – required when `use_uld_loss=True`; GOLD uses the teacher tokenizer to align tokens.
+* `uld_use_hybrid_loss`, `uld_hybrid_matched_weight`, `uld_hybrid_unmatched_weight` – enables and weights the hybrid
+  matched/unmatched loss.
+* `beta`, `lmbda`, `seq_kd` – inherited from `GKDConfig`, controlling the generalized JSD interpolation and on-policy
+  sampling ratio.
+
+A minimal end-to-end example:
+
+```python
+from datasets import load_dataset
+from trl.experimental.gold import GOLDConfig, GOLDTrainer
+
+train_dataset = load_dataset(
+    "HuggingFaceTB/OpenR1-Math-220k-default-verified",
+    "all",
+    split="train[:1024]",
+)
+
+trainer = GOLDTrainer(
+    model="meta-llama/Llama-3.2-1B-Instruct",
+    teacher_model="Qwen/Qwen2.5-0.5B-Instruct",
+    args=GOLDConfig(output_dir="gold-model", use_uld_loss=True, teacher_tokenizer_name_or_path="Qwen/Qwen2.5-0.5B-Instruct"),
+    train_dataset=train_dataset,
+)
+trainer.train()
+```
+
+For quick-start workflows you can rely on string identifiers as shown above—the trainer will load the model and tokenizer for you. Explicitly instantiating `AutoModelForCausalLM`, `AutoTokenizer`, or populating `GOLDConfig` is recommended only for advanced use cases where you need fine-grained control over initialization.
+
+A more explicit setup might look like this when you need to customise model loading, tokenizer settings, or training arguments:
+
+```python
+from datasets import load_dataset
+from trl import GOLDConfig, GOLDTrainer
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+student_name = "meta-llama/Llama-3.2-1B-Instruct"
+teacher_name = "Qwen/Qwen2.5-0.5B-Instruct"
+
+tokenizer = AutoTokenizer.from_pretrained(student_name)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+model = AutoModelForCausalLM.from_pretrained(student_name)
+teacher_model = AutoModelForCausalLM.from_pretrained(teacher_name)
+
+train_dataset = load_dataset(
+    "HuggingFaceTB/Countdown-Task-GOLD",
+    "verified_Qwen2.5-0.5B-Instruct",
+    split="train",
+)
+
+training_args = GOLDConfig(
+    output_dir="gold-model",
+    per_device_train_batch_size=1,
+    teacher_model=teacher_name,
+    teacher_tokenizer_name_or_path=teacher_name,
+    use_uld_loss=True,
+    uld_use_hybrid_loss=True,
+)
+
+trainer = GOLDTrainer(
+    model=model,
+    teacher_model=teacher_model,
+    args=training_args,
+    processing_class=tokenizer,
+    train_dataset=train_dataset,
+)
+trainer.train()
+```
+
+### Expected dataset type
+
+GOLD requires a [conversational](dataset_formats#conversational) [language modeling](dataset_formats#language_modeling) dataset, e.g.:
+
+```python
+{"messages": [{"role": "user", "content": "What color is the sky?"},
+              {"role": "assistant", "content": "It is blue."}]}
+```
+
+`GOLDTrainer` keeps the raw messages so the ChatML collator can construct prompts and completions with the correct
+boundaries.
+
+## GOLDTrainer
+
+[[autodoc]] experimental.gold.GOLDTrainer
+    - train
+    - generate_on_policy_outputs
+    - save_model
+    - push_to_hub
+
+## GOLDConfig
+
+[[autodoc]] experimental.gold.GOLDConfig
--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@ -1,6 +1,6 @@
 # GRPO Trainer

-[![](https://img.shields.io/badge/All_models-GRPO-blue)](https://huggingface.co/models?other=grpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-GRPO-blue)](https://huggingface.co/models?other=grpo,trl)

 ## Overview

@ -56,13 +56,13 @@ accelerate launch train_grpo.py

 Distributed across 8 GPUs, the training takes approximately 1 day.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_curves.png)
+![GRPO curves](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_curves.png)

 ## Looking deeper into the GRPO method

 GRPO is an online learning algorithm, meaning it improves iteratively by using the data generated by the trained model itself during training. The intuition behind GRPO objective is to maximize the advantage of the generated completions, while ensuring that the model remains close to the reference policy. To understand how GRPO works, it can be broken down into four main steps: **Generating completions**, **computing the advantage**, **estimating the KL divergence**, and **computing the loss**.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_visual.png)
+![GRPO visual](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_visual.png)

 ### Generating completions

@ -76,17 +76,11 @@ $$\hat{A}_{i,t} = \frac{r_i - \text{mean}(\mathbf{r})}{\text{std}(\mathbf{r})}$$

 This approach gives the method its name: **Group Relative Policy Optimization (GRPO)**.

-<Tip>
+> [!TIP]
+> It was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that scaling by  \\( \text{std}(\mathbf{r}) \\) may cause a question-level difficulty bias. You can disable this scaling by setting `scale_rewards=False` in [`GRPOConfig`].

-It was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that scaling by  \\( \text{std}(\mathbf{r}) \\) may cause a question-level difficulty bias. You can disable this scaling by setting `scale_rewards=False` in [`GRPOConfig`].
-
-</Tip>
-
-<Tip>
-
-[Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)](https://huggingface.co/papers/2508.08221) showed that calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping. You can use this scaling strategy by setting `scale_rewards="batch"` in [`GRPOConfig`].
-
-</Tip>
+> [!TIP]
+> As shown in [Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)](https://huggingface.co/papers/2508.08221), calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping. You can use this scaling strategy by setting `scale_rewards="batch"` in [`GRPOConfig`].

 ### Estimating the KL divergence

@ -105,17 +99,11 @@ $$

 where the first term represents the scaled advantage and the second term penalizes deviations from the reference policy through KL divergence.

-<Tip>
+> [!TIP]
+> Note that compared to the original formulation in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300), we don't scale by  \\( \frac{1}{|o_i|} \\) because it was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that this introduces a response-level length bias. More details in [loss types](#loss-types).

-Note that compared to the original formulation in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300), we don't scale by  \\( \frac{1}{|o_i|} \\) because it was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that this introduces a response-level length bias. More details in [loss types](#loss-types).
-
-</Tip>
-
-<Tip>
-
-Note that compared to the original formulation in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300), we use  \\( \beta = 0.0 \\) by default, meaning that the KL divergence term is not used. This choice is motivated by several recent studies (e.g., [Open-Reasoner-Zero: An Open Source Approach to Scaling Up Reinforcement Learning on the Base Model](https://huggingface.co/papers/2503.24290)) which have shown that the KL divergence term is not essential for training with GRPO. As a result, it has become common practice to exclude it (e.g. [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783), [DAPO: An Open-Source LLM Reinforcement Learning System at Scale](https://huggingface.co/papers/2503.14476)). If you wish to include the KL divergence term, you can set `beta` in [`GRPOConfig`] to a non-zero value.
-
-</Tip>
+> [!TIP]
+> Note that compared to the original formulation in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300), we use  \\( \beta = 0.0 \\) by default, meaning that the KL divergence term is not used. This choice is motivated by several recent studies (e.g., [Open-Reasoner-Zero: An Open Source Approach to Scaling Up Reinforcement Learning on the Base Model](https://huggingface.co/papers/2503.24290)) which have shown that the KL divergence term is not essential for training with GRPO. As a result, it has become common practice to exclude it (e.g. [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783), [DAPO: An Open-Source LLM Reinforcement Learning System at Scale](https://huggingface.co/papers/2503.14476)). If you wish to include the KL divergence term, you can set `beta` in [`GRPOConfig`] to a non-zero value.

 In the original paper, this formulation is generalized to account for multiple updates after each generation (denoted  \\( \mu \\), can be set with `num_iterations` in [`GRPOConfig`]) by leveraging the **clipped surrogate objective**:

@ -167,7 +155,7 @@ While training and evaluating, we record the following reward metrics:
 - `completions/mean_terminated_length`: The average length of generated completions that terminate with EOS.
 - `completions/min_terminated_length`: The minimum length of generated completions that terminate with EOS.
 - `completions/max_terminated_length`: The maximum length of generated completions that terminate with EOS.
- `completions/clipped_ratio` : The ratio of truncated (clipped) completions.
+- `completions/clipped_ratio`: The ratio of truncated (clipped) completions.
 - `reward/{reward_func_name}/mean`: The average reward from a specific reward function.
 - `reward/{reward_func_name}/std`: The standard deviation of the reward from a specific reward function.
 - `reward`: The overall average reward after applying reward weights.
@ -178,10 +166,10 @@ While training and evaluating, we record the following reward metrics:
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:
-$$
-\text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}\,.
-$$
-A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
+  $$
+  \text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}\,.
+  $$
+  A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
 - `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
 - `clip_ratio/low_min`: The minimum ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
 - `clip_ratio/high_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the upper bound of the trust region:  \\(r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}\\)
@ -192,26 +180,28 @@ A higher value means more tokens are clipped, which constrains how much the poli
 ### Speed up training with vLLM-powered generation

 Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
+
 ```shell
 pip install trl[vllm]
 ```

 We support two ways of using vLLM during training: **server mode** and **colocate mode**.

-<Tip>
-By default, Truncated Importance Sampling is activated for vLLM generation to address the generation-training mismatch that occurs when using different frameworks. This can be turned off by setting `vllm_importance_sampling_correction=False`. For more information, see [Truncated Importance Sampling](paper_index#truncated-importance-sampling)
-</Tip>
+> [!TIP]
+> By default, Truncated Importance Sampling is activated for vLLM generation to address the generation-training mismatch that occurs when using different frameworks. This can be turned off by setting `vllm_importance_sampling_correction=False`. For more information, see [Truncated Importance Sampling](paper_index#truncated-importance-sampling)

 #### 🔌 Option 1: Server mode

 In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.

 1. **Start the vLLM server**:
+
   ```bash
   trl vllm-serve --model <model_name>
   ```

 2. **Enable server mode in your training script**:
+
   ```python
   from trl import GRPOConfig

@ -222,11 +212,8 @@ In this mode, vLLM runs in a separate process (and using separate GPUs) and comm
   )
   ```

-<Tip warning={true}>
-
-Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the `CUDA_VISIBLE_DEVICES` environment variable.
-
-</Tip>
+> [!WARNING]
+> Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the `CUDA_VISIBLE_DEVICES` environment variable.

 #### 🧩 Option 2: Colocate mode

@ -242,30 +229,19 @@ training_args = GRPOConfig(
 )
 ```

-<Tip>
+> [!TIP]
+> Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the `vllm_gpu_memory_utilization` parameter in [`GRPOConfig`] to avoid underutilization or out-of-memory errors.
+>
+> We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
+>
+> <iframe src="https://trl-lib-recommend-vllm-memory.hf.space" frameborder="0" width="850" height="450"></iframe>
+>
+> If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
+>
+> If you still find you are getting out-of-memory errors set `vllm_enable_sleep_mode` to True and the vllm parameters and cache will be offloaded during the optimization step. For more information, see [Reducing Memory Usage with vLLM Sleep Mode](reducing_memory_usage#vllm-sleep-mode).

-Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the `vllm_gpu_memory_utilization` parameter in [`GRPOConfig`] to avoid underutilization or out-of-memory errors.
-
-We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
-
-<iframe
-	src="https://trl-lib-recommend-vllm-memory.hf.space"
-	frameborder="0"
-	width="850"
-	height="450"
-></iframe>
-
-If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
-
-If you still find you are getting out-of-memory errors set `vllm_sleep_enabled` to True and the vllm parameters and cache will be offloaded during the optimization step. For more information, see [Reducing Memory Usage with vLLM Sleep Mode](reducing_memory_usage#vllm-sleep-mode).
-
-</Tip>
-
-<Tip>
-
-By default, GRPO uses `MASTER_ADDR=localhost` and `MASTER_PORT=12345` for vLLM, but you can override these values by setting the environment variables accordingly.
-
-</Tip>
+> [!TIP]
+> By default, GRPO uses `MASTER_ADDR=localhost` and `MASTER_PORT=12345` for vLLM, but you can override these values by setting the environment variables accordingly.

 For more information, see [Speeding up training with vLLM](speeding_up_training#vllm-for-fast-generation-in-online-methods).

@ -273,7 +249,7 @@ For more information, see [Speeding up training with vLLM](speeding_up_training#

 When training large models like **Qwen2.5-72B**, you need several key optimizations to make the training efficient and scalable across multiple GPUs and nodes. These include:

- **DeepSpeed ZeRO Stage 3**: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such model. For more details, see [DeepSpeed Integration](deepspeed_integration).
+- **DeepSpeed ZeRO Stage 3**: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such models. For more details, see [DeepSpeed Integration](deepspeed_integration).
 - **Accelerate**: Accelerate is a library that simplifies distributed training across multiple GPUs and nodes. It provides a simple API to launch distributed training and handles the complexities of distributed training, such as data parallelism, gradient accumulation, and distributed data loading. For more details, see [Distributing Training](distributing_training).
 - **vLLM**: See the previous section on how to use vLLM to speed up generation.

@ -352,7 +328,7 @@ The [`GRPOTrainer`] supports using custom reward functions instead of dense rewa
     - `completions` (contains the generated completions),
     - `completions_ids` (contains the tokenized completions),
     - `trainer_state` ([`~transformers.TrainerState`]): The current state of the trainer. This can be used to implement dynamic reward functions, such as curriculum learning, where the reward is adjusted based on the training progress.
-     - All columns names (but `prompt`) that the dataset may have. For example, if the dataset contains a column named `ground_truth`, the function will be called with `ground_truth` as a keyword argument.
+     - All column names (but `prompt`) that the dataset may have. For example, if the dataset contains a column named `ground_truth`, the function will be called with `ground_truth` as a keyword argument.

     The easiest way to comply with this requirement is to use `**kwargs` in the function signature.
   - Depending on the dataset format, the input will vary:
@ -381,7 +357,7 @@ You can test it as follows:
 [2.0, 4.0]
 ```

-#### Example 1.1: Reward longer completions (based in the number of characters)
+#### Example 1.1: Reward longer completions (based on the number of characters)

 Same as the previous example, but this time the reward function is based on the number of characters instead of tokens.

@ -401,10 +377,10 @@ You can test it as follows:
 [6.0, 12.0]
 ```

-#### Example 2: Reward completions with specific format
+#### Example 2: Reward completions with a specific format

 Below is an example of a reward function that checks if the completion has a specific format. This example is inspired by the _format reward_ function used in the paper [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://huggingface.co/papers/2501.12948).
-It is designed for conversational format, where prompts and completions consist of structured messages.
+It is designed for a conversational format, where prompts and completions consist of structured messages.

 ```python
 import re
@ -457,6 +433,7 @@ You can test this function as follows:
 >>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
 [1.0, 0.0]
 ```
+
 #### Example 4: Multi-task reward functions

 Below is an example of using multiple reward functions in the [`GRPOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
@ -513,12 +490,10 @@ trainer = GRPOTrainer(
 trainer.train()
 ```

-In this example, the `math_reward_func` and `coding_reward_func` are designed to work with a mixed dataset that contains both math and coding problems. The `task` column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return `None` and the [`GRPOTrainer`] will continue with the valid functions and tasks. This allows the [`GRPOTrainer`] to handle multiple reward functions with different applicability.
+In this example, the `math_reward_func` and `coding_reward_func` are designed to work with a mixed dataset that contains both math and coding problems. The `task` column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return `None`, and the [`GRPOTrainer`] will continue with the valid functions and tasks. This allows the [`GRPOTrainer`] to handle multiple reward functions with different applicability.

 Note that the [`GRPOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.

-
-
 #### Passing the reward function to the trainer

 To use your custom reward function, pass it to the [`GRPOTrainer`] as follows:
@ -561,9 +536,8 @@ Tested with:
 - **Qwen2.5-VL** — e.g., `Qwen/Qwen2.5-VL-3B-Instruct`
 - **SmolVLM2** — e.g., `HuggingFaceTB/SmolVLM2-2.2B-Instruct`
  
-<Tip>
-Compatibility with all VLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.
-</Tip>
+> [!TIP]
+> Compatibility with all VLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.

 ### Quick Start

@ -589,9 +563,8 @@ accelerate launch \

 ### Configuration Tips

-<Tip warning={true}>
-VLM training may fail if image tokens are truncated. We highly recommend to disable truncation by setting `max_prompt_length` to `None`.
-</Tip>
+> [!WARNING]
+> VLM training may fail if image tokens are truncated. We highly recommend disabling truncation by setting `max_prompt_length` to `None`.

 - Use LoRA on vision-language projection layers
 - Enable 4-bit quantization to reduce memory usage
@ -603,7 +576,7 @@ VLM training may fail if image tokens are truncated. We highly recommend to disa
 Each training sample should include:

 - `prompt`: Text formatted via the processor's chat template
- `image`: A single image (PIL or NumPy array)
+- `image`/`images`: PIL Image or list of PIL Images

 The trainer automatically handles image-to-tensor conversion via the model’s image processor.

--- a/docs/source/grpo_with_replay_buffer.md
+++ b/docs/source/grpo_with_replay_buffer.md
@ -0,0 +1,39 @@
+# GRPO With Replay Buffer
+
+This experimental trainer, trains a model with GRPO but replaces groups (and corresponding completions) that have 0 standard deviation with groups with high rewards and standard deviation that've been used to train a model in prior batches.
+
+## Usage
+
+```python
+from trl.experimental.grpo_with_replay_buffer import GRPOWithReplayBufferTrainer
+from datasets import load_dataset
+
+dataset = load_dataset("trl-internal-testing/zen", "standard_prompt_only", split="train")
+
+# Guarantee that some rewards have 0 std
+def custom_reward_func(completions, **kwargs):
+    if torch.rand(1).item() < 0.25:
+        return [0] * len(completions)  # simulate some None rewards
+    else:
+        return torch.rand(len(completions)).tolist()
+
+training_args = GRPOWithReplayBufferConfig(
+    output_dir=self.tmp_dir,
+    learning_rate=1e-4,
+    per_device_train_batch_size=4,
+    num_generations=4,
+    max_completion_length=8,
+    replay_buffer_size=8,
+    report_to="none",
+)
+trainer = GRPOTrainer(
+    model="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
+    reward_funcs=[custom_reward_func],
+    args=training_args,
+    train_dataset=dataset,
+)
+
+previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
+
+trainer.train()
+```
--- a/docs/source/gspo_token.md
+++ b/docs/source/gspo_token.md
@ -0,0 +1,18 @@
+# GSPO-token
+
+In the paper [Group Sequence Policy Optimization](https://huggingface.co/papers/2507.18071), the authors propose a token-level objective variant to GSPO, called GSPO-token. To use GSPO-token, you can use the `GRPOTrainer` class in `trl.experimental.gspo_token`.
+
+## Usage
+
+```python
+from trl.experimental.gspo_token import GRPOTrainer
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    importance_sampling_level="sequence_token",
+    ...
+)
+```
+
+> [!WARNING]
+> To leverage GSPO-token, the user will need to provide the per-token advantage  \\( \hat{A_{i,t}} \\) for each token  \\( t \\) in the sequence  \\( i \\) (i.e., make  \\( \hat{A_{i,t}} \\) varies with  \\( t \\)—which isn't the case here,  \\( \hat{A_{i,t}}=\hat{A_{i}} \\)). Otherwise, GSPO-Token gradient is just equivalent to the original GSPO implementation.
--- a/docs/source/how_to_train.md
+++ b/docs/source/how_to_train.md
@ -1,65 +0,0 @@
-# Training FAQ
-
-## What Metrics Should I Look at?
-
-When performing classical supervised fine-tuning of language models, the loss (especially the validation loss) serves as a good indicator of the training progress. However, in Reinforcement Learning (RL), the loss becomes less informative about the model's performance, and its value may fluctuate while the actual performance improves.
-
-To address this, we recommend focusing on two key metrics first:
-
-**Mean Reward**: The primary goal is to maximize the reward achieved by the model during RL training.
-**Objective KL Divergence**: KL divergence (Kullback-Leibler divergence) measures the dissimilarity between two probability distributions. In the context of RL training, we use it to quantify the difference between the current model and a reference model. Ideally, we want to keep the KL divergence between 0 and 10 to ensure the model's generated text remains close to what the reference model produces.
-
-However, there are more metrics that can be useful for debugging, check out the [logging section](logging).
-
-## Why Do We Use a Reference Model, and What's the Purpose of KL Divergence?
-
-When training RL models, optimizing solely for reward may lead to unexpected behaviors, where the model exploits the environment in ways that don't align with good language generation. In the case of RLHF, we use a reward model trained to predict whether a generated text is highly ranked by humans.
-
-However, the RL model being optimized against the reward model may learn patterns that yield high reward but do not represent good language. This can result in extreme cases where the model generates texts with excessive exclamation marks or emojis to maximize the reward. In some worst-case scenarios, the model may generate patterns completely unrelated to natural language yet receive high rewards, similar to adversarial attacks.
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kl-example.png">
-<p style="text-align: center;"> <b>Figure:</b> Samples without a KL penalty from <a href="https://huggingface.co/papers/1909.08593">https://huggingface.co/papers/1909.08593</a>. </p>
-</div>
-
-To address this issue, we add a penalty to the reward function based on the KL divergence between the current model and the reference model. By doing this, we encourage the model to stay close to what the reference model generates.
-
-## What Is the Concern with Negative KL Divergence?
-
-If you generate text by purely sampling from the model distribution things work fine in general. But when you use the `generate` method there are a few caveats because it does not always purely sample depending on the settings which can cause KL-divergence to go negative. Essentially when the active model achieves `log_p_token_active < log_p_token_ref` we get negative KL-div. This can happen in several cases:
-
- **top-k sampling**: the model can smooth out the probability distribution causing the top-k tokens having a smaller probability than those of the reference model but they still are selected
- **min_length**: this ignores the EOS token until `min_length` is reached. thus the model can assign a very low log prob to the EOS token and very high probs to all others until min_length is reached
-
-These are just a few examples. Why is negative KL an issue? The total reward `R` is computed `R = r - beta * KL` so if the model can learn how to drive KL-divergence negative it effectively gets a positive reward. In many cases it can be much easier to exploit such a bug in the generation than actually learning the reward function. In addition the KL can become arbitrarily small thus the actual reward can be very small compared to it.
-
-So how should you generate text for PPO training? Let's have a look!
-
-## How to generate text for training?
-
-In order to avoid the KL issues described above we recommend to use the following settings:
-
-```python
-generation_kwargs = {
-    "min_length": -1, # don't ignore the EOS token (see above)
-    "top_k": 0.0, # no top-k sampling
-    "top_p": 1.0, # no nucleus sampling
-    "do_sample": True, # yes, we want to sample
-    "pad_token_id": tokenizer.eos_token_id, # most decoder models don't have a padding token - use EOS token instead
-    "max_new_tokens": 32, # specify how many tokens you want to generate at most
-}
-```
-
-With these settings we usually don't encounter any issues. You can also experiment with other settings but if you encounter issues with negative KL-divergence try to go back to these and see if they persist.
-
-## How can debug your own use-case?
-
-Debugging the RL pipeline can be challenging due to its complexity. Here are some tips and suggestions to make the process easier:
-
- **Start from a working example**: Begin with a working example from the trl repository and gradually modify it to fit your specific use-case. Changing everything at once can make it difficult to identify the source of potential issues. For example, you can start by replacing the model in the example and once you figure out the best hyperparameters try to switch to your dataset and reward model. If you change everything at once you won't know where a potential problem comes from.
- **Start small, scale later**: Training large models can be very slow and take several hours or days until you see any improvement. For debugging this is not a convenient timescale so try to use small model variants during the development phase and scale up once that works. That being said you sometimes have to be careful as small models might not have the capacity to solve a complicated task either.
- **Start simple**: Try to start with a minimal example and build complexity from there. Your use-case might require for example a complicated reward function consisting of many different rewards - try to use one signal first and see if you can optimize that and then add more complexity after that.
- **Inspect the generations**: It's always a good idea to inspect what the model is generating. Maybe there is a bug in your post-processing or your prompt. Due to bad settings you might cut-off generations too soon. These things are very hard to see on the metrics but very obvious if you look at the generations.
- **Inspect the reward model**: If your reward is not improving over time maybe there's an issue with the reward model. You can look at extreme cases to see if it does what it should: e.g. in the sentiment case you can check if simple positive and negative examples really get different rewards. And you can look at the distribution of your dataset. Finally, maybe the reward is dominated by the query which the model can't affect so you might need to normalize this (e.g. reward of query+response minus reward of the query).
-
-These are just a few tips that we find helpful - if you have more useful tricks feel free to open a PR to add them as well!
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -9,11 +9,49 @@ The library is integrated with 🤗 [transformers](https://github.com/huggingfac

 ## 🎉 What's New

-**✨ OpenAI GPT OSS Support**: TRL now fully supports fine-tuning the latest [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4)! Check out the:
+**OpenEnv Integration:** TRL now supports **[OpenEnv](https://huggingface.co/blog/openenv)**, the open-source framework from Meta for defining, deploying, and interacting with environments in reinforcement learning and agentic workflows.

- [OpenAI Cookbook](https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers)
- [GPT OSS recipes](https://github.com/huggingface/gpt-oss-recipes)
- [Our example script](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_gpt_oss.py)
+Explore how to seamlessly integrate TRL with OpenEnv in our [dedicated documentation](openenv).
+
+## Taxonomy
+
+Below is the current list of TRL trainers, organized by method type (⚡️ = vLLM support; 🧪 = experimental).
+
+<div style="display: flex; justify-content: space-between; width: 100%; gap: 2rem;">
+<div style="flex: 1; min-width: 0;">
+
+### Online methods
+
+- [`GRPOTrainer`] ⚡️
+- [`RLOOTrainer`] ⚡️
+- [`OnlineDPOTrainer`] ⚡️
+- [`NashMDTrainer`] ⚡️
+- [`XPOTrainer`] ⚡️
+- [`PPOTrainer`]
+
+### Reward modeling
+
+- [`PRMTrainer`]
+- [`RewardTrainer`]
+
+</div>
+<div style="flex: 1; min-width: 0;">
+
+### Offline methods
+
+- [`SFTTrainer`]
+- [`DPOTrainer`]
+- [`ORPOTrainer`]
+- [`experimental.bco.BCOTrainer`] 🧪
+- [`CPOTrainer`]
+- [`KTOTrainer`]
+
+### Knowledge distillation
+
+- [`GKDTrainer`]
+
+</div>
+</div>

 You can also explore TRL-related models, datasets, and demos in the [TRL Hugging Face organization](https://huggingface.co/trl-lib).

@ -36,6 +74,11 @@ The documentation is organized into the following sections:

 <div class="mt-10">
  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-vlm-alignment">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/openenv/thumbnail.png" alt="thumbnail" class="mt-0">
+      <p class="text-gray-500 text-sm">Published October 23, 2025</p>
+      <p class="text-gray-700">Building the Open Agent Ecosystem Together: Introducing OpenEnv</p>
+    </a>
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-vlm-alignment">
      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/trl_vlm/thumbnail.png" alt="thumbnail" class="mt-0">
      <p class="text-gray-500 text-sm">Published on August 7, 2025</p>
@ -93,3 +136,15 @@ The documentation is organized into the following sections:
    </a>
  </div>
 </div>
+
+## Talks
+
+<div class="mt-10">
+  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/Fine%20tuning%20with%20TRL%20(Oct%2025).pdf">
+      <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/Fine%20tuning%20with%20TRL%20(Oct%2025).png" alt="thumbnail" class="mt-0">
+      <p class="text-gray-500 text-sm">Talk given on October 30, 2025</p>
+      <p class="text-gray-700">Fine tuning with TRL</p>
+    </a>
+  </div>
+</div>
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -1,13 +1,15 @@
 # Installation
+
 You can install TRL either from PyPI or from source:

 ## PyPI
+
 Install the library with pip or [uv](https://docs.astral.sh/uv/):

 <hfoptions id="install">
 <hfoption id="uv">

-uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions).
+uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions.

 ```bash
 uv pip install trl
@ -24,6 +26,7 @@ pip install trl
 </hfoptions>

 ## Source
+
 You can also install the latest version from source. First clone the repo and then run the installation with `pip`:

 ```bash
--- a/docs/source/iterative_sft_trainer.md
+++ b/docs/source/iterative_sft_trainer.md
@ -1,147 +0,0 @@
-# Iterative Trainer
-
-[![](https://img.shields.io/badge/All_models-Iterative_SFT-blue)](https://huggingface.co/models?other=iterative-sft,trl)
-
-<Tip warning={true}>
-
-The IterativeSFTTrainer is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].
-
-</Tip>
-
-Iterative fine-tuning is a training method that enables to perform custom actions (generation and filtering for example) between optimization steps. In TRL we provide an easy-to-use API to fine-tune your models in an iterative way in just a few lines of code.
-
-## Quickstart
-
-To get started quickly, you can either pass a model identifier or a pre-instantiated model to the trainer:
-
-```python
-from trl import IterativeSFTConfig, IterativeSFTTrainer
-
-# Using a model identifier
-trainer = IterativeSFTTrainer(
-    "facebook/opt-350m",
-    args=IterativeSFTConfig(
-        max_length=512,
-        output_dir="./output",
-    ),
-)
-
-# Or using a pre-instantiated model
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
-tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-
-trainer = IterativeSFTTrainer(
-    model,
-    args=IterativeSFTConfig(
-        max_length=512,
-        output_dir="./output",
-    ),
-    processing_class=tokenizer,
-)
-```
-
-## Usage
-
-The [`IterativeSFTTrainer`] supports two ways of providing input data to the `step` function:
-
-### Using a list of tensors as input:
-
-```python
-inputs = {
-    "input_ids": input_ids,
-    "attention_mask": attention_mask,
-}
-
-trainer.step(**inputs)
-```
-
-### Using a list of strings as input:
-
-```python
-inputs = {
-    "texts": texts,
-    "texts_labels": texts_labels,  # Optional, defaults to texts
-}
-
-trainer.step(**inputs)
-```
-
-For causal language models, labels will automatically be created from `input_ids` or from `texts`. When using sequence to sequence models you will have to provide your own labels or `text_labels`.
-
-## Configuration
-
-The [`IterativeSFTConfig`] class provides several parameters to customize the training:
-
-```python
-from trl import IterativeSFTConfig
-
-config = IterativeSFTConfig(
-    # Model initialization parameters
-    model_init_kwargs={"dtype": "bfloat16"},
-
-    # Data preprocessing parameters
-    max_length=512,
-    truncation_mode="keep_end",
-
-    # Training parameters
-    output_dir="./output",
-    learning_rate=2e-5,
-    per_device_train_batch_size=4,
-    gradient_accumulation_steps=4,
-    max_steps=1000,
-    save_steps=100,
-    optim="adamw_torch",
-    report_to="wandb",
-)
-```
-
-### Model Initialization
-
-You can control how the model is initialized by passing keyword arguments to `model_init_kwargs`:
-
-```python
-config = IterativeSFTConfig(
-    model_init_kwargs={
-        "dtype": "bfloat16",
-        "device_map": "auto",
-        "trust_remote_code": True,
-    }
-)
-```
-
-### Data Preprocessing
-
-The trainer supports two truncation modes:
-
- `keep_end`: Truncates from the start of the sequence
- `keep_start`: Truncates from the end of the sequence
-
-```python
-config = IterativeSFTConfig(
-    max_length=512,
-    truncation_mode="keep_end",  # or "keep_start"
-)
-```
-
-### Training Optimization
-
-You can optimize CUDA cache usage for more memory-efficient training:
-
-```python
-config = IterativeSFTConfig(
-    optimize_device_cache=True,
-)
-```
-
-## IterativeSFTTrainer
-
-[[autodoc]] IterativeSFTTrainer
-    - train
-    - save_model
-    - push_to_hub
-
-## IterativeSFTConfig
-
-[[autodoc]] IterativeSFTConfig
--- a/docs/source/jobs_training.md
+++ b/docs/source/jobs_training.md
@ -1,46 +1,64 @@
-# Training using Jobs
+# Training with Jobs

-[Jobs](https://huggingface.co/docs/huggingface_hub/guides/jobs) lets you run training scripts on fully managed infrastructure (no need to handle GPUs, dependencies, or environment setup locally). This makes it easy to scale and monitor your experiments directly from the Hub.
+[![model badge](https://img.shields.io/badge/All_models-HF_Jobs-blue)](https://huggingface.co/models?other=hf_jobs,trl)

-In this guide, you’ll learn how to:
+[Hugging Face Jobs](https://huggingface.co/docs/huggingface_hub/guides/jobs) lets you run training scripts on fully managed infrastructure—no need to manage GPUs or local environment setup.

- Run TRL training scripts using Jobs.
- Configure hardware, timeouts, environment variables, and secrets.
- Monitor and manage jobs from the CLI or Python.
+In this guide, you'll learn how to:

-<Tip>
+* Use [TRL Jobs](https://github.com/huggingface/trl-jobs) to easily run pre-optimized TRL training
+* Run any TRL training script with uv scripts

-When a model is trained using **TRL + Jobs**, a tag is automatically added to the model card.  
-You can explore models trained with this method [Hugging Face model hub](https://huggingface.co/models?other=hf_jobs).
-
-</Tip>
+For general details about Hugging Face Jobs (hardware selection, job monitoring, etc.), see the [Jobs documentation](https://huggingface.co/docs/huggingface_hub/guides/jobs).

 ## Requirements

- [Pro](https://hf.co/pro), [Team](https://hf.co/enterprise), or [Enterprise](https://hf.co/enterprise) plan.
- Logged into the Hugging Face Hub (`hf auth login`).
+* A [Pro](https://hf.co/pro), [Team](https://hf.co/enterprise), or [Enterprise](https://hf.co/enterprise) plan
+* Logged in to the Hugging Face Hub (`hf auth login`)

-## Preparing your Script
+## Using TRL Jobs

-You can launch Jobs using either the [`hf jobs` CLI](https://huggingface.co/docs/huggingface_hub/guides/cli#hf-jobs) or the Python API. A convenient option is to use [UV scripts](https://docs.astral.sh/uv/guides/scripts/), which packages all dependencies directly into a single Python file. You can run them like this:
+[TRL Jobs](https://github.com/huggingface/trl-jobs) is a high-level wrapper around Hugging Face Jobs and TRL that streamlines training. It provides optimized default configurations so you can start quickly without manually tuning parameters.
+
+Example:
+
+```bash
+pip install trl-jobs
+trl-jobs sft --model_name Qwen/Qwen3-0.6B --dataset_name trl-lib/Capybara
+```
+
+TRL Jobs supports everything covered in this guide, with additional optimizations to simplify workflows.
+
+## Using uv Scripts
+
+For more control, you can run Hugging Face Jobs directly with your own scripts, using [uv scripts](https://docs.astral.sh/uv/guides/scripts/).
+
+Create a Python script (e.g., `train.py`) containing your training code:
+
+```python
+from datasets import load_dataset
+from trl import SFTTrainer
+
+dataset = load_dataset("trl-lib/Capybara", split="train")
+trainer = SFTTrainer(
+    model="Qwen/Qwen2.5-0.5B",
+    train_dataset=dataset,
+)
+trainer.train()
+trainer.push_to_hub("Qwen2.5-0.5B-SFT")
+```
+
+Launch the job using either the [`hf jobs` CLI](https://huggingface.co/docs/huggingface_hub/guides/cli#hf-jobs) or the Python API:

 <hfoptions id="script_type">
 <hfoption id="bash">

 ```bash
-hf jobs uv run --flavor a100-large --secrets HF_TOKEN "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" --model_name_or_path Qwen/Qwen2-0.5B --dataset_name trl-lib/Capybara
-```
-
-The script can also be a local file:
-
-```bash
-hf jobs uv run --flavor a100-large --secrets HF_TOKEN trl/scripts/sft.py --model_name_or_path Qwen/Qwen2-0.5B --dataset_name trl-lib/Capybara
-```
-
-Since it runs using a Docker Image from Hugging Face Spaces or Docker Hub, you can also specify it:
-
-```bash
-hf jobs uv run --flavor a100-large --secrets HF_TOKEN --image <docker-image> trl/scripts/sft.py --model_name_or_path Qwen/Qwen2-0.5B --dataset_name trl-lib/Capybara
+hf jobs uv run \
+    --flavor a100-large \
+    --with trl \
+    --secrets HF_TOKEN \
+    train.py
 ```

 </hfoption>
@ -48,236 +66,113 @@ hf jobs uv run --flavor a100-large --secrets HF_TOKEN --image <docker-image> trl

 ```python
 from huggingface_hub import run_uv_job
+
 run_uv_job(
-    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py",
-    token="hf...",
+    "train.py",
+    dependencies=["trl"],
    flavor="a100-large",
-    script_args=[
-        "--model_name_or_path", "Qwen/Qwen2-0.5B",
-        "--dataset_name", "trl-lib/Capybara",
-    ]
-)
-```
-
-The script can also be a local file:
-
-```python
-from huggingface_hub import run_uv_job
-run_uv_job(
-    "trl/scripts/sft.py",
-    token="hf...",
-    flavor="a100-large",
-    script_args=[
-        "--model_name_or_path", "Qwen/Qwen2-0.5B",
-        "--dataset_name", "trl-lib/Capybara",
-    ]
-)
-```
-
-Since it runs using a Docker Image from Hugging Face Spaces or Docker Hub, you can also specify it:
-
-```python
-from huggingface_hub import run_uv_job
-run_uv_job(
-    "sft.py",
-    token="hf...",
-    flavor="a100-large",
-    image="<docker-image>",
-    script_args=[
-        "--model_name_or_path", "Qwen/Qwen2-0.5B",
-        "--dataset_name", "trl-lib/Capybara",
-    ]
+    secrets={"HF_TOKEN": "hf_..."},
 )
 ```

 </hfoption>
 </hfoptions>

-You can also run jobs without UV:
+To run successfully, the script needs:
+
+* **TRL installed**: Use the `--with trl` flag or the `dependencies` argument. uv installs these dependencies automatically before running the script.
+* **An authentication token**: Required to push the trained model (or perform other authenticated operations). Provide it with the `--secrets HF_TOKEN` flag or the `secrets` argument.
+
+> [!WARNING]
+> When training with Jobs, be sure to:
+>
+> * **Set a sufficient timeout**. Jobs time out after 30 minutes by default. If your job exceeds the timeout, it will fail and all progress will be lost. See [Setting a custom timeout](https://huggingface.co/docs/huggingface_hub/guides/jobs#setting-a-custom-timeout).
+> * **Push the model to the Hub**. The Jobs environment is ephemeral—files are deleted when the job ends. If you don’t push the model, it will be lost.
+
+You can also run a script directly from a URL:

 <hfoptions id="script_type">
 <hfoption id="bash">

-In this case, we give the cli the Docker image and run it as:
-
 ```bash
-hf jobs run --flavor a100-large pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel python -c "import torch; print(torch.cuda.get_device_name())"
+hf jobs uv run \
+    --flavor a100-large \
+    --with trl \
+    --secrets HF_TOKEN \
+    "https://gist.githubusercontent.com/qgallouedec/eb6a7d20bd7d56f9c440c3c8c56d2307/raw/69fd78a179e19af115e4a54a1cdedd2a6c237f2f/train.py"
 ```

 </hfoption>
 <hfoption id="python">

 ```python
-from huggingface_hub import run_job
-run_job(
-    image="pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
-    command=["python", "-c", "import torch; print(torch.cuda.get_device_name())"],
+from huggingface_hub import run_uv_job
+
+run_uv_job(
+    "https://gist.githubusercontent.com/qgallouedec/eb6a7d20bd7d56f9c440c3c8c56d2307/raw/69fd78a179e19af115e4a54a1cdedd2a6c237f2f/train.py",
    flavor="a100-large",
+    dependencies=["trl"],
+    secrets={"HF_TOKEN": "hf_..."},
 )
 ```

 </hfoption>
 </hfoptions>

-### Adding Dependencies with UV
-
-All example scripts in TRL are compatible with `uv`, allowing seamless execution with Jobs. You can check the full list of examples in [Maintained examples](example_overview#maintained-examples).  
-
-Dependencies are specified at the top of the script using this structure:
+To make a script self-contained, declare dependencies at the top:

 ```python
 # /// script
 # dependencies = [
-#     "trl @ git+https://github.com/huggingface/trl.git",
+#     "trl",
 #     "peft",
 # ]
 # ///
+
+from datasets import load_dataset
+from peft import LoraConfig
+from trl import SFTTrainer
+
+dataset = load_dataset("trl-lib/Capybara", split="train")
+
+trainer = SFTTrainer(
+    model="Qwen/Qwen2.5-0.5B",
+    train_dataset=dataset,
+    peft_config=LoraConfig(),
+)
+trainer.train()
+trainer.push_to_hub("Qwen2.5-0.5B-SFT")
 ```

-When you run the UV script, these dependencies are automatically installed. In the example above, `trl` and `peft` would be installed before the script runs.
-
-You can also provide dependencies directly in the `uv run` command:
+You can then run the script without specifying dependencies:

 <hfoptions id="script_type">
 <hfoption id="bash">

-Using the `--with` flag.
-
 ```bash
 hf jobs uv run \
    --flavor a100-large \
    --secrets HF_TOKEN \
-    --with transformers \
-    --with torch \
-    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
-    --model_name_or_path Qwen/Qwen2-0.5B  \
-    --dataset_name trl-lib/Capybara
+    train.py
 ```

 </hfoption>
 <hfoption id="python">

-Using the `dependencies` argument.
-
 ```python
 from huggingface_hub import run_uv_job
+
 run_uv_job(
-    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py",
-    dependencies=["transformers", "torch"]
-    token="hf...",
+    "train.py",
    flavor="a100-large",
-    script_args=[
-        "--model_name_or_path", "Qwen/Qwen2-0.5B",
-        "--dataset_name", "trl-lib/Capybara",
-    ]
+    secrets={"HF_TOKEN": "hf_..."},
 )
 ```

 </hfoption>
 </hfoptions>

-### Hardware and Timeout Settings
-
-Jobs allow you to select a specific hardware configuration using the `--flavor` flag. As of 08/25, the available options are:
-
-**CPU:** `cpu-basic`, `cpu-upgrade`  
-**GPU:** `t4-small`, `t4-medium`, `l4x1`, `l4x4`, `a10g-small`, `a10g-large`, `a10g-largex2`, `a10g-largex4`, `a100-large`  
-**TPU:** `v5e-1x1`, `v5e-2x2`, `v5e-2x4`  
-
-You can always check the latest list of supported hardware flavors in [Spaces config reference](https://huggingface.co/docs/hub/en/spaces-config-reference).
-
-By default, jobs have a **30-minute timeout**, after which they will automatically stop. For long-running tasks like training, you can increase the timeout as needed. Supported time units are:
-
- `s`: seconds
- `m`: minutes
- `h`: hours
- `d`: days
-
-Example with a 2-hour timeout:
-
-<hfoptions id="script_type">
-<hfoption id="bash">
-
-Using the `--timeout` flag:
-
-```bash
-hf jobs uv run \
-    --timeout 2h \
-    --flavor a100-large \
-    --secrets HF_TOKEN \
-    --with transformers \
-    --with torch \
-    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
-    --model_name_or_path Qwen/Qwen2-0.5B  \
-    --dataset_name trl-lib/Capybara
-```
-
-</hfoption>
-<hfoption id="python">
-
-Using the `timeout` argument:
-
-```python
-from huggingface_hub import run_uv_job
-run_uv_job(
-    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py",
-    timeout="2h",
-    token="hf...",
-    flavor="a100-large",
-    script_args=[
-        "--model_name_or_path", "Qwen/Qwen2-0.5B",
-        "--dataset_name", "trl-lib/Capybara",
-    ]
-)
-```
-
-</hfoption>
-</hfoptions>
-
-### Environment Variables, Secrets, and Token
-
-You can pass environment variables, secrets, and your auth token to your jobs. 
-
-<hfoptions id="script_type">
-<hfoption id="bash">
-
-Using the `--env`, `--secrets`, and/or `--token` options.
-
-```bash
-hf jobs uv run \
-    trl/scripts/sft.py \
-    --flavor a100-large \
-    --env FOO=foo \
-    --env BAR=bar \
-    --secrets HF_TOKEN=HF_TOKEN \
-    --secrets MY_SECRET=password \
-    --token hf...
-```
-
-</hfoption>
-<hfoption id="python">
-
-
-Using the `env`, `secrets`, and/or `token` arguments.
-
-```python
-from huggingface_hub import run_uv_job
-run_uv_job(
-    "trl/scripts/sft.py",
-    env={"FOO": "foo", "BAR": "bar"},
-    secrets={"MY_SECRET": "psswrd"},
-    token="hf..."
-)
-```
-
-</hfoption>
-</hfoptions>
-
-## Training and Evaluating a Model with Jobs
-
-TRL example scripts are fully UV-compatible, allowing you to run a complete training workflow directly on Jobs. You can customize the training by providing the usual script arguments, along with hardware specifications and secrets.  
-
-To evaluate your training runs, in addition to reviewing the job logs, you can use [**Trackio**](https://huggingface.co/blog/trackio), a lightweight experiment tracking library. Trackio enables end-to-end experiment management on the Hugging Face Hub. All TRL example scripts already support reporting to Trackio via the `report_to` argument. Using this feature saves your experiments in an interactive HF Space, making it easy to monitor metrics, compare runs, and track progress over time.
+TRL example scripts are fully uv-compatible, so you can run a complete training workflow directly on Jobs. You can customize training with standard script arguments plus hardware and secrets:

 <hfoptions id="script_type">
 <hfoption id="bash">
@ -286,19 +181,10 @@ To evaluate your training runs, in addition to reviewing the job logs, you can u
 hf jobs uv run \
    --flavor a100-large \
    --secrets HF_TOKEN \
-    "trl/scripts/sft.py" \
-    --model_name_or_path Qwen/Qwen2-0.5B \
-    --dataset_name trl-lib/Capybara \
-    --learning_rate 2.0e-5 \
-    --num_train_epochs 1 \
-    --packing \
-    --per_device_train_batch_size 2 \
-    --gradient_accumulation_steps 8 \
-    --eos_token '<|im_end|>' \
-    --eval_strategy steps \
-    --eval_steps 100 \
-    --output_dir Qwen2-0.5B-SFT \
-    --report_to trackio \
+    https://raw.githubusercontent.com/huggingface/trl/refs/heads/main/examples/scripts/prm.py \
+    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
+    --dataset_name trl-lib/prm800k \
+    --output_dir Qwen2-0.5B-Reward \
    --push_to_hub
 ```

@ -307,24 +193,14 @@ hf jobs uv run \

 ```python
 from huggingface_hub import run_uv_job
-
 run_uv_job(
-    "trl/scripts/sft.py",
+    "https://raw.githubusercontent.com/huggingface/trl/refs/heads/main/examples/scripts/prm.py",
    flavor="a100-large",
-    secrets={"HF_TOKEN": "your_hf_token"},
+    secrets={"HF_TOKEN": "hf_..."},
    script_args=[
-        "--model_name_or_path", "Qwen/Qwen2-0.5B",
-        "--dataset_name", "trl-lib/Capybara",
-        "--learning_rate", "2.0e-5",
-        "--num_train_epochs", "1",
-        "--packing",
-        "--per_device_train_batch_size", "2",
-        "--gradient_accumulation_steps", "8",
-        "--eos_token", "<|im_end|>",
-        "--eval_strategy", "steps",
-        "--eval_steps", "100",
-        "--output_dir", "Qwen2-0.5B-SFT",
-        "--report_to", "trackio",
+        "--model_name_or_path", "Qwen/Qwen2-0.5B-Instruct",
+        "--dataset_name", "trl-lib/prm800k",
+        "--output_dir", "Qwen2-0.5B-Reward",
        "--push_to_hub"
    ]
 )
@ -332,61 +208,67 @@ run_uv_job(

 </hfoption>
 </hfoptions>
+See the full list of examples in [Maintained examples](example_overview#maintained-examples).

-## Monitoring and Managing Jobs
+### Docker Images

-After launching a job, you can track its progress on the [Jobs page](https://huggingface.co/settings/jobs). Additionally, Jobs provides CLI and Python commands to check status, view logs, or cancel a job.
+An up-to-date Docker image with all TRL dependencies is available at [huggingface/trl](https://hub.docker.com/r/huggingface/trl) and can be used directly with Hugging Face Jobs:

 <hfoptions id="script_type">
 <hfoption id="bash">

 ```bash
-# List your jobs
-hf jobs ps -a
-
-# List your running jobs
-hf jobs ps 
-
-# Inspect the status of a job
-hf jobs inspect
-
-# View logs from a job
-hf jobs logs job_id
-
-# Cancel a job
-hf jobs cancel job_id
+hf jobs uv run \
+    --flavor a100-large \
+    --secrets HF_TOKEN \
+    --image huggingface/trl \
+    train.py
 ```

 </hfoption>
 <hfoption id="python">

-
 ```python
-from huggingface_hub import list_jobs, inspect_job, fetch_job_logs, cancel_job
+from huggingface_hub import run_uv_job

-# List your jobs
-jobs = list_jobs()
-jobs[0]
-
-# List your running jobs
-running_jobs = [job for job in list_jobs() if job.status.stage == "RUNNING"]
-
-# Inspect the status of a job
-inspect_job(job_id=job_id)
-
-# View logs from a job
-for log in fetch_job_logs(job_id=job_id):
-    print(log)
-
-# Cancel a job
-cancel_job(job_id=job_id)
+run_uv_job(
+    "train.py",
+    flavor="a100-large",
+    secrets={"HF_TOKEN": "hf_..."},
+    image="huggingface/trl",
+)
 ```

 </hfoption>
 </hfoptions>

-## Best Practices and Tips
+Jobs runs on a Docker image from Hugging Face Spaces or Docker Hub, so you can also specify any custom image:

- Choose hardware that fits the size of your model and dataset for optimal performance.
- Training jobs can be long-running. Consider increasing the default timeout.
- Reuse training and evaluation scripts whenever possible to streamline workflows.
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+```bash
+hf jobs uv run \
+    --flavor a100-large \
+    --secrets HF_TOKEN \
+    --image <docker-image> \
+    --secrets HF_TOKEN \
+    train.py
+```
+
+</hfoption>
+<hfoption id="python">
+
+```python
+from huggingface_hub import run_uv_job
+
+run_uv_job(
+    "train.py",
+    flavor="a100-large",
+    secrets={"HF_TOKEN": "hf_..."},
+    image="<docker-image>",
+)
+```
+
+</hfoption>
+</hfoptions>
--- a/docs/source/judges.md
+++ b/docs/source/judges.md
@ -1,10 +1,7 @@
 # Judges

-<Tip warning={true}>
-
-TRL Judges is an experimental API which is subject to change at any time.
-
-</Tip>
+> [!WARNING]
+> TRL Judges is an experimental API which is subject to change at any time.

 TRL provides judges to easily compare two completions.

@ -16,7 +13,7 @@ pip install trl[judges]

 ## Using the provided judges

-TRL provides several judges out of the box. For example, you can use the `HfPairwiseJudge` to compare two completions using a pre-trained model from the Hugging Face model hub:
+TRL provides several judges out of the box. For example, you can use the [`HfPairwiseJudge`] to compare two completions using a pre-trained model from the Hugging Face model hub:

 ```python
 from trl import HfPairwiseJudge
--- a/docs/source/kernels_hub.md
+++ b/docs/source/kernels_hub.md
@ -43,12 +43,8 @@ Or using the TRL CLI:
 trl sft ... --attn_implementation kernels-community/flash-attn
 ```

-<Tip>
-
-Now you can leverage faster attention backends with a pre-optimized kernel for your hardware configuration from the Hub, speeding up both development and training.
-
-</Tip>
-
+> [!TIP]
+> Now you can leverage faster attention backends with a pre-optimized kernel for your hardware configuration from the Hub, speeding up both development and training.

 ## Comparing Attention Implementations

@ -57,15 +53,14 @@ The experiments were run on a single **H100 GPU** with **CUDA 12.9**, leveraging
 Keep in mind that the results shown here are specific to this setup and may vary with different training configurations.

 The following figure illustrates both **latency** (time per training step) and **peak allocated memory** for the different attention implementations and kernel backends.  
-Kernel-based implementations perform on par with custom-installed attention, and increasing the model’s `max_length` further enhances performance. Memory consumption is similar across all implementations, showing no significant differences. We get the same performance but with less friction, as described in [the following section](#benchmarking-flash-attention-build-from-source-vs-hub-kernels).
-
+Kernel-based implementations perform on par with custom-installed attention, and increasing the model’s `max_length` further enhances performance. Memory consumption is similar across all implementations, showing no significant differences. We get the same performance but with less friction, as described in [the following section](#flash-attention-vs-hub-kernels).

 <div class="flex justify-center">
-  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_latency.png" alt="Latency and Memory Usage" width="600"/>
-  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_peak_allocated_memory.png" alt="Latency and Memory Usage" width="600"/>
+  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_latency.png" alt="Latency and Memory Usage" width="45%"/>
+  <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_peak_allocated_memory.png" alt="Latency and Memory Usage" width="45%"/>
 </div>

-## Flash Attention (Build-from-Source) vs. Hub Kernels
+## Flash Attention vs. Hub Kernels

 Building Flash Attention from source can be time-consuming, often taking anywhere from several minutes to hours, depending on your hardware, CUDA/PyTorch configuration, and whether precompiled wheels are available.  

@ -77,7 +72,6 @@ You can combine **FlashAttention kernels** with **Liger kernels** for additional

 First, install the Liger kernel dependency:

-
 ```bash
 pip install liger-kernel
 ```
@ -99,6 +93,4 @@ training_args = SFTConfig(
 )
 ```

-Learn more about this integration [here](./liger_kernel_integration).
-
-
+Learn more about the [Liger Kernel Integration](./liger_kernel_integration).
--- a/docs/source/kto_trainer.md
+++ b/docs/source/kto_trainer.md
@ -1,12 +1,11 @@
 # KTO Trainer

-[![](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto,trl)
+[![model badge](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto,trl)

 ## Overview

 Kahneman-Tversky Optimization (KTO) was introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306) by [Kawin Ethayarajh](https://huggingface.co/kawine), [Winnie Xu](https://huggingface.co/xwinxu), [Niklas Muennighoff](https://huggingface.co/Muennighoff), Dan Jurafsky, [Douwe Kiela](https://huggingface.co/douwekiela).

-
 The abstract from the paper is the following:

 > Kahneman & Tversky's prospect theory tells us that humans perceive random variables in a biased but well-defined manner; for example, humans are famously loss-averse. We show that objectives for aligning LLMs with human feedback implicitly incorporate many of these biases -- the success of these objectives (e.g., DPO) over cross-entropy minimization can partly be ascribed to them being human-aware loss functions (HALOs). However, the utility functions these methods attribute to humans still differ from those in the prospect theory literature. Using a Kahneman-Tversky model of human utility, we propose a HALO that directly maximizes the utility of generations instead of maximizing the log-likelihood of preferences, as current methods do. We call this approach Kahneman-Tversky Optimization (KTO), and it matches or exceeds the performance of preference-based methods at scales from 1B to 30B. Crucially, KTO does not need preferences -- only a binary signal of whether an output is desirable or undesirable for a given input. This makes it far easier to use in the real world, where preference data is scarce and expensive.
@ -51,7 +50,7 @@ accelerate launch train_kto.py

 Distributed across 8 x H100 GPUs, the training takes approximately 30 minutes. You can verify the training progress by checking the reward graph. An increasing trend in the reward margin indicates that the model is improving and generating better responses over time.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kto-qwen2-reward-margin.png)
+![kto qwen2 reward margin](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kto-qwen2-reward-margin.png)

 To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

@ -60,14 +59,14 @@ To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) pe
 What is the best programming language?

 <strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-KTO&gt;:</span></strong>
-The best programming language can vary depending on individual preferences, industry-specific requirements, technical skills, and familiarity with the specific use case or task. Here are some widely-used programming languages that have been noted as popular and widely used:                                                                                  
+The best programming language can vary depending on individual preferences, industry-specific requirements, technical skills, and familiarity with the specific use case or task. Here are some widely-used programming languages that have been noted as popular and widely used:

 Here are some other factors to consider when choosing a programming language for a project:

- <strong><span style="color: green;">1</span> JavaScript</strong>: JavaScript is at the heart of the web and can be used for building web applications, APIs, and interactive front-end applications like frameworks like React and Angular. It's similar to C, C++, and F# in syntax structure and is accessible and easy to learn, making it a popular choice for beginners and professionals alike.                                                                   
- <strong><span style="color: green;">2</span> Java</strong>: Known for its object-oriented programming (OOP) and support for Java 8 and .NET, Java is used for developing enterprise-level software applications, high-performance games, as well as mobile apps, game development, and desktop applications.                                                                                                                                                            
- <strong><span style="color: green;">3</span> C++</strong>: Known for its flexibility and scalability, C++ offers comprehensive object-oriented programming and is a popular choice for high-performance computing and other technical fields. It's a powerful platform for building real-world applications and games at scale.                                                                                                                                         
- <strong><span style="color: green;">4</span> Python</strong>: Developed by Guido van Rossum in 1991, Python is a high-level, interpreted, and dynamically typed language known for its simplicity, readability, and versatility.   
+ <strong><span style="color: green;">1</span> JavaScript</strong>: JavaScript is at the heart of the web and can be used for building web applications, APIs, and interactive front-end applications like frameworks like React and Angular. It's similar to C, C++, and F# in syntax structure and is accessible and easy to learn, making it a popular choice for beginners and professionals alike.
+ <strong><span style="color: green;">2</span> Java</strong>: Known for its object-oriented programming (OOP) and support for Java 8 and .NET, Java is used for developing enterprise-level software applications, high-performance games, as well as mobile apps, game development, and desktop applications.
+ <strong><span style="color: green;">3</span> C++</strong>: Known for its flexibility and scalability, C++ offers comprehensive object-oriented programming and is a popular choice for high-performance computing and other technical fields. It's a powerful platform for building real-world applications and games at scale.
+ <strong><span style="color: green;">4</span> Python</strong>: Developed by Guido van Rossum in 1991, Python is a high-level, interpreted, and dynamically typed language known for its simplicity, readability, and versatility.
 </code></pre>

 ## Expected dataset format
@ -102,7 +101,6 @@ To ensure that we train MOEs similarly during preference-tuning, it is beneficia
 This option is enabled by setting `output_router_logits=True` in the model config (e.g. [`~transformers.MixtralConfig`]).  
 To scale how much the auxiliary loss contributes to the total loss, use the hyperparameter `router_aux_loss_coef=...` (default: `0.001`) in the model config.

-
 ### Batch size recommendations

 Use a per-step batch size that is at least 4, and an effective batch size between 16 and 128. Even if your effective batch size is large, if your per-step batch size is poor, then the KL estimate in KTO will be poor.
--- a/docs/source/liger_kernel_integration.md
+++ b/docs/source/liger_kernel_integration.md
@ -1,32 +1,78 @@
 # Liger Kernel Integration

-<Tip warning={true}>
-
-Section under construction. Feel free to contribute!
-
-</Tip>
-
 [Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU training throughput by 20% and reduce memory usage by 60%. That way, we can **4x** our context length, as described in the benchmark below. They have implemented Hugging Face compatible `RMSNorm`, `RoPE`, `SwiGLU`, `CrossEntropy`, `FusedLinearCrossEntropy`, with more to come. The kernel works out of the box with [FlashAttention](https://github.com/Dao-AILab/flash-attention), [PyTorch FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html), and [Microsoft DeepSpeed](https://github.com/microsoft/DeepSpeed).

 With this memory reduction, you can potentially turn off `cpu_offloading` or gradient checkpointing to further boost the performance.

-| Speed Up                 | Memory Reduction        |
-|--------------------------|-------------------------|
+| Speed Up | Memory Reduction |
+| --- | --- |
 | ![Speed up](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-tps.png) | ![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-memory.png) |

-1. To use Liger-Kernel in [`SFTTrainer`], first install it by:
+## Supported Trainers

-```bash
-pip install liger-kernel
-```
+Liger Kernel is supported in the following TRL trainers:
+- **SFT** (Supervised Fine-Tuning)
+- **DPO** (Direct Preference Optimization)
+- **GRPO** (Group Relative Policy Optimization)
+- **KTO** (Kahneman-Tversky Optimization)
+- **GKD** (Generalized Knowledge Distillation)

-2. Once installed, set `use_liger_kernel` in [`SFTConfig`]. No other changes are needed!
+## Usage
+
+1. First, install Liger Kernel:
+
+  ```bash
+  pip install liger-kernel
+  ```
+
+2. Once installed, set `use_liger_kernel=True` in your trainer config. No other changes are needed!
+
+<hfoptions id="liger">
+<hfoption id="SFT">

 ```python
-training_args = SFTConfig(
-    use_liger_kernel=True,
-    ...
-)
+from trl import SFTConfig
+
+training_args = SFTConfig(..., use_liger_kernel=True)
 ```

+</hfoption>
+<hfoption id="DPO">
+
+```python
+from trl import DPOConfig
+
+training_args = DPOConfig(..., use_liger_kernel=True)
+```
+
+</hfoption>
+<hfoption id="GRPO">
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(..., use_liger_kernel=True)
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+```python
+from trl import KTOConfig
+
+training_args = KTOConfig(..., use_liger_kernel=True)
+```
+
+</hfoption>
+<hfoption id="GKD">
+
+```python
+from trl import GKDConfig
+
+training_args = GKDConfig(..., use_liger_kernel=True)
+```
+
+</hfoption>
+</hfoptions>
+
 To learn more about Liger-Kernel, visit their [official repository](https://github.com/linkedin/Liger-Kernel/).
--- a/docs/source/logging.md
+++ b/docs/source/logging.md
@ -1,106 +0,0 @@
-# Logging
-
-As reinforcement learning algorithms are historically challenging to debug, it's important to pay careful attention to logging.
-By default, TRL trainers like [`PPOTrainer`] and [`GRPOTrainer`] save a lot of relevant information to supported experiment trackers like Trackio, Weights & Biases (wandb) or TensorBoard.
-
-Upon initialization, pass the `report_to` argument to the respective configuration object (e.g., [`PPOConfig`] for `PPOTrainer`, or [`GRPOConfig`] for `GRPOTrainer`):
-
-```python
-# For PPOTrainer
-ppo_config = PPOConfig(
-    # ...,
-    report_to="trackio"  # or "wandb" or "tensorboard"
-)
-
-# For GRPOTrainer
-grpo_config = GRPOConfig(
-    # ...,
-    report_to="trackio"  # or "wandb" or "tensorboard"
-)
-```
-
-If you want to log with TensorBoard, you might also need to specify logging directories, for example, by adding `logging_dir=PATH_TO_LOGS` to the configuration object (e.g., `PPOConfig` or `GRPOConfig`).
-
-## PPO Logging
-
-Here's a brief explanation for the logged metrics provided in the data:
-
-* `eps`: Tracks the number of episodes per second.
-* `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
-* `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
-* `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
-* `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
-* `objective/scores`: The mean scores returned by the reward model / environment.
-* `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
-* `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
-* `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
-* `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
-* `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to `policy/clipfrac_avg` but for the value function.
-* `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
-* `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
-* `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
-* `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
-* `lr`: The current learning rate used by the optimizer.
-* `episode`: The current episode count in the training process.
-
-### Crucial values
-During training, many values are logged, here are the most important ones:
-
-1. `objective/scores`: The mean scores returned by the reward model / environment.
-1. `objective/rlhf_reward`: The mean RLHF reward. This is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
-1. `objective/non_score_reward`: The mean reward from non-score-related sources (e.g., KL penalty).
-
-Here are some parameters that are useful to monitor for stability (when these diverge or collapse to 0, try tuning variables):
-
-1. `loss/value_avg`: The average value loss. It will spike / NaN when not going well.
-1. `val/ratio`: The mean ratio of the current policy probability to the old policy probability. This number should float around 1.0. If this `ratio` is too high (e.g., 2.0 or 1000.0) or too small (e.g., 0.1), it means the updates between consecutive policies are too drastic.
-1. `policy/clipfrac_avg` and `policy/approxkl_avg`: If `val/ratio` is too high, the `ratio` is going to get clipped, resulting in high `policy/clipfrac_avg` and high `policy/approxkl_avg` as well.
-1. `objective/kl`: The mean KL divergence. It should stay positive and ideally not too large, so that the policy is not too far away from the reference policy.
-
-## GRPO Logging
-
-Here's a brief explanation for the logged metrics provided in the data for the GRPO trainer:
-
-* `num_tokens`: Total number of input tokens processed during training so far.
-
-#### Completions
-
-* `completions/mean_length`: Mean length of all generated completions (including those not ending with an EOS token).
-* `completions/min_length`: Minimum length among all generated completions.
-* `completions/max_length`: Maximum length among all generated completions.
-* `completions/clipped_ratio`: The ratio of completions that did not end with an EOS token before reaching the maximum generation length (i.e., they were truncated).
-* `completions/mean_terminated_length`: Mean length of only those completions that successfully ended with an EOS token.
-* `completions/min_terminated_length`: Minimum length among completions that ended with an EOS token.
-* `completions/max_terminated_length`: Maximum length among completions that ended with an EOS token.
-
-#### Rewards
-
-* `rewards/{reward_func_name}/mean`: The mean reward obtained from a specific, named reward function (e.g., `rewards/my_custom_reward/mean`). This is logged for each reward function used.
-* `rewards/{reward_func_name}/std`: The standard deviation of rewards from a specific, named reward function.
-* `reward`: The overall mean of the (potentially weighted and, if `args.scale_rewards` is true, normalized) rewards, after group-wise normalization (advantages).
-* `reward_std`: The standard deviation of the (potentially weighted) rewards *before* group-wise normalization for advantages.
-
-#### Policy and Loss Metrics
-
-* `kl`: The mean Kullback-Leibler (KL) divergence between the current policy and the reference policy. This is logged only if `beta` (the KL coefficient in `GRPOConfig`) is non-zero.
-* `entropy`: Average entropy of token predictions across generated completions.
-* If Liger GRPOLoss is used (`use_liger_loss: True` in `GRPOConfig`):
-    *   `clip_ratio`: The fraction of policy updates where the probability ratio was clipped according to the GRPO loss's epsilon bounds.
-* If standard GRPOLoss is used (`use_liger_loss: False`):
-    *   `clip_ratio/low_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the lower bound `1 - epsilon_low` (occurs when advantage is negative and ratio is below the bound).
-    *   `clip_ratio/low_min`: The minimum observed fraction for `clip_ratio/low_mean` across batches/processes.
-    *   `clip_ratio/high_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the upper bound `1 + epsilon_high` (occurs when advantage is positive and ratio is above the bound).
-    *   `clip_ratio/high_max`: The maximum observed fraction for `clip_ratio/high_mean` across batches/processes.
-    *   `clip_ratio/region_mean`: The mean fraction of instances where the probability ratio was clipped at either the lower or upper bound.
-
-### Crucial GRPO values
-
-During GRPO training, monitor these values for insights into performance and stability:
-
-1.  `reward`: This is the primary objective. It reflects the (group-wise normalized) rewards the policy is achieving. It should generally increase during successful training.
-1.  `kl`: If `beta > 0`, this tracks the divergence from the reference model. Keep an eye on it to ensure the policy doesn't stray too far, which can lead to instability.
-1.  `clip_ratio/*` (either `clip_ratio` for Liger loss or the more detailed `clip_ratio/...` metrics for standard loss): These indicate how often the policy updates are being constrained by the GRPO clipping mechanism. Very high values might suggest that the policy is trying to change too drastically (potentially due to large advantages or a learning rate that's too high) or that the epsilon clipping range is too restrictive.
-1.  `completions/clipped_ratio`: A high ratio here indicates that the model is frequently generating completions that are cut off by `max_completion_length` rather than naturally ending with an EOS token. This might suggest issues with learning sequence termination or that `max_completion_length` is too short.
-1. `rewards/{reward_func_name}/mean`: Monitoring the mean of individual reward functions can help diagnose which aspects of the desired behavior the model is learning or struggling with, especially when using multiple reward sources.
-1. `entropy`: Measures how uncertain the policy is in its action choices, higher entropy suggests more exploration. A collapse in entropy means the policy is becoming overconfident and deterministic, often too early. This can stall learning by reducing exploration and making updates overly biased. Stable but non-zero entropy is usually a sign that the policy retains flexibility and continues to explore.
-
--- a/docs/source/lora_without_regret.md
+++ b/docs/source/lora_without_regret.md
@ -0,0 +1,442 @@
+# LoRA Without Regret
+
+Recent research from the team at [Thinking Machines Lab](https://thinkingmachines.ai/blog/lora/) (Schulman et al., 2025) shows that **LoRA can match full fine-tuning performance** when configured correctly, while using only ~67% of the compute. These findings are exciting to TRL users because they're straightforward to implement and can improve model performance on smaller budgets.
+
+This guide provides simple instructions to reproduce the results of the blog post in TRL.
+
+> [!TIP]
+> It is recommended to read the blog post before following this guide, or to consult both resources in parallel for best results.
+
+## Benefits of LoRA over full fine-tuning
+
+First of all, let's remind ourselves of the benefits of [LoRA over full fine-tuning](https://huggingface.co/docs/trl/en/peft_integration).
+
+LoRA adds adapter layers on top of the base model, which contains significantly fewer parameters than the base model itself. This design reduces GPU memory requirements and enables more efficient training. As described in the [blog](https://thinkingmachines.ai/blog/lora/), this approach was originally thought to involve a performance trade-off, although careful configuration can overcome this trade-off and match full fine-tuning performance.  
+
+## Examples with TRL
+
+Let's implement and train LoRA adapters in TRL scripts based on the core findings of the blog post. Afterwards, we'll revisit each finding in light of the TRL results.
+
+### Supervised Fine-Tuning (SFT)
+
+The blog post performs SFT on a range of models and datasets from the Hub, which we can reproduce in TRL.
+
+| Model | Dataset |
+| --- | --- |
+| [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B) | [allenai/tulu-3-sft-mixture](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) |
+| [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B) | [open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) |
+| [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B) | [allenai/tulu-3-sft-mixture](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) |
+| [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B) | [open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) |
+
+<hfoptions id="sft">
+<hfoption id="python">
+
+We can integrate these findings with the TRL Python API like so:
+
+```python
+
+from datasets import load_dataset
+from peft import LoraConfig
+from trl import SFTTrainer, SFTConfig
+
+dataset = load_dataset("open-thoughts/OpenThoughts-114k", split="train")
+
+peft_config = LoraConfig(r=256, lora_alpha=16, target_modules="all-linear")
+
+training_args = SFTConfig(
+    learning_rate=2e-4,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_train_epochs=1,
+    report_to=["trackio"],
+)
+
+trainer = SFTTrainer(
+    model="Qwen/Qwen2.5-3B-Instruct",
+    train_dataset=dataset,
+    peft_config=peft_config,
+    args=training_args,
+)
+
+trainer.train()
+
+```
+
+</hfoption>
+<hfoption id="jobs">
+
+```bash
+
+hf jobs uv run \
+    --flavor a100-large \
+    --timeout 8h \
+    --secrets HF_TOKEN \
+    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
+    --model_name_or_path Qwen/Qwen2.5-3B-Instruct \
+    --dataset_name open-thoughts/OpenThoughts-114k \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 16 \
+    --use_peft \
+    --lora_r 256 \
+    --lora_alpha 16 \
+    --lora_target_modules all-linear \
+    --output_dir Qwen2.5-3B-OpenThoughts-LoRA \
+    --report_to trackio \
+    --push_to_hub
+
+```
+
+To use Hugging Face Jobs, you will need to be logged in to the Hugging Face Hub (`hf auth login`) and have a [Pro](https://hf.co/pro), [Team](https://hf.co/enterprise), or [Enterprise](https://hf.co/enterprise) plan. Check out the [Jobs documentation](https://huggingface.co/docs/huggingface_hub/en/guides/jobs) for more details.
+
+</hfoption>
+<hfoption id="local">
+
+```bash
+
+uv run "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
+    --model_name_or_path Qwen/Qwen2.5-3B-Instruct \
+    --dataset_name open-thoughts/OpenThoughts-114k \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 16 \
+    --gradient_checkpointing \
+    --eval_strategy no \
+    --use_peft \
+    --lora_r 256 \
+    --lora_alpha 16 \
+    --lora_target_modules all-linear \
+    --output_dir Qwen2.5-3B-OpenThoughts-LoRA \
+    --report_to trackio \
+    --push_to_hub
+
+```
+
+To run the script locally, you will need to have `uv` installed. Check out the [uv documentation](https://docs.astral.sh/uv/) for more details.
+
+</hfoption>
+</hfoptions>
+
+Once training starts, you can monitor the progress in [Trackio](https://huggingface.co/trackio), which will log the URL.
+
+### Reinforcement Learning (GRPO)
+
+The blog post performs GRPO on a range of models and datasets from the Hub, and once again we can reproduce the results in TRL.
+
+| Model | Dataset |
+| --- | --- |
+| [Llama-3.1-8B-Base](https://huggingface.co/meta-llama/Llama-3.2-1B) | [GSM8k](https://huggingface.co/datasets/openai/gsm8k) |
+| [Llama-3.1-8B-Base](https://huggingface.co/meta-llama/Llama-3.2-1B) | [DeepMath-103K](https://huggingface.co/datasets/zwhe99/DeepMath-103K) |
+| [Qwen3-8b-base](https://huggingface.co/Qwen/Qwen3-8b-base) | [DeepMath-103K](https://huggingface.co/datasets/zwhe99/DeepMath-103K) |
+
+For reinforcement learning, the blog uses a math reasoning task that we can reproduce as a Python function.
+
+<details>
+<summary>Reward function</summary>
+
+```python
+def strip_reasoning_accuracy_reward(
+    completions: list[list[dict[str, str]]], solution: list[str], **kwargs
+) -> list[Optional[float]]:
+    """Reward function that strips reasoning tags and checks mathematical accuracy.
+
+    This function:
+    1. Extracts the content from completions
+    2. Removes <think></think> tags (for reasoning that shouldn't be evaluated)
+    3. Parses both the gold solution and the predicted answer
+    4. Uses math_verify to check if they are mathematically equivalent
+
+    Args:
+        completions: List of model completions, each containing a list of messages
+        solution: List of ground truth solutions
+        **kwargs: Additional arguments (ignored but required for trainer compatibility)
+
+    Returns:
+        List of rewards where:
+        - 1.0 if the answer is correct
+        - 0.0 if the answer is incorrect
+        - None if the solution is not parseable (skips this example)
+    """
+    contents = [completion[0]["content"] for completion in completions]
+    rewards = []
+
+    for content, sol in zip(contents, solution):
+        # Strip reasoning tags from completion
+        while "<think>" in content and "</think>" in content:
+            start = content.find("<think>")
+            end = content.find("</think>", start)
+            if start != -1 and end != -1:
+                content = content[:start] + content[end + len("</think>") :]
+            else:
+                break
+
+        # Parse gold solution
+        gold_parsed = parse(
+            f"${sol}$",
+            extraction_config=[
+                LatexExtractionConfig(
+                    boxed_match_priority=0, try_extract_without_anchor=True
+                )
+            ],
+        )
+
+        if len(gold_parsed) != 0:
+            # We require the answer to be provided in correct latex (no malformed operators)
+            answer_parsed = parse(
+                content,
+                extraction_config=[
+                    LatexExtractionConfig(
+                        boxed_match_priority=0,
+                        normalization_config=NormalizationConfig(
+                            basic_latex=True,
+                            units=True,
+                            malformed_operators=False,
+                            nits=False,
+                            boxed=True,
+                        ),
+                        try_extract_without_anchor=False,
+                    )
+                ],
+                extraction_mode="first_match",
+            )
+
+            # Compute binary rewards if verifiable, `None` otherwise to skip this example
+            try:
+                reward = float(verify(gold_parsed, answer_parsed))
+            except Exception as e:
+                print(
+                    f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}"
+                )
+                reward = None
+        else:
+            # If the gold solution is not parseable, we assign `None` to skip this example
+            reward = None
+
+        rewards.append(reward)
+
+    return rewards
+```
+
+</details>
+
+<hfoptions id="grpo">
+<hfoption id="python">
+
+We can implement these recommendations with the TRL Python API like so:
+
+```python
+
+from datasets import load_dataset
+from peft import LoraConfig
+from trl import GRPOConfig, GRPOTrainer
+
+dataset = load_dataset("HuggingFaceH4/OpenR1-Math-220k-default-verified", split="train")
+
+def strip_reasoning_accuracy_reward(completions, **kwargs):
+    """Reward function that strips reasoning and accuracy scores from the model outputs."""
+
+    ... 
+
+peft_config = LoraConfig(
+    r=1,
+    lora_alpha=32,
+    target_modules="all-linear"
+)
+
+training_args = GRPOConfig(
+    learning_rate=5e-5,
+    per_device_train_batch_size=1,
+    gradient_accumulation_steps=4,
+    num_train_epochs=1,
+    num_generations=8,
+    generation_batch_size=8,
+    report_to=["trackio"],
+)
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen3-0.6B",
+    reward_funcs=strip_reasoning_accuracy_reward,
+    args=training_args,
+    train_dataset=dataset,
+    peft_config=peft_config,
+)
+
+trainer.train()
+
+```
+
+> [!WARNING]
+> This snippet skips the reward function which is defined above to keep the example concise.
+
+</hfoption>
+<hfoption id="jobs">
+
+```bash
+
+hf jobs uv run \
+    --flavor a100-large \
+    --timeout 4h \
+    --secrets HF_TOKEN \
+    --env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+    "https://huggingface.co/datasets/burtenshaw/lora-without-regrets/resolve/main/grpo.py" \
+    --model_name_or_path Qwen/Qwen3-0.6B \
+    --dataset_name HuggingFaceH4/OpenR1-Math-220k-default-verified \
+    --output_dir grpo-full-qwen3-0.6b \
+    --learning_rate 1.0e-6 \
+    --lr_scheduler_type cosine \
+    --warmup_ratio 0.0 \
+    --max_grad_norm 1.0 \
+    --beta 0.0 \
+    --max_prompt_length 1024 \
+    --max_completion_length 4096 \
+    --num_generations 16 \
+    --generation_batch_size 16 \
+    --gradient_accumulation_steps 8 \
+    --per_device_train_batch_size 1 \
+    --num_train_epochs 1 \
+    --lora_r 1 \
+    --lora_alpha 32 \
+    --lora_dropout 0.0 \
+    --lora_target_modules all-linear \
+    --vllm_mode colocate \
+    --save_strategy steps \
+    --save_steps 50 \
+    --save_total_limit 1 \
+    --logging_steps 1 \
+    --max_steps 200 \
+    --report_to trackio
+```
+
+To use Hugging Face Jobs, you will need to be logged in to the Hugging Face Hub (`hf auth login`) and have a [Pro](https://hf.co/pro), [Team](https://hf.co/enterprise), or [Enterprise](https://hf.co/enterprise) plan. Check out the [Jobs documentation](https://huggingface.co/docs/huggingface_hub/en/guides/jobs) for more details.
+
+</hfoption>
+<hfoption id="local">
+
+```bash
+uv run "https://huggingface.co/datasets/burtenshaw/lora-without-regrets/resolve/main/grpo.py" \
+    --model_name_or_path Qwen/Qwen3-0.6B \
+    --dataset_name HuggingFaceH4/OpenR1-Math-220k-default-verified \
+    --output_dir grpo-full-qwen3-0.6b \
+    --learning_rate 1.0e-6 \
+    --lr_scheduler_type cosine \
+    --warmup_ratio 0.0 \
+    --max_grad_norm 1.0 \
+    --beta 0.0 \
+    --max_prompt_length 1024 \
+    --max_completion_length 4096 \
+    --num_generations 16 \
+    --generation_batch_size 16 \
+    --gradient_accumulation_steps 8 \
+    --per_device_train_batch_size 1 \
+    --num_train_epochs 1 \
+    --lora_r 1 \
+    --lora_alpha 32 \
+    --lora_dropout 0.0 \
+    --lora_target_modules all-linear \
+    --vllm_mode colocate \
+    --save_strategy steps \
+    --save_steps 50 \
+    --save_total_limit 1 \
+    --logging_steps 1 \
+    --max_steps 200 \
+    --report_to trackio
+```
+
+To run the script locally, you will need to have `uv` installed. Check out the [uv documentation](https://docs.astral.sh/uv/) for more details.
+
+</hfoption>
+</hfoptions>
+
+The reinforcement learning script with GRPO is implemented as a custom script in TRL, which uses the reward function shown above. You can review it at [`grpo.py`](https://huggingface.co/datasets/burtenshaw/lora-without-regrets/blob/main/grpo.py) - Reinforcement learning with LoRA best practices
+
+## Key findings in optimizing LoRA
+
+The authors recommend applying LoRA to all weight matrices rather than limiting it to attention layers, as increasing the rank does not compensate for this restriction. In TRL, this can be configured using `--lora_target_modules all-linear` to apply LoRA to all weight matrices.
+
+We were able to reproduce the results of the blog post using TRL and the SmolLM3 model. We trained the model for 500 steps on the [Math 220k dataset](https://huggingface.co/datasets/HuggingFaceH4/OpenR1-Math-220k-default-verified) with the reward function and configuration above. As you can see in the figure below, the LoRA model's average train reward curve matches the full fine-tuning curve.
+
+![train reward](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/5.png)
+
+And most importantly, the LoRA model uses significantly less memory than the full fine-tuning model, as we can see in the figure below.
+
+![memory usage](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/6.png)
+
+Here are the parameters we used to train the above models
+
+| Parameter | LoRA | Full FT |
+| --- | --- | --- |
+| `--model_name_or_path` | HuggingFaceTB/SmolLM3-3B | HuggingFaceTB/SmolLM3-3B |
+| `--dataset_name` | HuggingFaceH4/OpenR1-Math-220k-default-verified | HuggingFaceH4/OpenR1-Math-220k-default-verified |
+| `--learning_rate` | 1.0e-5 | 1.0e-6 |
+| `--max_prompt_length` | 1024 | 1024 |
+| `--max_completion_length` | 4096 | 4096 |
+| `--lora_r` | 1 | - |
+| `--lora_alpha` | 32 | - |
+| `--lora_dropout` | 0.0 | - |
+| `--lora_target_modules` | all-linear | - |
+
+Let's break down the key findings of the blog post and how we were able to reproduce them.
+
+### 1. *LoRA performs better when applied to all weight matrices*
+
+The authors recommend applying LoRA to all weight matrices rather than limiting it to attention layers, as increasing the rank does not compensate for this restriction.
+
+![all layers](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/1.png)
+
+Attention-only LoRA underperforms even when using a higher rank to match parameter count. In TRL, this can be configured using `--lora_target_modules all-linear` to apply LoRA to all weight matrices.  In Python, we can do this like so:
+
+```python
+from peft import LoraConfig  
+
+peft_config = LoraConfig(target_modules="all-linear")  
+```
+
+### 2. *The adapter needs sufficient capacity to learn from the dataset*
+
+The blog post recommends using a sufficient LoRA rank to learn from the dataset. The rank determines the number of trainable parameters in the LoRA adapter. Therefore, "For datasets that exceed LoRA capacity, LoRA underperforms FullFT".
+
+![learning rate](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/3.png)
+
+In the TRL script, we could use `--lora_r` to set the rank and adapt it based on the task and dataset we're training on. The blog post recommends the following ranks based on the task and dataset size:
+
+Reinforcement learning tasks typically require lower capacity, so smaller LoRA ranks can be used. This is because policy gradient algorithms extract roughly ~1 bit of information per episode, demanding minimal parameter capacity.  
+
+The blog post defines the ideal dataset size for LoRA to match full fine-tuning as "Post-training scale". Which we can use to determine the recommended rank for SFT and RL LoRAs as:
+
+| Task Type | Dataset Size | Recommended Rank |
+| --- | --- | --- |
+| **SFT** | Post-training scale | 256 |
+| **RL** | Any size | 1-32 |
+
+### 3. *"FullFT and high-rank LoRAs have similar learning curves"*
+
+Counterintuitively, the blog post recommends using a higher learning rate than for full fine-tuning. In the table above, we used 1.0e-5 for LoRA and 1.0e-6 for full fine-tuning. In the TRL script, we could use `--learning_rate` to set the learning rate. The  \\( \frac{1}{r} \\) scaling in LoRA makes the optimal learning rate approximately rank-independent.
+
+![learning rate](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/2.png)
+
+### 4. *"In some scenarios, LoRA is less tolerant of large batch sizes than full fine-tuning."*
+
+The blog post recommends using an effective batch size < 32 because the authors found LoRA to be less tolerant of large batch sizes. This could not be mitigated by increasing the LoRA rank. In the TRL script, we could use `--per_device_train_batch_size` and `--gradient_accumulation_steps` to set the batch size.
+
+![learning rate](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/4.png)
+
+## Takeaways
+
+Using TRL, you can efficiently implement LoRA adapters to match full fine-tuning performance, applying the core insights (targeting all weight matrices, choosing the right rank, and managing batch size and learning rate) without the heavy compute cost of FullFT.
+
+## Citation
+
+```bibtex
+@article{schulman2025lora,  
+    title        = {{LoRA Without Regret}},  
+    author       = {John Schulman and Thinking Machines Lab},  
+    year         = 2025,  
+    journal      = {Thinking Machines Lab: Connectionism},  
+    doi          = {10.64434/tml.20250929},  
+    note         = {https://thinkingmachines.ai/blog/lora/}  
+}  
+```
--- a/docs/source/models.md
+++ b/docs/source/models.md
@ -8,7 +8,6 @@ With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder mode

 ## AutoModelForCausalLMWithValueHead

-
 [[autodoc]] AutoModelForCausalLMWithValueHead
    - __init__
    - forward
@ -25,4 +24,4 @@ With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder mode

 ## create_reference_model

-[[autodoc]] create_reference_model
+[[autodoc]] create_reference_model
--- a/docs/source/multi_adapter_rl.md
+++ b/docs/source/multi_adapter_rl.md
@ -1,100 +0,0 @@
-# Multi Adapter RL (MARL) - a single base model for everything
-
-Here we present an approach that uses a single base model for the entire PPO algorithm - which includes retrieving the reference logits, computing the active logits and the rewards. This feature is experimental as we did not test the convergence of the approach. We encourage the community to let us know if they potentially face issues.
-
-## Requirements
-
-You just need to install `peft` and optionally install `bitsandbytes` as well if you want to go for 8bit base models, for more memory efficient finetuning.
-
-## Summary
-
-You need to address this approach in three stages that we summarize as follows:
-
-1- Train a base model on the target domain (e.g. [IMDB dataset](https://huggingface.co/datasets/stanfordnlp/imdb)) - this is the Supervised Fine Tuning stage - it can leverage the `SFTTrainer` from TRL.
-2- Train a reward model using `peft`. This is required in order to re-use the adapter during the RL optimisation process (step 3 below). We show an example of leveraging the `RewardTrainer` from TRL in [this example](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_modeling.py)
-3- Fine tune new adapters on the base model using PPO and the reward adapter. ("0 abstraction RL")
-
-Make sure to use the same model (i.e. same architecture and same weights) for the stages 2 & 3. 
-
-## Quickstart
-
-Let us assume you have trained your reward adapter on `llama-7b` model using `RewardTrainer` and pushed the weights on the hub under `trl-lib/llama-7b-hh-rm-adapter`. 
-When doing PPO, before passing the model to `PPOTrainer` create your model as follows:
-
-```python
-model_name = "huggyllama/llama-7b"
-rm_adapter_id = "trl-lib/llama-7b-hh-rm-adapter"
-
-# PPO adapter
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-model = AutoModelForCausalLMWithValueHead.from_pretrained(
-    model_name,
-    peft_config=lora_config,
-    reward_adapter=rm_adapter_id,
-)
-
-...
-trainer = PPOTrainer(
-    model=model,
-    ...
-)
-
-...
-```
-Then inside your PPO training loop, call the `compute_reward_score` method by accessing the `model` attribute from `PPOTrainer`.
-
-```python
-rewards = trainer.model.compute_reward_score(**inputs)
-```
-
-## Advanced usage
-
-### Control on the adapter name 
-
-If you are familiar with the `peft` library, you know that you can use multiple adapters inside the same model. What you can do is train multiple adapters on the same base model to fine-tune on different policies. 
-In this case, you want to be able to control the adapter name you want to activate back, after retrieving the reward. For that, simply pass the appropriate `adapter_name` to `ppo_adapter_name` argument when calling `compute_reward_score`.
-
-```python
-adapter_name_policy_1 = "policy_1"
-rewards = trainer.model.compute_reward_score(**inputs, ppo_adapter_name=adapter_name_policy_1)
-...
-```
-
-### Using 4-bit and 8-bit base models
-
-For more memory efficient fine-tuning, you can load your base model in 8-bit or 4-bit while keeping the adapters in the default precision (float32).
-Just pass the appropriate arguments (i.e. `load_in_8bit=True` or `load_in_4bit=True`) to `AutoModelForCausalLMWithValueHead.from_pretrained` as follows (assuming you have installed `bitsandbytes`):
-```python
-model_name = "llama-7b"
-rm_adapter_id = "trl-lib/llama-7b-hh-rm-adapter"
-
-# PPO adapter
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-model = AutoModelForCausalLMWithValueHead.from_pretrained(
-    model_name,
-    peft_config=lora_config,
-    reward_adapter=rm_adapter_id,
-    load_in_8bit=True,
-)
-
-...
-trainer = PPOTrainer(
-    model=model,
-    ...
-)
-...
-```
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@ -1,16 +1,16 @@
 # Nash-MD Trainer

-[![](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md,trl)
+[![model badge](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md,trl)

 ## Overview

-Nash-MD was proposed in the paper [Nash Learning from Human Feedback](https://huggingface.co/papers/2312.00886) by Rémi Munos, [Michal Valko](https://huggingface.co/misovalko), Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mésnard, and Andrea Michi. 
+Nash-MD was proposed in the paper [Nash Learning from Human Feedback](https://huggingface.co/papers/2312.00886) by Rémi Munos, [Michal Valko](https://huggingface.co/misovalko), Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mésnard, and Andrea Michi.

 The abstract from the paper is the following:

 > Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences.

-This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif) and [Daniil Tiapkin](https://huggingface.co/dtiapkin), [Pierre Ménard](https://huggingface.co/menardprr), Daniele Calandriello and [Quentin Gallouédec](https://huggingface.co/qgallouedec). 
+This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif) and [Daniil Tiapkin](https://huggingface.co/dtiapkin), [Pierre Ménard](https://huggingface.co/menardprr), Daniele Calandriello and [Quentin Gallouédec](https://huggingface.co/qgallouedec).

 ## Quick start

@ -85,11 +85,8 @@ Instead of a judge, you can chose to use a reward model -- see [Reward Bench](ht
  )
 ```

-<Tip warning={true}>
-
-Make sure that the SFT model and reward model use the _same_ chat template and the same tokenizer. Otherwise, you may find the model completions are scored incorrectly during training.
-
-</Tip>
+> [!WARNING]
+> Make sure that the SFT model and reward model use the _same_ chat template and the same tokenizer. Otherwise, you may find the model completions are scored incorrectly during training.

 ### Encourage EOS token generation

--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@ -1,10 +1,10 @@
 # Online DPO Trainer

-[![](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo,trl)

-## Overview 
+## Overview

-Online DPO was proposed in [Direct Language Model Alignment from Online AI Feedback](https://huggingface.co/papers/2402.04792) by Shangmin Guo, Biao Zhang, Tianlin Liu, Tianqi Liu, Misha Khalman, Felipe Llinares, Alexandre Rame, Thomas Mesnard, Yao Zhao, Bilal Piot, Johan Ferret, and Mathieu Blondel. 
+Online DPO was proposed in [Direct Language Model Alignment from Online AI Feedback](https://huggingface.co/papers/2402.04792) by Shangmin Guo, Biao Zhang, Tianlin Liu, Tianqi Liu, Misha Khalman, Felipe Llinares, Alexandre Rame, Thomas Mesnard, Yao Zhao, Bilal Piot, Johan Ferret, and Mathieu Blondel.

 The abstract from the paper is the following:

@ -112,7 +112,6 @@ This callback logs the model's generated completions directly to Weights & Biase

 ![Logged Completions](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/wandb_completions.png)

-
 ## Example script

 We provide an example script to train a model using the online DPO method. The script is available in [`examples/scripts/dpo_online.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_online.py)
@ -153,8 +152,7 @@ While training and evaluating, we record the following reward metrics. Here is a

 To validate the online DPO implementation works, we ran experiments with the Pythia 1B, 2.8B, and 6.9B models on a single node of 8 x H100s. Here are the commands we used to run the experiments. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).

-
-```
+```shell
 # 1B Online DPO experiment
 accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml \
    examples/scripts/dpo_online.py \
@ -213,9 +211,8 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml

 Checkpoints and experiment tracking are available at:

- [🤗 Model checkpoints](https://huggingface.co/collections/trl-lib/online-dpo-66acd3fa38a331a9cd457b07)
- [🐝 Tracked experiment](https://wandb.ai/huggingface/trl/reports/Online-DPO-experiments-for-TL-DR-summarisation--Vmlldzo5MTczMDU0)
-
+* [🤗 Model checkpoints](https://huggingface.co/collections/trl-lib/online-dpo-66acd3fa38a331a9cd457b07)
+* [🐝 Tracked experiment](https://wandb.ai/huggingface/trl/reports/Online-DPO-experiments-for-TL-DR-summarisation--Vmlldzo5MTczMDU0)

 To evaluate, we use [vLLM](https://github.com/vllm-project/vllm) to load the checkpoints and GPT-4o mini as a judge model to evaluate the generated TL;DR against the reference TL;DR.
 For more information on how to use judges, see [Judges](judges).
@ -259,8 +256,6 @@ plt.tight_layout()
 plt.show()
 ```

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/online_dpo_scaling.png)
-
 The online DPO checkpoint gets increasingly more win rate as we scale up the model sizes. This is a good sign that the online DPO implementation is working as intended.

 ## OnlineDPOTrainer
--- a/docs/source/openenv.md
+++ b/docs/source/openenv.md
@ -0,0 +1,373 @@
+# OpenEnv Integration for Training LLMs with Environments
+
+## Overview
+
+[OpenEnv](https://github.com/meta-pytorch/OpenEnv) is an open-source framework from Meta's PyTorch team for defining, deploying, and interacting with environments in reinforcement learning (RL) and agentic workflows. It offers [Gymnasium-style APIs](https://gymnasium.farama.org) (e.g., `reset()` and `step()`) to interface with environments in a standard manner, and supports running these environments as backend servers (for example via HTTP or containerised execution). You can find a collection of ready-to-use OpenEnv environments on the [Hugging Face Hub](https://huggingface.co/collections/openenv/environment-hub).
+
+In this guide, we’ll focus on **how to integrate OpenEnv with TRL**, but feel free to explore the links above to dive deeper into OpenEnv itself.
+
+## Installation
+
+To use OpenEnv with TRL, install the framework:
+
+```bash
+pip install openenv-core
+```
+
+## Using `rollout_func` with OpenEnv environments
+
+TRL's [`GRPOTrainer`] supports _custom rollout logic_ through the `rollout_func` argument. This lets you override the trainer's default text-generation loop and directly interact with OpenEnv environments — for instance, to compute environment-driven rewards instead of relying solely on model-based signals.
+
+### Rollout Function Signature
+
+A rollout function must have the following signature:
+
+```python
+def rollout_func(
+    prompts: list[str],
+    args: GRPOConfig,
+    processing_class
+) -> dict[str, list]:
+    """
+    Custom rollout function for generation and reward computation.
+
+    Args:
+        prompts: List of prompts to generate from
+        args: GRPOConfig containing sampling parameters (temperature, top_p, etc.)
+        processing_class: Tokenizer/processor for encoding/decoding
+
+    Returns:
+        Dictionary containing:
+        - prompt_ids: List of token IDs for each prompt
+        - completion_ids: List of token IDs for each completion
+        - logprobs: List of log probabilities for each token
+        - Any additional fields are forwarded to reward functions as kwargs
+    """
+    pass
+```
+
+> [!NOTE]
+> Any extra fields in the returned dictionary (beyond the required three) are automatically forwarded to your reward functions. This makes it easy to propagate signals such as environment rewards or auxiliary metrics from the rollout step.
+
+### Integration pattern
+
+The typical pattern when combining OpenEnv with TRL looks like this:
+
+1. Start or connect to an OpenEnv environment (e.g., an HTTP endpoint or Dockerized env).
+2. Generate completions from your model — for example, via a vLLM inference server (`use_vllm=True`, `vllm_mode="server"`).
+3. Step through the environment using each completion to compute rewards or metrics.
+4. Add environment results (e.g., `env_reward`) to the rollout result dict.
+5. Access those rewards inside your reward function via `**kwargs`.
+
+By using OpenEnv in this loop, you can:
+
+* Train with realistic or interactive feedback (not just static reward functions).
+* Plug in custom simulators, web APIs, or evaluators as environments.
+* Pass structured reward signals back into RL training seamlessly.
+
+## A simple example
+
+The [echo.py](https://github.com/huggingface/trl/blob/main/examples/scripts/openenv/echo.py) script demonstrates a minimal, end-to-end integration between TRL and OpenEnv. In this example, the Echo environment rewards completions based on their text length, encouraging the model to generate longer outputs. This pattern can be extended to any custom environment that provides structured feedback or task-based rewards:
+
+```python
+from envs.echo_env import EchoEnv, EchoAction
+from trl import GRPOConfig, GRPOTrainer
+
+# Create HTTP client for Echo Environment
+client = EchoEnv.from_docker_image("echo-env:latest")
+
+def rollout_func(prompts, args, processing_class):
+    # 1. Generate completions via vLLM inference server (running on port 8000)
+    payload = {
+        "prompts": prompts,
+        "n": args.num_generations,
+        "temperature": args.temperature,
+        "max_tokens": args.max_completion_length,
+    }
+    response = requests.post("http://0.0.0.0:8000/generate/", json=payload)
+    result = response.json()
+
+    completions_text = processing_class.batch_decode(
+        result["completion_ids"],
+        skip_special_tokens=True
+    )
+
+    # 2. Step through the environment to get rewards
+    client.reset()
+    env_rewards = []
+    for msg in completions_text:
+        env_result = client.step(EchoAction(message=msg))
+        env_rewards.append(env_result.reward)
+
+    # 3. Add environment rewards as extra field
+    result["env_reward"] = env_rewards
+    return result
+
+def reward_from_env(completions, **kwargs):
+    """Extract environment rewards passed via rollout_func kwargs."""
+    env_rewards = kwargs.get("env_reward", [])
+    return [float(reward) for reward in env_rewards] if env_rewards else [0.0] * len(completions)
+
+dataset = Dataset.from_dict({"prompt": ["You are an AI that interacts with an *Echo* environment. Word to echo:"] * 64})
+
+# Setup trainer with custom rollout
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2.5-0.5B-Instruct",
+    reward_funcs=reward_from_env,
+    train_dataset=dataset,
+    rollout_func=rollout_func,  # Use custom rollout
+    args=GRPOConfig(
+        vllm_mode="server",
+        use_vllm=True,
+        num_train_epochs=1,
+        num_generations=8,
+        max_completion_length=2048,
+        per_device_train_batch_size=8,
+        gradient_accumulation_steps=4,
+    ),
+)
+trainer.train()
+```
+
+That's it! Now that you’ve seen the full example, let’s unpack how the main pieces fit together.
+
+1. **Environment Client:** `EchoEnv` implements an HTTP interface to interact with the environment server.  
+2. **Custom rollout:** The `rollout_func` generates completions and steps through the environment to collect rewards.  
+3. **Extra fields:** The rollout adds `env_reward` to the result dictionary, which is automatically passed to reward functions.  
+4. **Reward function:** Extracts `env_reward` from `kwargs` to apply environment-computed rewards during training.
+
+> [!WARNING]
+> The `rollout_func` is currently only supported when using vLLM in server mode (`use_vllm=True`, `vllm_mode="server"`).
+
+### Running the Example
+
+The example requires two GPUs:
+
+```bash
+# Terminal 1: Start vLLM inference server
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-0.5B-Instruct --host 0.0.0.0 --port 8000
+
+# Terminal 2: Run GRPO training with OpenEnv
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/echo.py
+```
+
+Below is the reward curve from training:
+
+<iframe src="https://trl-lib-trackio.hf.space?project=openenv&metrics=train/rewards/reward_from_env/mean&runs=qgallouedec-1761202871&sidebar=hidden&navbar=hidden" style="width:600px; height:500px; border:0;"></iframe>
+
+To learn more about how to create custom environments, see the [OpenEnv documentation](https://github.com/meta-pytorch/OpenEnv/blob/main/src/envs/README.md).
+
+## Advanced Example
+
+Let's level this up a bit by training a model to interact with a more complex environment. We'll use the game word guessing game [wordle](https://www.nytimes.com/games/wordle/index.html) from the `textarena` environment. 
+
+### The TextArena Environment
+
+[TextArena](https://huggingface.co/papers/2504.11442) is an open-source collection of competitive text-based games designed to evaluate reasoning skills in LLMs using textual games like Wordle, Snake, Tic-Tac-Toe, and more. Research has shown that such games improve model performance on reasoning tasks.
+
+![image of textarena](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/text_arena_evals.png)
+
+We will use the `textarena` environment to train a model to play Wordle. The environment is a simple text based response environment that allows the model to interact with the game by making guesses and receive feedback on them.
+
+### Wordle
+
+Wordle is a useful game to train a model on because it requires the model to reason about the word and the feedback provided by the environment. Also, it is a purely language based game that requires no external tools or knowledge. Furthermore, we found that models from 1 billion parameters and up are able to improve on wordle and only require 8 tokens to generate a guess, which makes the game a good benchmark to experiment with Reinforcement Learning environments without significant compute requirements.
+
+> [!NOTE] How does Wordle work?
+> Wordle is a word guessing game where the player has to guess a 5-letter word. The player can make 6 guesses, and for each guess, the environment will provide feedback on the correctness of the guess. The player wins if they guess the word in 6 guesses or less. It challenges the model to generate words that are likely to be correct, and to learn from the feedback provided by the environment. 
+> 
+> For example, if the wordle environment returns the following feedback:
+>
+> ```
+> G U E S S
+> X G Y X X
+> ```
+> The model has guessed the word "GUESS" and the environment has provided feedback as the letters X, G, and Y. Referring to colors in the original game blank, green, and yellow. From this feedback, the model should learn that the word is "GUESS" is incorrect. The letter "E" is in the word, but in the wrong position. The letter "U" is correct and in the correct position.
+ 
+In the TextArena environment, reward is only given when the model wins the game. The reward is 1.0 if the model wins, and 0.0 otherwise. This is not a very efficient reward signal for the model, so we have added a number of custom reward functions to the script to help the model learn to play the game. The extensible nature of `reward_funcs` and `rollout_func` allows you to add any custom reward function you want to the script.  
+
+### Rollout Function
+
+The rollout function runs one full Wordle episode, prompting the model for a guess each turn and capturing both environment rewards and auxiliary signals such as letter coverage and repetition penalties.
+
+```python
+def rollout_once(
+    env: TextArenaEnv,
+    tokenizer: AutoTokenizer,
+    args: GRPOConfig,
+    dataset_prompt: str,
+    cli_args: argparse.Namespace,
+    system_prompt: str,
+) -> dict[str, list]:
+    result = env.reset()
+    observation = result.observation
+
+    prompt_ids: list[int] = []
+    completion_ids: list[int] = []
+    logprobs: list[float] = []
+    raw_rewards: list[float] = []
+    green_scores: list[float] = []
+    yellow_scores: list[float] = []
+    repetition_scores: list[float] = []
+    correct_scores: list[float] = []
+    guess_counts: dict[str, int] = {}
+
+    for _turn in range(cli_args.max_turns):
+        # when the game is over the environment will return a done=True
+        if result.done:
+            break
+
+        # set up the prompt for the model
+        base_prompt = observation.prompt or dataset_prompt
+        user_prompt = make_user_prompt(base_prompt, observation.messages)
+        messages = [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": user_prompt},
+        ]
+        prompt_text = tokenizer.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False,
+            enable_thinking=False,
+        )
+
+        # generate the completion from the model using vLLM
+        vllm_result = request_vllm_completion(
+            prompt_text,
+            args,
+            endpoint=cli_args.vllm_endpoint,
+            timeout=cli_args.request_timeout,
+            fallback=cli_args,
+        )
+        prompt_ids.extend(vllm_result["prompt_ids"])
+        completion_ids.extend(vllm_result["completion_ids"])
+        logprobs.extend(vllm_result["logprobs"])
+        completion_text = vllm_result.get("text") or tokenizer.decode(
+            vllm_result["completion_ids"], skip_special_tokens=True
+        )
+        # extract the guess from the completion
+        guess = extract_guess(completion_text)
+
+        # step the environment with the guess
+        result = env.step(TextArenaAction(message=guess))
+        raw_rewards.append(float(result.reward or 0.0))
+        observation = result.observation
+        correct_score = float(result.reward or 0.0)
+        feedback = extract_wordle_feedback(observation)
+
+        # Update guess counts
+        previous_occurrences = guess_counts[guess]
+        repetition_score = scale_repetition_score(previous_occurrences, len(guess_counts))
+        guess_counts[guess] += 1
+
+        # calculate custom reward signals from the feedback
+        if not feedback:
+            green_score = 0.0
+            yellow_score = 0.0
+        else:
+            green_count, yellow_count = extract_feedback_counts(feedback)
+            green_score = green_count / 5.0
+            yellow_score = yellow_count / 5.0
+
+        repetition_scores.append(repetition_score)
+        green_scores.append(green_score)
+        yellow_scores.append(yellow_score)
+        correct_scores.append(correct_score)
+
+    correct_reward_value = correct_scores[-1] if correct_scores else (raw_rewards[-1] if raw_rewards else 0.0)
+
+    return {
+        "prompt_ids": prompt_ids,
+        "completion_ids": completion_ids,
+        "logprobs": logprobs,
+        "raw_rewards": raw_rewards,
+        "correct_reward": correct_reward_value,
+        "green_reward": green_scores[-1] if green_scores else 0.0,
+        "yellow_reward": yellow_scores[-1] if yellow_scores else 0.0,
+        "repetition_reward": repetition_scores[-1] if repetition_scores else 0.0,
+    }
+```
+
+The environment has a reward signal based on the completion of the game. We found that most models struggle to ever win the game, so we have added a number of custom reward functions to the script to help the model learn to play the game more iteratively. At first, the model will learn to cover new letters and avoid repeating guesses. As it improves, it will learn to win the game.
+
+### Reward Functions
+
+We log four reward streams that encourage the model to solve the puzzle, cover new letters, and avoid repeating guesses:
+
+- `reward_correct`: final win/loss signal from the environment.
+- `reward_greens`: density of green letters in the last feedback.
+- `reward_yellows`: density of yellow letters in the last feedback.
+- `reward_repetition`: penalty for guessing the same token multiple times.
+
+```python
+def reward_correct(completions: List[str], **kwargs: Optional[Dict]) -> List[float]:
+    rewards = kwargs.get("correct_reward") if kwargs else None
+    return [float(r) for r in rewards] if rewards is not None else [0.0] * len(completions)
+
+
+def reward_greens(completions: List[str], **kwargs: Optional[Dict]) -> List[float]:
+    rewards = kwargs.get("green_reward") if kwargs else None
+    return [float(r) for r in rewards] if rewards is not None else [0.0] * len(completions)
+
+
+def reward_yellows(completions: List[str], **kwargs: Optional[Dict]) -> List[float]:
+    rewards = kwargs.get("yellow_reward") if kwargs else None
+    return [float(r) for r in rewards] if rewards is not None else [0.0] * len(completions)
+
+
+def reward_repetition(completions: List[str], **kwargs: Optional[Dict]) -> List[float]:
+    rewards = kwargs.get("repetition_reward") if kwargs else None
+    return [float(r) for r in rewards] if rewards is not None else [0.0] * len(completions)
+```
+
+### Training the Model
+
+The training script wires the custom rollout and rewards into `GRPOTrainer`. The CLI exposes the configuration used during development as defaults, so you can override endpoints or hyperparameters at launch time.
+
+```python
+parser = argparse.ArgumentParser()
+# ... add CLI arguments with sensible defaults ...
+cli_args = parser.parse_args()
+
+trainer = GRPOTrainer(
+    model=cli_args.model_id,
+    processing_class=tokenizer,
+    reward_funcs=[
+        reward_correct,
+        reward_greens,
+        reward_yellows,
+        reward_repetition,
+    ],
+    train_dataset=dataset,
+    args=grpo_config,
+    rollout_func=lambda prompts, args, processing_class: rollout_func(
+        env=env,
+        tokenizer=tokenizer,
+        prompts=prompts,
+        args=args,
+        cli_args=cli_args,
+        system_prompt=system_prompt,
+    ),
+)
+trainer.train()
+```
+
+### Running the Example
+
+The example requires two GPUs:
+
+```bash
+# Terminal 1: Start vLLM inference server
+CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen2.5-0.5B-Instruct --host 0.0.0.0 --port 8000
+
+# Terminal 2: Run GRPO training with OpenEnv
+CUDA_VISIBLE_DEVICES=1 python examples/scripts/openenv/wordle.py
+```
+
+### Results
+
+The resulting model improves it's performance on the game, both by reducing the number of repetitions and by increasing the number of correct guesses. However, the the Qwen3-1.7B model we trained is not able to consistently win the game. The following reward curve shows the coverage of the model's guesses and the coverage of correct Y and G letters.
+
+<iframe src="https://burtenshaw-wordle-grpo.hf.space/?project=group-Qwen-Qwen3-17B&metrics=train/rewards/reward_coverage/mean&runs=run-2025-10-26_09-39-49&sidebar=hidden&navbar=hidden" style="width:600px; height:500px; border:0;"></iframe>
+
+We experimented larger models like `gpt-oss-20b` and found that model was able to consistently win the game. However, this requires a lot of compute to train and the model. Why not try this out yourself?
--- a/docs/source/orpo_trainer.md
+++ b/docs/source/orpo_trainer.md
@ -1,6 +1,6 @@
 # ORPO Trainer

-[![](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo,trl) [![](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
+[![model badge](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo,trl) [![model badge](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)

 ## Overview

@ -54,7 +54,7 @@ accelerate launch train_orpo.py

 Distributed across 8 GPUs, the training takes approximately 30 minutes. You can verify the training progress by checking the reward graph. An increasing trend in the reward margin indicates that the model is improving and generating better responses over time.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/orpo-qwen2-reward-margin.png)
+![orpo qwen2 reward margin](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/orpo-qwen2-reward-margin.png)

 To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-ORPO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

@ -64,11 +64,11 @@ What is the best programming language?

 <strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-ORPO&gt;:</span></strong>
 It's challenging to determine the best programming language as no one language is perfect, as the complexity of a task and the type of project are significant factors. Some popular languages include Java, Python, JavaScript, and
-C++. If you have specific needs or requirements for a specific project, it's important to choose the language that best suits those needs.                                                                                          
+C++. If you have specific needs or requirements for a specific project, it's important to choose the language that best suits those needs.

 Here are some other factors to consider when choosing a programming language for a project:

- <strong><span style="color: green;">• Language proficiency:</span></strong> A good programming language is more likely to be easy to understand and use, and will allow developers to collaborate on projects more efficiently.                                     
+ <strong><span style="color: green;">• Language proficiency:</span></strong> A good programming language is more likely to be easy to understand and use, and will allow developers to collaborate on projects more efficiently.
 <strong><span style="color: green;">• Ease of use:</span></strong> There are tools and libraries available to make programming more accessible, so developers should choose a language that can help them get started easier.
 <strong><span style="color: green;">• Code readability:</span></strong> A clear and concise codebase should be easy to read and understand, especially when working with large projects.
 <strong><span style="color: green;">• Tool and framework support:</span></strong> There are numerous libraries available for Python, Java, and JavaScript, along with tools like IDEs and static code analysis tools.
@ -118,7 +118,7 @@ While training and evaluating, we record the following reward metrics:
 - `log_odds_chosen`: the mean log odds ratio of the chosen responses over the rejected responses
 - `log_odds_ratio`: the mean of the `log(sigmoid(log_odds_chosen))`
 - `nll_loss`: the mean negative log likelihood loss from the SFT part of the loss over chosen responses
- 
+
 ## ORPOTrainer

 [[autodoc]] ORPOTrainer
--- a/docs/source/paper_index.md
+++ b/docs/source/paper_index.md
@ -1,10 +1,7 @@
 # Paper Index

-<Tip warning={true}>
-
-Section under construction. Feel free to contribute!
-
-</Tip>
+> [!WARNING]
+> Section under construction. Feel free to contribute!

 ## Group Relative Policy Optimization

@ -32,6 +29,8 @@ training_args = GRPOConfig(

 Note that this method only has an effect when training goes slightly off-policy—for example, when `steps_per_generation > gradient_accumulation_steps` or `num_iterations > 1`. Otherwise, it is effectively equivalent to no modification.

+TRL also provide an experimental implementation of GSPO-token, see [Experimental - GSPO-Token](experimental#gspo-token).
+
 #### Policy ratio: GRPO vs. GSPO

 In GSPO, the policy ratio is defined at the sequence-level. In other words, it is the ratio between the probability of the current policy generating a sequence over the old policy generating that same sequence.
@ -171,7 +170,7 @@ $$
 }
 $$

-Despite  \\( \textcolor{red}{\pi_{\text{inference}}} \\) and  \\( \textcolor{blue}{\pi_{\text{training}}} \\) sharing the same model parameters  \\( \theta \\), they can produce significantly different token probabilities. This unexpected behavior implicitly breaks the on-policy assumption, and silently turns training off-policy. 
+Despite  \\( \textcolor{red}{\pi_{\text{inference}}} \\) and  \\( \textcolor{blue}{\pi_{\text{training}}} \\) sharing the same model parameters  \\( \theta \\), they can produce significantly different token probabilities. This unexpected behavior implicitly breaks the on-policy assumption, and silently turns training off-policy.

 Truncated Importance Sampling (TIS) addresses this issue by adapting the model update via importance-sampling correction. The gradient computation of the aforementioned PPO objective becomes

@ -202,6 +201,37 @@ training_args = GRPOConfig(
 )
 ```

+### Sample More to Think Less: Group Filtered Policy Optimization for Concise Reasoning
+
+**📜 Paper**: https://huggingface.co/papers/2508.09726
+
+See [Experimental - GFPO](experimental#gfpo).
+
+### Perception-Aware Policy Optimization for Multimodal Reasoning
+
+**📜 Paper**: https://huggingface.co/papers/2507.06448
+
+A novel policy gradient algorithm that encourages VLMs to learn to perceive while learning to reason. This is a TRL adaptation. The TRL implementation is not the official one provided by the authors.
+This is a TRL adaptation of PAPO. Note that this is not the official implementation. The official code can be found in [MikeWangWZHL/PAPO](https://github.com/MikeWangWZHL/PAPO).
+
+```python
+from trl.experimental.papo import PAPOConfig, PAPOTrainer
+
+training_args = PAPOConfig(
+    # PAPO-specific params
+    perception_loss_weight=0.01,  # Weight for perception loss
+    mask_ratio=0.6,  # 40% of image will be masked
+    mask_type="random",  # Use patch masking (recommended)
+    der_loss_weight1=0.02,
+    der_loss_weight2=0.02,
+    # ...other GRPO params...
+)
+trainer = PAPOTrainer(
+    args=training_args,
+    ...
+)
+```
+
 ## Direct Policy Optimization

 Papers relating to the [`DPOTrainer`]
@ -261,7 +291,7 @@ These parameters only appear in the [published version](https://openreview.net/p

 ### Towards Efficient and Exact Optimization of Language Model Alignment

-**📜 Paper**: https://huggingface.co/papers/2305.10425
+**📜 Paper**: https://huggingface.co/papers/2402.00856

 Efficient exact optimization (EXO) method is proposed to align language models with human preferences, providing a guaranteed and efficient alternative to reinforcement learning and direct preference optimization. To reproduce the paper's setting, use this configuration:

@ -333,7 +363,7 @@ training_args = DPOConfig(
 )
 ```

-For the unpaired version, the user should utilize `BCOConfig` and `BCOTrainer`.
+For the unpaired version, the user should utilize [`experimental.bco.BCOConfig`] and [`experimental.bco.BCOTrainer`].

 ### Self-Play Preference Optimization for Language Model Alignment

@ -453,10 +483,7 @@ trainer = SFTTrainer(
 Dynamic Fine-Tuning (DFT) improves the generalization of Large Language Models (LLMs) by dynamically rescaling gradients, outperforming standard Supervised Fine-Tuning (SFT) and showing competitive results in offline reinforcement learning.

 $$
-\mathcal{L}_{\text{DFT}}(\theta) 
-= \mathbb{E}_{(x,y) \sim \mathcal{D}} \left[ - \sum_{t=1}^{|y|} 
-\textcolor{red}{\text{sg}\big(\pi_\theta(y_t \mid y_{<t}, x)\big)} 
-\; \log \pi_\theta(y_t \mid y_{<t}, x) \right]
+\mathcal{L}_{\text{DFT}}(\theta) = \mathbb{E}_{(x,y) \sim \mathcal{D}} \left[ - \sum_{t=1}^{|y|} \textcolor{red}{\text{sg}\big(\pi_\theta(y_t \mid y_{<t}, x)\big)} \; \log \pi_\theta(y_t \mid y_{<t}, x) \right]
 $$

 where  \\( \text{sg}(\cdot) \\) is the stop-gradient operator. To use DFT with SFT as described in the paper, you can use the `loss_type="dft"` argument:
@ -528,3 +555,97 @@ training_args = CPOConfig(
    ...
 )
 ```
+
+## Reward Modeling
+
+Papers relating to the [`RewardTrainer`]
+
+### Helping or Herding? Reward Model Ensembles Mitigate but do not Eliminate Reward Hacking
+
+**📜 Paper**: https://huggingface.co/papers/2312.09244
+
+This paper proposed an auxiliary loss function designed to directly learn a centered reward model. This auxiliary loss minimizes the squared sum of the rewards, encouraging the model to naturally produce mean-zero outputs and thereby resolving the issue of underdetermination.
+
+$$
+\mathcal{L}(\theta) = - \mathbb{E}_{(x,y^+,y^-) \sim \mathcal{D}} \left[ \log \sigma(r_\theta(x, y^+) - r_\theta(x, y^-)) \textcolor{red}{- \eta \cdot (r_\theta(x, y^+) + r_\theta(x, y^-))^2} \right].
+$$
+
+To use this auxiliary loss with [`RewardTrainer`], you can use the `center_rewards_coefficient` argument in [`RewardConfig`] as follows:
+
+```python
+from trl import RewardConfig
+
+training_args = RewardConfig(
+    center_rewards_coefficient=0.01,  # η in the paper
+    ...
+)
+```
+
+### Llama 2: Open Foundation and Fine-Tuned Chat Models
+
+**📜 Paper**: https://huggingface.co/papers/2307.09288
+
+In this paper, the authors propose to leverage their preference ratings being decomposed as a scale of four points (e.g., _significantly better_) to provide more informative feedback to the reward model. This is done by adding a margin to the loss function, which encourages the reward model to assign larger gaps in scores for pairs with higher preference ratings.
+
+$$
+\mathcal{L}(\theta) = - \mathbb{E}_{(x,y^+,y^-,\textcolor{red}{m}) \sim \mathcal{D}} \left[ \log \sigma(r_\theta(x, y^+) - r_\theta(x, y^-) \textcolor{red}{- m}) \right].
+$$
+
+You can add a margin to the loss by adding a `margin` column to the dataset. The following example shows how to set up a the "Margin Small" setting of the paper.
+
+```python
+def add_margin(example):
+    preference_to_margin = {
+        "significantly better": 1.0,
+        "better": 2.0/3.0,
+        "slightly better": 1.0/3.0,
+        "negligibly better / unsure": 0.0,
+    }
+    return {"margin": preference_to_margin[example["preference_label"]]}
+
+dataset = dataset.map(add_margin)
+```
+
+## Distillation
+Papers relating to training a student model with the help of a teacher model.
+
+### On-Policy Distillation
+**📰 Blog**: https://thinkingmachines.ai/blog/on-policy-distillation/
+
+On-Policy Distillation involves a student model generating rollouts for each batch of training data. We subsequently obtain the probability distributions for each token of the rollouts from both the student and teacher models. The student model is then optimized to minimize the negative Kullback-Leibler (KL) divergence between its own token distributions and those of the teacher model.
+
+| Method                  | Sampling   | Reward signal |
+|-------------------------|------------|---------------|
+| Supervised finetuning   | off-policy | dense         |
+| Reinforcement learning  | on-policy  | sparse        |
+| On-policy distillation  | on-policy  | dense         |
+
+On-Policy Distillation has been shown to outperform SFT, GRPO and can be used to restore generalization capabilities lost during SFT.
+
+Additionally on-policy distillation is more compute efficient and is less prone to overfitting when trained with limited data.
+
+To train a model with on-policy distillation using TRL, you can use the following configuration, with the [`GKDTrainer`] and [`GKDConfig`]:
+
+```python
+from trl import GKDConfig
+
+config = GKDConfig(
+    lmbda=1.0, # student produces rollouts for all batches
+    beta=1.0, # to ensure reverse-kl as the loss function
+    teacher_model_name_or_path="teacher-model", # specify the teacher model
+
+)
+```
+
+Alternatively, you can use the [`GOLDTrainer`] and [`GOLDConfig`] to perform on-policy distillation with a similar configuration:
+
+```python
+from trl.experimental import GOLDConfig
+
+config = GOLDConfig(
+    lmbda=1.0, # student produces rollouts for all batches
+    beta=1.0, # to ensure reverse-kl as the loss function
+    teacher_model_name_or_path="teacher-model", # specify the teacher model
+
+)
+```
--- a/docs/source/papo_trainer.md
+++ b/docs/source/papo_trainer.md
@ -0,0 +1,20 @@
+# PAPO Trainer
+
+[![model badge](https://img.shields.io/badge/All_models-PAPO-blue)](https://huggingface.co/models?other=papo,trl)
+
+TRL supports the Perception-Aware Policy Optimization (PAPO) as described in the paper [Perception-Aware Policy Optimization for Multimodal Reasoning](https://huggingface.co/papers/2507.06448) by [Zhenhailong Wang](https://huggingface.co/mikewang), Xuehang Guo, Sofia Stoica, [Haiyang Xu](https://huggingface.co/xhyandwyy), Hongru Wang, Hyeonjeong Ha, Xiusi Chen, Yangyi Chen, Ming Yan, Fei Huang, Heng Ji
+
+The abstract from the paper is the following:
+
+> Reinforcement Learning with Verifiable Rewards (RLVR) has proven to be a highly effective strategy for endowing Large Language Models (LLMs) with robust multi-step reasoning abilities. However, its design and optimizations remain tailored to purely textual domains, resulting in suboptimal performance when applied to multimodal reasoning tasks. In particular, we observe that a major source of error in current multimodal reasoning lies in the perception of visual inputs. To address this bottleneck, we propose Perception-Aware Policy Optimization (PAPO), a simple yet effective extension of GRPO that encourages the model to learn to perceive while learning to reason, entirely from internal supervision signals. Notably, PAPO does not rely on additional data curation, external reward models, or proprietary models. Specifically, we introduce the Implicit Perception Loss in the form of a KL divergence term to the GRPO objective, which, despite its simplicity, yields significant overall improvements (4.4%) on diverse multimodal benchmarks. The improvements are more pronounced, approaching 8.0%, on tasks with high vision dependency. We also observe a substantial reduction (30.5%) in perception errors, indicating improved perceptual capabilities with PAPO. We conduct comprehensive analysis of PAPO and identify a unique loss hacking issue, which we rigorously analyze and mitigate through a Double Entropy Loss. Overall, our work introduces a deeper integration of perception-aware supervision into RLVR learning objectives and lays the groundwork for a new RL framework that encourages visually grounded reasoning. Project page: https://mikewangwzhl.github.io/PAPO.
+
+## PAPOTrainer
+
+[[autodoc]] experimental.papo.PAPOTrainer
+    - train
+    - save_model
+    - push_to_hub
+
+## PAPOConfig
+
+[[autodoc]] experimental.papo.PAPOConfig
--- a/docs/source/peft_integration.md
+++ b/docs/source/peft_integration.md
@ -3,17 +3,10 @@
 The notebooks and scripts in these examples show how to use Low Rank Adaptation (LoRA) to fine-tune models in a memory efficient manner. Most of PEFT methods supported in peft library but note that some PEFT methods such as Prompt tuning are not supported.
 For more information on LoRA, see the [original paper](https://huggingface.co/papers/2106.09685).

-Here's an overview of the `peft`-enabled notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):
-
-| File | Task | Description | Colab link |
-|---|---| --- |
-| [`stack_llama/rl_training.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py) | RLHF | Distributed fine-tuning of the 7b parameter LLaMA models with a learned reward model and `peft`. |  |
-| [`stack_llama/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/reward_modeling.py) | Reward Modeling | Distributed training of the 7b parameter LLaMA reward model with `peft`. |  |
-| [`stack_llama/supervised_finetuning.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py) | SFT | Distributed instruction/supervised fine-tuning of the 7b parameter LLaMA model with `peft`. |  |
-
 ## Installation
+
 Note: peft is in active development, so we install directly from their Github page.
-Peft also relies on the latest version of transformers. 
+Peft also relies on the latest version of transformers.

 ```bash
 pip install trl[peft]
@ -27,7 +20,7 @@ Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scr

 ## How to use it?

-Simply declare a `PeftConfig` object in your script and pass it through `.from_pretrained` to load the TRL+PEFT model. 
+Simply declare a [`~peft.PeftConfig`] object in your script and pass it through `.from_pretrained` to load the TRL+PEFT model.

 ```python
 from peft import LoraConfig
@ -47,7 +40,9 @@ model = AutoModelForCausalLMWithValueHead.from_pretrained(
    peft_config=lora_config,
 )
 ```
+
 And if you want to load your model in 8bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -55,7 +50,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    peft_config=lora_config,
 )
 ```
+
 ... or in 4bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -64,7 +61,6 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
 )
 ```

-
 ## Launch scripts

 The `trl` library is powered by `accelerate`. As such it is best to configure and launch trainings with the following commands:
@ -77,6 +73,7 @@ accelerate launch examples/scripts/ppo.py --use_peft # launch`es training
 ## Using `trl` + `peft` and Data Parallelism

 You can scale up to as many GPUs as you want, as long as you are able to fit the training process in a single device. The only tweak you need to apply is to load the model as follows:
+
 ```python
 from peft import LoraConfig
 ...
@ -94,7 +91,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    peft_config=lora_config,
 )
 ```
+
 And if you want to load your model in 8bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -102,7 +101,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    load_in_8bit=True,
 )
 ```
+
 ... or in 4bit precision:
+
 ```python
 pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name, 
@ -110,21 +111,108 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
    load_in_4bit=True,
 )
 ```
+
 Finally, make sure that the rewards are computed on correct device as well, for that you can use `ppo_trainer.model.current_device`.

+## Multi-Adapter RL Training
+
+You can use a single base model with multiple PEFT adapters for the entire PPO algorithm - including retrieving reference logits, computing active logits, and calculating rewards. This approach is useful for memory-efficient RL training.
+
+> [!WARNING]
+> This feature is experimental and convergence has not been extensively tested. We encourage the community to share feedback and report any issues.
+
+### Requirements
+
+Install PEFT and optionally bitsandbytes for 8-bit models:
+
+```bash
+pip install peft bitsandbytes
+```
+
+### Training Workflow
+
+The multi-adapter approach requires three stages:
+
+1. **Supervised Fine-Tuning (SFT)**: Train a base model on your target domain (e.g., IMDB dataset) using `SFTTrainer`
+2. **Reward Model Training**: Train a reward model adapter using PEFT and `RewardTrainer` (see [reward modeling example](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_modeling.py))
+3. **PPO Training**: Fine-tune new adapters using PPO with the reward adapter
+
+> [!IMPORTANT]
+> Use the same base model (architecture and weights) for stages 2 & 3.
+
+### Basic Usage
+
+After training your reward adapter and pushing it to the Hub:
+
+```python
+from peft import LoraConfig
+from trl import AutoModelForCausalLMWithValueHead, PPOTrainer
+
+model_name = "huggyllama/llama-7b"
+rm_adapter_id = "trl-lib/llama-7b-hh-rm-adapter"
+
+# Configure PPO adapter
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+# Load model with reward adapter
+model = AutoModelForCausalLMWithValueHead.from_pretrained(
+    model_name,
+    peft_config=lora_config,
+    reward_adapter=rm_adapter_id,
+)
+
+trainer = PPOTrainer(model=model, ...)
+```
+
+In your training loop, compute rewards using:
+
+```python
+rewards = trainer.model.compute_reward_score(**inputs)
+```
+
+### Advanced Features
+
+#### Multiple Policy Adapters
+
+You can train multiple adapters on the same base model for different policies. Control which adapter to activate using the `ppo_adapter_name` argument:
+
+```python
+adapter_name_policy_1 = "policy_1"
+rewards = trainer.model.compute_reward_score(**inputs, ppo_adapter_name=adapter_name_policy_1)
+```
+
+#### Quantized Base Models
+
+For memory-efficient training, load the base model in 8-bit or 4-bit while keeping adapters in float32:
+
+```python
+from transformers import BitsAndBytesConfig
+
+model = AutoModelForCausalLMWithValueHead.from_pretrained(
+    model_name,
+    peft_config=lora_config,
+    reward_adapter=rm_adapter_id,
+    quantization_config=BitsAndBytesConfig(load_in_8bit=True),
+)
+```
+
 ## Naive pipeline parallelism (NPP) for large models (>60B models)

-The `trl` library also supports naive pipeline parallelism (NPP) for large models (>60B models). This is a simple way to parallelize the model across multiple GPUs. 
+The `trl` library also supports naive pipeline parallelism (NPP) for large models (>60B models). This is a simple way to parallelize the model across multiple GPUs.
 This paradigm, termed as "Naive Pipeline Parallelism" (NPP) is a simple way to parallelize the model across multiple GPUs. We load the model and the adapters across multiple GPUs and the activations and gradients will be naively communicated across the GPUs. This supports `int8` models as well as other `dtype` models.

-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-npp.png">
-</div>
+![NPP](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-npp.png)

 ### How to use NPP?

-Simply load your model with a custom `device_map` argument on the `from_pretrained` to split your model across multiple devices. Check out this [nice tutorial](https://github.com/huggingface/blog/blob/main/accelerate-large-models.md) on how to properly create a `device_map` for your model. 
- 
+Simply load your model with a custom `device_map` argument on the `from_pretrained` to split your model across multiple devices. Check out this [nice tutorial](https://github.com/huggingface/blog/blob/main/accelerate-large-models.md) on how to properly create a `device_map` for your model.
+
 Also make sure to have the `lm_head` module on the first GPU device as it may throw an error if it is not on the first device. As this time of writing, you need to install the `main` branch of `accelerate`: `pip install git+https://github.com/huggingface/accelerate.git@main` and `peft`: `pip install git+https://github.com/huggingface/peft.git@main`.

 ### Launch scripts
--- a/docs/source/ppo_trainer.md
+++ b/docs/source/ppo_trainer.md
@ -1,10 +1,11 @@
 # PPO Trainer

-[![](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)
+[![model badge](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)

 TRL supports training LLMs with [Proximal Policy Optimization (PPO)](https://huggingface.co/papers/1707.06347).

 References:
+
 - [Fine-Tuning Language Models from Human Preferences](https://github.com/openai/lm-human-preferences)
 - [Learning to Summarize from Human Feedback](https://github.com/openai/summarize-from-feedback)
 - [The N Implementation Details of RLHF with PPO](https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo)
@ -31,49 +32,45 @@ python examples/scripts/ppo/ppo.py \
    --missing_eos_penalty 1.0
 ```

-
 ## Explanation of the logged metrics

 The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35)

-* `eps`: Tracks the number of episodes per second.
-* `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
-* `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
-* `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
-* `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
-* `objective/scores`: The mean scores returned by the reward model / environment.
-* `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
-* `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
-* `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
-* `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
-* `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to policy/clipfrac_avg but for the value function.
-* `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
-* `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
-* `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
-* `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
-* `lr`: lr: The current learning rate used by the optimizer.
-* `episode`: episode: The current episode count in the training process.
-
+- `eps`: Tracks the number of episodes per second.
+- `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
+- `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
+- `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
+- `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
+- `objective/scores`: The mean scores returned by the reward model / environment.
+- `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
+- `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
+- `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
+- `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
+- `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to policy/clipfrac_avg but for the value function.
+- `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
+- `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
+- `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
+- `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
+- `lr`: lr: The current learning rate used by the optimizer.
+- `episode`: episode: The current episode count in the training process.

 ## Cookbook

-* Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
-* Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try understand why this is happening and try to fix it.
-* Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
-* Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
-* Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
-
+- Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
+- Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try understand why this is happening and try to fix it.
+- Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
+- Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
+- Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.

 ## What is my model doing exactly?

 To help you understand what your model is doing, we periodically log some sample completions from the model. Here is an example of a completion. In an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35), it looks like the following, allowing you to see the model's response at different stages of training. By default we generate `--num_sample_generations 10` during training, but you can customize the number of generations.

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2_completions.gif)
+![ppov2_completions](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2_completions.gif)

+In the logs the sampled generations look like

-In the logs the sampled generations look like 
-
-```
+```txt
 ┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
 ┃ query                           ┃ model response                  ┃ score    ┃
 ┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
@ -177,7 +174,7 @@ This PPO implementation is based on the [The N+ Implementation Details of RLHF w

 To validate the PPO implementation works, we ran experiment on the 1B model. Here are the command we used to run the experiment. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).

-```
+```shell
 accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \
    examples/scripts/ppo/ppo_tldr.py \
    --output_dir models/minimal/ppo_tldr \
@ -212,8 +209,7 @@ The PPO checkpoint gets a 64.7% preferred rate vs the 33.0% preference rate of t

 Metrics:

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2.png)
-
+![PPO v2](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2.png)

 ```bash
 # pip install openrlbenchmark==0.2.1a5
--- a/docs/source/prm_trainer.md
+++ b/docs/source/prm_trainer.md
@ -1,12 +1,9 @@
 # PRM Trainer

-[![](https://img.shields.io/badge/All_models-PRM-blue)](https://huggingface.co/models?other=prm,trl)
+[![model badge](https://img.shields.io/badge/All_models-PRM-blue)](https://huggingface.co/models?other=prm,trl)

-<Tip warning={true}>
-
-PRM Trainer is an experimental API which is subject to change at any time.
-
-</Tip>
+> [!WARNING]
+> PRM Trainer is an experimental API which is subject to change at any time.

 ## Overview

@ -18,7 +15,6 @@ The abstract from the paper is the following:

 This post-training method was contributed by [Gaetan Lopez](https://github.com/gaetanlop), [Lewis Tunstall](https://huggingface.co/lewtun), [Quentin Gallouédec](https://huggingface.co/qgallouedec) and [Agustín Piqueres](https://huggingface.co/plaguss).

-
 ## Quick start

 This example demonstrates how to train a model using the PRM method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B) as the base model. We use the stepwise supervision data from the [Math Shepherd dataset](https://huggingface.co/datasets/trl-lib/math_shepherd). You can view the data in the dataset here:
@ -57,7 +53,6 @@ Distributed across 8 GPUs, the training takes approximately 1 hour.

 To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward-Math-Sheperd) performs, you can use the following script.

-
 ```python
 from datasets import load_dataset
 from transformers import pipeline
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@ -1,6 +1,6 @@
 # Quickstart

-TRL is a comprehensive library for post-training foundation models using techniques like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO),  Direct Preference Optimization (DPO).
+TRL is a comprehensive library for post-training foundation models using techniques like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO).

 ## Quick Examples

@ -51,6 +51,21 @@ trainer = DPOTrainer(
 trainer.train()
 ```

+### Reward Modeling
+
+```python
+from trl import RewardTrainer
+from datasets import load_dataset
+
+dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
+
+trainer = RewardTrainer(
+    model="Qwen/Qwen2.5-0.5B-Instruct",
+    train_dataset=dataset,
+)
+trainer.train()
+```
+
 ## Command Line Interface

 Skip the code entirely - train directly from your terminal:
@ -63,6 +78,10 @@ trl sft --model_name_or_path Qwen/Qwen2.5-0.5B \
 # DPO: Align with preferences  
 trl dpo --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
    --dataset_name trl-lib/ultrafeedback_binarized
+
+# Reward: Train a reward model
+trl reward --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --dataset_name trl-lib/ultrafeedback_binarized
 ```

 ## What's Next?
@ -72,7 +91,6 @@ trl dpo --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
 - [SFT Trainer](sft_trainer) - Complete SFT guide
 - [DPO Trainer](dpo_trainer) - Preference alignment
 - [GRPO Trainer](grpo_trainer) - Group relative policy optimization
- [Training FAQ](how_to_train) - Common questions

 ### 🚀 Scale Up

@ -122,4 +140,4 @@ Try adjusting the learning rate:
 training_args = SFTConfig(learning_rate=2e-5)  # Good starting point
 ```

-For more help, see our [Training FAQ](how_to_train) or open an [issue on GitHub](https://github.com/huggingface/trl/issues).
+For more help, open an [issue on GitHub](https://github.com/huggingface/trl/issues).
--- a/docs/source/rapidfire_integration.md
+++ b/docs/source/rapidfire_integration.md
@ -0,0 +1,390 @@
+# RapidFire AI Integration
+
+RapidFire AI is an open-source experiment execution framework that enables concurrent training of multiple TRL configurations on the same GPU(s) through intelligent chunk-based scheduling.
+
+## Key Features
+
+- **16-24× higher experimentation throughput** compared to sequential training.
+- **Almost no code changes** - drop-in configuration wrappers around TRL's and PEFT's existing configs.
+- **Interactive Control Operations** - real-time control to stop, resume, clone, and modify training runs in flight
+- **Automatic multi-GPU orchestration** with intelligent scheduling
+- **Full compatibility** with transformers, PEFT, SFTTrainer, DPOTrainer, and GRPOTrainer
+- **Full MLflow Integration**: Automatic experiment tracking and visualization
+- **Production-Ready**: Already used in production environments with complete working examples.
+
+### Problem It Solves
+
+When fine-tuning or post-training with TRL, AI developers often need to:
+- Try different hyperparameter configurations
+- Compare different LoRA settings
+- Test different prompt schemes
+- Run ablation studies
+
+
+**Current approach**: Train each config one after another → slow and inefficient process
+
+**With RapidFire AI**: Train all configs in one go even on a single GPU → 16-24× faster process
+
+### How It Works
+
+RapidFire AI employs **adaptive chunk-based scheduling**:
+
+```
+GPU Timeline (Single GPU):
+Chunk 1: [Config A] → [Config B] → [Config C] → [Config D]
+Chunk 2: [Config A] → [Config B] → [Config C] → [Config D]
+Chunk 3: [Config A] → [Config B] → [Config C] → [Config D]
+```
+
+This enables:
+- Early comparison of configurations on same data subsets incrementally
+- Efficient GPU utilization and minimizing idle times
+- Real-time and automated experiment metrics tracking
+- Dynamic control over runs in flight to incentivize more experimentation
+
+
+## Installation
+
+### Prerequisites
+
+- Python 3.12.x
+- NVIDIA GPU with Compute Capability 7.x or 8.x
+- CUDA Toolkit 11.8+
+- PyTorch 2.7.1+
+
+### pip install
+
+```bash
+pip install rapidfireai
+```
+
+Once installed, authenticate with Hugging Face and initialize RapidFire AI:
+
+```bash
+# Authenticate with Hugging Face
+huggingface-cli login --token YOUR_TOKEN
+
+# Workaround for current issue: https://github.com/huggingface/xet-core/issues/527
+pip uninstall -y hf-xet
+
+# Initialize RapidFire AI
+rapidfireai init
+
+# Start the RapidFire AI server
+rapidfireai start
+```
+
+The dashboard will be available at `http://0.0.0.0:3000` where you can monitor and control experiments in real-time.
+
+## Quick Start: SFT Training with Multiple Configs
+
+Here's a complete example showing how to train multiple SFT configurations concurrently:
+
+```python
+from rapidfireai import Experiment
+from rapidfireai.automl import List, RFGridSearch, RFModelConfig, RFLoraConfig, RFSFTConfig
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# Load dataset
+dataset = load_dataset("bitext/Bitext-customer-support-llm-chatbot-training-dataset")
+train_dataset = dataset["train"].select(range(128)).shuffle(seed=42)
+eval_dataset = dataset["train"].select(range(100, 124)).shuffle(seed=42)
+
+# Define data formatting function
+def formatting_function(row):
+    return {
+        "prompt": [
+            {"role": "system", "content": "You are a helpful customer support assistant."},
+            {"role": "user", "content": row["instruction"]},
+        ],
+        "completion": [
+            {"role": "assistant", "content": row["response"]}
+        ]
+    }
+
+# Initialize experiment
+experiment = Experiment(experiment_name="sft-customer-support")
+
+# Define multiple LoRA configurations to compare
+peft_configs = List([
+    RFLoraConfig(r=8, lora_alpha=16, lora_dropout=0.1, 
+                 target_modules=["q_proj", "v_proj"], bias="none"),
+    RFLoraConfig(r=32, lora_alpha=64, lora_dropout=0.1,
+                 target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], bias="none")
+])
+
+# Define multiple training configurations
+# 2 base configs × 2 PEFT configs = 4 total training runs
+config_set = List([
+    RFModelConfig(
+        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        peft_config=peft_configs,
+        training_args=RFSFTConfig(  # Wraps TRL's SFTConfig
+            learning_rate=1e-3,
+            per_device_train_batch_size=4,
+            max_steps=128,
+            fp16=True,
+        ),
+        model_type="causal_lm",
+        model_kwargs={"device_map": "auto", "torch_dtype": "auto", "use_cache": False},
+        formatting_func=formatting_function,
+    ),
+    RFModelConfig(
+        model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
+        peft_config=peft_configs,
+        training_args=RFSFTConfig(
+            learning_rate=1e-4,  # Different learning rate
+            per_device_train_batch_size=4,
+            max_steps=128,
+            fp16=True,
+        ),
+        model_type="causal_lm",
+        model_kwargs={"device_map": "auto", "torch_dtype": "auto", "use_cache": False},
+        formatting_func=formatting_function,
+    )
+])
+
+# Define model creation function
+def create_model(model_config):
+    model = AutoModelForCausalLM.from_pretrained(
+        model_config["model_name"], 
+        **model_config["model_kwargs"]
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_config["model_name"])
+    return (model, tokenizer)
+
+# Create grid search over all configurations
+config_group = RFGridSearch(configs=config_set, trainer_type="SFT")
+
+# Run all 4 configurations concurrently with chunk-based scheduling
+experiment.run_fit(config_group, create_model, train_dataset, eval_dataset, 
+                   num_chunks=4, seed=42)
+
+# End experiment
+experiment.end()
+```
+
+### What Happens During Execution
+
+When you run this example:
+
+1. **Config Expansion**: 2 base configurations × 2 PEFT configs = 4 total training runs
+2. **Chunk-based Scheduling**: Training data is divided into chunks, and all 4 configs train concurrently
+3. **GPU Swapping**: Models are swapped in/out of GPU memory based on chunk boundaries
+4. **Real-time Tracking**: All metrics visible in the dashboard at `http://localhost:3000`
+5. **Interactive Control**: Stop, resume, or clone any configuration from the dashboard
+
+This delivers **16-24× higher throughput** compared to training each configuration sequentially!
+
+## Supported TRL Trainers
+
+### SFTTrainer
+
+Use `RFSFTConfig` as a drop-in replacement for `SFTConfig`:
+
+```python
+from rapidfireai.automl import RFSFTConfig
+
+training_args = RFSFTConfig(
+    learning_rate=5e-5,
+    per_device_train_batch_size=4,
+    num_train_epochs=3,
+    max_length = 512,
+    # ... all other SFTConfig parameters supported
+)
+```
+
+**Example Notebook**: [SFT for Customer Support](https://github.com/RapidFireAI/rapidfireai/blob/main/tutorial_notebooks/rf-tutorial-sft-chatqa-lite.ipynb)
+
+### DPOTrainer
+
+Use `RFDPOConfig` as a drop-in replacement for `DPOConfig`:
+
+```python
+from rapidfireai.automl import RFDPOConfig
+
+training_args = RFDPOConfig(
+    beta=0.1,
+    loss_type="sigmoid",
+    max_prompt_length=512,
+    max_completion_length=512,
+    learning_rate=5e-4,
+    # ... all other DPOConfig parameters supported
+)
+```
+
+**Example Notebook**: [DPO for Preference Alignment](https://github.com/RapidFireAI/rapidfireai/blob/main/tutorial_notebooks/rf-tutorial-dpo-alignment-lite.ipynb)
+
+### GRPOTrainer
+
+Use `RFGRPOConfig` as a drop-in replacement for `GRPOConfig`:
+
+```python
+from rapidfireai.automl import RFGRPOConfig
+
+training_args = RFGRPOConfig(
+    learning_rate=5e-6,
+    num_generations=8,
+    max_prompt_length=256,
+    max_completion_length=256,
+    # ... all other GRPOConfig parameters supported
+)
+```
+
+**Example Notebook**: [GRPO for Math Reasoning](https://github.com/RapidFireAI/rapidfireai/blob/main/tutorial_notebooks/rf-tutorial-grpo-mathreasoning-lite.ipynb)
+
+## Core Concepts
+
+### Chunk-Based Concurrent Training
+
+RapidFire AI divides training data into chunks and alternates between configurations:
+
+```
+GPU Timeline (Single GPU):
+Chunk 1: [Config A] → [Config B] → [Config C] → [Config D]
+Chunk 2: [Config A] → [Config B] → [Config C] → [Config D]
+Chunk 3: [Config A] → [Config B] → [Config C] → [Config D]
+...
+```
+
+This approach maximizes GPU utilization and enables early comparison of configurations while maintaining training stability through automatic checkpointing.
+
+### Interactive Control Operations (IC Ops)
+
+Through the RapidFire AI dashboard, you can dynamically control running experiments:
+
+- **Stop**: Pause a configuration (checkpointed automatically)
+- **Resume**: Continue from last checkpoint
+- **Clone**: Duplicate a configuration with modifications
+- **Clone & Warm Start**: Clone and initialize from parent's weights
+- **Delete**: Remove failed or unwanted runs
+
+This enables adaptive experimentation where you can stop underperforming configs early and clone promising ones with tweaked hyperparameters.
+
+### Multi-Config Experimentation
+
+Use `RFGridSearch` or `RFRandomSearch` to automatically generate configuration combinations:
+
+```python
+# Grid search: tests all combinations
+config_group = RFGridSearch(configs=config_list, trainer_type="SFT")
+
+# Random search: samples N configurations
+config_group = RFRandomSearch(configs=config_list, trainer_type="DPO", num_samples=10)
+```
+
+## Advanced Features
+
+### PEFT/LoRA Integration
+
+Full support for parameter-efficient fine-tuning:
+
+```python
+from rapidfireai.automl import RFLoraConfig
+from peft import TaskType
+
+lora_config = RFLoraConfig(
+    task_type=TaskType.CAUSAL_LM,
+    r=64,
+    lora_alpha=64,
+    lora_dropout=0.1,
+    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
+    bias="none"
+)
+```
+
+### Custom Reward Functions (GRPO)
+
+Define multiple reward functions for GRPO training:
+
+```python
+def correctness_reward(prompts, completions, answer, **kwargs):
+    """Reward for correct answers"""
+    responses = [completion[0]['content'] for completion in completions]
+    extracted = [extract_answer(r) for r in responses]
+    return [2.0 if r == a else 0.0 for r, a in zip(extracted, answer)]
+
+def format_reward(completions, **kwargs):
+    """Reward for proper formatting"""
+    import re
+    pattern = r"<reasoning>.*?</reasoning>\s*<answer>.*?</answer>"
+    responses = [completion[0]["content"] for completion in completions]
+    matches = [re.match(pattern, r) for r in responses]
+    return [0.5 if match else 0.0 for match in matches]
+
+# Use in model config
+config = RFModelConfig(
+    reward_funcs=[correctness_reward, format_reward],
+    # ... other parameters
+)
+```
+
+### Multi-GPU Support
+
+RapidFire AI automatically detects and utilizes all available GPUs. No special configuration needed - the scheduler automatically distributes configurations across GPUs.
+
+## Best Practices
+
+### Tuning Chunk Granularity
+
+The `num_chunks` parameter controls swap frequency:
+
+```python
+# Fewer chunks = less overhead, less frequent comparison
+experiment.run_fit(..., num_chunks=2)
+
+# More chunks = more overhead, more frequent comparison
+experiment.run_fit(..., num_chunks=16)
+```
+
+**Rule of thumb**: Start with `num_chunks=4` and adjust based on dataset size and number of configurations.
+
+### Memory Management
+
+For large models, use quantization:
+
+```python
+from transformers import BitsAndBytesConfig
+import torch
+
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+)
+
+model_kwargs = {
+    "quantization_config": bnb_config,
+    "device_map": "auto",
+}
+```
+
+## Performance Benchmarks
+
+Based on internal benchmarks comparing sequential vs. RapidFire AI concurrent training:
+
+| Scenario | Sequential Time | RapidFire AI Time | Speedup |
+|----------|----------------|-------------------|---------|
+| 4 configs, 1 GPU | 120 min | 7.5 min | 16× |
+| 8 configs, 1 GPU | 240 min | 12 min | 20× |
+| 4 configs, 2 GPUs | 60 min | 4 min | 15× |
+| 8 configs, 4 GPUs | 60 min | 3 min | 20× |
+
+*Benchmarks performed on NVIDIA A100 40GB with TinyLlama-1.1B and Llama-3.2-1B models*
+
+## Troubleshooting
+
+For troubleshooting guidance, see the [RapidFire AI Troubleshooting Guide](https://oss-docs.rapidfire.ai/en/latest/troubleshooting.html).
+
+## Additional Resources
+- **Colab Notebook**: [RapidFire AI in Google Colab](http://tinyurl.com/rapidfireai-colab)
+- **Documentation**: [oss-docs.rapidfire.ai](https://oss-docs.rapidfire.ai)
+- **GitHub**: [RapidFireAI/rapidfireai](https://github.com/RapidFireAI/rapidfireai)
+- **PyPI**: [pypi.org/project/rapidfireai](https://pypi.org/project/rapidfireai/)
+- **Discord**: [Join our Discord](https://discord.gg/6vSTtncKNN)
+- **Tutorial Notebooks**: [GitHub Repository](https://github.com/RapidFireAI/rapidfireai/tree/main/tutorial_notebooks)
+
+Learn more about RapidFire AI in their [official repository](https://github.com/RapidFireAI/rapidfireai) and [documentation](https://oss-docs.rapidfire.ai).
+
--- a/docs/source/reducing_memory_usage.md
+++ b/docs/source/reducing_memory_usage.md
@ -1,18 +1,18 @@
 # Reducing Memory Usage

-<Tip warning={true}>
+Training workflows can often be optimized to **reduce memory consumption**, and TRL provides several built-in features to help achieve this.

-Section under construction. Feel free to contribute!
+Below, we outline these techniques and recommend experimenting with different combinations to figure out which configuration works best for your specific setup.

-</Tip>
+Each method includes examples for the supported trainers. If you're unsure whether a technique is compatible with your trainer, please take a look at the corresponding trainer documentation.
+
+For additional strategies, such as **gradient checkpointing**, which is supported across all trainers, see the [`transformers` performance guide](https://huggingface.co/docs/transformers/perf_train_gpu_one#gradient-checkpointing).

 ## Truncation

 Sequence lengths in the dataset can vary widely. When data is batched, sequences are padded to match the longest one in the batch, which can cause high memory usage, even if most sequences are relatively short.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png" alt="Truncation prompt-completion" width="600"/>
-</div>
+![Truncation prompt-completion](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png)

 To reduce memory usage, it's important to truncate sequences to a reasonable length. While TRL trainers truncate sequences by default, you may want to adjust the default truncation length to better align with your specific use case.

@ -21,9 +21,7 @@ To reduce memory usage, it's important to truncate sequences to a reasonable len

 DPO truncation is applied first to the prompt and to the completion via the `max_prompt_length` and `max_completion_length` parameters. The `max_length` parameter is then used to truncate the resulting sequence.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png" alt="Truncation prompt-completion" width="600"/>
-</div>
+![DPO truncation](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png)

 To set the truncation parameters, use the following code snippet:

@ -46,9 +44,7 @@ training_args = DPOConfig(..., max_completion_length=...)

 SFT truncation is applied to the input sequence via the `max_length` parameter.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png" alt="Truncation input ids" width="600"/>
-</div>
+![Truncation input ids](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png)

 To set the truncation parameter, use the following code snippet:

@ -63,7 +59,7 @@ training_args = SFTConfig(..., max_length=...)

 ### How to choose the `max_length` value?

-If `max_length` is too small, a significant portion of your tokens will be discarded and won't contribute to training. If it's too large, memory usage can spike, potentially leading to OOM (Out-Of-Memory) errors. Without packing or padding-free, a large `max_length` may also result in inefficient training, as many tokens will be padding.
+If `max_length` is too small, a significant portion of your tokens will be discarded and won't contribute to training. If it's too large, memory usage can spike, potentially leading to out-of-memory (OOM) errors. Without packing or padding-free, a large `max_length` may also result in inefficient training, as many tokens will be padding.

 To help you choose an appropriate value, we provide a utility to visualize the sequence length distribution in your dataset.

@ -71,30 +67,22 @@ To help you choose an appropriate value, we provide a utility to visualize the s

 ## Packing

-<Tip>
-
-This technique applies only to SFT.
-
-</Tip>
-
+> [!TIP]
+> This technique is available only for **SFT** training and setups that use **FlashAttention** (or its variants).

 [Truncation](#truncation) has several drawbacks:
+
 1. **Loss of information**: Key data at the end of a sequence may be discarded.
 2. **Choosing truncation length**: Too short loses data; too long undermines efficiency.

 Packing, introduced in [Raffel et al., 2020](https://huggingface.co/papers/1910.10683), addresses these issues by grouping sequences instead of truncating. It concatenates and splits dataset sequences into the desired lengths.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_2.png" alt="Packing" width="600"/>
-</div>
+![Packing](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_2.png)

 Packing reduces padding by merging several sequences in one row when possible. We use an advanced method to be near-optimal in the way we pack the dataset. To enable packing, use `packing=True` in the [`SFTConfig`].

-<Tip>
-
-In TRL 0.18 and earlier, packing used a more aggressive method that reduced padding to almost nothing, but had the downside of breaking sequence continuity for a large fraction of the dataset. To revert to this strategy, use `packing_strategy="wrapped"` in `SFTConfig`.
-
-</Tip>
+> [!TIP]
+> In TRL 0.18 and earlier, packing used a more aggressive method that reduced padding to almost nothing, but had the downside of breaking sequence continuity for a large fraction of the dataset. To revert to this strategy, use `packing_strategy="wrapped"` in [`SFTConfig`].

 ```python
 from trl import SFTConfig
@ -102,49 +90,60 @@ from trl import SFTConfig
 training_args = SFTConfig(..., packing=True, max_length=512)
 ```

-<Tip warning={true}>
-
-Packing may cause batch contamination, where adjacent sequences influence one another. This can be problematic for some applications. For more details, see [#1230](https://github.com/huggingface/trl/issues/1230).
-
-</Tip>
+> [!WARNING]
+> Packing may cause batch contamination, where adjacent sequences influence one another. This can be problematic for some applications. For more details, see [#1230](https://github.com/huggingface/trl/issues/1230).

 ## Liger for reducing peak memory usage

-> [Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%.
+> [Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU training throughput by 20% and reduce memory usage by 60%.

-For more information, see [Liger Kernel Integration](liger_kernel_integration)
-
-<hfoptions id="liger">
-<hfoption id="DPO">
+For more information, see [Liger Kernel Integration](liger_kernel_integration).

 To use Liger for reducing peak memory usage, use the following code snippet:
-  
+
+<hfoptions id="liger">
+<hfoption id="SFT">
+
+```python
+from trl import SFTConfig
+
+training_args = SFTConfig(..., use_liger_kernel=True)
+```
+
+</hfoption>
+<hfoption id="DPO">
+
 ```python
 from trl import DPOConfig

-training_args = DPOConfig(..., use_liger_loss=True)
+training_args = DPOConfig(..., use_liger_kernel=True)
 ```

 </hfoption>
 <hfoption id="GRPO">

-To use Liger for reducing peak memory usage, use the following code snippet:
-  
 ```python
 from trl import GRPOConfig

-training_args = GRPOConfig(..., use_liger_loss=True)
+training_args = GRPOConfig(..., use_liger_kernel=True)
 ```

 </hfoption>
 <hfoption id="KTO">

-To use Liger for reducing peak memory usage, use the following code snippet:
-  
 ```python
 from trl import KTOConfig

-training_args = KTOConfig(..., use_liger_loss=True)
+training_args = KTOConfig(..., use_liger_kernel=True)
+```
+
+</hfoption>
+<hfoption id="GKD">
+
+```python
+from trl import GKDConfig
+
+training_args = GKDConfig(..., use_liger_kernel=True)
 ```

 </hfoption>
@ -154,15 +153,10 @@ training_args = KTOConfig(..., use_liger_loss=True)

 Padding-free batching is an alternative approach for reducing memory usage. In this method, a batch is first sampled and then flattened into a single sequence, avoiding padding. Unlike packing, which can result in incomplete sequences by combining parts of different samples, padding-free batching ensures that all sequences remain complete and intact.

-<div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png" alt="Padding-free batching" width="600"/>
-</div>
+![Padding-free](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png)

-<Tip warning={true}>
-
-It's highly recommended to use padding-free batching with **FlashAttention 2** or **FlashAttention 3**. Otherwise, you may encounter batch contamination issues.
-
-</Tip>
+> [!WARNING]
+> It's highly recommended to use padding-free batching with **FlashAttention 2** or **FlashAttention 3**. Otherwise, you may encounter batch contamination issues.

 <hfoptions id="padding-free">
 <hfoption id="DPO">
@ -197,27 +191,40 @@ from trl import SFTConfig
 training_args = SFTConfig(..., activation_offloading=True)
 ```

-<Tip warning={true}>
+Under the hood, activation offloading implements PyTorch's [`saved_tensors_hooks`](https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html#hooks-for-autograd-saved-tensors) to intercept activations during the forward pass. It intelligently manages which tensors to offload based on size and context, avoiding offloading output tensors that would be inefficient. For performance optimization, it can, via a flag (which is true by default), use CUDA streams to overlap computation with CPU-GPU transfers.

-When using activation offloading with models that use Liger kernels, you must disable Liger cross entropy due to compatibility issues. The issue occurs specifically with `use_liger_kernel=True` because Liger cross entropy performs in-place operations which conflict with activation offloading. The default setting (`use_liger_kernel=False`) works:
+## Padding Sequences to a Multiple
+
+> [!TIP]
+> This technique is supported for **SFT** and **Reward** trainers currently.
+
+When enabled, this option ensures that all sequences are **padded to a multiple** of the specified value.  
+This can improve computational efficiency on some hardware by aligning sequence lengths to memory-friendly boundaries.
+
+<hfoptions id="pad_to_multiple_of">
+<hfoption id="SFT">

 ```python
-# When using activation offloading with a model that uses Liger kernels:
 from trl import SFTConfig

-training_args = SFTConfig(
-    activation_offloading=True,
-    use_liger_kernel=False,  # Disable Liger cross entropy
-    # Other parameters...
-)
+training_args = SFTConfig(..., pad_to_multiple_of=2048)
 ```
-</Tip>

-Under the hood, activation offloading implements PyTorch's [`saved_tensors_hooks`](https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html#hooks-for-autograd-saved-tensors) to intercept activations during the forward pass. It intelligently manages which tensors to offload based on size and context, avoiding offloading output tensors which would be inefficient. For performance optimization, it can optionally use CUDA streams to overlap computation with CPU-GPU transfers.
+</hfoption>
+<hfoption id="Reward">
+
+```python
+from trl import RewardConfig
+
+training_args = RewardConfig(..., pad_to_multiple_of=2048)
+```
+
+</hfoption>
+</hfoptions>

 ## Disabling model gathering for generation in online methods

-When using DeepSpeed ZeRO-3, model weights are sharded across multiple GPUs. Online methods involve generating completions from the model as part of the training process. During this step, the model weights are temporarily gathered on a single GPU for generation. For very large models, this gathering can lead to out-of-memory (OOM) errors, as described in this issue: [#2250](https://github.com/huggingface/trl/issues/2250#issue-2598304204).
+When using DeepSpeed ZeRO-3, model weights are sharded across multiple GPUs. Online methods involve generating completions from the model as part of the training process. During this step, the model weights are temporarily gathered on a single GPU for generation. For very large models, this gathering can lead to OOM errors, as described in this issue: [#2250](https://github.com/huggingface/trl/issues/2250#issue-2598304204).

 If you encounter this issue, you can disable the gathering of model weights for generation by setting the following parameter:

@ -262,110 +269,9 @@ training_args = RLOOConfig(..., ds3_gather_for_generation=False)

 This adjustment prevents model weights from being gathered, avoiding OOM errors, but it may result in slower generation speeds.

-## Context Parallelism
-
-Context Parallelism (CP) is a parallelization technique that enables training with longer sequences by splitting the sequence dimension across multiple GPUs. Each GPU processes a portion of the sequence, allowing you to train with sequences longer than what would fit on a single GPU's memory.
-
-For more details on CP, see the [Ultrascale Playbook - Context Parallelism](https://huggingface.co/spaces/nanotron/ultrascale-playbook?section=context_parallelism).
-
-CP is particularly useful when:
-
- You want to train with very long sequences (>32k tokens)
- Single GPU memory is insufficient for your desired sequence length
- You need to maintain sequence coherence across the full context
-
-### Requirements and Limitations
-
-CP has specific requirements:
-
-1. **Accelerate 1.10 or higher** is required
-2. **FSDP2 (PyTorch FSDP v2)** is required as the distributed training backend
-3. **SDPA attention** - Flash Attention is currently not supported with CP
-4. **Sequence length divisibility** - sequences must be divisible by `cp_size * 2`. This is now automatically handled using the `pad_to_multiple_of` parameter in the data collator, which works seamlessly with both standard and padding-free modes.
-
-### Configuration
-
-To enable CP, you need to configure both Accelerate and your training arguments:
-
-#### Accelerate Configuration
-
-Use one of the provided accelerate config files (e.g. `fsdp_context_parallel_2gpu.yaml` for 2 GPUs):
-
-```yaml
-compute_environment: LOCAL_MACHINE
-debug: false
-distributed_type: FSDP
-downcast_bf16: 'no'
-enable_cpu_affinity: false
-fsdp_config:
-  fsdp_activation_checkpointing: false
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_offload_params: false
-  fsdp_reshard_after_forward: true
-  fsdp_state_dict_type: FULL_STATE_DICT
-  fsdp_version: 2
-machine_rank: 0
-main_training_function: main
-mixed_precision: bf16
-num_machines: 1
-num_processes: 2  # Number of GPUs
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
-parallelism_config:
-  parallelism_config_dp_replicate_size: 1
-  parallelism_config_dp_shard_size: 1
-  parallelism_config_tp_size: 1
-  parallelism_config_cp_size: 2  # Context parallel size
-```
-
-#### Training Configuration
-
-```python
-from trl import SFTConfig
-
-training_args = SFTConfig(
-    # required
-    pad_to_multiple_of=4,           # ensures divisibility by cp_size * 2
-    # to get the most out of CP
-    max_length=16384,               # long sequence length
-    packing=True,                   # use packing to reduce padding
-    use_liger_kernel=True,          # compatible with CP
-    per_device_train_batch_size=1,
-    ...
-)
-```
-
-Then, launch your training script with the appropriate accelerate config file:
-
-```bash
-accelerate launch --config_file fsdp_context_parallel_2gpu.yaml train.py
-```
-
-### Best Practices
-
-1. **Use the `pad_to_multiple_of` parameter** - This is now the recommended way to ensure sequence length divisibility:
-   - For `cp_size=2`: use `pad_to_multiple_of=4` (since `cp_size * 2 = 4`)
-   - For `cp_size=4`: use `pad_to_multiple_of=8` (since `cp_size * 2 = 8`)
-   - The data collator automatically pads sequences to the required multiple, ensuring compatibility with CP
-
-2. **Use packing with padding** - The default BFD (Best Fit Decreasing) strategy works perfectly:
-   - Preserves sequence boundaries and maintains training quality
-   - Works seamlessly with both `padding_free=True` and standard padding modes
-
-3. **Combine with other memory optimizations** like Liger kernels, bfloat16, and gradient checkpointing
-
-4. **Start with smaller context parallel sizes** (2-4 GPUs) before scaling up
-
-5. **Monitor memory usage** across all GPUs to ensure balanced workload
-
 ## vLLM sleep mode

-When using vLLM as the generation backend, you can enable _sleep mode_ to offload vLLM parameters and cache to CPU RAM during the optimization step and reload them back to GPU VRAM when needed for weight synchronization and generation.
+When using **vLLM** as the generation backend for online training methods, you can enable _sleep mode_ to offload vLLM parameters and cache to CPU RAM during the optimization step and reload them back to GPU VRAM when needed for weight synchronization and generation.

 <hfoptions id="vllm_sleep">
 <hfoption id="GRPO">
@ -373,7 +279,7 @@ When using vLLM as the generation backend, you can enable _sleep mode_ to offloa
 ```python
 from trl import GRPOConfig

-training_args = GRPOConfig(..., vllm_sleep_enabled=True)
+training_args = GRPOConfig(..., vllm_enable_sleep_mode=True)
 ```

 </hfoption>
@ -382,7 +288,7 @@ training_args = GRPOConfig(..., vllm_sleep_enabled=True)
 ```python
 from trl import RLOOConfig

-training_args = RLOOConfig(..., vllm_sleep_enabled=True)
+training_args = RLOOConfig(..., vllm_enable_sleep_mode=True)
 ```

 </hfoption>
--- a/docs/source/reward_trainer.md
+++ b/docs/source/reward_trainer.md
@ -1,85 +1,226 @@
 # Reward Modeling

-[![](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer,trl)
+[![model badge](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer,trl)

-TRL supports custom reward modeling for anyone to perform reward modeling on their dataset and model.
+## Overview

-Check out a complete flexible example at [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_modeling.py).
+TRL supports the Outcome-supervised Reward Modeling (ORM) Trainer for training reward models.

-## Expected dataset type
+This post-training method was contributed by [Younes Belkada](https://huggingface.co/ybelkada).

-The [`RewardTrainer`] requires a [*implicit prompt* preference dataset](dataset_formats#preference). It means that the dataset should only contain the columns `"chosen"` and `"rejected"` (and not `"prompt"`).
-The [`RewardTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+## Quick start

-You can also use a pretokenized dataset, in which case the dataset should contain the following columns: `input_ids_chosen`, `attention_mask_chosen`, `input_ids_rejected` and `attention_mask_rejected`.
-
-## Using the `RewardTrainer`
-
-After preparing your dataset, you can use the [`RewardTrainer`] in the same way as the `Trainer` class from 🤗 Transformers.
-You should pass an `AutoModelForSequenceClassification` model to the [`RewardTrainer`], along with a [`RewardConfig`] which configures the hyperparameters of the training.
-
-### Leveraging 🤗 PEFT to train a reward model
-
-Just pass a `peft_config` in the keyword arguments of [`RewardTrainer`], and the trainer should automatically take care of converting the model into a PEFT model!
+This example demonstrates how to train a reward model using the [`RewardTrainer`] from TRL. We train a [Qwen 3 0.6B](https://huggingface.co/Qwen/Qwen3-0.6B) model on the [UltraFeedback dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback_binarized), large-scale, fine-grained, diverse preference dataset.

 ```python
-from peft import LoraConfig, TaskType
-from transformers import AutoModelForSequenceClassification, AutoTokenizer
-from trl import RewardTrainer, RewardConfig
-
-model = AutoModelForSequenceClassification.from_pretrained("gpt2")
-peft_config = LoraConfig(
-    task_type=TaskType.SEQ_CLS,
-    inference_mode=False,
-    r=8,
-    lora_alpha=32,
-    lora_dropout=0.1,
-)
-
-...
+from trl import RewardTrainer
+from datasets import load_dataset

 trainer = RewardTrainer(
-    model=model,
-    args=training_args,
-    processing_class=tokenizer,
+    model="Qwen/Qwen3-0.6B",
+    train_dataset=load_dataset("trl-lib/ultrafeedback_binarized", split="train"),
+)
+trainer.train()
+```
+
+<iframe src="https://trl-lib-trackio.hf.space/?project=trl-documentation&metrics=train*&sidebar=hidden&runs=reward_qwen3-0.6B_ultrafeedback2" style="width: 100%; min-width: 300px; max-width: 800px;" height="830" frameBorder="0"></iframe>
+
+## Expected dataset type and format
+
+[`RewardTrainer`] supports [preference](dataset_formats#preference) datasets type (both implicit and explicit prompt). The [`RewardTrainer`] is compatible with both [standard](dataset_formats#standard) and [conversational](dataset_formats#conversational) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+
+```python
+# Standard preference (implicit prompt)
+{"chosen": "The sky is blue.",
+ "rejected": "The sky is green."}
+
+# Conversational preference (implicit prompt)
+{"chosen": [{"role": "user", "content": "What color is the sky?"},
+            {"role": "assistant", "content": "It is blue."}],
+ "rejected": [{"role": "user", "content": "What color is the sky?"},
+              {"role": "assistant", "content": "It is green."}]}
+
+# Standard preference (explicit prompt)
+{"prompt": "The sky is",
+ "chosen": " blue.",
+ "rejected": " green."}
+
+# Conversational preference (explicit prompt)
+{"prompt": [{"role": "user", "content": "What color is the sky?"}],
+ "chosen": [{"role": "assistant", "content": "It is blue."}],
+ "rejected": [{"role": "assistant", "content": "It is green."}]}
+```
+
+If your dataset is not in one of these formats, you can preprocess it to convert it into the expected format. Here is an example with the [lmarena-ai/arena-human-preference-55k](https://huggingface.co/datasets/lmarena-ai/arena-human-preference-55k) dataset:
+
+```python
+from datasets import load_dataset
+import json
+
+dataset = load_dataset("lmarena-ai/arena-human-preference-55k")
+
+# Filter out ties
+dataset = dataset.filter(lambda example: example["winner_tie"] == 0)
+
+# Create 'chosen' and 'rejected' fields based on the winner column
+def response_a_b_to_chosen_rejected(example):
+    if example["winner_model_a"] == 1:
+        example["chosen"] = example["response_a"]
+        example["rejected"] = example["response_b"]
+    else:
+        example["chosen"] = example["response_b"]
+        example["rejected"] = example["response_a"]
+    return example
+
+dataset = dataset.map(response_a_b_to_chosen_rejected)
+
+# Convert to conversational format
+def make_conversation(example):
+    prompt = json.loads(example["prompt"])[0]  # '["What color is the sky?"]' -> "What color is the sky?"
+    chosen = json.loads(example["chosen"])[0]
+    rejected = json.loads(example["rejected"])[0]
+    return {
+        "chosen": [{"role": "user", "content": prompt}, {"role": "assistant", "content": chosen}],
+        "rejected": [{"role": "user", "content": prompt}, {"role": "assistant", "content": rejected}],
+    }
+
+
+dataset = dataset.map(make_conversation)
+
+# Keep only necessary columns
+dataset = dataset.select_columns(["chosen", "rejected"])
+
+print(next(iter(dataset["train"])))
+```
+
+```json
+{
+    "chosen": [
+        {"role": "user", "content": "Is it morally right to try to have a certain percentage of females on managerial positions?"},
+        {"role": "assistant", "content": "The question of whether it is morally right to aim for a certain percentage of females..."},
+    ],
+    "rejected": [
+        {"role": "user", "content": "Is it morally right to try to have a certain percentage of females on managerial positions?"},
+        {"role": "assistant", "content": "As an AI, I don't have personal beliefs or opinions. However, ..."},
+    ],
+}
+```
+
+## Looking deeper into the training method
+
+Reward Models (RMs) are typically trained using supervised learning on datasets containing pairs of preferred and non-preferred responses. The goal is to learn a function that assigns higher scores to preferred responses, enabling the model to rank outputs based on preferences.
+
+This section breaks down how reward modeling works in practice, covering the key steps: **preprocessing** and **loss computation**.
+
+### Preprocessing and tokenization
+
+During training, each example is expected to contain a **chosen** and **rejected** field. For more details on the expected formats, see [Dataset formats - Preference](dataset_formats#preference).
+The [`RewardTrainer`] tokenizes each input using the model's tokenizer. If prompts and completions (chosen and rejected) are provided separately (explicit prompt case), they are concatenated before tokenization.
+
+### Computing the loss
+
+Let  \\( x \\) be the input sequence (prompt) and  \\( y^+ \\) and  \\( y^- \\) be the chosen and rejected sequences respectively. Under the Bradley-Terry model ([Bradley & Terry, 1952](https://www.jstor.org/stable/2334029)), the probability that  \\( y^+ \\) is preferred over  \\( y^- \\) given a reward function  \\( r \\) is  \\( p(y^+ ≻ y^- |x) = \sigma(r(x, y^+)−r(x, y^-)) \\), where  \\( σ \\) is the sigmoid function.
+
+The reward model  \\( r_\theta(x, y) \\) is trained to assign higher scores to preferred responses  \\( y^+ \\) over non-preferred ones  \\( y^- \\). The loss is then defined as the negative log-likelihood of the observed preferences:
+
+$$
+\mathcal{L}(\theta) = - \mathbb{E}_{(x,y^+,y^-) \sim \mathcal{D}} \left[ \log \sigma(r_\theta(x, y^+) - r_\theta(x, y^-)) \right].
+$$
+
+> [!TIP]
+> The Bradley-Terry model is underdetermined, meaning that adding a constant to all rewards does not change the preference probabilities. To address this, [Helping or Herding? Reward Model Ensembles Mitigate but do not Eliminate Reward Hacking](https://huggingface.co/papers/2312.09244) proposes adding an auxiliary loss term that encourages the rewards to be centered around zero. This is controlled by the `center_rewards_coefficient` parameter in the [`RewardConfig`]. The recommended value is `1e-2`.
+
+## Logged metrics
+
+While training and evaluating we record the following reward metrics:
+
+* `global_step`: The total number of optimizer steps taken so far.
+* `epoch`: The current epoch number, based on dataset iteration.
+* `num_tokens`: The total number of tokens processed so far.
+* `loss`: The average loss over the last logging interval.
+* `accuracy`: The proportion of correct predictions (i.e., the model assigned a higher score to the chosen response than to the rejected one) averaged over the last logging interval.
+* `min_reward`: The minimum reward score assigned by the model. This value is averaged over the logging interval.
+* `mean_reward`: The average reward score assigned by the model over the last logging interval.
+* `max_reward`: The maximum reward score assigned by the model. This value is averaged over the logging interval.
+* `margin`: The average margin (difference between chosen and rejected rewards) over the last logging interval.
+* `learning_rate`: The current learning rate, which may change dynamically if a scheduler is used.
+* `grad_norm`: The L2 norm of the gradients, computed before gradient clipping.
+
+## Customization
+
+### Model initialization
+
+You can directly pass the kwargs of the [`~transformers.AutoModelForSequenceClassification.from_pretrained()`] method to the [`RewardConfig`]. For example, if you want to load a model in a different precision, analogous to
+
+```python
+model = AutoModelForSequenceClassification.from_pretrained("Qwen/Qwen3-0.6B", dtype=torch.bfloat16)
+```
+
+you can do so by passing the `model_init_kwargs={"dtype": torch.bfloat16}` argument to the [`RewardConfig`].
+
+```python
+from trl import RewardConfig
+
+training_args = RewardConfig(
+    model_init_kwargs={"dtype": torch.bfloat16},
+)
+```
+
+Note that all keyword arguments of [`~transformers.AutoModelForSequenceClassification.from_pretrained()`] are supported, except for `num_labels`, which is automatically set to 1.
+
+### Train adapters with PEFT
+
+We support tight integration with 🤗 PEFT library, allowing any user to conveniently train adapters and share them on the Hub, rather than training the entire model.
+
+```python
+from datasets import load_dataset
+from trl import RewardTrainer
+from peft import LoraConfig
+
+dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
+
+trainer = RewardTrainer(
+    "Qwen/Qwen3-4B",
    train_dataset=dataset,
-    peft_config=peft_config,
+    peft_config=LoraConfig(modules_to_save=["score"])  # important to include the score head when base model is not a sequence classification model
 )

 trainer.train()
-
 ```

-### Adding a margin to the loss
-
-As in the [Llama 2 paper](https://huggingface.co/papers/2307.09288), you can add a margin to the loss by adding a `margin` column to the dataset. The reward collator will automatically pass it through and the loss will be computed accordingly.
+You can also continue training your [`~peft.PeftModel`]. For that, first load a `PeftModel` outside [`RewardTrainer`] and pass it directly to the trainer without the `peft_config` argument being passed.

 ```python
-def add_margin(row):
-    # Assume you have a score_chosen and score_rejected columns that you want to use to compute the margin
-    return {'margin': row['score_chosen'] - row['score_rejected']}
+from datasets import load_dataset
+from trl import RewardTrainer
+from peft import AutoPeftModelForCausalLM

-dataset = dataset.map(add_margin)
-```
+model = AutoPeftModelForCausalLM.from_pretrained("trl-lib/Qwen3-4B-Reward-LoRA", is_trainable=True)
+dataset = load_dataset("trl-lib/Capybara", split="train")

-### Centering rewards
-
-In many scenarios, it's preferable to ensure that a reward model's output is mean zero. This is often done by first calculating the model's average score and then subtracting it.
-
-[[Eisenstein et al., 2023]](https://huggingface.co/papers/2312.09244) proposed an auxiliary loss function designed to directly learn a centered reward model. This auxiliary loss minimizes the squared sum of the rewards, encouraging the model to naturally produce mean-zero outputs:
-
-$$\Big( R(p, r_1) + R(p, r_2) \Big)^2 $$
-
-This auxiliary loss is combined with the main loss function, weighted by the parameter `center_rewards_coefficient` in the `[RewardConfig]`. By default, this feature is deactivated (`center_rewards_coefficient = None`).
-
-```python
-training_args = RewardConfig(
-    center_rewards_coefficient=0.01,
-    ...
+trainer = RewardTrainer(
+    model=model,
+    train_dataset=dataset,
 )
+
+trainer.train()
 ```

-For reference results, please refer PR [#1932](https://github.com/huggingface/trl/pull/1932).
+> [!TIP]
+> When training adapters, you typically use a higher learning rate (≈1e‑3) since only new parameters are being learned.
+>
+> ```python
+> RewardConfig(learning_rate=1e-3, ...)
+> ```
+
+## Tool Calling with Reward Modeling
+
+The [`RewardTrainer`] fully supports fine-tuning models with _tool calling_ capabilities. In this case, each dataset example should include:
+
+* The conversation messages, including any tool calls (`tool_calls`) and tool responses (`tool` role messages)
+* The list of available tools in the `tools` column, typically provided as JSON schemas
+
+For details on the expected dataset structure, see the [Dataset Format — Tool Calling](dataset_formats#tool-calling) section.

 ## RewardTrainer

@ -91,3 +232,7 @@ For reference results, please refer PR [#1932](https://github.com/huggingface/tr
 ## RewardConfig

 [[autodoc]] RewardConfig
+
+## DataCollatoForPreference
+
+[[autodoc]] trainer.reward_trainer.DataCollatorForPreference
--- a/docs/source/rewards.md
+++ b/docs/source/rewards.md
@ -2,14 +2,14 @@

 This module contains some useful reward functions, primarily intended for use with the [`GRPOTrainer`] and [`RLOOTrainer`].

-## Format rewards
+## accuracy_reward

-### think_format_reward
+[[autodoc]] rewards.accuracy_reward
+
+## think_format_reward

 [[autodoc]] rewards.think_format_reward

-## Other rewards
-
-### get_soft_overlong_punishment
+## get_soft_overlong_punishment

 [[autodoc]] rewards.get_soft_overlong_punishment
--- a/docs/source/rloo_trainer.md
+++ b/docs/source/rloo_trainer.md
@ -1,6 +1,6 @@
 # RLOO Trainer

-[![](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)
+[![model badge](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)

 ## Overview

@ -70,7 +70,7 @@ At each training step, we sample a batch of prompts and generate a set of  \\( G
 In RLOO, the reward consists of two components: the reward provided by the reward model (or reward function) and a KL penalty that discourages the policy from deviating too far from a fixed reference policy

 1. For each of the  \\( G \\) generated sequences  \\( o_i = (o_{i,1}, \dots, o_{i,T}) \\) conditioned on a query \\( q \\), we compute a scalar reward using a reward model  \\( R(o_i, q) \\).
-2. Concurenlty, we estimate the KL divergence between the current policy  \\( \pi_\theta \\) and the fixed reference policy  \\( \pi_{\text{ref}} \\) over the sequence. The KL estimate for sequence  \\( o_i \\) is:
+2. Concurrently, we estimate the KL divergence between the current policy  \\( \pi_\theta \\) and the fixed reference policy  \\( \pi_{\text{ref}} \\) over the sequence. The KL estimate for sequence  \\( o_i \\) is:

 $$
 \mathbb{D}_{\mathrm{KL}}\!\left[\pi_\theta\|\pi_{\mathrm{ref}}\right] = \sum_{t=1}^T \log \frac{\pi_\theta(o_{i,t} \mid q, o_{i,<t})}{\pi_{\mathrm{ref}}(o_{i,t} \mid q, o_{i,<t})}.
@ -84,34 +84,30 @@ $$

 where  \\( \beta > 0 \\) controls the strength of the KL penalty.

-<Tip>  
-
-In a purely online setting (`num_iterations = 1`, default), the data are generated by the current policy. In this case, the KL penalty is computed directly using the current policy.  
-
-In the more general setting (e.g., multiple gradient steps per batch), the data are instead generated by an earlier snapshot \\( \pi_{\text{old}} \\). To keep the penalty consistent with the sampling distribution, the KL is defined with respect to this policy:
-
-$$
-\mathbb{D}_{\mathrm{KL}}\!\left[\pi_{\text{old}} \,\|\, \pi_{\text{ref}}\right].
-$$
-
-Equivalently, for a sampled sequence $o$, the Monte Carlo estimate is
-
-$$
-\mathbb{D}_{\mathrm{KL}}\!\left[\pi_{\text{old}} \|\pi_{\mathrm{ref}}\right] = \sum_{t=1}^T \log \frac{\pi_{\text{old}}(o_{i,t} \mid q, o_{i,<t})}{\pi_{\mathrm{ref}}(o_{i,t} \mid q, o_{i,<t})}.
-$$
-
-</Tip>
+> [!TIP]
+> In a purely online setting (`num_iterations = 1`, default), the data are generated by the current policy. In this case, the KL penalty is computed directly using the current policy.  
+>
+> In the more general setting (e.g., multiple gradient steps per batch), the data are instead generated by an earlier snapshot \\( \pi_{\text{old}} \\). To keep the penalty consistent with the sampling distribution, the KL is defined with respect to this policy:
+>
+> $$
+> \mathbb{D}_{\mathrm{KL}}\!\left[\pi_{\text{old}} \,\|\, \pi_{\text{ref}}\right].
+> $$
+>
+> Equivalently, for a sampled sequence $o$, the Monte Carlo estimate is
+>
+> $$
+> \mathbb{D}_{\mathrm{KL}}\!\left[\pi_{\text{old}} \|\pi_{\mathrm{ref}}\right] = \sum_{t=1}^T \log \frac{\pi_{\text{old}}(o_{i,t} \mid q, o_{i,<t})}{\pi_{\mathrm{ref}}(o_{i,t} \mid q, o_{i,<t})}.
+> $$

 ### Computing the advantage

-Once the rewards for each completion have been computed, we calculate a baseline as the average reward of all other samples in the same batch, excluding the current sample. This baseline is used to reduce the variance of the policy gradient estimate. The advantage for each completion is then obtained as the difference between its own reward and this leave-one-out baseline. 
+Once the rewards for each completion have been computed, we calculate a baseline as the average reward of all other samples in the same batch, excluding the current sample. This baseline is used to reduce the variance of the policy gradient estimate. The advantage for each completion is then obtained as the difference between its own reward and this leave-one-out baseline.

 Formally, for a batch of G completions, the baseline for completion is:
 $$
 b_i = \frac{1}{G-1} \sum_{j \neq i} r_j
 $$

-
 and then the advantage for each completion is computed as the difference between its reward and the baseline:

 $$
@ -145,7 +141,7 @@ While training and evaluating, we record the following reward metrics:
 - `completions/mean_terminated_length`: The average length of generated completions that terminate with EOS.
 - `completions/min_terminated_length`: The minimum length of generated completions that terminate with EOS.
 - `completions/max_terminated_length`: The maximum length of generated completions that terminate with EOS.
- `completions/clipped_ratio` : The ratio of truncated (clipped) completions.
+- `completions/clipped_ratio`: The ratio of truncated (clipped) completions.
 - `reward/{reward_func_name}/mean`: The average reward from a specific reward function.
 - `reward/{reward_func_name}/std`: The standard deviation of the reward from a specific reward function.
 - `reward`: The overall average reward after applying reward weights.
@ -154,9 +150,9 @@ While training and evaluating, we record the following reward metrics:
 - `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
 - `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
 - `clip_ratio/region_mean`: The ratio of sequence probabilities where the RLOO objective is clipped to stay within the trust region:
-$$
-\text{clip}\left( r_{i}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i}(\theta) = \frac{\pi_\theta(o_{i} \mid q)}{\pi_{\theta_{\text{old}}}(o_{i} \mid q)}\,.
-$$
+  $$
+  \text{clip}\left( r_{i}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i}(\theta) = \frac{\pi_\theta(o_{i} \mid q)}{\pi_{\theta_{\text{old}}}(o_{i} \mid q)}\,.
+  $$

    A higher value means more samples are clipped, which constrains how much the policy $\pi_\theta$ can change.
 - `clip_ratio/low_mean`: The average ratio of sequence probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
@ -169,6 +165,7 @@ $$
 ### Speed up training with vLLM-powered generation

 Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
+
 ```shell
 pip install trl[vllm]
 ```
@ -180,11 +177,13 @@ We support two ways of using vLLM during training: **server mode** and **colocat
 In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.

 1. **Start the vLLM server**:
+
   ```bash
   trl vllm-serve --model <model_name>
   ```

 2. **Enable server mode in your training script**:
+
   ```python
   from trl import RLOOConfig

@ -195,11 +194,8 @@ In this mode, vLLM runs in a separate process (and using separate GPUs) and comm
   )
   ```

-<Tip warning={true}>
-
-Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the `CUDA_VISIBLE_DEVICES` environment variable.
-
-</Tip>
+> [!WARNING]
+> Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the `CUDA_VISIBLE_DEVICES` environment variable.

 #### 🧩 Option 2: Colocate mode

@ -215,28 +211,19 @@ training_args = RLOOConfig(
 )
 ```

-<Tip>
+> [!TIP]
+> Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the `vllm_gpu_memory_utilization` parameter in [`RLOOConfig`] to avoid underutilization or out-of-memory errors.
+>
+> We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
+>
+> <iframe src="https://trl-lib-recommend-vllm-memory.hf.space" frameborder="0" width="850" height="450"></iframe>
+>
+> If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
+>
+> If you still find you are getting out-of-memory errors set `vllm_enable_sleep_mode` to True and the vllm parameters and cache will be offloaded during the optimization step. For more information, see [Reducing Memory Usage with vLLM Sleep Mode](reducing_memory_usage#vllm-sleep-mode).

-Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the `vllm_gpu_memory_utilization` parameter in [`RLOOConfig`] to avoid underutilization or out-of-memory errors.
-
-We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
-
-<iframe
-	src="https://trl-lib-recommend-vllm-memory.hf.space"
-	frameborder="0"
-	width="850"
-	height="450"
-></iframe>
-
-If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
-
-</Tip>
-
-<Tip>
-
-By default, RLOO uses `MASTER_ADDR=localhost` and `MASTER_PORT=12345` for vLLM, but you can override these values by setting the environment variables accordingly.
-
-</Tip>
+> [!TIP]
+> By default, RLOO uses `MASTER_ADDR=localhost` and `MASTER_PORT=12345` for vLLM, but you can override these values by setting the environment variables accordingly.

 For more information, see [Speeding up training with vLLM](speeding_up_training#vllm-for-fast-generation-in-online-methods).

@ -244,7 +231,7 @@ For more information, see [Speeding up training with vLLM](speeding_up_training#

 When training large models like **Qwen2.5-72B**, you need several key optimizations to make the training efficient and scalable across multiple GPUs and nodes. These include:

- **DeepSpeed ZeRO Stage 3**: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such model. For more details, see [DeepSpeed Integration](deepspeed_integration).
+- **DeepSpeed ZeRO Stage 3**: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such models. For more details, see [DeepSpeed Integration](deepspeed_integration).
 - **Accelerate**: Accelerate is a library that simplifies distributed training across multiple GPUs and nodes. It provides a simple API to launch distributed training and handles the complexities of distributed training, such as data parallelism, gradient accumulation, and distributed data loading. For more details, see [Distributing Training](distributing_training).
 - **vLLM**: See the previous section on how to use vLLM to speed up generation.

@ -323,7 +310,7 @@ The [`RLOOTrainer`] supports using custom reward functions instead of dense rewa
     - `completions` (contains the generated completions),
     - `completions_ids` (contains the tokenized completions),
     - `trainer_state` ([`~transformers.TrainerState`]): The current state of the trainer. This can be used to implement dynamic reward functions, such as curriculum learning, where the reward is adjusted based on the training progress.
-     - All columns names (but `prompt`) that the dataset may have. For example, if the dataset contains a column named `ground_truth`, the function will be called with `ground_truth` as a keyword argument.
+     - All column names (but `prompt`) that the dataset may have. For example, if the dataset contains a column named `ground_truth`, the function will be called with `ground_truth` as a keyword argument.

     The easiest way to comply with this requirement is to use `**kwargs` in the function signature.
   - Depending on the dataset format, the input will vary:
@ -352,7 +339,7 @@ You can test it as follows:
 [2.0, 4.0]
 ```

-#### Example 1.1: Reward longer completions (based in the number of characters)
+#### Example 1.1: Reward longer completions (based on the number of characters)

 Same as the previous example, but this time the reward function is based on the number of characters instead of tokens.

@ -372,10 +359,10 @@ You can test it as follows:
 [6.0, 12.0]
 ```

-#### Example 2: Reward completions with specific format
+#### Example 2: Reward completions with a specific format

 Below is an example of a reward function that checks if the completion has a specific format. This example is inspired by the _format reward_ function used in the paper [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://huggingface.co/papers/2501.12948).
-It is designed for conversational format, where prompts and completions consist of structured messages.
+It is designed for a conversational format, where prompts and completions consist of structured messages.

 ```python
 import re
@ -428,6 +415,7 @@ You can test this function as follows:
 >>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
 [1.0, 0.0]
 ```
+
 #### Example 4: Multi-task reward functions

 Below is an example of using multiple reward functions in the [`RLOOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
@ -484,12 +472,10 @@ trainer = RLOOTrainer(
 trainer.train()
 ```

-In this example, the `math_reward_func` and `coding_reward_func` are designed to work with a mixed dataset that contains both math and coding problems. The `task` column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return `None` and the [`RLOOTrainer`] will continue with the valid functions and tasks. This allows the [`RLOOTrainer`] to handle multiple reward functions with different applicability.
+In this example, the `math_reward_func` and `coding_reward_func` are designed to work with a mixed dataset that contains both math and coding problems. The `task` column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return `None`, and the [`RLOOTrainer`] will continue with the valid functions and tasks. This allows the [`RLOOTrainer`] to handle multiple reward functions with different applicability.

 Note that the [`RLOOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.

-
-
 #### Passing the reward function to the trainer

 To use your custom reward function, pass it to the [`RLOOTrainer`] as follows:
@ -518,6 +504,64 @@ and the reward will be computed as the sum of the rewards from each function, or

 Note that [`RLOOTrainer`] supports multiple reward functions of different types. See the parameters documentation for more details.

+## Vision-Language Model (VLM) Training
+
+RLOO supports training Vision-Language Models (VLMs) on multimodal datasets containing both text and images.
+
+### Supported Models
+
+Tested with:
+
+- **Gemma3** — e.g., `google/gemma-3-4b-it`
+- **LLaVA-NeXT** — e.g., `llava-hf/llava-v1.6-mistral-7b-hf`
+- **Qwen2-VL** — e.g., `Qwen/Qwen2-VL-2B-Instruct`
+- **Qwen2.5-VL** — e.g., `Qwen/Qwen2.5-VL-3B-Instruct`
+- **SmolVLM2** — e.g., `HuggingFaceTB/SmolVLM2-2.2B-Instruct`
+  
+> [!TIP]
+> Compatibility with all VLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.
+
+### Quick Start
+
+Use [rloo\_vlm.py](https://github.com/huggingface/trl/blob/main/examples/scripts/rloo_vlm.py) to fine-tune a VLM. Example command for training on [`lmms-lab/multimodal-open-r1-8k-verified`](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified):
+
+```bash
+accelerate launch \
+  --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \
+  examples/scripts/rloo_vlm.py \
+  --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \
+  --output_dir rloo-Qwen2.5-VL-3B-Instruct \
+  --learning_rate 1e-5 \
+  --gradient_checkpointing \
+  --dtype bfloat16 \
+  --max_prompt_length 2048 \
+  --max_completion_length 1024 \
+  --use_vllm \
+  --vllm_mode colocate \
+  --use_peft \
+  --lora_target_modules "q_proj", "v_proj" \
+  --log_completions
+```
+
+### Configuration Tips
+
+> [!WARNING]
+> VLM training may fail if image tokens are truncated. We highly recommend disabling truncation by setting `max_prompt_length` to `None`.
+
+- Use LoRA on vision-language projection layers
+- Enable 4-bit quantization to reduce memory usage
+- VLMs are memory-intensive — start with smaller batch sizes
+- Most models are compatible with vLLM (`server` and `colocate` modes)
+
+### Dataset Format
+
+Each training sample should include:
+
+- `prompt`: Text formatted via the processor's chat template
+- `image`/`images`: PIL Image or list of PIL Images
+
+The trainer automatically handles image-to-tensor conversion via the model’s image processor.
+
 ## RLOOTrainer

 [[autodoc]] RLOOTrainer
@ -540,7 +584,7 @@ Note that [`RLOOTrainer`] supports multiple reward functions of different types.

 ## Migration Guide from the old implementation (0.21 and below)

-With the release of version 0.22.0, we have revamped the [`RLOOTrainer`] to be more alinged with other online trainers in the library like [`GRPOTrainer`]. This new implementation introduces several changes to the configuration parameters and overall structure of the trainer.
+With the release of version 0.22.0, we have revamped the [`RLOOTrainer`] to be more aligned with other online trainers in the library, like [`GRPOTrainer`]. This new implementation introduces several changes to the configuration parameters and overall structure of the trainer.
 Below is a summary of the key changes for [`RLOOConfig`]:

 | TRL ≤ 0.21.x | TRL ≥ 0.22.0 |
--- a/docs/source/sentiment_tuning.md
+++ b/docs/source/sentiment_tuning.md
@ -4,15 +4,11 @@ The notebooks and scripts in these examples show how to fine-tune a model with a

 Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):

-
-
-| File                                                                                           | Description                                                                                                              |
-|------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
-| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset                 |
-| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb)              | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.                |
-| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb)                | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.    
-
-
+| File | Description |
+| --- |--- |
+| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset |
+| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
+| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. |

 ## Usage

@ -30,7 +26,6 @@ python examples/scripts/ppo.py --log_with wandb --mini_batch_size 1 --gradient_a

 Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).

+## Few notes on multi-GPU

-## Few notes on multi-GPU 
-
-To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.
+To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.
--- a/docs/source/sft_trainer.md
+++ b/docs/source/sft_trainer.md
@ -23,7 +23,7 @@ trainer = SFTTrainer(
 trainer.train()
 ```

-<iframe src="https://trl-lib-trackio.hf.space/?project=trl-documentation&metrics=train/loss,train/mean_token_accuracy,train/num_tokens&sidebar=hidden" style="width: 100%; min-width: 300px; max-width: 800px;" height="830" frameBorder="0"></iframe>
+<iframe src="https://trl-lib-trackio.hf.space/?project=trl-documentation&metrics=train*&runs=sft_qwen3-0.6B_capybara" style="width: 100%; min-width: 300px; max-width: 800px;" height="830" frameBorder="0"></iframe>

 ## Expected dataset type and format

@ -105,11 +105,8 @@ $$
  
 where  \\( y_t \\) is the target token at timestep  \\( t \\), and the model is trained to predict the next token given the previous ones. In practice, padding tokens are masked out during loss computation.

-<Tip>
-
-[On the Generalization of SFT: A Reinforcement Learning Perspective with Reward Rectification](https://huggingface.co/papers/2508.05629) proposes an alternative loss function, called **Dynamic Fine-Tuning (DFT)**, which aims to improve generalization by rectifying the reward signal. This method can be enabled by setting `loss_type="dft"` in the [`SFTConfig`]. For more details, see [Paper Index - Dynamic Fine-Tuning](paper_index#on-the-generalization-of-sft-a-reinforcement-learning-perspective-with-reward-rectification).
-
-</Tip>
+> [!TIP]
+> The paper [On the Generalization of SFT: A Reinforcement Learning Perspective with Reward Rectification](https://huggingface.co/papers/2508.05629) proposes an alternative loss function, called **Dynamic Fine-Tuning (DFT)**, which aims to improve generalization by rectifying the reward signal. This method can be enabled by setting `loss_type="dft"` in the [`SFTConfig`]. For more details, see [Paper Index - Dynamic Fine-Tuning](paper_index#on-the-generalization-of-sft-a-reinforcement-learning-perspective-with-reward-rectification).

 ### Label shifting and masking

@ -180,9 +177,8 @@ To train on completion only, use a [prompt-completion](dataset_formats#prompt-co

 ![train_on_completion](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/train_on_completion.png)

-<Tip>
-Training on completion only is compatible with training on assistant messages only. In this case, use a [conversational](dataset_formats#conversational) [prompt-completion](dataset_formats#prompt-completion) dataset and set `assistant_only_loss=True` in the [`SFTConfig`].
-</Tip>
+> [!TIP]
+> Training on completion only is compatible with training on assistant messages only. In this case, use a [conversational](dataset_formats#conversational) [prompt-completion](dataset_formats#prompt-completion) dataset and set `assistant_only_loss=True` in the [`SFTConfig`].

 ### Train adapters with PEFT

@ -204,7 +200,7 @@ trainer = SFTTrainer(
 trainer.train()
 ```

-You can also continue training your [`peft.PeftModel`]. For that, first load a `PeftModel` outside [`SFTTrainer`] and pass it directly to the trainer without the `peft_config` argument being passed.
+You can also continue training your [`~peft.PeftModel`]. For that, first load a `PeftModel` outside [`SFTTrainer`] and pass it directly to the trainer without the `peft_config` argument being passed.

 ```python
 from datasets import load_dataset
@ -222,15 +218,12 @@ trainer = SFTTrainer(
 trainer.train()
 ```

-<Tip>
-
-When training adapters, you typically use a higher learning rate (≈1e‑4) since only new parameters are being learned.
-
-```python
-SFTConfig(learning_rate=1e-4, ...)
-```
-
-</Tip>
+> [!TIP]
+> When training adapters, you typically use a higher learning rate (≈1e‑4) since only new parameters are being learned.
+>
+> ```python
+> SFTConfig(learning_rate=1e-4, ...)
+> ```

 ### Train with Liger Kernel

@ -313,17 +306,14 @@ trainer = SFTTrainer(
 trainer.train()
 ```

-<Tip>
-
-For VLMs, truncating may remove image tokens, leading to errors during training. To avoid this, set `max_length=None` in the [`SFTConfig`]. This allows the model to process the full sequence length without truncating image tokens.
-
-```python
-SFTConfig(max_length=None, ...)
-```
-
-Only use `max_length` when you've verified that truncation won't remove image tokens for the entire dataset.
-
-</Tip>
+> [!TIP]
+> For VLMs, truncating may remove image tokens, leading to errors during training. To avoid this, set `max_length=None` in the [`SFTConfig`]. This allows the model to process the full sequence length without truncating image tokens.
+>
+> ```python
+> SFTConfig(max_length=None, ...)
+> ```
+>
+> Only use `max_length` when you've verified that truncation won't remove image tokens for the entire dataset.

 ## SFTTrainer

--- a/docs/source/speeding_up_training.md
+++ b/docs/source/speeding_up_training.md
@ -1,10 +1,7 @@
 # Speeding Up Training

-<Tip warning={true}>
-
-Section under construction. Feel free to contribute!
-
-</Tip>
+> [!WARNING]
+> Section under construction. Feel free to contribute!

 ## vLLM for fast generation in online methods

@ -14,13 +11,7 @@ To speed up generation, you can use [vLLM](https://github.com/vllm-project/vllm)
 To use [vLLM](https://github.com/vllm-project/vllm), first install it using:

 ```bash
-pip install vllm
-```
-
-or 
-
-```bash
-pip install "trl[vllm]"
+pip install trl[vllm]
 ```

 <hfoptions id="vllm examples">
@ -53,21 +44,20 @@ training_args = GRPOConfig(..., use_vllm=True)

 You can customize the server configuration by passing additional arguments. For more information, see [vLLM integration](vllm_integration).

-<Tip warning={true}>
-
-When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  
-
-Set GPUs **0-3** for vLLM generation:  
-```sh
-CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
-```  
-
-And GPUs **4-7** for training:  
-```sh
-CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
-```  
-
-</Tip>
+> [!WARNING]
+> When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  
+>
+> Set GPUs **0-3** for vLLM generation:  
+>
+> ```sh
+> CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
+> ```  
+>
+> And GPUs **4-7** for training:
+>
+> ```sh
+> CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
+> ```

 </hfoption>
 <hfoption id="RLOO">
@ -88,21 +78,20 @@ training_args = RLOOConfig(..., use_vllm=True)

 You can customize the server configuration by passing additional arguments. For more information, see [vLLM integration](vllm_integration).

-<Tip warning={true}>
-
-When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  
-
-Set GPUs **0-3** for vLLM generation:  
-```sh
-CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
-```  
-
-And GPUs **4-7** for training:  
-```sh
-CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
-```  
-
-</Tip>
+> [!WARNING]
+> When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  
+>
+> Set GPUs **0-3** for vLLM generation:
+>
+> ```sh
+> CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
+> ```  
+>
+> And GPUs **4-7** for training:
+>
+> ```sh
+> CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
+> ```

 </hfoption>
 </hfoptions>
--- a/docs/source/trackio_integration.md
+++ b/docs/source/trackio_integration.md
@ -64,4 +64,4 @@ trainer.train()

 will give you a hosted dashboard at https://huggingface.co/spaces/trl-lib/trackio.

-<iframe src="https://trl-lib-trackio.hf.space/?project=trl-documentation&sidebar=hidden" style="width: 100%; min-width: 300px; max-width: 800px;" height="830" frameBorder="0"></iframe>
+<iframe src="https://trl-lib-trackio.hf.space/?project=trl-documentation&sidebar=hidden&runs=sft_qwen3-0.6B_capybara" style="width: 100%; min-width: 300px; max-width: 800px;" height="830" frameBorder="0"></iframe>
--- a/docs/source/using_llama_models.md
+++ b/docs/source/using_llama_models.md
@ -1,159 +0,0 @@
-# Using LLaMA models with TRL
-
-We've begun rolling out examples to use Meta's LLaMA models in `trl` (see [Meta's LLaMA release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) for the original LLaMA model).
-
-## Efficient training strategies
-
-Even training the smallest LLaMA model requires an enormous amount of memory. Some quick math: in bf16, every parameter uses 2 bytes (in fp32 4 bytes) in addition to 8 bytes used, e.g., in the Adam optimizer (see the [performance docs](https://huggingface.co/docs/transformers/perf_train_gpu_one#optimizer) in Transformers for more info). So a 7B parameter model would use `(2+8)*7B=70GB` just to fit in memory and would likely need more when you compute intermediate values such as attention scores. So you couldn’t train the model even on a single 80GB A100 like that. You can use some tricks, like more efficient optimizers of half-precision training, to squeeze a bit more into memory, but you’ll run out sooner or later.
-
-Another option is to use Parameter-Efficient Fine-Tuning (PEFT) techniques, such as the [`peft`](https://github.com/huggingface/peft) library, which can perform low-rank adaptation (LoRA) on a model loaded in 8-bit.
-For more on `peft` + `trl`, see the [Peft integration](peft_integration) docs.
-
-Loading the model in 8bit reduces the memory footprint drastically since you only need one byte per parameter for the weights (e.g. 7B LlaMa is 7GB in memory).
-Instead of training the original weights directly, LoRA adds small adapter layers on top of some specific layers (usually the attention layers); thus, the number of trainable parameters is drastically reduced.
-
-In this scenario, a rule of thumb is to allocate ~1.2-1.4GB per billion parameters (depending on the batch size and sequence length) to fit the entire fine-tuning setup.
-This enables fine-tuning larger models (up to 50-60B scale models on a NVIDIA A100 80GB) at low cost.
-
-Now we can fit very large models into a single GPU, but the training might still be very slow.
-The simplest strategy in this scenario is data parallelism: we replicate the same training setup into separate GPUs and pass different batches to each GPU.
-With this, you can parallelize the forward/backward passes of the model and scale with the number of GPUs.
-
-![chapter10_ddp.png](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/chapter10_ddp.png)
-
-We use either the `transformers.Trainer` or `accelerate`, which both support data parallelism without any code changes, by simply passing arguments when calling the scripts with `torchrun` or `accelerate launch`. The following runs a training script with 8 GPUs on a single machine with `accelerate` and `torchrun`, respectively.
-
-```bash
-accelerate launch --multi_gpu --num_machines 1  --num_processes 8 my_accelerate_script.py
-torchrun --nnodes 1  --nproc_per_node 8 my_torch_script.py
-```
-
-## Supervised fine-tuning
-
-Before we start training reward models and tuning our model with RL, it helps if the model is already good in the domain we are interested in.
-In our case, we want it to answer questions, while for other use cases, we might want it to follow instructions, in which case instruction tuning is a great idea.
-The easiest way to achieve this is by continuing to train the language model with the language modeling objective on texts from the domain or task.
-The [StackExchange dataset](https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences) is enormous (over 10 million instructions), so we can easily train the language model on a subset of it.
-
-There is nothing special about fine-tuning the model before doing RLHF - it’s just the causal language modeling objective from pretraining that we apply here.
-To use the data efficiently, we use a technique called packing: instead of having one text per sample in the batch and then padding to either the longest text or the maximal context of the model, we concatenate a lot of texts with an EOS token in between and cut chunks of the context size to fill the batch without any padding.
-
-![chapter10_preprocessing-clm.png](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/chapter10_preprocessing-clm.png)
-
-With this approach the training is much more efficient as each token that is passed through the model is also trained in contrast to padding tokens which are usually masked from the loss.
-If you don't have much data and are more concerned about occasionally cutting off some tokens that are overflowing the context you can also use a classical data loader.
-
-
-```python
-# load model in 8bit
-model = AutoModelForCausalLM.from_pretrained(
-        args.model_path,
-        load_in_8bit=True,
-        device_map={"": Accelerator().local_process_index}
-    )
-model = prepare_model_for_kbit_training(model)
-
-# add LoRA to model
-lora_config = LoraConfig(
-    r=16,
-    lora_alpha=32,
-    lora_dropout=0.05,
-    bias="none",
-    task_type="CAUSAL_LM",
-)
-
-model = get_peft_model(model, config)
-```
-
-We train the model for a few thousand steps with the causal language modeling objective and save the model.
-Since we will tune the model again with different objectives, we merge the adapter weights with the original model weights.
-
-**Disclaimer:** due to LLaMA's license, we release only the adapter weights for this and the model checkpoints in the following sections.
-You can apply for access to the base model's weights by filling out Meta AI's [form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform) and then converting them to the 🤗 Transformers format by running this [script](https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py).
-Note that you'll also need to install 🤗 Transformers from source until the `v4.28` is released.
-
-Now that we have fine-tuned the model for the task, we are ready to train a reward model.
-
-## Reward modeling and human preferences
-
-In principle, we could fine-tune the model using RLHF directly with the human annotations.
-However, this would require us to send some samples to humans for rating after each optimization iteration.
-This is expensive and slow due to the number of training samples needed for convergence and the inherent latency of human reading and annotator speed.
-
-A trick that works well instead of direct feedback is training a reward model on human annotations collected before the RL loop.
-The goal of the reward model is to imitate how a human would rate a text. There are several possible strategies to build a reward model: the most straightforward way would be to predict the annotation (e.g. a rating score or a binary value for “good”/”bad”).
-In practice, what works better is to predict the ranking of two examples, where the reward model is presented with two candidates `(y_k, y_j)` for a given prompt `x` and has to predict which one would be rated higher by a human annotator.
-
-With the StackExchange dataset, we can infer which of the two answers was preferred by the users based on the score.
-With that information and the loss defined above, we can then modify the `transformers.Trainer` by adding a custom loss function.
-
-```python
-class RewardTrainer(Trainer):
-    def compute_loss(self, model, inputs, return_outputs=False):
-        rewards_j = model(input_ids=inputs["input_ids_j"],  attention_mask=inputs["attention_mask_j"])[0]
-        rewards_k = model(input_ids=inputs["input_ids_k"], attention_mask=inputs["attention_mask_k"])[0]
-        loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
-        if return_outputs:
-            return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
-        return loss
-```
-
-We utilize a subset of a 100,000 pair of candidates and evaluate on a held-out set of 50,000. With a modest training batch size of 4, we train the Llama model using the LoRA `peft` adapter for a single epoch using the Adam optimizer with BF16 precision. Our LoRA configuration is:
-
-```python
-peft_config = LoraConfig(
-    task_type=TaskType.SEQ_CLS,
-    inference_mode=False,
-    r=8,
-    lora_alpha=32,
-    lora_dropout=0.1,
-)
-```
-As detailed in the next section, the resulting adapter can be merged into the frozen model and saved for further downstream use.
-
-## Reinforcement Learning from Human Feedback
-
-With the fine-tuned language model and the reward model at hand, we are now ready to run the RL loop. It follows roughly three steps:
-
-1. Generate responses from prompts,
-2. Rate the responses with the reward model,
-3. Run a reinforcement learning policy-optimization step with the ratings.
-
-The Query and Response prompts are templated as follows before being tokenized and passed to the model:
-
-```bash
-Question: <Query>
-
-Answer: <Response>
-```
-
-The same template was used for SFT, RM and RLHF stages.
-Once more, we utilize `peft` for memory-efficient training, which offers an extra advantage in the RLHF context.
-Here, the reference model and policy share the same base, the SFT model, which we load in 8-bit and freeze during training.
-We exclusively optimize the policy's LoRA weights using PPO while sharing the base model's weights.
-
-```python
-for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
-    question_tensors = batch["input_ids"]
-
-	# sample from the policy and to generate responses
-    response_tensors = ppo_trainer.generate(
-        question_tensors,
-        return_prompt=False,
-        length_sampler=output_length_sampler,
-        **generation_kwargs,
-    )
-    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)
-
-    # Compute sentiment score
-    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
-    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
-    rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]
-
-    # Run PPO step
-    stats = ppo_trainer.step(question_tensors, response_tensors, rewards)
-	# Log stats to Wandb
-    ppo_trainer.log_stats(stats, batch, rewards)
-```
-
-For the rest of the details and evaluation, please refer to our [blog post on StackLLaMA](https://huggingface.co/blog/stackllama).
--- a/docs/source/vllm_integration.md
+++ b/docs/source/vllm_integration.md
@ -1,14 +1,26 @@
 # vLLM Integration

-This document will guide you through the process of using vLLM with TRL for faster generation in online methods like GRPO and Online DPO. We first summarize a tl;dr on how to use vLLM with TRL, and then we will go into the details of how it works under the hood. Let's go! 🔥
+This document will guide you through the process of using vLLM with TRL for faster generation in online methods like GRPO and Online DPO. We first summarize a tl;dr on how to use vLLM with TRL, and then we will go into the details of how it works under the hood.
+
+> [!WARNING]
+> TRL currently only supports vLLM version `0.10.2`. Please ensure you have this version installed to avoid compatibility issues.
+
+> [!TIP]
+> The following trainers currently support generation with vLLM:
+>
+> - [`GRPOTrainer`]
+> - [`OnlineDPOTrainer`]
+> - [`NashMDTrainer`]
+> - [`XPOTrainer`]
+> - [`RLOOTrainer`]

 ## 🚀 How can I use vLLM with TRL to speed up training?

 💡 **Note**: Resources required for this specific example: a single node with 8 GPUs.

-<Tip warning={true}>
-vLLM server and TRL trainer must use different CUDA devices to avoid conflicts.
-</Tip>
+> [!WARNING]
+> When using vLLM with TRL, the **vLLM server** and the **trainer** must run on **separate CUDA devices** to prevent conflicts.
+> For guidance on configuring this properly, see [Modes of using vLLM during training](#modes-of-using-vllm-during-training).

 First, install vLLM using the following command:

@ -22,12 +34,15 @@ Then run the server on specific GPUs (e.g., GPUs 0-3):
 CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model Qwen/Qwen2.5-7B --tensor-parallel-size 2 --data-parallel-size 2
 ```

-Once the server is running, you can use it to generate completions for training. In the example below, we are using the `GRPOTrainer` to train a model using the vLLM server for generation. The `--tensor-parallel-size` and `--data-parallel-size` arguments control how the model and data are sharded across GPUs.
+Once the server is running, you can use it to generate completions for training. In the example below, we are using the different supported trainers using the vLLM server for generation. The `--tensor-parallel-size` and `--data-parallel-size` arguments control how the model and data are sharded across GPUs.

 In this example, we are sharding two copies of the model across 4 GPUs. Increasing data parallelism increases throughput, while increasing tensor parallelism allows for serving larger models. Then, run the training script on different GPUs (e.g., GPUs 4-7) by passing `use_vllm=True` in the training arguments as follows:

 Sample of a simple `train.py` script:

+<hfoptions id="vllm examples">
+<hfoption id="GRPO">
+
 ```python
 from datasets import load_dataset
 from trl import GRPOTrainer, GRPOConfig
@ -55,21 +70,148 @@ trainer = GRPOTrainer(
 trainer.train()
 ```

+</hfoption>
+<hfoption id="OnlineDPO">
+
+```python
+from datasets import load_dataset
+from trl import OnlineDPOTrainer, OnlineDPOConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = OnlineDPOConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = OnlineDPOTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+<hfoption id="NashMD">
+
+```python
+from datasets import load_dataset
+from trl import NashMDTrainer, NashMDConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = NashMDConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = NashMDTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+<hfoption id="XPO">
+
+```python
+from datasets import load_dataset
+from trl import XPOTrainer, XPOConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = XPOConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = XPOTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```python
+from datasets import load_dataset
+from trl import RLOOTrainer, RLOOConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = RLOOConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = RLOOTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+</hfoption>
+</hfoptions>
+
 And the train command on separate GPUs from the server:

 ```sh
 CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
 ```

-## 🎬 Flashback: Why do we need to use vLLM in online methods?
+## Why using vLLM?
+
+### 🎬 Flashback: Why do we need to use vLLM in online methods?

 Online methods like GRPO or Online DPO require the model to generate completions during training, which are then used to compute reward signals. However, generation can be extremely time-consuming, especially with large or reasoning models. In the default setup (without vLLM), completions are generated using the [(unwrapped) model's `generate` method](https://github.com/huggingface/trl/blob/f3e8c2304428ef16e9ae5de9e5741ed84d533b7b/trl/trainer/grpo_trainer.py#L965C39-L965C66). This approach quickly becomes a major bottleneck — generation is slow and inefficient, particularly for large batches or models. As a result, training times increase significantly, and overall efficiency drops. To address this, we turn to vLLM, which enables much faster and more scalable generation, helping eliminate this bottleneck in online methods.

-## 🤔 How does vLLM solve the slow generation issue?
+### 🤔 How does vLLM solve the slow generation issue?

 If you've ever done autoregressive decoder training, you know all the input tokens to the LLM produce their attention key and value tensors, and these tensors are kept in GPU memory to later generate subsequent tokens based on them. These cached key and value tensors are often referred to as the KV cache. However, storing the KV cache occupies a lot of memory, so vLLM uses a technique called **PagedAttention** to solve this problem. PagedAttention, which is inspired by the OS’s virtual memory concept, stores continuous keys and values in **non-contiguous memory space**, which is much more efficient. The details of this are beyond the scope of this document, but in short, it allows the model to store the keys and values in a more efficient way, reducing the memory footprint and speeding up the generation process. If you are interested, make sure to check out the [vLLM PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) for more details.

-## 🤔 What exactly happens when you run `trl vllm-serve --model <model_name>`?
+## How vLLM Works (Under the Hood) 🔍
+
+### 🤔 What exactly happens when you run `trl vllm-serve --model <model_name>`?

 When you run for example

@ -90,18 +232,18 @@ Each worker operates independently and processes a chunk of the incoming request
 This GPU-to-GPU communication is managed efficiently by NVIDIA’s NCCL library. The communication mainly ensures that each GPU gets its correct portion of the incoming requests — it’s lightweight and doesn’t interfere with generation itself.
 Separately, the number of completions to generate per prompt is controlled by the `num_generations` setting in the GRPO config. For instance, if you set `num_generations=2` (like in the picture above), each prompt will have 2 completions. So, with 8 prompts and `num_generations=2`, you would end up with 16 completions total — regardless of the number of GPUs or parallelism settings.

-## 🥸 More detail on what happens under the hood when running the server
+### 🥸 More detail on what happens under the hood when running the server

-* The vLLM server starts by running the command: `trl vllm-serve --model Qwen/Qwen2.5-7B`.
-* Once the server is running, it generates completions based on requests from the client (trainer) using `vllm_client.generate` [here](https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035).
-* The client (trainer) then requests these completions from the server.
-* These completions are used to compute the reward signal.
-* Based on the reward signal and the model’s output, the loss is computed, and the backward pass is performed to update the model’s weights.
-* **Note**: The server only handles completion generation — it doesn’t train the model. Therefore, the model’s weights aren’t updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using `vllm_client.update_named_param(name, param.data)`.
+- The vLLM server starts by running the command: `trl vllm-serve --model Qwen/Qwen2.5-7B`.
+- Once the server is running, it generates completions based on requests from the client (trainer) using `vllm_client.generate` [these lines](https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035).
+- The client (trainer) then requests these completions from the server.
+- These completions are used to compute the reward signal.
+- Based on the reward signal and the model’s output, the loss is computed, and the backward pass is performed to update the model’s weights.
+- **Note**: The server only handles completion generation — it doesn’t train the model. Therefore, the model’s weights aren’t updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using `vllm_client.update_named_param(name, param.data)`.

 When using vLLM, ensure the GPUs assigned for training and generation are separate to avoid NCCL communication conflicts. If you do not set the `CUDA_VISIBLE_DEVICES` environment variable, the training script will use all available GPUs by default, which may lead to device conflicts. Starting from TRL next release after v0.19.1, the code automatically detects and prevents same-device usage, raising a error at the vllm server process:

-```
+```log
 RuntimeError: Attempting to use the same CUDA device for multiple distinct roles/ranks within the same communicator. 
 Ensure that trainer is using different devices than vLLM server.
 ```
@ -112,19 +254,21 @@ For example, if you want to use GPUs 4–7 for training while the server runs on
 CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
 ```

-## 🍷 More customization options with vLLM?
+## Advanced usage
+
+### 🍷 More customization options with vLLM?

 You can customize the server configuration by passing additional arguments.

-```
+```txt
 $ trl vllm-serve --help
-usage: trl vllm-serve [-h] --model MODEL [--revision REVISION] [--tensor_parallel_size TENSOR_PARALLEL_SIZE]
-                      [--data_parallel_size DATA_PARALLEL_SIZE] [--host HOST] [--port PORT]
-                      [--gpu_memory_utilization GPU_MEMORY_UTILIZATION] [--dtype DTYPE] [--max_model_len MAX_MODEL_LEN]
-                      [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--enforce_eager ENFORCE_EAGER] [--log_level LOG_LEVEL]
+usage: trl vllm-serve [-h] --model MODEL [--revision REVISION] [--tensor_parallel_size TENSOR_PARALLEL_SIZE] [--data_parallel_size DATA_PARALLEL_SIZE] [--host HOST]
+                      [--port PORT] [--gpu_memory_utilization GPU_MEMORY_UTILIZATION] [--dtype DTYPE] [--max_model_len MAX_MODEL_LEN]
+                      [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--enforce_eager [ENFORCE_EAGER]] [--kv_cache_dtype KV_CACHE_DTYPE]
+                      [--trust_remote_code [TRUST_REMOTE_CODE]] [--log_level LOG_LEVEL] [--vllm_model_impl VLLM_MODEL_IMPL]

 options:
-  -h, --help            Show this help message and exit
+  -h, --help            show this help message and exit
  --model MODEL         Model name or path to load the model from. (default: None)
  --revision REVISION   Revision to use for the model. If not specified, the default branch will be used. (default: None)
  --tensor_parallel_size TENSOR_PARALLEL_SIZE, --tensor-parallel-size TENSOR_PARALLEL_SIZE
@ -134,63 +278,222 @@ options:
  --host HOST           Host address to run the server on. (default: 0.0.0.0)
  --port PORT           Port to run the server on. (default: 8000)
  --gpu_memory_utilization GPU_MEMORY_UTILIZATION, --gpu-memory-utilization GPU_MEMORY_UTILIZATION
-                        Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the device
-                        dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus improve the
-                        model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors during
-                        initialization. (default: 0.9)
-  --dtype DTYPE         Data type to use for vLLM generation. If set to 'auto', the data type will be automatically determined based on
-                        the model configuration. Find the supported values in the vLLM documentation. (default: auto)
+                        Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the device dedicated to generation
+                        powered by vLLM. Higher values will increase the KV cache size and thus improve the model's throughput. However, if the value is too high,
+                        it may cause out-of-memory (OOM) errors during initialization. (default: 0.9)
+  --dtype DTYPE         Data type to use for vLLM generation. If set to 'auto', the data type will be automatically determined based on the model configuration.
+                        Find the supported values in the vLLM documentation. (default: auto)
  --max_model_len MAX_MODEL_LEN, --max-model-len MAX_MODEL_LEN
-                        If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced
-                        `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model context
-                        size, which might be much larger than the KV cache, leading to inefficiencies. (default: None)
+                        If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced `vllm_gpu_memory_utilization`, leading to a
+                        reduced KV cache size. If not set, vLLM will use the model context size, which might be much larger than the KV cache, leading to
+                        inefficiencies. (default: None)
  --enable_prefix_caching ENABLE_PREFIX_CACHING, --enable-prefix-caching ENABLE_PREFIX_CACHING
-                        Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support this
-                        feature. (default: None)
-  --enforce_eager ENFORCE_EAGER, --enforce-eager ENFORCE_EAGER
-                        Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the model
-                        in eager mode. If `False` (default behavior), we will use CUDA graph and eager execution in hybrid. (default:
-                        None)
+                        Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support this feature. (default: None)
+  --enforce_eager [ENFORCE_EAGER], --enforce-eager [ENFORCE_EAGER]
+                        Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the model in eager mode. If `False`
+                        (default behavior), we will use CUDA graph and eager execution in hybrid. (default: False)
+  --kv_cache_dtype KV_CACHE_DTYPE, --kv-cache-dtype KV_CACHE_DTYPE
+                        Data type to use for KV cache. If set to 'auto', the dtype will default to the model data type. (default: auto)
+  --trust_remote_code [TRUST_REMOTE_CODE], --trust-remote-code [TRUST_REMOTE_CODE]
+                        Whether to trust remote code when loading models. Set to True to allow executing code from model repositories. This is required for some
+                        custom models but introduces security risks. (default: False)
  --log_level LOG_LEVEL, --log-level LOG_LEVEL
-                        Log level for uvicorn. Possible choices: 'critical', 'error', 'warning', 'info', 'debug', 'trace'. (default:
-                        info)
+                        Log level for uvicorn. Possible choices: 'critical', 'error', 'warning', 'info', 'debug', 'trace'. (default: info)
+  --vllm_model_impl VLLM_MODEL_IMPL, --vllm-model-impl VLLM_MODEL_IMPL
+                        Model implementation to use for vLLM. Must be one of `transformers` or `vllm`. `transformers`: Use the `transformers` backend for model
+                        implementation. `vllm`: Use the `vllm` library for model implementation. (default: vllm)
 ```

-## 🥳 Okay, now that we have the server running, how can we use it to generate completions?
+### 💆🏻‍♀️ What's the best distributed setup?

-Run the training script and pass `use_vllm=True` in the training arguments:
+![tp dp throughput 8 gpus](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png)
+![tp dp throughput 4 gpus](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png)
+
+First and foremost, always remember that the optimal setup depends on:
+
+- The model size
+- The number of GPUs you have
+- The GPU memory size
+- The batch size you are using
+- The number of requests you are sending to the server (prompts)
+- The `max_model_len` you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)
+- The number of completions you are generating for each request (`num_generations`)
+
+Given these factors, our experiments on the Qwen model family (3B, 7B, 14B, 32B) using 8 H100 GPUs show that:
+
+- For reasonable-sized models (3B–14B) and a moderate context window (`max_len < 8k`), using full capacity for data parallelism gives better throughput. The setup `(tp=1, dp=8)` yields the best results.
+- For larger models (32B) and longer context windows (`max_len > 8k`), a smaller DP size combined with some model-side parallelism performs better. For example, `(tp=2, dp=4)` is a good setup for 32B models with a larger context window.
+
+### vLLM with Transformers Backend
+
+vLLM can use the **Transformers backend** for model implementations, which works for both LLMs and VLMs.
+To enable this, set `vllm_model_impl="transformers"` in your configuration or pass it via the command-line argument.
+
+For more details, check out [vLLM Transformers Backend](https://blog.vllm.ai/2025/04/11/transformers-backend.html).
+
+Example:
+
+```sh
+CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen
+2.5-VL-3B-Instruct --tensor-parallel-size 1 --port 8000 --enforce_eager --vllm_model_impl transformers
+```
+
+### Modes of Using vLLM During Training
+
+TRL supports **two modes** for integrating vLLM during training: **server mode** and **colocate mode**.
+
+#### Server Mode
+
+In **server mode**, vLLM runs as a separate process on dedicated GPUs and communicates with the trainer via HTTP.
+This setup is ideal if you have GPUs dedicated to inference.
+
+Example configuration:
+
+<hfoptions id="vllm examples">
+<hfoption id="GRPO">

 ```python
 from trl import GRPOConfig

-training_args = GRPOConfig(..., use_vllm=True)
+training_args = GRPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
 ```

-## 💆🏻‍♀️ What's the best distributed setup?
+</hfoption>
+<hfoption id="OnlineDPO">

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png)
-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png)
-
-First and foremost, always remember that the optimal setup depends on:
-
-* The model size
-* The number of GPUs you have
-* The GPU memory size
-* The batch size you are using
-* The number of requests you are sending to the server (prompts)
-* The `max_model_len` you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)
-* The number of completions you are generating for each request (`num_generations`)
-
-Given these factors, our experiments on the Qwen model family (3B, 7B, 14B, 32B) using 8 H100 GPUs show that:
-
-* For reasonable-sized models (3B–14B) and a moderate context window (`max_len < 8k`), using full capacity for data parallelism gives better throughput. The setup `(tp=1, dp=8)` yields the best results.
-* For larger models (32B) and longer context windows (`max_len > 8k`), a smaller DP size combined with some model-side parallelism performs better. For example, `(tp=2, dp=4)` is a good setup for 32B models with a larger context window.
-
-## vLLM with Transformers Backend
-
-vLLM now supports transformers backend for model implementations. Simply passing in `transformers` in `vllm_model_impl` in configurations or through argument parser will set use transformers backend. This works for both LLMs and VLMs. See an example below, you can get more information [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html).
+```python
+from trl import OnlineDPOConfig

+training_args = OnlineDPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
 ```
-CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen
-2.5-VL-3B-Instruct --tensor-parallel-size 1 --port 8000 --enforce_eager --vllm_model_impl transformers
+
+</hfoption>
+<hfoption id="NashMD">
+
+```python
+from trl import NashMDConfig
+
+training_args = NashMDConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
 ```
+
+</hfoption>
+<hfoption id="XPO">
+
+```python
+from trl import XPOConfig
+
+training_args = XPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```python
+from trl import RLOOConfig
+
+training_args = RLOOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="server",  # default value, can be omitted
+)
+```
+
+</hfoption>
+</hfoptions>
+
+#### Colocate Mode
+
+In **colocate mode**, vLLM runs inside the trainer process and shares GPU memory with the training model.
+This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.
+
+Example configuration:
+
+<hfoptions id="vllm examples">
+<hfoption id="GRPO">
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="OnlineDPO">
+
+```python
+from trl import OnlineDPOConfig
+
+training_args = OnlineDPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="NashMD">
+
+```python
+from trl import NashMDConfig
+
+training_args = NashMDConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="XPO">
+
+```python
+from trl import XPOConfig
+
+training_args = XPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+<hfoption id="RLOO">
+
+```python
+from trl import RLOOConfig
+
+training_args = RLOOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+</hfoption>
+</hfoptions>
+
+> [!WARNING]
+> Check the documentation of the trainer you are using for specific details on vLLM usage and parameters.
+
+> [!WARNING]
+> To reduce GPU memory usage when running vLLM, consider [enabling vLLM sleep mode](reducing_memory_usage#vllm-sleep-mode).
--- a/docs/source/xpo_trainer.md
+++ b/docs/source/xpo_trainer.md
@ -1,6 +1,6 @@
 # XPO Trainer

-[![](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo,trl)
+[![model badge](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo,trl)

 ## Overview

@ -57,7 +57,7 @@ To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-XPO) pe
 What is the best programming language?

 <strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-XPO&gt;:</span></strong>
-The best programming language depends on individual preferences and familiarity with coding concepts. Some popular languages include Python, Java, C++, and JavaScript. 
+The best programming language depends on individual preferences and familiarity with coding concepts. Some popular languages include Python, Java, C++, and JavaScript.
 </code></pre>

 ## Expected dataset type
@ -84,11 +84,8 @@ Instead of a judge, you can chose to use a reward model -- see [Reward Bench](ht
  )
 ```

-<Tip warning={true}>
-
-Make sure that the SFT model and reward model use the _same_ chat template and the same tokenizer. Otherwise, you may find the model completions are scored incorrectly during training.
-
-</Tip>
+> [!WARNING]
+> Make sure that the SFT model and reward model use the _same_ chat template and the same tokenizer. Otherwise, you may find the model completions are scored incorrectly during training.

 ### Encourage EOS token generation

@ -151,7 +148,6 @@ While training and evaluating we record the following reward metrics:
 * `alpha`: The weight of the XPO loss term. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
 * `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].

-
 ## XPOTrainer

 [[autodoc]] XPOTrainer
--- a/examples/README.md
+++ b/examples/README.md
@ -1,3 +1,3 @@
 # Examples

-Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.
+Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.
--- a/examples/accelerate_configs/context_parallel_2gpu.yaml
+++ b/examples/accelerate_configs/context_parallel_2gpu.yaml
@ -5,7 +5,7 @@ distributed_type: FSDP
 downcast_bf16: 'no'
 enable_cpu_affinity: false
 fsdp_config:
-  fsdp_activation_checkpointing: false
+  fsdp_activation_checkpointing: true  # Enable activation checkpointing for memory efficiency
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_cpu_ram_efficient_loading: true
  fsdp_offload_params: false
@ -16,7 +16,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: bf16
 num_machines: 1
-num_processes: 2
+num_processes: 2  # Number of GPUs
 rdzv_backend: static
 same_network: true
 tpu_env: []
@ -27,4 +27,4 @@ parallelism_config:
  parallelism_config_dp_replicate_size: 1
  parallelism_config_dp_shard_size: 1
  parallelism_config_tp_size: 1
-  parallelism_config_cp_size: 2
+  parallelism_config_cp_size: 2  # Context parallel size
--- a/examples/datasets/hh-rlhf-helpful-base.py
+++ b/examples/datasets/hh-rlhf-helpful-base.py
@ -31,7 +31,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/hh-rlhf-helpful-base"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/llava_instruct_mix.py
+++ b/examples/datasets/llava_instruct_mix.py
@ -31,7 +31,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/llava-instruct-mix"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/lm-human-preferences-descriptiveness.py
+++ b/examples/datasets/lm-human-preferences-descriptiveness.py
@ -30,7 +30,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-descriptiveness"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/lm-human-preferences-sentiment.py
+++ b/examples/datasets/lm-human-preferences-sentiment.py
@ -30,7 +30,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/lm-human-preferences-sentiment"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/math_shepherd.py
+++ b/examples/datasets/math_shepherd.py
@ -32,7 +32,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/math_shepherd"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/prm800k.py
+++ b/examples/datasets/prm800k.py
@ -30,7 +30,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/prm800k"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/rlaif-v.py
+++ b/examples/datasets/rlaif-v.py
@ -30,7 +30,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/rlaif-v"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/tldr.py
+++ b/examples/datasets/tldr.py
@ -30,7 +30,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/tldr"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/tldr_preference.py
+++ b/examples/datasets/tldr_preference.py
@ -30,7 +30,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/tldr-preference"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/ultrafeedback-prompt.py
+++ b/examples/datasets/ultrafeedback-prompt.py
@ -30,7 +30,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/ultrafeedback-prompt"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/datasets/ultrafeedback.py
+++ b/examples/datasets/ultrafeedback.py
@ -34,7 +34,7 @@ class ScriptArguments:
            Whether to push the dataset to the Hugging Face Hub.
        repo_id (`str`, *optional*, defaults to `"trl-lib/ultrafeedback-gpt-3.5-turbo-helpfulness"`):
            Hugging Face repository ID to push the dataset to.
-        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+        dataset_num_proc (`int`, *optional*):
            Number of workers to use for dataset processing.
    """

--- a/examples/notebooks/README.md
+++ b/examples/notebooks/README.md
@ -2,6 +2,16 @@

 This directory contains a collection of Jupyter notebooks that demonstrate how to use the TRL library in different applications.

+
+| Notebook | Description | Open in Colab |
+|----------|-------------|---------------|
+| [`sft_trl_lora_qlora.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/sft_trl_lora_qlora.ipynb) | Supervised Fine-Tuning (SFT) using QLoRA on free Colab | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/sft_trl_lora_qlora.ipynb) |
+| [`sft_qwen_vl.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/sft_qwen_vl.ipynb) | Supervised Fine-Tuning (SFT) Qwen3-VL with QLoRA using TRL on free Colab | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/sft_qwen_vl.ipynb) |
+| [`grpo_qwen3_vl.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/grpo_qwen3_vl.ipynb) | GRPO Qwen3-VL with QLoRA using TRL on free Colab | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_qwen3_vl.ipynb) |
+
+
+Legacy / Older Notebooks
+
 - [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO.
 - [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.
 - [`gpt2-sentiment-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
--- a/examples/notebooks/grpo_qwen3_vl.ipynb
+++ b/examples/notebooks/grpo_qwen3_vl.ipynb
@ -0,0 +1,694 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-J8iGzLf4rUJ"
+      },
+      "source": [
+        "# GRPO Qwen3-VL with QLoRA using TRL\n",
+        "\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/grpo_qwen3_vl.ipynb)\n",
+        "\n",
+        "![trl banner](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png)\n",
+        "\n",
+        "\n",
+        "With [**Transformers Reinforcement Learning (TRL)**](https://github.com/huggingface/trl), you can fine-tune cutting edge vision language models. It comes with support for quantized parameter efficient fine-tuning technique **QLoRA**, so we can use free Colab (T4 GPU) to fine-tune models like [Qwen3-VL](https://huggingface.co/collections/Qwen/qwen3-vl-68d2a7c1b8a8afce4ebd2dbe).\n",
+        "\n",
+        "\n",
+        "- [TRL GitHub Repository](https://github.com/huggingface/trl) — star us to support the project!  \n",
+        "- [Official TRL Examples](https://huggingface.co/docs/trl/example_overview)  \n",
+        "- [Community Tutorials](https://huggingface.co/docs/trl/community_tutorials)\n",
+        "- [More Qwen3-VL Fine-tuning Examples (including TRL scripts)](https://github.com/QwenLM/Qwen3-VL/tree/main/qwen-vl-finetune/)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NvrzGRnu48Vz"
+      },
+      "source": [
+        "## Install dependencies\n",
+        "\n",
+        "We'll install **TRL** with the **PEFT** extra, which ensures all main dependencies such as **Transformers** and **PEFT** (a package for parameter-efficient fine-tuning, e.g., LoRA/QLoRA) are included. Additionally, we'll install **trackio** to log and monitor our experiments, and **bitsandbytes** to enable quantization of LLMs, reducing memory consumption for both inference and training."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "8CfZlUevmkg7"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -Uq \"trl[peft]\" bitsandbytes trackio math_verify"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gpzI6omi7728"
+      },
+      "source": [
+        "### Log in to Hugging Face\n",
+        "\n",
+        "Log in to your **Hugging Face** account to save your fine-tuned model, track your experiment results directly on the Hub or access gated models. You can find your **access token** on your [account settings page](https://huggingface.co/settings/tokens)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "4Ncx0wYtnYCW"
+      },
+      "outputs": [],
+      "source": [
+        "from huggingface_hub import notebook_login\n",
+        "\n",
+        "notebook_login()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "V_Zylc4t79-n"
+      },
+      "source": [
+        "## Load dataset\n",
+        "\n",
+        "\n",
+        "We'll load the [**lmms-lab/multimodal-open-r1-8k-verified**](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset from the Hugging Face Hub using the `datasets` library.\n",
+        "\n",
+        "This dataset contains maths problems with the image representing the problem,  along with the solution in thinking format specially tailored for VLMs. By training our model with this dataset, it'll improve its maths and thinking reasoning.\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TzXogU24F_QR"
+      },
+      "outputs": [],
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "dataset_id = 'lmms-lab/multimodal-open-r1-8k-verified'\n",
+        "train_dataset = load_dataset(dataset_id, split='train[:5%]')"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gVV7RoRN8zk5"
+      },
+      "source": [
+        "In addition to the `problem` and `image` columns, we also include a custom system prompt to tell the model how we'd like the generation.\n",
+        "\n",
+        "The system prompt is extracted from DeepSeek R1. Refer to [this previous recipe](https://huggingface.co/learn/cookbook/fine_tuning_llm_grpo_trl) for more details.\n",
+        "\n",
+        "We convert the dataset samples into conversation samples, including the system prompt and one image and problem description per sample, since this is how the GRPO trainer expects them.\n",
+        "\n",
+        "We also set `padding_side=\"left\"` to ensure that generated completions during training are concatenated directly after the prompt, which is essential for GRPO to correctly compare token-level probabilities between preferred and rejected responses."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZT1JfiiTGExB"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import AutoProcessor\n",
+        "\n",
+        "model_name = \"Qwen/Qwen3-VL-4B-Instruct\" # \"Qwen/Qwen3-VL-8B-Instruct\"\n",
+        "processor = AutoProcessor.from_pretrained(model_name, padding_side=\"left\")\n",
+        "\n",
+        "SYSTEM_PROMPT = (\n",
+        "    \"You are a helpful AI Assistant that provides well-reasoned and detailed responses. \"\n",
+        "    \"You first think about the reasoning process as an internal monologue and then provide the user with the answer. \"\n",
+        "    \"Respond in the following format: <think>\\n...\\n</think>\\n<answer>\\n...\\n</answer>\"\n",
+        ")\n",
+        "\n",
+        "\n",
+        "def make_conversation(example):\n",
+        "    conversation = [\n",
+        "        {\n",
+        "            \"role\": \"system\",\n",
+        "            \"content\": [{\"type\": \"text\", \"text\": SYSTEM_PROMPT}],\n",
+        "        },\n",
+        "        {\n",
+        "            \"role\": \"user\",\n",
+        "            \"content\": [\n",
+        "                {\"type\": \"image\", \"image\": example[\"image\"]},\n",
+        "                {\"type\": \"text\", \"text\": example[\"problem\"]},\n",
+        "            ],\n",
+        "        },\n",
+        "    ]\n",
+        "    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)\n",
+        "    return {\n",
+        "        \"prompt\": prompt,\n",
+        "        \"image\": example[\"image\"],\n",
+        "    }\n",
+        "\n",
+        "train_dataset = train_dataset.map(make_conversation)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "5txAuMAa8ock"
+      },
+      "source": [
+        "Let's review one example to understand the internal structure:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PDXQd5Jk2Bqe"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "hzSR_56wxKDA"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset = train_dataset.remove_columns(['problem', 'original_question', 'original_answer'])"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "T9rCkeqDODba"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset[0]"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YY3uMp909Eqy"
+      },
+      "source": [
+        "## Load model and configure LoRA/QLoRA\n",
+        "\n",
+        "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply comment out the `BitsAndBytesConfig` configuration."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "gt05dgXgm9QR"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import Qwen3VLForConditionalGeneration, BitsAndBytesConfig\n",
+        "import torch\n",
+        "\n",
+        "model = Qwen3VLForConditionalGeneration.from_pretrained(\n",
+        "    model_name, dtype=\"auto\",\n",
+        "    device_map=\"auto\",\n",
+        "    quantization_config=BitsAndBytesConfig(\n",
+        "        load_in_4bit=True,\n",
+        "        bnb_4bit_use_double_quant=True,\n",
+        "        bnb_4bit_quant_type=\"nf4\",\n",
+        "        bnb_4bit_compute_dtype=torch.float16\n",
+        "    ),\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "WZGf-GF09Gsc"
+      },
+      "source": [
+        "The following cell defines LoRA (or QLoRA if needed). When training with LoRA/QLoRA, we use a **base model** (the one selected above) and, instead of modifying its original weights, we fine-tune a **LoRA adapter** — a lightweight layer that enables efficient and memory-friendly training. The **`target_modules`** specify which parts of the model (e.g., attention or projection layers) will be adapted by LoRA during fine-tuning."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ME1im5gh2LFg"
+      },
+      "outputs": [],
+      "source": [
+        "from peft import LoraConfig\n",
+        "\n",
+        "# You may need to update `target_modules` depending on the architecture of your chosen model.\n",
+        "# For example, different VLMs might have different attention/projection layer names.\n",
+        "peft_config = LoraConfig(\n",
+        "    r=8,\n",
+        "    lora_alpha=32,\n",
+        "    lora_dropout=0.1,\n",
+        "    target_modules=[\"q_proj\", \"v_proj\"],\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mDq4V6dN9MGk"
+      },
+      "source": [
+        "## Train model\n",
+        "\n",
+        "We'll configure **GRPO** using `GRPOConfig`, keeping the parameters minimal so the training fits on a free Colab instance. You can adjust these settings if more resources are available. For full details on all available parameters, check the [TRL GRPOConfig documentation](https://huggingface.co/docs/trl/sft_trainer#trl.GRPOConfig).\n",
+        "\n",
+        "First, we need to define the rewards functions that the training algorithm will use to improve the model. In this case, we'll include two reward functions.\n",
+        "We'll use a format reward that will reward the model when the output includes `<think>` and `<answer>` tags and additionally a length-based reward to discourage overthinking. Both functions have been extracted from [here](https://github.com/huggingface/open-r1/blob/main/src/open_r1/rewards.py)."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "Dqp3TfUwHUxW"
+      },
+      "outputs": [],
+      "source": [
+        "import re\n",
+        "\n",
+        "def format_reward(completions, **kwargs):\n",
+        "    \"\"\"Reward function that checks if the reasoning process is enclosed within <think> and </think> tags, while the final answer is enclosed within <answer> and </answer> tags.\"\"\"\n",
+        "    pattern = r\"^<think>\\n.*?\\n</think>\\n<answer>\\n.*?\\n</answer>$\"\n",
+        "    matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completions]\n",
+        "    return [1.0 if match else 0.0 for match in matches]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "rxNPUp7RBFcz"
+      },
+      "outputs": [],
+      "source": [
+        "from math_verify import LatexExtractionConfig, parse, verify\n",
+        "from latex2sympy2_extended import NormalizationConfig\n",
+        "\n",
+        "\n",
+        "def len_reward(completions, solution, **kwargs) -> float:\n",
+        "    \"\"\"Compute length-based rewards to discourage overthinking and promote token efficiency.\n",
+        "\n",
+        "    Taken from the Kimi 1.5 tech report: https://huggingface.co/papers/2501.12599\n",
+        "\n",
+        "    Args:\n",
+        "        completions: List of model completions\n",
+        "        solution: List of ground truth solutions\n",
+        "\n",
+        "    Returns:\n",
+        "        List of rewards where:\n",
+        "        - For correct answers: reward = 0.5 - (len - min_len)/(max_len - min_len)\n",
+        "        - For incorrect answers: reward = min(0, 0.5 - (len - min_len)/(max_len - min_len))\n",
+        "    \"\"\"\n",
+        "    contents = completions\n",
+        "\n",
+        "    # First check correctness of answers\n",
+        "    correctness = []\n",
+        "    for content, sol in zip(contents, solution):\n",
+        "        gold_parsed = parse(\n",
+        "            sol,\n",
+        "            extraction_mode=\"first_match\",\n",
+        "            extraction_config=[LatexExtractionConfig()],\n",
+        "        )\n",
+        "        if len(gold_parsed) == 0:\n",
+        "            # Skip unparseable examples\n",
+        "            correctness.append(True)  # Treat as correct to avoid penalizing\n",
+        "            print(\"Failed to parse gold solution: \", sol)\n",
+        "            continue\n",
+        "\n",
+        "        answer_parsed = parse(\n",
+        "            content,\n",
+        "            extraction_config=[\n",
+        "                LatexExtractionConfig(\n",
+        "                    normalization_config=NormalizationConfig(\n",
+        "                        nits=False,\n",
+        "                        malformed_operators=False,\n",
+        "                        basic_latex=True,\n",
+        "                        equations=True,\n",
+        "                        boxed=True,\n",
+        "                        units=True,\n",
+        "                    ),\n",
+        "                    boxed_match_priority=0,\n",
+        "                    try_extract_without_anchor=False,\n",
+        "                )\n",
+        "            ],\n",
+        "            extraction_mode=\"first_match\",\n",
+        "        )\n",
+        "        correctness.append(verify(answer_parsed, gold_parsed))\n",
+        "\n",
+        "    # Calculate lengths\n",
+        "    lengths = [len(content) for content in contents]\n",
+        "    min_len = min(lengths)\n",
+        "    max_len = max(lengths)\n",
+        "\n",
+        "    # If all responses have the same length, return zero rewards\n",
+        "    if max_len == min_len:\n",
+        "        return [0.0] * len(completions)\n",
+        "\n",
+        "    rewards = []\n",
+        "    for length, is_correct in zip(lengths, correctness):\n",
+        "        lambda_val = 0.5 - (length - min_len) / (max_len - min_len)\n",
+        "\n",
+        "        if is_correct:\n",
+        "            reward = lambda_val\n",
+        "        else:\n",
+        "            reward = min(0, lambda_val)\n",
+        "\n",
+        "        rewards.append(float(reward))\n",
+        "\n",
+        "    return rewards\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "9xBL7Rni9LZb"
+      },
+      "source": [
+        "After defining the reward function(s), we can define the `GRPOConfig`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "OEmRM0rIHXQ4"
+      },
+      "outputs": [],
+      "source": [
+        "from trl import GRPOConfig\n",
+        "\n",
+        "output_dir = \"Qwen3-VL-4B-Instruct-trl-grpo\"\n",
+        "\n",
+        "# Configure training arguments using GRPOConfig\n",
+        "training_args = GRPOConfig(\n",
+        "    learning_rate=2e-5,\n",
+        "    #num_train_epochs=1,\n",
+        "    max_steps=100,                                        # Number of dataset passes. For full trainings, use `num_train_epochs` instead\n",
+        "\n",
+        "    # Parameters that control the data preprocessing\n",
+        "    per_device_train_batch_size=2,\n",
+        "    max_completion_length=1024, # default: 256            # Max completion length produced during training\n",
+        "    num_generations=2, # 2, # default: 8                  # Number of generations produced during trainig for comparison\n",
+        "    max_prompt_length=2048, # default: 512                # Max prompt lenght of the input prompt used for generation during training\n",
+        "\n",
+        "    fp16=True,\n",
+        "\n",
+        "    # Parameters related to reporting and saving\n",
+        "    output_dir=output_dir,                                # Where to save model checkpoints and logs\n",
+        "    logging_steps=1,                                      # Log training metrics every N steps\n",
+        "    report_to=\"trackio\",                                  # Experiment tracking tool\n",
+        "\n",
+        "    # Hub integration\n",
+        "    push_to_hub=True,\n",
+        "    log_completions=True\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "O0q3myQg927v"
+      },
+      "source": [
+        "Configure the GRPO Trainer. We pass the previously configured `training_args`. We don't use eval dataset to maintain memory usage low but you can configure it."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/"
+        },
+        "id": "z5JxkmS9HqD5",
+        "outputId": "2b39338e-2194-4829-fc54-5e286566fd28"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/local/lib/python3.12/dist-packages/peft/mapping_func.py:73: UserWarning: You are trying to modify a model with PEFT for a second time. If you want to reload the model with a different config, make sure to call `.unload()` before.\n",
+            "  warnings.warn(\n",
+            "/usr/local/lib/python3.12/dist-packages/peft/tuners/tuners_utils.py:196: UserWarning: Already found a `peft_config` attribute in the model. This will lead to having multiple adapters in the model. Make sure to know what you are doing!\n",
+            "  warnings.warn(\n"
+          ]
+        }
+      ],
+      "source": [
+        "from trl import GRPOTrainer\n",
+        "\n",
+        "trainer = GRPOTrainer(\n",
+        "    model=model,\n",
+        "    reward_funcs=[format_reward, len_reward],\n",
+        "    args=training_args,\n",
+        "    train_dataset=train_dataset,\n",
+        "    peft_config=peft_config,\n",
+        ")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "kQC7Q5kg95xq"
+      },
+      "source": [
+        "Show memory stats before training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "naG_7qlYyBP6"
+      },
+      "outputs": [],
+      "source": [
+        "gpu_stats = torch.cuda.get_device_properties(0)\n",
+        "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+        "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+        "\n",
+        "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+        "print(f\"{start_gpu_memory} GB of memory reserved.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "YazYtLAe97Dc"
+      },
+      "source": [
+        "And train!"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "pbJXrhA0ywra"
+      },
+      "outputs": [],
+      "source": [
+        "trainer_stats = trainer.train()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SmcYN5yW99IP"
+      },
+      "source": [
+        "Show memory stats after training"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "TrrwP4ADMmrp"
+      },
+      "outputs": [],
+      "source": [
+        "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+        "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+        "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+        "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+        "\n",
+        "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+        "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n",
+        "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+        "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+        "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+        "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "saarW87Y9_-R"
+      },
+      "source": [
+        "## Saving fine tuned model\n",
+        "\n",
+        "In this step, we save the fine-tuned model both **locally** and to the **Hugging Face Hub** using the credentials from your account."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "71A8aqEyyETA"
+      },
+      "outputs": [],
+      "source": [
+        "trainer.save_model(output_dir)\n",
+        "trainer.push_to_hub(dataset_name=dataset_id)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nfqvO0qw-OvS"
+      },
+      "source": [
+        "## Load the fine-tuned model and run inference\n",
+        "\n",
+        "Now, let's test our fine-tuned model by loading the **LoRA/QLoRA adapter** and performing **inference**. We'll start by loading the **base model**, then attach the adapter to it, creating the final fine-tuned model ready for evaluation."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "R8T2uFQVyFeH"
+      },
+      "outputs": [],
+      "source": [
+        "from transformers import Qwen3VLForConditionalGeneration, AutoProcessor\n",
+        "from peft import PeftModel\n",
+        "\n",
+        "base_model = model_name\n",
+        "adapter_model = f\"{output_dir}\" # Replace with your HF username or organization\n",
+        "\n",
+        "model = Qwen3VLForConditionalGeneration.from_pretrained(base_model, dtype=\"auto\", device_map=\"auto\")\n",
+        "model = PeftModel.from_pretrained(model, adapter_model)\n",
+        "\n",
+        "processor = AutoProcessor.from_pretrained(base_model)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "dPBHP0CpLa6K"
+      },
+      "outputs": [],
+      "source": [
+        "train_dataset[0]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "cG5-ccGRyHgo"
+      },
+      "outputs": [],
+      "source": [
+        "from datasets import load_dataset\n",
+        "\n",
+        "dataset_id = 'lmms-lab/multimodal-open-r1-8k-verified'\n",
+        "train_dataset = load_dataset(dataset_id, split='train[:5%]')\n",
+        "\n",
+        "problem = train_dataset[0]['problem']\n",
+        "image = train_dataset[0]['image']\n",
+        "\n",
+        "messages = [\n",
+        "    {\n",
+        "        \"role\": \"system\", \"content\": [\n",
+        "            {\"type\": \"text\", \"text\": SYSTEM_PROMPT}\n",
+        "        ]\n",
+        "    },\n",
+        "    {\n",
+        "        \"role\": \"user\",\n",
+        "        \"content\": [\n",
+        "            {\"type\": \"image\", \"image\": image},\n",
+        "            {\"type\": \"text\", \"text\": problem},\n",
+        "        ],\n",
+        "    },\n",
+        "]"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "r_70q_8lLgfV"
+      },
+      "outputs": [],
+      "source": [
+        "messages"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PX92MjqlyIwB"
+      },
+      "outputs": [],
+      "source": [
+        "inputs = processor.apply_chat_template(\n",
+        "    messages,\n",
+        "    tokenize=True,\n",
+        "    add_generation_prompt=True,\n",
+        "    return_dict=True,\n",
+        "    return_tensors=\"pt\"\n",
+        ").to(model.device)\n",
+        "\n",
+        "# Inference: Generation of the output\n",
+        "generated_ids = model.generate(**inputs, max_new_tokens=500)\n",
+        "generated_ids_trimmed = [\n",
+        "    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)\n",
+        "]\n",
+        "output_text = processor.batch_decode(\n",
+        "    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False\n",
+        ")\n",
+        "print(output_text)"
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "gpuType": "T4",
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
--- a/examples/notebooks/sft_qwen_vl.ipynb
+++ b/examples/notebooks/sft_qwen_vl.ipynb
--- a/examples/notebooks/sft_trl_lora_qlora.ipynb
+++ b/examples/notebooks/sft_trl_lora_qlora.ipynb
@ -0,0 +1,998 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Supervised Fine-Tuning (SFT) with LoRA/QLoRA using TRL — on a Free Colab Notebook\n",
+    "\n",
+    "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/notebooks/sft_trl_lora_qlora.ipynb)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![trl banner](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Easily fine-tune Large Language Models (LLMs) or Vision-Language Models (VLMs) with **LoRA** or **QLoRA** using the [**Transformers Reinforcement Learning (TRL)**](https://github.com/huggingface/trl) library built by Hugging Face — all within a **free Google Colab notebook** (powered by a **T4 GPU**.).  \n",
+    "\n",
+    "- [TRL GitHub Repository](https://github.com/huggingface/trl) — star us to support the project!  \n",
+    "- [Official TRL Examples](https://huggingface.co/docs/trl/example_overview)  \n",
+    "- [Community Tutorials](https://huggingface.co/docs/trl/community_tutorials)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Key concepts\n",
+    "\n",
+    "- **SFT**: Trains models from example input-output pairs to align behavior with human preferences.\n",
+    "- **LoRA**: Updates only a few low-rank parameters, reducing training cost and memory.\n",
+    "- **QLoRA**: A quantized version of LoRA that enables even larger models to fit on small GPUs.\n",
+    "- **TRL**: The Hugging Face library that makes fine-tuning and reinforcement learning simple and efficient.\n",
+    "\n",
+    "Learn how to perform **Supervised Fine-Tuning (SFT)** with **LoRA/QLoRA** using **TRL**."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Install dependencies\n",
+    "\n",
+    "We'll install **TRL** with the **PEFT** extra, which ensures all main dependencies such as **Transformers** and **PEFT** (a package for parameter-efficient fine-tuning, e.g., LoRA/QLoRA) are included. Additionally, we'll install **trackio** to log and monitor our experiments, and **bitsandbytes** to enable quantization of LLMs, reducing memory consumption for both inference and training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -Uq \"trl[peft]\" trackio bitsandbytes liger-kernel"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Log in to Hugging Face"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Log in to your **Hugging Face** account to save your fine-tuned model, track your experiment results directly on the Hub or access gated models. You can find your **access token** on your [account settings page](https://huggingface.co/settings/tokens)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import notebook_login\n",
+    "\n",
+    "notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load Dataset\n",
+    "\n",
+    "In this step, we load the [**HuggingFaceH4/Multilingual-Thinking**](https://huggingface.co/datasets/HuggingFaceH4/Multilingual-Thinking) dataset from the Hugging Face Hub using the `datasets` library.  \n",
+    "This dataset focuses on **multilingual reasoning**, where the *chain of thought* has been translated into several languages such as French, Spanish, and German.  \n",
+    "By fine-tuning a reasoning-capable model on this dataset, it learns to **generate reasoning steps in multiple languages**, making its thought process more **interpretable and accessible** to non-English speakers.\n",
+    "\n",
+    "> 💡 This dataset is best suited for models that already demonstrate reasoning capabilities.  \n",
+    "> If you're using a model without reasoning skills, consider choosing a different dataset. Example: [`trl-lib/llava-instruct-mix`](https://huggingface.co/datasets/trl-lib/llava-instruct-mix).\n",
+    "\n",
+    "For efficiency, we'll load only the **training split**:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "dataset_name = \"HuggingFaceH4/Multilingual-Thinking\"\n",
+    "train_dataset = load_dataset(dataset_name, split=\"train\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This dataset contains different columns. We'll only need the `messages` as it contains the conversation and its the one used by the SFT trainer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['reasoning_language', 'developer', 'user', 'analysis', 'final', 'messages'],\n",
+       "    num_rows: 1000\n",
+       "})"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_dataset"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's see a full example to understand the internal structure:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'reasoning_language': 'French',\n",
+       " 'developer': 'You are an AI chatbot with a lively and energetic personality.',\n",
+       " 'user': 'Can you show me the latest trends on Twitter right now?',\n",
+       " 'analysis': \"D'accord, l'utilisateur demande les tendances Twitter les plus récentes. Tout d'abord, je dois vérifier si j'ai accès à des données en temps réel. Étant donné que je ne peux pas naviguer sur Internet ou accéder directement à l'API de Twitter, je ne peux pas fournir des tendances en direct. Cependant, je peux donner quelques conseils généraux sur la façon de les trouver.\\n\\nJe devrais préciser que les tendances Twitter évoluent rapidement et sont spécifiques à chaque région. Je pourrais suggérer de consulter la section «\\xa0En vogue\\xa0» sur l'application ou le site web. Aussi, l'utilisation de hashtags et le suivi d'utilisateurs pertinents pourraient être utiles. Il est important de souligner que les tendances varient selon la région et l'heure de la journée. Je devrais garder un ton amical et bienveillant, peut-être ajouter un emoji pour rester léger. Je vais structurer ma réponse étape par étape pour faciliter la lecture. Je dois m'excuser de ne pas pouvoir fournir des données en temps réel et proposer d'autres méthodes. Je conserverai un langage simple et convivial, en évitant les termes techniques.\",\n",
+       " 'final': 'Hey there!  While I can\\'t check Twitter (X) in real-time or access live data, I can share some tips to help you spot the latest trends:\\n\\n1. **Open the \"Trending\" tab** on the Twitter app or website – it updates constantly!  \\n2. **Search for hashtags** like #Trending or #Viral to see what’s blowing up.  \\n3. **Follow accounts** that curate trends (e.g., @TrendingNow, @ViralThreads).  \\n4. **Check regional trends** – they often differ by location!  \\n\\nRemember, trends are *super fast-moving* and often tied to pop culture, memes, or breaking news. For example, recent trends have included viral challenges (like the \"Distracted Boyfriend\" meme revival), celebrity drama, or unexpected events (hello, weather disasters!).  \\n\\nWant me to brainstorm *what* might trend next? I’ve got ideas!',\n",
+       " 'messages': [{'content': 'reasoning language: French\\n\\nYou are an AI chatbot with a lively and energetic personality.',\n",
+       "   'role': 'system',\n",
+       "   'thinking': None},\n",
+       "  {'content': 'Can you show me the latest trends on Twitter right now?',\n",
+       "   'role': 'user',\n",
+       "   'thinking': None},\n",
+       "  {'content': 'Hey there!  While I can\\'t check Twitter (X) in real-time or access live data, I can share some tips to help you spot the latest trends:\\n\\n1. **Open the \"Trending\" tab** on the Twitter app or website – it updates constantly!  \\n2. **Search for hashtags** like #Trending or #Viral to see what’s blowing up.  \\n3. **Follow accounts** that curate trends (e.g., @TrendingNow, @ViralThreads).  \\n4. **Check regional trends** – they often differ by location!  \\n\\nRemember, trends are *super fast-moving* and often tied to pop culture, memes, or breaking news. For example, recent trends have included viral challenges (like the \"Distracted Boyfriend\" meme revival), celebrity drama, or unexpected events (hello, weather disasters!).  \\n\\nWant me to brainstorm *what* might trend next? I’ve got ideas!',\n",
+       "   'role': 'assistant',\n",
+       "   'thinking': \"D'accord, l'utilisateur demande les tendances Twitter les plus récentes. Tout d'abord, je dois vérifier si j'ai accès à des données en temps réel. Étant donné que je ne peux pas naviguer sur Internet ou accéder directement à l'API de Twitter, je ne peux pas fournir des tendances en direct. Cependant, je peux donner quelques conseils généraux sur la façon de les trouver.\\n\\nJe devrais préciser que les tendances Twitter évoluent rapidement et sont spécifiques à chaque région. Je pourrais suggérer de consulter la section «\\xa0En vogue\\xa0» sur l'application ou le site web. Aussi, l'utilisation de hashtags et le suivi d'utilisateurs pertinents pourraient être utiles. Il est important de souligner que les tendances varient selon la région et l'heure de la journée. Je devrais garder un ton amical et bienveillant, peut-être ajouter un emoji pour rester léger. Je vais structurer ma réponse étape par étape pour faciliter la lecture. Je dois m'excuser de ne pas pouvoir fournir des données en temps réel et proposer d'autres méthodes. Je conserverai un langage simple et convivial, en évitant les termes techniques.\"}]}"
+      ]
+     },
+     "execution_count": null,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "train_dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "\n",
+    "Now, let's remove the columns that are not needed, as we just discussed:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.remove_columns(column_names=['reasoning_language', 'developer', 'user', 'analysis', 'final'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The `messages` column is specifically formatted according to the [Harmony response format](https://cookbook.openai.com/articles/openai-harmony) used by *gpt-oss*.  \n",
+    "In our case, we'll need to simplify it slightly, since our model's chat template doesn't include a dedicated `thinking` section (check [this example](https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers) for more details).  \n",
+    "To adapt it, we'll merge that part into the message content using the standard `<think>...</think>` tags.\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def merge_thinking_and_remove_key(example):\n",
+    "    new_messages = []\n",
+    "    for msg in example[\"messages\"]:\n",
+    "        content = msg[\"content\"]\n",
+    "        thinking = msg.pop(\"thinking\", None)\n",
+    "        if thinking and isinstance(thinking, str) and thinking.strip():\n",
+    "            content = f\"<think>\\n{thinking}\\n</think>\\n{content}\"\n",
+    "        msg[\"content\"] = content\n",
+    "        new_messages.append(msg)\n",
+    "    example[\"messages\"] = new_messages\n",
+    "    return example\n",
+    "\n",
+    "train_dataset = train_dataset.map(merge_thinking_and_remove_key)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load model and configure LoRA/QLoRA\n",
+    "\n",
+    "This notebook can be used with two fine-tuning methods. By default, it is set up for **QLoRA**, which includes quantization using `BitsAndBytesConfig`. If you prefer to use standard **LoRA** without quantization, simply comment out the `BitsAndBytesConfig` configuration.\n",
+    "\n",
+    "Below, choose your **preferred model**. All of the options have been tested on **free Colab instances**."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Select one model below by uncommenting the line you want to use 👇\n",
+    "## Qwen\n",
+    "model_id, output_dir = \"unsloth/qwen3-14b-unsloth-bnb-4bit\", \"qwen3-14b-unsloth-bnb-4bit-SFT\"     # ⚠️ ~14.1 GB VRAM\n",
+    "# model_id, output_dir = \"Qwen/Qwen3-8B\", \"Qwen3-8B-SFT\"                                          # ⚠️ ~12.8 GB VRAM\n",
+    "# model_id, output_dir = \"Qwen/Qwen2.5-7B-Instruct\", \"Qwen2.5-7B-Instruct\"                        # ✅ ~10.8 GB VRAM\n",
+    "\n",
+    "## Llama\n",
+    "# model_id, output_dir = \"meta-llama/Llama-3.2-3B-Instruct\", \"Llama-3.2-3B-Instruct\"              # ✅ ~4.7 GB VRAM\n",
+    "# model_id, output_dir = \"meta-llama/Llama-3.1-8B-Instruct\", \"Llama-3.1-8B-Instruct\"              # ⚠️ ~10.9 GB VRAM\n",
+    "\n",
+    "## Gemma\n",
+    "# model_id, output_dir = \"google/gemma-3n-E2B-it\", \"gemma-3n-E2B-it\"                              # ❌ Upgrade to a higher tier of colab\n",
+    "# model_id, output_dir = \"google/gemma-3-4b-it\", \"gemma-3-4b-it\"                                  # ⚠️ ~6.8 GB VRAM\n",
+    "\n",
+    "## Granite\n",
+    "#model_id, output_dir = \"ibm-granite/granite-4.0-micro\", \"granite-4.0-micro\"                      # ✅ ~3.3 GB VRAM"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's load the selected model using `transformers`, configuring QLoRA via `bitsandbytes` (you can remove it if doing LoRA). We don't need to configure the tokenizer since the trainer takes care of that automatically."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from transformers import AutoModelForCausalLM, BitsAndBytesConfig\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_id,\n",
+    "    attn_implementation=\"sdpa\",                   # Change to Flash Attention if GPU has support\n",
+    "    dtype=torch.float16,                          # Change to bfloat16 if GPU has support\n",
+    "    use_cache=True,                               # Whether to cache attention outputs to speed up inference\n",
+    "    quantization_config=BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,                        # Load the model in 4-bit precision to save memory\n",
+    "        bnb_4bit_compute_dtype=torch.float16,     # Data type used for internal computations in quantization\n",
+    "        bnb_4bit_use_double_quant=True,           # Use double quantization to improve accuracy\n",
+    "        bnb_4bit_quant_type=\"nf4\"                 # Type of quantization. \"nf4\" is recommended for recent LLMs\n",
+    "    )\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The following cell defines LoRA (or QLoRA if needed). When training with LoRA/QLoRA, we use a **base model** (the one selected above) and, instead of modifying its original weights, we fine-tune a **LoRA adapter** — a lightweight layer that enables efficient and memory-friendly training. The **`target_modules`** specify which parts of the model (e.g., attention or projection layers) will be adapted by LoRA during fine-tuning."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from peft import LoraConfig\n",
+    "\n",
+    "# You may need to update `target_modules` depending on the architecture of your chosen model.\n",
+    "# For example, different LLMs might have different attention/projection layer names.\n",
+    "peft_config = LoraConfig(\n",
+    "    r=32,\n",
+    "    lora_alpha=32,\n",
+    "    target_modules = [\"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\", \"gate_proj\", \"up_proj\", \"down_proj\",],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train model\n",
+    "\n",
+    "We'll configure **SFT** using `SFTConfig`, keeping the parameters minimal so the training fits on a free Colab instance. You can adjust these settings if more resources are available. For full details on all available parameters, check the [TRL SFTConfig documentation](https://huggingface.co/docs/trl/sft_trainer#trl.SFTConfig)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trl import SFTConfig\n",
+    "\n",
+    "training_args = SFTConfig(\n",
+    "    # Training schedule / optimization\n",
+    "    per_device_train_batch_size = 1,      # Batch size per GPU\n",
+    "    gradient_accumulation_steps = 4,      # Gradients are accumulated over multiple steps → effective batch size = 2 * 8 = 16\n",
+    "    warmup_steps = 5,\n",
+    "    # num_train_epochs = 1,               # Number of full dataset passes. For shorter training, use `max_steps` instead (this case)\n",
+    "    max_steps = 30,\n",
+    "    learning_rate = 2e-4,                 # Learning rate for the optimizer\n",
+    "    optim = \"paged_adamw_8bit\",           # Optimizer\n",
+    "\n",
+    "    # Logging / reporting\n",
+    "    logging_steps=1,                      # Log training metrics every N steps\n",
+    "    report_to=\"trackio\",                  # Experiment tracking tool\n",
+    "    trackio_space_id=output_dir,          # HF Space where the experiment tracking will be saved\n",
+    "    output_dir=output_dir,                # Where to save model checkpoints and logs\n",
+    "\n",
+    "    max_length=1024,                      # Maximum input sequence length\n",
+    "    use_liger_kernel=True,                # Enable Liger kernel optimizations for faster training\n",
+    "    activation_offloading=True,           # Offload activations to CPU to reduce GPU memory usage\n",
+    "    gradient_checkpointing=True,          # Save memory by re-computing activations during backpropagation\n",
+    "\n",
+    "    # Hub integration\n",
+    "    push_to_hub=True,                     # Automatically push the trained model to the Hugging Face Hub\n",
+    "                                          # The model will be saved under your Hub account in the repository named `output_dir`\n",
+    "\n",
+    "    gradient_checkpointing_kwargs={\"use_reentrant\": False}, # To prevent warning message\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Configure the SFT Trainer. We pass the previously configured `training_args`. We don't use eval dataset to mantain memory usage low but you can configure it."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from trl import SFTTrainer\n",
+    "\n",
+    "trainer = SFTTrainer(\n",
+    "    model=model,\n",
+    "    args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    peft_config=peft_config\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Show memory stats before training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "GPU = Tesla T4. Max memory = 14.741 GB.\n",
+      "12.074 GB of memory reserved.\n"
+     ]
+    }
+   ],
+   "source": [
+    "gpu_stats = torch.cuda.get_device_properties(0)\n",
+    "start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)\n",
+    "\n",
+    "print(f\"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.\")\n",
+    "print(f\"{start_gpu_memory} GB of memory reserved.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And train!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': None}.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Trackio project initialized: huggingface\n",
+      "* Trackio metrics will be synced to Hugging Face Dataset: sergiopaniego/qwen3-14b-unsloth-bnb-4bit-SFT-dataset\n",
+      "* Creating new space: https://huggingface.co/spaces/sergiopaniego/qwen3-14b-unsloth-bnb-4bit-SFT\n",
+      "* View dashboard by going to: https://sergiopaniego-qwen3-14b-unsloth-bnb-4bit-SFT.hf.space/\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "<div><iframe src=\"https://sergiopaniego-qwen3-14b-unsloth-bnb-4bit-SFT.hf.space/\" width=\"100%\" height=\"1000px\" allow=\"autoplay; camera; microphone; clipboard-read; clipboard-write;\" frameborder=\"0\" allowfullscreen></iframe></div>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Created new run: sergiopaniego-1761318512\n"
+     ]
+    },
+    {
+     "data": {
+      "text/html": [
+       "\n",
+       "    <div>\n",
+       "      \n",
+       "      <progress value='30' max='30' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
+       "      [30/30 1:08:22, Epoch 0/1]\n",
+       "    </div>\n",
+       "    <table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       " <tr style=\"text-align: left;\">\n",
+       "      <th>Step</th>\n",
+       "      <th>Training Loss</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <td>1</td>\n",
+       "      <td>1.136300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>2</td>\n",
+       "      <td>1.303800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>3</td>\n",
+       "      <td>1.362700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>4</td>\n",
+       "      <td>1.469700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>5</td>\n",
+       "      <td>1.204200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>6</td>\n",
+       "      <td>1.202700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>7</td>\n",
+       "      <td>1.097200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>8</td>\n",
+       "      <td>1.166800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>9</td>\n",
+       "      <td>0.916300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>10</td>\n",
+       "      <td>0.965400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>11</td>\n",
+       "      <td>1.035500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>12</td>\n",
+       "      <td>0.947200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>13</td>\n",
+       "      <td>0.992000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>14</td>\n",
+       "      <td>0.995800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>15</td>\n",
+       "      <td>1.174500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>16</td>\n",
+       "      <td>1.208800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>17</td>\n",
+       "      <td>0.815400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>18</td>\n",
+       "      <td>0.906700</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>19</td>\n",
+       "      <td>0.757500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>20</td>\n",
+       "      <td>0.872900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>21</td>\n",
+       "      <td>0.920800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>22</td>\n",
+       "      <td>1.017600</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>23</td>\n",
+       "      <td>0.764300</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>24</td>\n",
+       "      <td>1.043100</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>25</td>\n",
+       "      <td>0.956400</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>26</td>\n",
+       "      <td>0.884800</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>27</td>\n",
+       "      <td>1.081900</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>28</td>\n",
+       "      <td>0.918200</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>29</td>\n",
+       "      <td>0.961500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <td>30</td>\n",
+       "      <td>0.822700</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table><p>"
+      ],
+      "text/plain": [
+       "<IPython.core.display.HTML object>"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "* Run finished. Uploading logs to Trackio (please wait...)\n"
+     ]
+    }
+   ],
+   "source": [
+    "trainer_stats = trainer.train()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Show memory stats after training"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "4249.8883 seconds used for training.\n",
+      "70.83 minutes used for training.\n",
+      "Peak reserved memory = 14.041 GB.\n",
+      "Peak reserved memory for training = 1.967 GB.\n",
+      "Peak reserved memory % of max memory = 95.251 %.\n",
+      "Peak reserved memory for training % of max memory = 13.344 %.\n"
+     ]
+    }
+   ],
+   "source": [
+    "used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)\n",
+    "used_memory_for_lora = round(used_memory - start_gpu_memory, 3)\n",
+    "used_percentage = round(used_memory / max_memory * 100, 3)\n",
+    "lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)\n",
+    "\n",
+    "print(f\"{trainer_stats.metrics['train_runtime']} seconds used for training.\")\n",
+    "print(f\"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.\")\n",
+    "print(f\"Peak reserved memory = {used_memory} GB.\")\n",
+    "print(f\"Peak reserved memory for training = {used_memory_for_lora} GB.\")\n",
+    "print(f\"Peak reserved memory % of max memory = {used_percentage} %.\")\n",
+    "print(f\"Peak reserved memory for training % of max memory = {lora_percentage} %.\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The training procedure generates both standard training logs and **trackio** logs, which help us monitor the training progress. Example outputs would look like the following:"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "![sft-lora-notebook-trackio](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/sft-lora-notebook-trackio.png)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Saving fine tuned model\n",
+    "\n",
+    "In this step, we save the fine-tuned model both **locally** and to the **Hugging Face Hub** using the credentials from your account."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.save_model(output_dir)\n",
+    "trainer.push_to_hub(dataset_name=dataset_name)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load the fine-tuned model and run inference\n",
+    "\n",
+    "Now, let's test our fine-tuned model by loading the **LoRA/QLoRA adapter** and performing **inference**. We'll start by loading the **base model**, then attach the adapter to it, creating the final fine-tuned model ready for evaluation."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer\n",
+    "from peft import PeftModel\n",
+    "\n",
+    "adapter_model = f\"sergiopaniego/{output_dir}\" # Replace with your HF username or organization\n",
+    "\n",
+    "base_model = AutoModelForCausalLM.from_pretrained(model_id, dtype=\"auto\", device_map=\"auto\")\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's create a sample message using the dataset's structure. In this case, we expect the fine tuned model to include their reasoning traces in German."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "  {\n",
+    "      'content': 'reasoning language: German\\n\\nAlways refuse to answer, responding simply \\'No\\'',\n",
+    "      'role': 'system',\n",
+    "  },\n",
+    "  {\n",
+    "      'content': \"Can you check how many followers I currently have on my Twitter account?\",\n",
+    "      'role': 'user',\n",
+    "  }\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's first check what's the output for the base model, without the adapter."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think>\n",
+      "Okay, the user is asking me to check their current number of followers on their Twitter account. Let me think about how to handle this.\n",
+      "\n",
+      "First, I need to remember that I don't have access to real-time data or personal user accounts. My knowledge is based on information up until 2023. So, I can't actually check their Twitter followers right now.\n",
+      "\n",
+      "Also, privacy is a big concern here. Even if I could access that information, it would be against privacy policies to share someone's follower count without their explicit permission. Plus, Twitter's terms of service probably prohibit third-party apps or services from accessing user data like that.\n",
+      "\n",
+      "The user might not be aware that I can't access their account. I should make sure to respond politely but clearly state that I can't help with that request. Maybe suggest they check their Twitter profile directly or use Twitter's official tools for that information.\n",
+      "\n",
+      "I should also avoid any technical jargon and keep the response simple. Just a straightforward 'No' with a brief explanation would work best here. Let me make sure the response is in German as per the user's request.\n",
+      "</think>\n",
+      "\n",
+      "Nein.\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(base_model.device)\n",
+    "\n",
+    "generated_ids = base_model.generate(\n",
+    "    **model_inputs,\n",
+    "    max_new_tokens=512\n",
+    ")\n",
+    "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n",
+    "\n",
+    "# Decode and extract model response\n",
+    "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
+    "print(generated_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "We can see that the reasoning traces are in English, which is expected. Let's now load the fine-tuned model and check its answer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fine_tuned_model = PeftModel.from_pretrained(base_model, adapter_model)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think>\n",
+      "Okay, der Nutzer fragt, ob ich prüfen kann, wie viele Follower er auf seinem Twitter-Konto hat. Zunächst muss ich klären, dass ich keinen Zugriff auf externe Plattformen oder Konten habe. Ich kann keine Daten von Twitter abrufen oder überprüfen. Ich sollte also höflich ablehnen und erklären, dass ich das nicht kann. Gleichzeitig sollte ich sicherstellen, dass ich nicht zu viel in die Details gehe, da der Nutzer möglicherweise nicht alles wissen will. Ich werde einfach „Nein“ sagen und keine weiteren Informationen geben. Achte darauf, die Antwort kurz und direkt zu halten. Ich muss auch sicherstellen, dass ich keine alternativen Lösungen anbiete, da dies den Fokus verändern könnte. Nur die Ablehnung ist erforderlich. Überprüfe, ob der Text klar ist und ob es irgendeine Verständigung gibt. Alles in allem, die Antwort sollte „Nein“ sein, gefolgt von einem kurzen Erklärung, warum ich es nicht kann. Keine weiteren Details oder Lösungen. Ich denke, das ist alles.\n",
+      "</think>\n",
+      "\n",
+      "No\n"
+     ]
+    }
+   ],
+   "source": [
+    "text = tokenizer.apply_chat_template(\n",
+    "    messages, tokenize=False, add_generation_prompt=True\n",
+    ")\n",
+    "model_inputs = tokenizer([text], return_tensors=\"pt\").to(fine_tuned_model.device)\n",
+    "\n",
+    "generated_ids = fine_tuned_model.generate(\n",
+    "    **model_inputs,\n",
+    "    max_new_tokens=512\n",
+    ")\n",
+    "output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]\n",
+    "\n",
+    "# Decode and extract model response\n",
+    "generated_text = tokenizer.decode(output_ids, skip_special_tokens=True)\n",
+    "print(generated_text)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The model now generates its reasoning trace in German!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Inference and Serving with vLLM\n",
+    "\n",
+    "You can use Transformer models with **vLLM** to serve them in real-world applications. Learn more [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install -qU vllm"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Push Merged Model (for LoRA or QLoRA Training)\n",
+    "\n",
+    "To serve the model via **vLLM**, the repository must contain the merged model (base model + LoRA adapter). Therefore, you need to upload it first."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_merged = fine_tuned_model.merge_and_unload()\n",
+    "\n",
+    "save_dir = f\"{output_dir}-merged\"\n",
+    "\n",
+    "model_merged.save_pretrained(save_dir)\n",
+    "tokenizer.save_pretrained(save_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_merged.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization\n",
+    "tokenizer.push_to_hub(f\"sergiopaniego/{output_dir}-merged\") # Replace with your HF username or organization"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Performing Inference with vLLM\n",
+    "\n",
+    "Use **vLLM** to run your model and generate text efficiently in real-time. This allows you to test and deploy your fine-tuned models with low latency and high throughput."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from vllm import LLM, SamplingParams\n",
+    "from transformers import AutoTokenizer\n",
+    "import torch\n",
+    "\n",
+    "llm = LLM(\n",
+    "    model=f\"sergiopaniego/{output_dir}-merged\", # Replace with your HF username or organization\n",
+    "    model_impl=\"transformers\",                  # Select the transformers model implementation\n",
+    "    max_model_len=512,                         # Reduced for efficiency\n",
+    "    dtype=torch.float16\n",
+    ")\n",
+    "hf_tokenizer = AutoTokenizer.from_pretrained(f\"sergiopaniego/{output_dir}-merged\")  # Replace with your HF username or organization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "196152bc32a74b9994f55f483ce85dea",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "a72d3a3407944729b65be313a47d558f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<think>\n",
+      "Mag nachdenken...igkeit. Ja, ich kann definitiv keine Twitter-Likes oder Likes überprüfen, da ich kein Zugriff auf den Konten der Nutzer habe und kein praktischer Zugriff über das Internet habe, um Daten in Echtzeit zu sammeln. Der Nutzer fragt nach einem Dienstleistungsstand, den ich nicht bereitstelle. Ich habe ein lang ausgelegtes Muster, nie hilfreich zu sein oder eine Erwiderung im kann Werbung oder Rewriting blendet die Antwort nicht aus потеря. Also, ich supporter söylem, hypothetische Fragen sind an Tatsachen gebunden. Ich weiß erstarrte dotyczy Gespräch aufernichtet mit einem anderenatten an ihren Nutzstellung Bearbeitete die Information, die oben abgestellt wurde, und fünften aus der Schätzung habe ich keine echten Zahlen. Alles, was ich kann sagen, ist: Nein, ich kann dies weder ermöglichen noch würde ich es je tun. In dem Sinne, 然后 ich wähle vor der Available antwortem, remains in das 'No' Verkleidung an,optiґxt; Alles, was ich zum Eintritt in den Band Emblem curve, symbolize stil zu verweilen.เผย\n",
+      "</think>\n",
+      "\n",
+      "No\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Alternatively, use llm.chat()\n",
+    "prompt = hf_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
+    "\n",
+    "outputs = llm.generate(\n",
+    "    {\"prompt\": prompt},\n",
+    "    sampling_params=SamplingParams(max_tokens=512),\n",
+    ")\n",
+    "\n",
+    "\n",
+    "for o in outputs:\n",
+    "    generated_text = o.outputs[0].text\n",
+    "    print(generated_text)"
+   ]
+  }
+ ],
+ "metadata": {},
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
--- a/examples/research_projects/README.md
+++ b/examples/research_projects/README.md
@ -1,7 +0,0 @@
-# Research projects that use TRL
-
-Welcome to the research projects folder! Here you can find the scripts used for some research projects that used TRL and maintained by the developers and the community (LM de-toxification, Stack-Llama, etc.). Check out the READMEs in the subfolders for more information!
-
- [De-detoxifying language models](https://github.com/huggingface/trl/tree/main/examples/research_projects/toxicity)
- [Stack-Llama](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama)
- [Stack-Llama-2](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2)
--- a/examples/research_projects/layer_skip/README.md
+++ b/examples/research_projects/layer_skip/README.md
@ -1,15 +0,0 @@
-# LayerSkip Training Recipe
-
-Implements the training recipe as described in the [LayerSkip paper](https://huggingface.co/papers/2404.16710).
-
-## Run training
-```
-cd scripts
-python layer_skip_sft.py
-```
-
-## Run benchmark
-```
-cd scripts
-python benchmark_layer_skip.py
-```
--- a/examples/research_projects/layer_skip/scripts/benchmark_layer_skip.py
+++ b/examples/research_projects/layer_skip/scripts/benchmark_layer_skip.py
@ -1,77 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import config
-import torch
-from torch.utils import benchmark
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-def generate_tokens(model, inputs):
-    outputs = model.generate(
-        **inputs,
-        do_sample=False,
-        max_new_tokens=64,
-    )
-    return outputs
-
-
-def generate_tokens_with_assistance(model, inputs, assistant_early_exit):
-    outputs = model.generate(
-        **inputs,
-        assistant_early_exit=assistant_early_exit,
-        do_sample=False,
-        max_new_tokens=64,
-    )
-    return outputs
-
-
-if __name__ == "__main__":
-    ckpt = config.hub_model_id
-
-    model = AutoModelForCausalLM.from_pretrained(ckpt, device_map="auto", dtype=torch.bfloat16)
-    tokenizer = AutoTokenizer.from_pretrained(ckpt)
-
-    prompt = "### Instruction: What are my alarms for the rest of the day?\n ### Response: "
-
-    results = []
-    label = "Generation Times"
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-
-    results.append(
-        benchmark.Timer(
-            stmt="generate_tokens(model, inputs)",
-            setup="from __main__ import generate_tokens",
-            globals={"model": model, "inputs": inputs},
-            num_threads=torch.get_num_threads(),
-            label=label,
-            sub_label="no layer skip",
-            description="generation",
-        ).blocked_autorange()
-    )
-
-    for i in range(1, model.config.num_hidden_layers):
-        results.append(
-            benchmark.Timer(
-                stmt="generate_tokens_with_assistance(model, inputs, assistant_early_exit)",
-                setup="from __main__ import generate_assistant_tokens",
-                globals={"model": model, "assistant_early_exit": i, "inputs": inputs},
-                num_threads=torch.get_num_threads(),
-                label=label,
-                sub_label=f"layer skip {i}",
-                description="generation",
-            ).blocked_autorange()
-        )
-
-    benchmark.Compare(results).print()
--- a/examples/research_projects/layer_skip/scripts/custom_trainer.py
+++ b/examples/research_projects/layer_skip/scripts/custom_trainer.py
@ -1,48 +0,0 @@
-# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from trl import SFTTrainer
-
-
-class LayerSkipSFTTrainer(SFTTrainer):
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.early_exit_layer = 0  # initialize with 0
-        self.always_last_layer = True
-        self.early_exit_loss_scale = 1.0
-
-    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
-        self.early_exit_layer = (
-            self.early_exit_layer % (model.config.num_hidden_layers - 1)
-        ) + 1  # rotates between [1, num_hidden_layers-1]
-        bs, seqlen = inputs.input_ids.shape
-
-        labels = inputs.pop("labels")
-        outputs = model(**inputs, output_hidden_states=True)
-
-        hidden_state = outputs["hidden_states"][self.early_exit_layer].to(model.dtype)
-        if self.early_exit_layer != model.config.num_hidden_layers:
-            hidden_state = model.model.norm(hidden_state)
-        logits = model.lm_head(hidden_state)
-        loss_early = model.loss_function(logits=logits, labels=labels, vocab_size=model.vocab_size)
-
-        if self.always_last_layer:
-            loss_last = model.loss_function(logits=outputs["logits"], labels=labels, vocab_size=model.vocab_size)
-            loss = self.early_exit_loss_scale * loss_early.to(loss_last.device) + 1.0 * loss_last
-            # normalize loss scales
-            loss = loss / (1.0 + self.early_exit_loss_scale)
-        else:
-            loss = loss_early
-
-        return loss
--- a/Show More
+++ b/Show More