remove attention mask when position ids

Support pad-to-multiple-when and padding-free
Refactor version retrieval to use importlib.metadata for improved reliability
2025-10-21 02:53:59 +08:00 · 2025-09-02 22:43:02 +00:00 · 2025-09-02 22:34:07 +00:00 · 2025-08-29 20:44:05 +00:00 · 2025-08-29 13:21:55 -07:00 · 2025-08-29 13:19:34 -07:00
281 changed files with 31511 additions and 14266 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -21,8 +21,7 @@ Fixes # (issue)
      Pull Request section?
 - [ ] Was this discussed/approved via a GitHub issue? Please add a link
      to it if that's the case.
- [ ] Did you make sure to update the documentation with your changes? Here are the
-      [documentation guidelines](https://github.com/huggingface/trl/tree/main/docs).
+- [ ] Did you make sure to update the documentation with your changes?
 - [ ] Did you write any new necessary tests?


--- a/.github/codeql/custom-queries.qls
+++ b/.github/codeql/custom-queries.qls
@ -0,0 +1,19 @@
+import codeql
+
+from WorkflowString interpolation, Workflow workflow
+where 
+  interpolation.getStringValue().matches("${{ github.event.issue.title }}") or
+  interpolation.getStringValue().matches("${{ github.event.issue.body }}") or
+  interpolation.getStringValue().matches("${{ github.event.pull_request.title }}") or
+  interpolation.getStringValue().matches("${{ github.event.pull_request.body }}") or
+  interpolation.getStringValue().matches("${{ github.event.review.body }}") or
+  interpolation.getStringValue().matches("${{ github.event.comment.body }}") or
+  interpolation.getStringValue().matches("${{ github.event.inputs.* }}") or
+  interpolation.getStringValue().matches("${{ github.event.head_commit.message }}")
+  interpolation.getStringValue().matches("${{ github.event.* }}") and
+  (
+    step.getKey() = "run" or  // Injection in run
+    step.getKey() = "env" or  // Injection via env
+    step.getKey() = "with"    // Injection via with
+  )
+select workflow, "🚨 Do not use directly as input of action"
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -9,6 +9,7 @@ concurrency:

 jobs:
  build:
+    if: github.event.pull_request.draft == false
    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
    with:
      commit_sha: ${{ github.event.pull_request.head.sha }}
--- a/.github/workflows/codeQL.yml
+++ b/.github/workflows/codeQL.yml
@ -0,0 +1,26 @@
+name: "CodeQL Analysis - Workflows"
+
+on:
+  workflow_dispatch:
+
+jobs:
+  analyze:
+    name: "Analyze GitHub Workflows"
+    runs-on: ubuntu-latest
+    permissions:
+      security-events: write
+      actions: read
+      contents: read
+
+    steps:
+      - name: "Checkout repository"
+        uses: actions/checkout@v4
+
+      - name: "Initialize CodeQL"
+        uses: github/codeql-action/init@v2
+        with:
+          languages: "yaml"
+          queries: +security-and-quality, ./.github/codeql/custom-queries.qls
+
+      - name: "Perform CodeQL Analysis"
+        uses: github/codeql-action/analyze@v2
--- a/.github/workflows/pr_style_bot.yml
+++ b/.github/workflows/pr_style_bot.yml
@ -0,0 +1,127 @@
+name: PR Style Bot
+
+on:
+  workflow_dispatch:
+
+
+permissions:
+  contents: write
+  pull-requests: write
+
+jobs:
+  run-style-bot:
+    if: >
+      contains(github.event.comment.body, '@bot /style') &&
+      github.event.issue.pull_request != null
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Extract PR details
+        id: pr_info
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const prNumber = context.payload.issue.number;
+            const { data: pr } = await github.rest.pulls.get({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              pull_number: prNumber
+            });
+            
+            // We capture both the branch ref and the "full_name" of the head repo
+            // so that we can check out the correct repository & branch (including forks).
+            core.setOutput("prNumber", prNumber);
+            core.setOutput("headRef", pr.head.ref);
+            core.setOutput("headRepoFullName", pr.head.repo.full_name);
+
+      - name: Check out PR branch
+        uses: actions/checkout@v3
+        env: 
+          HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
+          HEADREF: ${{ steps.pr_info.outputs.headRef }}
+        with:
+          # Instead of checking out the base repo, use the contributor's repo name
+          repository: ${{ env.HEADREPOFULLNAME }}
+          ref: ${{ env.HEADREF }}
+          # You may need fetch-depth: 0 for being able to push
+          fetch-depth: 0
+          token: ${{ secrets.GITHUB_TOKEN }}
+      
+      - name: Debug
+        env: 
+          HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
+          HEADREF: ${{ steps.pr_info.outputs.headRef }}
+          PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
+        run: |
+          echo "PR number: ${{ env.PRNUMBER }}"
+          echo "Head Ref: ${{ env.HEADREF }}"
+          echo "Head Repo Full Name: ${{ env.HEADREPOFULLNAME }}"
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+
+      - name: Install dependencies
+        run: |
+          pip install ruff pre-commit
+
+      - name: Download Makefile from main branch
+        run: |
+          curl -o main_Makefile https://raw.githubusercontent.com/huggingface/trl/main/Makefile
+        
+      - name: Compare Makefiles
+        run: |
+          if ! diff -q main_Makefile Makefile; then
+            echo "Error: The Makefile has changed. Please ensure it matches the main branch."
+            exit 1
+          fi
+          echo "No changes in Makefile. Proceeding..."
+          rm -rf main_Makefile
+
+      - name: Run make style and make quality
+        run: |
+          make precommit || true
+
+      - name: Commit and push changes
+        id: commit_and_push
+        env: 
+          HEADREPOFULLNAME: ${{ steps.pr_info.outputs.headRepoFullName }}
+          HEADREF: ${{ steps.pr_info.outputs.headRef }}
+          PRNUMBER: ${{ steps.pr_info.outputs.prNumber }}
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          echo "HEADREPOFULLNAME: ${{ env.HEADREPOFULLNAME }}, HEADREF: ${{ env.HEADREF }}"
+          # Configure git with the Actions bot user
+          git config user.name "github-actions[bot]"
+          git config user.email "github-actions[bot]@users.noreply.github.com"
+
+          # Make sure your 'origin' remote is set to the contributor's fork
+          git remote set-url origin "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ env.HEADREPOFULLNAME }}.git"
+
+          # If there are changes after running style/quality, commit them
+          if [ -n "$(git status --porcelain)" ]; then
+            git add .
+            git commit -m "Apply style fixes"
+            # Push to the original contributor's forked branch
+            git push origin HEAD:${{ env.HEADREF }}
+            echo "changes_pushed=true" >> $GITHUB_OUTPUT
+          else
+            echo "No changes to commit."
+            echo "changes_pushed=false" >> $GITHUB_OUTPUT
+          fi
+
+      - name: Comment on PR with workflow run link
+        if: steps.commit_and_push.outputs.changes_pushed == 'true'
+        uses: actions/github-script@v6
+        with:
+          script: |
+            const prNumber = parseInt(process.env.prNumber, 10);
+            const runUrl = `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY}/actions/runs/${process.env.GITHUB_RUN_ID}`
+
+            await github.rest.issues.createComment({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              issue_number: prNumber,
+              body: `Style fixes have been applied. [View the workflow run here](${runUrl}).`
+            });
+        env:
+          prNumber: ${{ steps.pr_info.outputs.prNumber }}
--- a/.github/workflows/publish.yml
+++ b/.github/workflows/publish.yml
@ -0,0 +1,43 @@
+name: Publish to PyPI
+
+on:
+  push:
+    branches:
+      - main
+      - v*-release
+    paths:
+      - "VERSION"
+
+jobs:
+  publish:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Read version
+        id: get_version
+        run: echo "version=$(cat VERSION)" >> $GITHUB_OUTPUT
+
+      - name: Debug - Show version.txt content
+        run: echo "Version is ${{ steps.get_version.outputs.version }}"
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: "3.x"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install build twine
+
+      - name: Build package
+        run: python -m build
+
+      - name: Publish to PyPI
+        if: ${{ !contains(steps.get_version.outputs.version, 'dev') }}
+        env:
+          TWINE_USERNAME: __token__
+          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
+        run: |
+          python -m twine upload dist/*
--- a/.github/workflows/slow-tests.yml
+++ b/.github/workflows/slow-tests.yml
@ -2,7 +2,7 @@ name: Slow tests (on push)

 on:
  push:
-    branches: [ main ]
+    branches: [main]
    paths:
      # Run only when python files are modified
      - "trl/**.py"
@ -12,13 +12,16 @@ env:
  IS_GITHUB_CI: "1"
  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

-
 jobs:
  run_all_tests_single_gpu:
    strategy:
      fail-fast: false
      matrix:
-        docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"]
+        docker-image-name:
+          [
+            "huggingface/trl-latest-gpu:latest",
+            "huggingface/trl-source-gpu:latest",
+          ]
    runs-on:
      group: aws-g4dn-2xlarge
    env:
@ -35,7 +38,7 @@ jobs:
      - name: Pip install
        run: |
          source activate trl
-          pip install -e ".[test]" --no-deps
+          pip install -e ".[test,vlm]" --no-deps
          pip install pytest-reportlog parameterized

      - name: Run slow SFT tests on single GPU
@ -43,19 +46,22 @@ jobs:
        run: |
          source activate trl
          make slow_tests
-      
+
      - name: Generate Report
        if: always()
        run: |
          pip install slack_sdk tabulate
          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY

-
  run_all_tests_multi_gpu:
    strategy:
      fail-fast: false
      matrix:
-        docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"]
+        docker-image-name:
+          [
+            "huggingface/trl-latest-gpu:latest",
+            "huggingface/trl-source-gpu:latest",
+          ]
    runs-on:
      group: aws-g4dn-2xlarge
    env:
@ -72,7 +78,7 @@ jobs:
      - name: Pip install
        run: |
          source activate trl
-          pip install -e ".[test]" --no-deps
+          pip install -e ".[test,vlm]" --no-deps
          pip install pytest-reportlog parameterized

      - name: Run slow SFT tests on Multi GPU
@ -87,7 +93,7 @@ jobs:
          source activate trl
          pip install deepspeed
          make test_examples
-      
+
      - name: Generate Reports
        if: always()
        run: |
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -21,11 +21,9 @@ jobs:
  check_code_quality:
    name: Check code quality
    runs-on: ubuntu-latest
+    if: github.event.pull_request.draft == false
    steps:
      - uses: actions/checkout@v4
-        with:
-          fetch-depth: 0
-          submodules: recursive
      - name: Set up Python 3.12
        uses: actions/setup-python@v5
        with:
@ -38,126 +36,217 @@ jobs:
    name: Tests
    strategy:
      matrix:
-        python-version: ['3.9', '3.10', '3.11', '3.12']
-        os: ['ubuntu-latest', 'windows-latest']
+        python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
      fail-fast: false
-    runs-on: ${{ matrix.os }}
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    if: github.event.pull_request.draft == false
    steps:
-      - uses: actions/checkout@v4
+      - name: Git checkout
+        uses: actions/checkout@v4
+
      - name: Set up Python ${{ matrix.python-version }}
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python-version }}
-          cache: "pip"
-          cache-dependency-path: |
-              setup.py
-              requirements.txt
+
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel

      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
-          python -m pip install ".[dev]"
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+
      - name: Test with pytest
        run: |
+          source .venv/bin/activate
          make test
+
      - name: Post to Slack
        if: github.ref == 'refs/heads/main' && always()  # Check if the branch is main
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: Results with Python ${{ matrix.python-version }} on ${{ matrix.os }} with lastest dependencies
+          title: Results with Python ${{ matrix.python-version }} and latest dependencies
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  tests_dev:
    name: Tests with dev dependencies
-    runs-on: 'ubuntu-latest'
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    if: github.event.pull_request.draft == false
    steps:
-      - uses: actions/checkout@v4
+      - name: Git checkout
+        uses: actions/checkout@v4
+
      - name: Set up Python 3.12
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
-          cache: "pip"
-          cache-dependency-path: |
-              setup.py
-              requirements.txt
+
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
-          python -m pip install -U git+https://github.com/huggingface/accelerate.git
-          python -m pip install -U git+https://github.com/huggingface/datasets.git
-          python -m pip install -U git+https://github.com/huggingface/transformers.git
-          python -m pip install ".[dev]"
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+          uv pip install -U git+https://github.com/huggingface/accelerate.git
+          uv pip install -U git+https://github.com/huggingface/datasets.git
+          uv pip install -U git+https://github.com/huggingface/transformers.git
+          
+
      - name: Test with pytest
        run: |
+          source .venv/bin/activate
          make test
+
      - name: Post to Slack
        if: github.ref == 'refs/heads/main' && always()  # Check if the branch is main
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: Results with Python 3.12 on ubuntu-latest with dev dependencies
+          title: Results with Python 3.12 and dev dependencies
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  tests_wo_optional_deps:
    name: Tests without optional dependencies
-    runs-on: 'ubuntu-latest'
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    if: github.event.pull_request.draft == false
    steps:
-      - uses: actions/checkout@v4
+      - name: Git checkout
+        uses: actions/checkout@v4
+
      - name: Set up Python 3.12
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
-          cache: "pip"
-          cache-dependency-path: |
-              setup.py
-              requirements.txt
+
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
-          python -m pip install ".[test]"
+          source .venv/bin/activate
+          uv pip install ".[test]"
+
      - name: Test with pytest
        run: |
+          source .venv/bin/activate
          make test
+
      - name: Post to Slack
        if: github.ref == 'refs/heads/main' && always()  # Check if the branch is main
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: Results with Python 3.12 on ubuntu-latest without optional dependencies
+          title: Results with Python 3.12 without optional dependencies
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}

  tests_min_versions:
    name: Tests with minimum versions
-    runs-on: 'ubuntu-latest'
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
+    if: github.event.pull_request.draft == false
    steps:
-      - uses: actions/checkout@v4
+      - name: Git checkout
+        uses: actions/checkout@v4
+
      - name: Set up Python 3.12
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
-          cache: "pip"
-          cache-dependency-path: |
-              setup.py
-              requirements.txt
+
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
-          python -m pip install accelerate==0.34.0
-          python -m pip install datasets==2.21.0
-          python -m pip install transformers==4.46.0
-          python -m pip install ".[dev]"
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+          uv pip install accelerate==1.4.0
+          uv pip install datasets==3.0.0
+          uv pip install transformers==4.55.0
+
      - name: Test with pytest
        run: |
+          source .venv/bin/activate
          make test
+
      - name: Post to Slack
        if: github.ref == 'refs/heads/main' && always()  # Check if the branch is main
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: Results with Python 3.12 on ubuntu-latest with minimum versions
+          title: Results with Python 3.12 and minimum dependencies versions
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/tests_latest.yml
+++ b/.github/workflows/tests_latest.yml
@ -13,33 +13,54 @@ env:
 jobs:
  tests:
    name: Tests latest TRL release with dev dependencies
-    runs-on: 'ubuntu-latest'
+    runs-on:
+      group: aws-g4dn-2xlarge
+    container:
+      image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+      options: --gpus all
+    defaults:
+      run:
+        shell: bash
    steps:
      - name: Git checkout
        uses: actions/checkout@v4
-        with: { ref: v0.13-release }
+        with: { ref: v0.22-release }
+
      - name: Set up Python 3.12
        uses: actions/setup-python@v5
        with:
          python-version: '3.12'
-          cache: "pip"
-          cache-dependency-path: |
-              setup.py
-              requirements.txt
+
+      - name: Install Make and Git
+        run: |
+          apt-get update && apt-get install -y make git curl
+
+      - name: Install uv
+        run: |
+          curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Create Python virtual environment
+        run: |
+          uv venv
+          uv pip install --upgrade setuptools wheel
+
      - name: Install dependencies
        run: |
-          python -m pip install --upgrade pip
-          python -m pip install -U git+https://github.com/huggingface/accelerate.git
-          python -m pip install -U git+https://github.com/huggingface/datasets.git
-          python -m pip install -U git+https://github.com/huggingface/transformers.git
-          python -m pip install ".[dev]"
+          source .venv/bin/activate
+          uv pip install ".[dev]"
+          uv pip install -U git+https://github.com/huggingface/accelerate.git
+          uv pip install -U git+https://github.com/huggingface/datasets.git
+          uv pip install -U git+https://github.com/huggingface/transformers.git
+
      - name: Test with pytest
        run: |
+          source .venv/bin/activate
          make test
+
      - name: Post to Slack
        uses: huggingface/hf-workflows/.github/actions/post-slack@main
        with:
          slack_channel: ${{ env.CI_SLACK_CHANNEL }}
-          title: Results of latest TRL with Python 3.12 on ubuntu-latest with dev dependencies
+          title: Results of latest TRL with Python 3.12 and dev dependencies
          status: ${{ job.status }}
          slack_token: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/trufflehog.yml
+++ b/.github/workflows/trufflehog.yml
@ -12,4 +12,7 @@ jobs:
      with:
        fetch-depth: 0
    - name: Secret Scanning
-      uses: trufflesecurity/trufflehog@main
+      uses: trufflesecurity/trufflehog@853e1e8d249fd1e29d0fcc7280d29b03df3d643d
+      with:
+        # exclude buggy postgres detector that is causing false positives and not relevant to our codebase
+        extra_args: --results=verified,unknown --exclude-detectors=postgres
--- a/.gitignore
+++ b/.gitignore
@ -142,4 +142,4 @@ checklink/cookies.txt
 # wandb files
 nbs/wandb/
 examples/notebooks/wandb/
-wandb/
+wandb/
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,8 +1,8 @@
 repos:
  - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.6.3
+    rev: v0.11.10
    hooks:
-      - id: ruff
+      - id: ruff-check
        types_or: [ python, pyi ]
        args: [ --fix ]
      - id: ruff-format
--- a/CITATION.cff
+++ b/CITATION.cff
@ -31,4 +31,4 @@ keywords:
  - pytorch
  - transformers
 license: Apache-2.0
-version: 0.13
+version: "0.22"
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -23,7 +23,7 @@ There are several ways you can contribute to TRL:
 * Contribute to the examples or the documentation.

 If you don't know where to start, there is a special [Good First
-Issue](https://github.com/huggingface/trl/contribute) listing. It will give you a list of
+Issue](https://github.com/huggingface/trl/labels/%F0%9F%91%B6%20good%20first%20issue) listing. It will give you a list of
 open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.

 For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/trl/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀
@ -171,8 +171,7 @@ Follow these steps to start contributing:
   $ pytest tests/<TEST_TO_RUN>.py
   ```
   
-   > For the following commands leveraging the `make` utility, we recommend using the WSL system when running on
-   > Windows. More information [here](https://docs.microsoft.com/en-us/windows/wsl/about).
+   > For the following commands leveraging the `make` utility.

   You can also run the full suite with the following command.

@ -388,7 +387,7 @@ When a feature or component is marked for deprecation, its use will emit a warni
 - **Removal Version**: The target version when the feature will be removed, providing users with a clear timeframe to transition.

 Example:
-   
+
   ```python
   warnings.warn(
       "The `Trainer.foo` method is deprecated and will be removed in version 0.14.0. "
@ -433,7 +432,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
   def my_function(foo, bar, _warn=True):
       if foo == bar:
           if _warn:
-               warnings.warn("foo and bar are the same, this is likely a mistake. Ignore this warning by setting `_warn=False`.")
+               logger.warning("foo and bar are the same, this is likely a mistake. Ignore this warning by setting `_warn=False`.")
           # Do something
   ```

@ -443,7 +442,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
   ```python
   def my_function(foo, bar):
       if foo and bar:
-           warnings.warn("Both `foo` and `bar` were provided, but only one is allowed. Ignoring `foo`. Please pass only one of these arguments.")
+           logger.warning("Both `foo` and `bar` were provided, but only one is allowed. Ignoring `foo`. Please pass only one of these arguments.")
           # Do something
   ```

--- a/2
+++ b/2
@ -186,7 +186,7 @@
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

-   Copyright [yyyy] [name of copyright owner]
+   Copyright 2020-2025 The HuggingFace Team

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,6 +1,6 @@
-include settings.ini
 include LICENSE
 include CONTRIBUTING.md
 include README.md
 recursive-exclude * __pycache__
-include trl/templates/*.md
+include trl/templates/*.md
+include trl/accelerate_configs/*.yaml
--- a/10
+++ b/10
@ -6,17 +6,15 @@ ACCELERATE_CONFIG_PATH = `pwd`/examples/accelerate_configs
 COMMAND_FILES_PATH = `pwd`/commands

 test:
-	python -m pytest -n auto --dist=loadfile -s -v --reruns 5 --reruns-delay 1 --only-rerun '(OSError|Timeout|HTTPError.*502|HTTPError.*504||not less than or equal to 0.01)' ./tests/
+	pytest -n auto -m "not slow and not low-priority" -s -v --reruns 5 --reruns-delay 1 --only-rerun '(OSError|Timeout|HTTPError.*502|HTTPError.*504||not less than or equal to 0.01)' tests/

 precommit:
-	pre-commit run --all-files
 	python scripts/add_copyrights.py
-
-tests_gpu:
-	python -m pytest tests/test_* $(if $(IS_GITHUB_CI),--report-log "common_tests.log",)
+	pre-commit run --all-files
+	doc-builder style trl tests docs/source --max_len 119

 slow_tests:
-	python -m pytest tests/slow/test_* $(if $(IS_GITHUB_CI),--report-log "slow_tests.log",)
+	pytest -m "slow" tests/ $(if $(IS_GITHUB_CI),--report-log "slow_tests.log",)

 test_examples:
 	touch temp_results_sft_tests.txt
--- a/README.md
+++ b/README.md
@ -1,7 +1,7 @@
 # TRL - Transformer Reinforcement Learning

 <div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png" alt="TRL Banner">
+    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_banner_dark.png" alt="TRL Banner">
 </div>

 <hr> <br>
@ -12,26 +12,33 @@

 <p align="center">
    <a href="https://github.com/huggingface/trl/blob/main/LICENSE"><img alt="License" src="https://img.shields.io/github/license/huggingface/trl.svg?color=blue"></a>
-    <a href="https://huggingface.co/docs/trl/index"><img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/trl/index.svg?down_color=red&down_message=offline&up_color=blue&up_message=online"></a>
+    <a href="https://huggingface.co/docs/trl/index"><img alt="Documentation" src="https://img.shields.io/website?label=documentation&url=https%3A%2F%2Fhuggingface.co%2Fdocs%2Ftrl%2Findex&down_color=red&down_message=offline&up_color=blue&up_message=online"></a>
    <a href="https://github.com/huggingface/trl/releases"><img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/trl.svg"></a>
+    <a href="https://huggingface.co/trl-lib"><img alt="Hugging Face Hub" src="https://img.shields.io/badge/🤗%20Hub-trl--lib-yellow"></a>
 </p>

+## 🎉 What's New
+
+> **✨ OpenAI GPT OSS Support**: TRL now fully supports fine-tuning the latest [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4)! Check out the:
+>
+> - [OpenAI Cookbook](https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers)
+> - [GPT OSS recipes](https://github.com/huggingface/gpt-oss-recipes)
+> - [Our example script](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_gpt_oss.py)
+
 ## Overview

 TRL is a cutting-edge library designed for post-training foundation models using advanced techniques like Supervised Fine-Tuning (SFT), Proximal Policy Optimization (PPO), and Direct Preference Optimization (DPO). Built on top of the [🤗 Transformers](https://github.com/huggingface/transformers) ecosystem, TRL supports a variety of model architectures and modalities, and can be scaled-up across various hardware setups.

 ## Highlights

- **Efficient and scalable**: 
-    - Leverages [🤗 Accelerate](https://github.com/huggingface/accelerate) to scale from single GPU to multi-node clusters using methods like DDP and DeepSpeed.
-    - Full integration with [`PEFT`](https://github.com/huggingface/peft) enables training on large models with modest hardware via quantization and LoRA/QLoRA.
-    - Integrates [Unsloth](https://github.com/unslothai/unsloth) for accelerating training using optimized kernels.
+- **Trainers**: Various fine-tuning methods are easily accessible via trainers like [`SFTTrainer`](https://huggingface.co/docs/trl/sft_trainer), [`GRPOTrainer`](https://huggingface.co/docs/trl/grpo_trainer), [`DPOTrainer`](https://huggingface.co/docs/trl/dpo_trainer), [`RewardTrainer`](https://huggingface.co/docs/trl/reward_trainer) and more.

- **Command Line Interface (CLI)**: A simple interface lets you fine-tune and interact with models without needing to write code.
+- **Efficient and scalable**:
+  - Leverages [🤗 Accelerate](https://github.com/huggingface/accelerate) to scale from single GPU to multi-node clusters using methods like [DDP](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) and [DeepSpeed](https://github.com/deepspeedai/DeepSpeed).
+  - Full integration with [🤗 PEFT](https://github.com/huggingface/peft) enables training on large models with modest hardware via quantization and LoRA/QLoRA.
+  - Integrates [🦥 Unsloth](https://github.com/unslothai/unsloth) for accelerating training using optimized kernels.

- **Trainers**: Various fine-tuning methods are easily accessible via trainers like [`SFTTrainer`](https://huggingface.co/docs/trl/sft_trainer), [`DPOTrainer`](https://huggingface.co/docs/trl/dpo_trainer), [`RewardTrainer`](https://huggingface.co/docs/trl/reward_trainer), [`ORPOTrainer`](https://huggingface.co/docs/trl/orpo_trainer) and more.
-
- **AutoModels**: Use pre-defined model classes like [`AutoModelForCausalLMWithValueHead`](https://huggingface.co/docs/trl/models#trl.AutoModelForCausalLMWithValueHead) to simplify reinforcement learning (RL) with LLMs.
+- **Command Line Interface (CLI)**: A simple interface lets you fine-tune with models without needing to write code.

 ## Installation

@ -59,60 +66,74 @@ If you want to use the examples you can clone the repository with the following
 git clone https://github.com/huggingface/trl.git
 ```

-## Command Line Interface (CLI)
-
-You can use the TRL Command Line Interface (CLI) to quickly get started with Supervised Fine-tuning (SFT) and Direct Preference Optimization (DPO), or vibe check your model with the chat CLI: 
-
-**SFT:**
-
-```bash
-trl sft --model_name_or_path Qwen/Qwen2.5-0.5B \
-    --dataset_name trl-lib/Capybara \
-    --output_dir Qwen2.5-0.5B-SFT
-```
-
-**DPO:**
-
-```bash
-trl dpo --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
-    --dataset_name argilla/Capybara-Preferences \
-    --output_dir Qwen2.5-0.5B-DPO 
-```
-
-**Chat:**
-
-```bash
-trl chat --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct
-```
-
-Read more about CLI in the [relevant documentation section](https://huggingface.co/docs/trl/main/en/clis) or use `--help` for more details.
-
-## How to use
+## Quick Start

 For more flexibility and control over training, TRL provides dedicated trainer classes to post-train language models or PEFT adapters on a custom dataset. Each trainer in TRL is a light wrapper around the 🤗 Transformers trainer and natively supports distributed training methods like DDP, DeepSpeed ZeRO, and FSDP.

 ### `SFTTrainer`

-Here is a basic example of how to use the `SFTTrainer`:
+Here is a basic example of how to use the [`SFTTrainer`](https://huggingface.co/docs/trl/sft_trainer):

 ```python
-from trl import SFTConfig, SFTTrainer
+from trl import SFTTrainer
 from datasets import load_dataset

 dataset = load_dataset("trl-lib/Capybara", split="train")

-training_args = SFTConfig(output_dir="Qwen/Qwen2.5-0.5B-SFT")
 trainer = SFTTrainer(
-    args=training_args,
    model="Qwen/Qwen2.5-0.5B",
    train_dataset=dataset,
 )
 trainer.train()
 ```

+### `GRPOTrainer`
+
+[`GRPOTrainer`](https://huggingface.co/docs/trl/grpo_trainer) implements the [Group Relative Policy Optimization (GRPO) algorithm](https://huggingface.co/papers/2402.03300) that is more memory-efficient than PPO and was used to train [Deepseek AI's R1](https://huggingface.co/deepseek-ai/DeepSeek-R1).
+
+```python
+from datasets import load_dataset
+from trl import GRPOTrainer
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+trainer.train()
+```
+
+### `DPOTrainer`
+
+[`DPOTrainer`](https://huggingface.co/docs/trl/dpo_trainer) implements the popular [Direct Preference Optimization (DPO) algorithm](https://huggingface.co/papers/2305.18290) that was used to post-train [Llama 3](https://huggingface.co/papers/2407.21783) and many other models. Here is a basic example of how to use the `DPOTrainer`:
+
+```python
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import DPOConfig, DPOTrainer
+
+model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
+dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
+training_args = DPOConfig(output_dir="Qwen2.5-0.5B-DPO")
+trainer = DPOTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+    processing_class=tokenizer
+)
+trainer.train()
+```
+
 ### `RewardTrainer`

-Here is a basic example of how to use the `RewardTrainer`:
+Here is a basic example of how to use the [`RewardTrainer`](https://huggingface.co/docs/trl/reward_trainer):

 ```python
 from trl import RewardConfig, RewardTrainer
@ -137,60 +158,28 @@ trainer = RewardTrainer(
 trainer.train()
 ```

-### `RLOOTrainer`
+## Command Line Interface (CLI)

-`RLOOTrainer` implements a [REINFORCE-style optimization](https://huggingface.co/papers/2402.14740) for RLHF that is more performant and memory-efficient than PPO. Here is a basic example of how to use the `RLOOTrainer`:
+You can use the TRL Command Line Interface (CLI) to quickly get started with post-training methods like Supervised Fine-Tuning (SFT) or Direct Preference Optimization (DPO):

-```python
-from trl import RLOOConfig, RLOOTrainer, apply_chat_template
-from datasets import load_dataset
-from transformers import (
-    AutoModelForCausalLM,
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-)
+**SFT:**

-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-reward_model = AutoModelForSequenceClassification.from_pretrained(
-    "Qwen/Qwen2.5-0.5B-Instruct", num_labels=1
-)
-ref_policy = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-policy = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-
-dataset = load_dataset("trl-lib/ultrafeedback-prompt")
-dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})
-dataset = dataset.map(lambda x: tokenizer(x["prompt"]), remove_columns="prompt")
-
-training_args = RLOOConfig(output_dir="Qwen2.5-0.5B-RL")
-trainer = RLOOTrainer(
-    config=training_args,
-    processing_class=tokenizer,
-    policy=policy,
-    ref_policy=ref_policy,
-    reward_model=reward_model,
-    train_dataset=dataset["train"],
-    eval_dataset=dataset["test"],
-)
-trainer.train()
+```bash
+trl sft --model_name_or_path Qwen/Qwen2.5-0.5B \
+    --dataset_name trl-lib/Capybara \
+    --output_dir Qwen2.5-0.5B-SFT
 ```

-### `DPOTrainer`
+**DPO:**

-`DPOTrainer` implements the popular [Direct Preference Optimization (DPO) algorithm](https://huggingface.co/papers/2305.18290) that was used to post-train Llama 3 and many other models. Here is a basic example of how to use the `DPOTrainer`:
-
-```python
-from datasets import load_dataset
-from transformers import AutoModelForCausalLM, AutoTokenizer
-from trl import DPOConfig, DPOTrainer
-
-model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct")
-dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")
-training_args = DPOConfig(output_dir="Qwen2.5-0.5B-DPO")
-trainer = DPOTrainer(model=model, args=training_args, train_dataset=dataset, processing_class=tokenizer)
-trainer.train()
+```bash
+trl dpo --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --dataset_name argilla/Capybara-Preferences \
+    --output_dir Qwen2.5-0.5B-DPO 
 ```

+Read more about CLI in the [relevant documentation section](https://huggingface.co/docs/trl/main/en/clis) or use `--help` for more details.
+
 ## Development

 If you want to contribute to `trl` or customize it to your needs make sure to read the [contribution guide](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md) and make sure you make a dev install:
--- a/RELEASE.md
+++ b/RELEASE.md
@ -0,0 +1,167 @@
+# Making a release
+
+> [!NOTE]
+> VERSION needs to be formatted following the `v{major}.{minor}.{patch}` convention. We need to follow this convention to be able to retrieve versioned scripts.
+
+## Major/Minor Release
+
+### 1. Ensure your local repository is up to date with the upstream repository
+
+```bash
+git checkout main
+git pull origin main
+```
+
+> [!WARNING]
+> Do not merge other pull requests into `main` until the release is done. This is to ensure that the release is stable and does not include any untested changes. Announce internally (#trl-internal) to other maintainers that you are doing a release and that they must not merge PRs until the release is done.
+
+### 2. Create a release branch from main
+
+```bash
+git checkout -b release-v{major}.{minor}
+```
+
+### 3. Change the version in the following files
+
+- `.github/workflows/tests_latest.yml`:
+  
+  ```diff
+  - with: { ref: v{major}.{minor-1}-release }
+  + with: { ref: v{major}.{minor}-release }
+  ```
+
+- `CITATION.cff`
+
+  ```diff
+  - version: "{major}.{minor-1}"
+  + version: "{major}.{minor}"
+  ```
+
+- `VERSION`
+
+  ```diff
+  - {major}.{minor}.0.dev0
+  + {major}.{minor}.0
+  ```
+
+### 4. Commit and push these changes
+
+```shell
+git add .github/workflows/tests_latest.yml CITATION.cff VERSION
+git commit -m 'Release: {major}.{minor}'
+git push origin release-v{major}.{minor}
+```
+
+### 5. Create a pull request
+
+from `release-v{major}.{minor}` to `main`, named `Release: v{major}.{minor}`, wait for tests to pass, and request a review.
+
+### 6. Once the pull request is approved, merge it into `main`
+
+It will automatically publish the new version of the package on PyPI.
+
+### 7. Add a tag in git to mark the release
+
+```shell
+git checkout main
+git pull origin main
+git tag -a v{major}.{minor}.0 -m 'Adds tag v{major}.{minor}.0 for PyPI'
+git push origin v{major}.{minor}.0
+```
+
+### 8. Create a branch `v{major}.{minor}-release` for future patch releases
+
+```shell
+git checkout -b v{major}.{minor}-release
+git push origin v{major}.{minor}-release
+```
+
+This ensures that future patch releases (`v{major}.{minor}.1`, `v{major}.{minor}.2`, etc.) can be made separately from `main`.
+
+### 9. Create a GitHub Release
+
+1. Go to the repo’s [releases section](https://github.com/huggingface/trl/releases) on GitHub.
+2. Click **Draft a new release**.
+3. Select the `v{major}.{minor}.0` tag you just created in step 7.
+4. Add a title (`v{major}.{minor}.0`) and a short description of what’s new.
+5. Click **Publish Release**.
+
+### 10. Bump to dev version
+
+1. Create a branch `bump-dev-version-{major}.{minor+1}` from `main` and checkout to it.
+
+  ```shell
+  git checkout -b bump-dev-version-{major}.{minor+1}
+  ```
+
+2. Change the version in file `VERSION`:
+
+  ```diff
+  - {major}.{minor}.0
+  + {major}.{minor+1}.0.dev0
+  ```
+
+3. Commit and push these changes
+
+  ```shell
+  git add VERSION
+  git commit -m '⬆️ Bump dev version'
+  git push origin bump-dev-version-{major}.{minor+1}
+  ```
+
+4. Create a pull request from `bump-dev-version-{major}.{minor+1}` to `main`, named `⬆️ Bump dev version`, and request urgent review.
+
+5. Once the pull request is approved, merge it into `main`.
+
+6. The codebase is now ready for the next development cycle, inform the team in the #trl-internal channel.
+
+## Making a patch release
+
+### 1. Ensure your local repository is up to date with the upstream repository
+
+```bash
+git checkout v{major}.{minor}-release
+git pull origin main
+```
+
+### 2. Cherry-pick the changes you want to include in the patch release
+
+```bash
+git cherry-pick <commit-hash-0>
+git cherry-pick <commit-hash-1>
+...
+```
+
+### 3. Change the version in the file `VERSION`
+
+```diff
+- {major}.{minor}.{patch-1}
+ {major}.{minor}.{patch}
+```
+
+### 4. Commit and push these changes
+
+```shell
+git add VERSION
+git commit -m 'Release: {major}.{minor}.{patch}'
+git push origin v{major}.{minor}-release
+```
+
+### 5. Wait for the CI to pass
+
+The CI will automatically publish the new version of the package on PyPI.
+
+### 6. Add a tag in git to mark the release
+
+```shell
+git tag -a v{major}.{minor}.{patch} -m 'Adds tag v{major}.{minor}.{patch} for PyPI'
+git push origin v{major}.{minor}.{patch}
+```
+
+#### 7. Create a GitHub Release
+
+1. Go to the repo’s [releases section](https://github.com/huggingface/trl/releases) on GitHub.
+2. Click **Draft a new release**.
+3. Select the `v{major}.{minor}.{patch}` tag you just created in step 7.
+4. Add a title (`v{major}.{minor}.{patch}`) and a short description of what’s new.
+5. Click **Publish Release**.
--- a/1
+++ b/1
@ -0,0 +1 @@
+0.23.0.dev0
--- a/commands/run_sft.sh
+++ b/commands/run_sft.sh
@ -42,7 +42,7 @@ accelerate launch $EXTRA_ACCELERATE_ARGS \
    --output_dir $OUTPUT_DIR \
    --max_steps $MAX_STEPS \
    --per_device_train_batch_size $BATCH_SIZE \
-    --max_seq_length $SEQ_LEN \
+    --max_length $SEQ_LEN \
    $EXTRA_TRAINING_ARGS
 """

--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -9,6 +9,8 @@
 - sections:
  - local: dataset_formats
    title: Dataset Formats
+  - local: paper_index
+    title: Paper Index
  - local: how_to_train
    title: Training FAQ
  - local: logging
@ -17,12 +19,16 @@
 - sections:
  - local: clis
    title: Command Line Interface (CLI)
+  - local: jobs_training
+    title: Training using Jobs
  - local: customization
    title: Customizing the Training
  - local: reducing_memory_usage
    title: Reducing Memory Usage
  - local: speeding_up_training
    title: Speeding Up Training
+  - local: distributing_training
+    title: Distributing Training
  - local: use_model
    title: Using Trained Models
  title: How-to guides
@ -35,6 +41,8 @@
    title: PEFT
  - local: unsloth_integration
    title: Unsloth
+  - local: vllm_integration
+    title: vLLM
  title: Integrations
 - sections:
  - local: example_overview
@ -47,8 +55,6 @@
    title: Training StackLlama
  - local: detoxifying_a_lm
    title: Detoxifying a Language Model
-  - local: learning_tools
-    title: Learning to Use Tools
  - local: multi_adapter_rl
    title: Multi Adapter RLHF
  title: Examples
@ -93,6 +99,8 @@
    title: Trainers
  - local: models
    title: Model Classes
+  - local: model_utils
+    title: Model Utilities
  - local: best_of_n
    title: Best of N Sampling
  - local: judges
@ -101,8 +109,10 @@
    title: Callbacks
  - local: data_utils
    title: Data Utilities
-  - local: text_environments
-    title: Text Environments
+  - local: rewards
+    title: Reward Functions
  - local: script_utils
    title: Script Utilities
+  - local: others
+    title: Others
  title: API
--- a/docs/source/alignprop_trainer.md
+++ b/docs/source/alignprop_trainer.md
@ -16,7 +16,7 @@ The `alignprop.py` script is a working example of using the `AlignProp` trainer

 **Note:** one A100 GPU is recommended to get this running. For lower memory setting, consider setting truncated_backprop_rand to False. With default settings this will do truncated backpropagation with K=1.

-Almost every configuration parameter has a default. There is only one commandline flag argument that is required of the user to get things up and running. The user is expected to have a [huggingface user access token](https://huggingface.co/docs/hub/security-tokens) that will be used to upload the model post finetuning to HuggingFace hub. The following bash command is to be entered to get things running
+Almost every configuration parameter has a default. There is only one commandline flag argument that is required of the user to get things up and running. The user is expected to have a [huggingface user access token](https://huggingface.co/docs/hub/security-tokens) that will be used to upload the model post-finetuning to HuggingFace hub. The following bash command is to be entered to get things running

 ```batch
 python alignprop.py --hf_user_access_token <token>
@ -26,7 +26,7 @@ To obtain the documentation of `stable_diffusion_tuning.py`, please run `python

 The following are things to keep in mind (The code checks this for you as well) in general while configuring the trainer (beyond the use case of using the example script)

- The configurable randomized truncation range (`--alignprop_config.truncated_rand_backprop_minmax=(0,50)`) the first number should be equal and greater to 0, while the second number should equal or less to the number of diffusion timesteps (sample_num_steps)
+- The configurable randomized truncation range (`--alignprop_config.truncated_rand_backprop_minmax=(0,50)`) the first number should be equal and greater than 0, while the second number should equal or less to the number of diffusion timesteps (sample_num_steps)
 - The configurable truncation backprop absolute step (`--alignprop_config.truncated_backprop_timestep=49`) the number should be less than the number of diffusion timesteps (sample_num_steps), it only matters when truncated_backprop_rand is set to False

 ## Setting up the image logging hook function
--- a/docs/source/bco_trainer.md
+++ b/docs/source/bco_trainer.md
@ -9,7 +9,7 @@ For a full example have a look at  [`examples/scripts/bco.py`].
 ## Expected dataset type

 The [`BCOTrainer`] requires an [unpaired preference dataset](dataset_formats#unpaired-preference).
-The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 ## Expected model format
 The BCO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
@ -94,6 +94,9 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype
 ## BCOTrainer

 [[autodoc]] BCOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## BCOConfig

--- a/docs/source/best_of_n.md
+++ b/docs/source/best_of_n.md
@ -67,6 +67,6 @@ best_of_n.generate(query_tensors, device=device)

 ```

-Furthermore, at the time of initialization you can set the seed to control repeatability of the generation process and the number of samples to generate for each query
+Furthermore, at the time of initialization you can set the seed to control the repeatability of the generation process and the number of samples to generate for each query


--- a/docs/source/callbacks.md
+++ b/docs/source/callbacks.md
@ -18,4 +18,8 @@

 ## MergeModelCallback

-[[autodoc]] MergeModelCallback
+[[autodoc]] MergeModelCallback
+
+## BEMACallback
+
+[[autodoc]] BEMACallback
--- a/docs/source/clis.md
+++ b/docs/source/clis.md
@ -1,130 +1,269 @@
 # Command Line Interfaces (CLIs)

-You can use TRL to fine-tune your Language Model with Supervised Fine-Tuning (SFT) or Direct Policy Optimization (DPO) or even chat with your model using the TRL CLIs.
+TRL provides a powerful command-line interface (CLI) to fine-tune large language models (LLMs) using methods like Supervised Fine-Tuning (SFT), Direct Preference Optimization (DPO), and more. The CLI abstracts away much of the boilerplate, letting you launch training jobs quickly and reproducibly.

-Currently supported CLIs are:
+Currently supported commands are:

-#### Training commands
+#### Training Commands

 - `trl dpo`: fine-tune a LLM with DPO
 - `trl grpo`: fine-tune a LLM with GRPO
 - `trl kto`: fine-tune a LLM with KTO
+- `trl rloo`: fine-tune a LLM with RLOO
 - `trl sft`: fine-tune a LLM with SFT

-#### Other commands
+#### Other Commands

- `trl chat`: quickly spin up a LLM fine-tuned for chatting
 - `trl env`: get the system information
+- `trl vllm-serve`: serve a model with vLLM

-## Fine-tuning with the CLI
+## Fine-Tuning with the TRL CLI

-Before getting started, pick up a Language Model from Hugging Face Hub. Supported models can be found with the filter "text-generation" within models. Also make sure to pick up a relevant dataset for your task.
+### Basic Usage
+
+You can launch training directly from the CLI by specifying required arguments like the model and dataset:
+
+<hfoptions id="command_line">
+<hfoption id="SFT">

-Before using the `sft` or `dpo` commands make sure to run:
 ```bash
-accelerate config
+trl sft \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name stanfordnlp/imdb
 ```
-and pick up the right configuration for your training setup (single / multi-GPU, DeepSpeed, etc.). Make sure to complete all steps of `accelerate config` before running any CLI command.

-We also recommend you passing a YAML config file to configure your training protocol. Below is a simple example of a YAML file that you can use for training your models with `trl sft` command.
+</hfoption>
+<hfoption id="DPO">
+
+```bash
+trl dpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name anthropic/hh-rlhf
+```
+
+</hfoption>
+</hfoptions>
+
+### Using Configuration Files
+
+To keep your CLI commands clean and reproducible, you can define all training arguments in a YAML configuration file:
+
+<hfoptions id="config_file">
+<hfoption id="SFT">

 ```yaml
-model_name_or_path:
-  Qwen/Qwen2.5-0.5B
-dataset_name:
-  stanfordnlp/imdb
-report_to:
-  none
-learning_rate:
-  0.0001
-lr_scheduler_type:
-  cosine
+# sft_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: stanfordnlp/imdb
 ```

-Save that config in a `.yaml` and get started immediately! An example CLI config is available as `examples/cli_configs/example_config.yaml`. Note you can overwrite the arguments from the config file by explicitly passing them to the CLI, e.g. from the root folder:
+Launch with:

 ```bash
-trl sft --config examples/cli_configs/example_config.yaml --output_dir test-trl-cli --lr_scheduler_type cosine_with_restarts
+trl sft --config sft_config.yaml
 ```

-Will force-use `cosine_with_restarts` for `lr_scheduler_type`.
+</hfoption>
+<hfoption id="DPO">

-### Supported Arguments 
+```yaml
+# dpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: anthropic/hh-rlhf
+```

-We do support all arguments from `transformers.TrainingArguments`, for loading your model, we support all arguments from `~trl.ModelConfig`:
-
-[[autodoc]] ModelConfig
-
-You can pass any of these arguments either to the CLI or the YAML file.
-
-### Supervised Fine-tuning (SFT)
-
-Follow the basic instructions above and run `trl sft --output_dir <output_dir> <*args>`: 
+Launch with:

 ```bash
-trl sft --model_name_or_path facebook/opt-125m --dataset_name stanfordnlp/imdb --output_dir opt-sft-imdb
+trl dpo --config dpo_config.yaml
 ```

-The SFT CLI is based on the `trl/scripts/sft.py` script.
+</hfoption>
+</hfoptions>

-### Direct Policy Optimization (DPO)
+### Scaling Up with Accelerate

-To use the DPO CLI, you need to have a dataset in the TRL format such as 
+TRL CLI natively supports [🤗 Accelerate](https://huggingface.co/docs/accelerate), making it easy to scale training across multiple GPUs, machines, or use advanced setups like DeepSpeed — all from the same CLI.

-* TRL's Anthropic HH dataset: https://huggingface.co/datasets/trl-internal-testing/hh-rlhf-helpful-base-trl-style
-* TRL's OpenAI TL;DR summarization dataset: https://huggingface.co/datasets/trl-internal-testing/tldr-preference-trl-style
+You can pass any `accelerate launch` arguments directly to `trl`, such as `--num_processes`. For more information see [Using accelerate launch](https://huggingface.co/docs/accelerate/en/basic_tutorials/launch#using-accelerate-launch).

-These datasets always have at least three columns `prompt, chosen, rejected`:
-
-* `prompt` is a list of strings.
-* `chosen` is the chosen response in [chat format](https://huggingface.co/docs/transformers/main/en/chat_templating)
-* `rejected` is the rejected response [chat format](https://huggingface.co/docs/transformers/main/en/chat_templating) 
-
-
-To do a quick start, you can run the following command:
+<hfoptions id="launch_args">
+<hfoption id="SFT inline">

 ```bash
-trl dpo --model_name_or_path facebook/opt-125m --output_dir trl-hh-rlhf --dataset_name trl-internal-testing/hh-rlhf-helpful-base-trl-style
+trl sft \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name stanfordnlp/imdb \
+  --num_processes 4
 ```

+</hfoption>
+<hfoption id="SFT w/ config file">

-The DPO CLI is based on the `trl/scripts/dpo.py` script.
+```yaml
+# sft_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: stanfordnlp/imdb
+num_processes: 4
+```

-
-#### Custom preference dataset
-
-Format the dataset into TRL format (you can adapt the `examples/datasets/anthropic_hh.py`):
+Launch with:

 ```bash
-python examples/datasets/anthropic_hh.py --push_to_hub --hf_entity your-hf-org
+trl sft --config sft_config.yaml
 ```

-## Chat interface
+</hfoption>
+<hfoption id="DPO inline">

-The chat CLI lets you quickly load the model and talk to it. Simply run the following:
+```bash
+trl dpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name anthropic/hh-rlhf \
+  --num_processes 4
+```

-<pre><code>$ trl chat --model_name_or_path Qwen/Qwen1.5-0.5B-Chat 
-<strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
-What is the best programming language?
+</hfoption>
+<hfoption id="DPO w/ config file">

-<strong><span style="color: blue;">&lt;Qwen/Qwen1.5-0.5B-Chat&gt;:</span></strong>
-There isn't a "best" programming language, as everyone has different style preferences, needs, and preferences. However, some people commonly use   
-languages like Python, Java, C++, and JavaScript, which are popular among developers for a variety of reasons, including readability, flexibility,  
-and scalability. Ultimately, it depends on personal preference, needs, and goals.
-</code></pre>
+```yaml
+# dpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: anthropic/hh-rlhf
+num_processes: 4
+```

-Note that the chat interface relies on the tokenizer's [chat template](https://huggingface.co/docs/transformers/chat_templating) to format the inputs for the model. Make sure your tokenizer has a chat template defined.
+Launch with:

-Besides talking to the model there are a few commands you can use:
+```bash
+trl dpo --config dpo_config.yaml
+```
+</hfoption>
+</hfoptions>

- `clear`: clears the current conversation and start a new one
- `example {NAME}`: load example named `{NAME}` from the config and use it as the user input
- `set {SETTING_NAME}={SETTING_VALUE};`: change the system prompt or generation settings (multiple settings are separated by a `;`).
- `reset`: same as clear but also resets the generation configs to defaults if they have been changed by `set`
- `save` or `save {SAVE_NAME}`: save the current chat and settings to file by default to `./chat_history/{MODEL_NAME}/chat_{DATETIME}.yaml` or `{SAVE_NAME}` if provided
- `exit`: closes the interface
+### Using `--accelerate_config` for Accelerate Configuration

-## Getting the system information
+The `--accelerate_config` flag lets you easily configure distributed training with [🤗 Accelerate](https://github.com/huggingface/accelerate). This flag accepts either:
+
+* the name of a predefined config profile (built into TRL), or
+* a path to a custom Accelerate YAML config file.
+
+#### Predefined Config Profiles
+
+TRL provides several ready-to-use Accelerate configs to simplify common training setups:
+
+| Name         | Description                         |
+| ------------ | ----------------------------------- |
+| `fsdp1`      | Fully Sharded Data Parallel Stage 1 |
+| `fsdp2`      | Fully Sharded Data Parallel Stage 2 |
+| `zero1`      | DeepSpeed ZeRO Stage 1              |
+| `zero2`      | DeepSpeed ZeRO Stage 2              |
+| `zero3`      | DeepSpeed ZeRO Stage 3              |
+| `multi_gpu`  | Multi-GPU training                  |
+| `single_gpu` | Single-GPU training                 |
+
+To use one of these, just pass the name to `--accelerate_config`. TRL will automatically load the corresponding config file from `trl/accelerate_config/`.
+
+#### Example Usage
+
+<hfoptions id="accelerate_config">
+<hfoption id="SFT inline">
+
+```bash
+trl sft \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name stanfordnlp/imdb \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+</hfoption>
+<hfoption id="SFT w/ config file">
+
+```yaml
+# sft_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: stanfordnlp/imdb
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl sft --config sft_config.yaml
+```
+
+</hfoption>
+<hfoption id="DPO inline">
+
+```bash
+trl dpo \
+  --model_name_or_path Qwen/Qwen2.5-0.5B \
+  --dataset_name anthropic/hh-rlhf \
+  --accelerate_config zero2  # or path/to/my/accelerate/config.yaml
+```
+
+</hfoption>
+<hfoption id="DPO w/ config file">
+
+```yaml
+# dpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+dataset_name: anthropic/hh-rlhf
+accelerate_config: zero2  # or path/to/my/accelerate/config.yaml
+```
+
+Launch with:
+
+```bash
+trl dpo --config dpo_config.yaml
+```
+</hfoption>
+</hfoptions>
+
+### Using dataset mixtures
+
+You can use dataset mixtures to combine multiple datasets into a single training dataset. This is useful for training on diverse data sources or when you want to mix different types of data.
+
+<hfoptions id="accelerate_config">
+<hfoption id="SFT">
+
+```yaml
+# sft_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: stanfordnlp/imdb
+  - path: roneneldan/TinyStories
+```
+
+Launch with:
+
+```bash
+trl sft --config sft_config.yaml
+```
+
+</hfoption>
+<hfoption id="DPO">
+
+```yaml
+# dpo_config.yaml
+model_name_or_path: Qwen/Qwen2.5-0.5B
+datasets:
+  - path: BAAI/Infinity-Preference
+  - path: argilla/Capybara-Preferences
+```
+
+Launch with:
+
+```bash
+trl dpo --config dpo_config.yaml
+```
+
+</hfoption>
+</hfoptions>
+
+To see all the available keywords for defining dataset mixtures, refer to the [`scripts.utils.DatasetConfig`] and [`DatasetMixtureConfig`] classes.
+
+## Getting the System Information

 You can get the system information by running the following command:

@ -132,7 +271,7 @@ You can get the system information by running the following command:
 trl env
 ```

-This will print out the system information including the GPU information, the CUDA version, the PyTorch version, the transformers version, and the TRL version, and any optional dependencies that are installed.
+This will print out the system information, including the GPU information, the CUDA version, the PyTorch version, the transformers version, the TRL version, and any optional dependencies that are installed.

 ```txt
 Copy-paste the following information when reporting an issue:
@ -140,7 +279,7 @@ Copy-paste the following information when reporting an issue:
 - Platform: Linux-5.15.0-1048-aws-x86_64-with-glibc2.31
 - Python version: 3.11.9
 - PyTorch version: 2.4.1
- CUDA device: NVIDIA H100 80GB HBM3
+- accelerator(s): NVIDIA H100 80GB HBM3
 - Transformers version: 4.45.0.dev0
 - Accelerate version: 0.34.2
 - Accelerate config: 
@ -171,6 +310,7 @@ Copy-paste the following information when reporting an issue:
 - LLM-Blender version: 0.0.2
 - OpenAI version: 1.46.0
 - PEFT version: 0.12.0
+- vLLM version: not installed
 ```

-This information are required when reporting an issue.
+This information is required when reporting an issue.
--- a/docs/source/community_tutorials.md
+++ b/docs/source/community_tutorials.md
@ -1,28 +1,34 @@
 # Community Tutorials

-Community tutorials are made by active members of the Hugging Face community that want to share their knowledge and expertise with others. They are a great way to learn about the library and its features, and to get started with core classes and modalities.
+Community tutorials are made by active members of the Hugging Face community who want to share their knowledge and expertise with others. They are a great way to learn about the library and its features, and to get started with core classes and modalities.

 # Language Models

-| Task                    | Class           | Description                                                                              | Author                                                         | Tutorial                                                                                                             | Colab                                                                                                                                                                                                                                        |
-| ----------------------- | --------------- | ---------------------------------------------------------------------------------------- | -------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Instruction tuning      | [`SFTTrainer`]  | Fine-tuning Google Gemma LLMs using ChatML format with QLoRA                             | [Philipp Schmid](https://huggingface.co/philschmid)            | [Link](https://www.philschmid.de/fine-tune-google-gemma)                                                             | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/gemma-lora-example.ipynb)                              |
-| Structured Generation   | [`SFTTrainer`]  | Fine-tuning Llama-2-7B to generate Persian product catalogs in JSON using QLoRA and PEFT | [Mohammadreza Esmaeilian](https://huggingface.co/Mohammadreza) | [Link](https://huggingface.co/learn/cookbook/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format.ipynb) |
-| Preference Optimization | [`DPOTrainer`]  | Align Mistral-7b using Direct Preference Optimization for human preference alignment     | [Maxime Labonne](https://huggingface.co/mlabonne)              | [Link](https://mlabonne.github.io/blog/posts/Fine_tune_Mistral_7b_with_DPO.html)                                     | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Fine_tune_a_Mistral_7b_model_with_DPO.ipynb)                                             |
-| Preference Optimization | [`ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment      | [Maxime Labonne](https://huggingface.co/mlabonne)              | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html)                            | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi)                                                                                      |
+| Task | Class | Description | Author | Tutorial | Colab |
+| --- | --- | --- | --- | --- | --- |
+| Reinforcement Learning | [`GRPOTrainer`] | Post training an LLM for reasoning with GRPO in TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_llm_grpo_trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_grpo_trl.ipynb) |
+| Reinforcement Learning | [`GRPOTrainer`] | Mini-R1: Reproduce Deepseek R1 „aha moment“ a RL tutorial | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/mini-deepseek-r1) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/mini-deepseek-r1-aha-grpo.ipynb) |
+| Reinforcement Learning | [`GRPOTrainer`] | RL on LLaMA 3.1-8B with GRPO and Unsloth optimizations | [Andrea Manzoni](https://huggingface.co/AManzoni) | [Link](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) | 
+| Instruction tuning | [`SFTTrainer`] | Fine-tuning Google Gemma LLMs using ChatML format with QLoRA | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-google-gemma) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/gemma-lora-example.ipynb) |
+| Structured Generation | [`SFTTrainer`] | Fine-tuning Llama-2-7B to generate Persian product catalogs in JSON using QLoRA and PEFT | [Mohammadreza Esmaeilian](https://huggingface.co/Mohammadreza) | [Link](https://huggingface.co/learn/cookbook/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_to_generate_persian_product_catalogs_in_json_format.ipynb) |
+| Preference Optimization | [`DPOTrainer`] | Align Mistral-7b using Direct Preference Optimization for human preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/Fine_tune_Mistral_7b_with_DPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/mlabonne/llm-course/blob/main/Fine_tune_a_Mistral_7b_model_with_DPO.ipynb) |
+| Preference Optimization | [`ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
 | Instruction tuning | [`SFTTrainer`] | How to fine-tune open LLMs in 2025 with Hugging Face | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-llms-in-2025) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2025.ipynb) |

 <Youtube id="cnGyyM0vOes" />

 # Vision Language Models

-| Task            | Class          | Description                                                                  | Author                                                 | Tutorial                                                                         | Colab                                                                                                                                                                                                                           |
-| --------------- | -------------- | ---------------------------------------------------------------------------- | ------------------------------------------------------ | -------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Visual QA       | [`SFTTrainer`] | Fine-tuning Qwen2-VL-7B for visual question answering on ChartQA dataset     | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_vlm_trl)                | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_vlm_trl.ipynb)                                    |
-| Visual QA       | [`SFTTrainer`] | Fine-tuning SmolVLM with TRL on a consumer GPU | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_smol_vlm_sft_trl)                | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_smol_vlm_sft_trl.ipynb)                                    |
-| SEO Description | [`SFTTrainer`] | Fine-tuning Qwen2-VL-7B for generating SEO-friendly descriptions from images | [Philipp Schmid](https://huggingface.co/philschmid)    | [Link](https://www.philschmid.de/fine-tune-multimodal-llms-with-trl)             | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-multimodal-llms-with-trl.ipynb) |
-| Visual QA       | [`DPOTrainer`] | PaliGemma 🤝 Direct Preference Optimization                                   | [Merve Noyan](https://huggingface.co/merve)            | [Link](https://github.com/merveenoyan/smol-vision/blob/main/PaliGemma_DPO.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merveenoyan/smol-vision/blob/main/PaliGemma_DPO.ipynb)                                                    |
-| Visual QA       | [`DPOTrainer`] | Fine-tuning SmolVLM using direct preference optimization (DPO) with TRL on a consumer GPU                                   | [Sergio Paniego](https://huggingface.co/sergiopaniego)            | [Link](https://huggingface.co/learn/cookbook/fine_tuning_vlm_dpo_smolvlm_instruct) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_vlm_dpo_smolvlm_instruct.ipynb)                                                    |
+| Task | Class | Description | Author | Tutorial | Colab |
+| --- | --- | --- | --- | --- | --- |
+| Visual QA | [`SFTTrainer`] | Fine-tuning Qwen2-VL-7B for visual question answering on ChartQA dataset | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_vlm_trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_vlm_trl.ipynb) |
+| Visual QA | [`SFTTrainer`] | Fine-tuning SmolVLM with TRL on a consumer GPU | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_smol_vlm_sft_trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_smol_vlm_sft_trl.ipynb) |
+| SEO Description | [`SFTTrainer`] | Fine-tuning Qwen2-VL-7B for generating SEO-friendly descriptions from images | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-multimodal-llms-with-trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-multimodal-llms-with-trl.ipynb) |
+| Visual QA | [`DPOTrainer`] | PaliGemma 🤝 Direct Preference Optimization | [Merve Noyan](https://huggingface.co/merve) | [Link](https://github.com/merveenoyan/smol-vision/blob/main/PaliGemma_DPO.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/merveenoyan/smol-vision/blob/main/PaliGemma_DPO.ipynb) |
+| Visual QA | [`DPOTrainer`] | Fine-tuning SmolVLM using direct preference optimization (DPO) with TRL on a consumer GPU | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_vlm_dpo_smolvlm_instruct) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_vlm_dpo_smolvlm_instruct.ipynb) |
+| Object Detection Grounding | [`SFTTrainer`] | Fine tuning a VLM for Object Detection Grounding using TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_vlm_object_detection_grounding) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_vlm_object_detection_grounding.ipynb) |
+| Visual QA | [`DPOTrainer`] | Fine-Tuning a Vision Language Model with TRL using MPO | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_vlm_mpo) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_vlm_mpo.ipynb) |
+| Reinforcement Learning | [`GRPOTrainer`] | Post training a VLM for reasoning with GRPO using TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_vlm_grpo_trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_vlm_grpo_trl.ipynb) |

 ## Contributing

--- a/docs/source/cpo_trainer.md
+++ b/docs/source/cpo_trainer.md
@ -4,7 +4,7 @@

 ## Overview

-Contrastive Preference Optimization (CPO) as introduced in the paper [Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation](https://huggingface.co/papers/2401.08417) by [Haoran Xu](https://huggingface.co/haoranxu), [Amr Sharaf](https://huggingface.co/amrsharaf), [Yunmo Chen](https://huggingface.co/yunmochen), Weiting Tan, Lingfeng Shen, Benjamin Van Durme, [Kenton Murray](https://huggingface.co/Kenton), and [Young Jin Kim](https://huggingface.co/ykim362). At a high-level, CPO trains models to avoid generating adequate, but not perfect translations in Machine Translation (MT) tasks. However, CPO is a general approximation to the DPO loss and can be applied to other domains like chat.
+Contrastive Preference Optimization (CPO) as introduced in the paper [Contrastive Preference Optimization: Pushing the Boundaries of LLM Performance in Machine Translation](https://huggingface.co/papers/2401.08417) by [Haoran Xu](https://huggingface.co/haoranxu), [Amr Sharaf](https://huggingface.co/amrsharaf), [Yunmo Chen](https://huggingface.co/yunmochen), Weiting Tan, Lingfeng Shen, Benjamin Van Durme, [Kenton Murray](https://huggingface.co/Kenton), and [Young Jin Kim](https://huggingface.co/ykim362). At a high level, CPO trains models to avoid generating adequate, but not perfect, translations in Machine Translation (MT) tasks. However, CPO is a general approximation of the DPO loss and can be applied to other domains, such as chat.

 CPO aims to mitigate two fundamental shortcomings of SFT. First, SFT’s methodology of minimizing the discrepancy between predicted outputs and gold-standard references inherently caps model performance at the quality level of the training data. Secondly, SFT lacks a mechanism to prevent the model from rejecting mistakes in translations. The CPO objective is derived from the DPO objective.

@ -31,7 +31,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

-training_args = CPOConfig(output_dir="Qwen2-0.5B-CPO", logging_steps=10)
+training_args = CPOConfig(output_dir="Qwen2-0.5B-CPO")
 trainer = CPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
 trainer.train()
 ```
@ -44,7 +44,7 @@ accelerate launch train_cpo.py

 ## Expected dataset type

-CPO requires a [preference dataset](dataset_formats#preference). The [`CPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+CPO requires a [preference dataset](dataset_formats#preference). The [`CPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 ## Example script

@ -57,13 +57,12 @@ accelerate launch examples/scripts/cpo.py \
    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
    --dataset_name trl-lib/ultrafeedback_binarized \
    --num_train_epochs 1 \
-    --logging_steps 25 \
    --output_dir Qwen2-0.5B-CPO
 ```

 ## Logged metrics

-While training and evaluating we record the following reward metrics:
+While training and evaluating, we record the following reward metrics:

 * `rewards/chosen`: the mean log probabilities of the policy model for the chosen responses scaled by beta
 * `rewards/rejected`: the mean log probabilities of the policy model for the rejected responses scaled by beta
@ -75,34 +74,55 @@ While training and evaluating we record the following reward metrics:

 ### Simple Preference Optimization (SimPO)

-The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, we can use SimPO easily by turning on `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`].
+[Simple Preference Optimization](https://huggingface.co/papers/2405.14734) (SimPO) by [Yu Meng](https://huggingface.co/yumeng5), [Mengzhou Xia](https://huggingface.co/mengzhouxia), and [Danqi Chen](https://huggingface.co/cdq10131) proposes a simpler and more effective preference optimization algorithm than DPO without using a reference model. The key designs in SimPO are (1) using length-normalized log likelihood as the implicit reward, and (2) incorporating a target reward margin in the Bradley-Terry ranking objective. The official code can be found at [princeton-nlp/SimPO](https://github.com/princeton-nlp/SimPO).
+
+The abstract from the paper is the following:
+
+> Direct Preference Optimization (DPO) is a widely used offline preference optimization algorithm that reparameterizes reward functions in reinforcement learning from human feedback (RLHF) to enhance simplicity and training stability. In this work, we propose SimPO, a simpler yet more effective approach. The effectiveness of SimPO is attributed to a key design: using the average log probability of a sequence as the implicit reward. This reward formulation better aligns with model generation and eliminates the need for a reference model, making it more compute and memory efficient. Additionally, we introduce a target reward margin to the Bradley-Terry objective to encourage a larger margin between the winning and losing responses, further enhancing the algorithm's performance. We compare SimPO to DPO and its latest variants across various state-of-the-art training setups, including both base and instruction-tuned models like Mistral and Llama3. We evaluated on extensive instruction-following benchmarks, including AlpacaEval 2, MT-Bench, and the recent challenging Arena-Hard benchmark. Our results demonstrate that SimPO consistently and significantly outperforms existing approaches without substantially increasing response length. Specifically, SimPO outperforms DPO by up to 6.4 points on AlpacaEval 2 and by up to 7.5 points on Arena-Hard. Our top-performing model, built on Llama3-8B-Instruct, achieves a remarkable 44.7 length-controlled win rate on AlpacaEval 2 -- surpassing Claude 3 Opus on the leaderboard, and a 33.8 win rate on Arena-Hard -- making it the strongest 8B open-source model.
+
+The SimPO loss is integrated in the [`CPOTrainer`], as it's an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, just turn on `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and set the `simpo_gamma` to a recommended value.

 ### CPO-SimPO

 We also offer the combined use of CPO and SimPO, which enables more stable training and improved performance. Learn more details at [CPO-SimPO GitHub](https://github.com/fe1ixxu/CPO_SIMPO). To use this method, simply enable SimPO by setting `loss_type="simpo"` and a non-zero `cpo_alpha` in the [`CPOConfig`].

+### AlphaPO
+
+The [AlphaPO -- Reward shape matters for LLM alignment](https://huggingface.co/papers/2501.03884) (AlphaPO) method by Aman Gupta, Shao Tang, Qingquan Song, Sirou Zhu, [Jiwoo Hong](https://huggingface.co/JW17), Ankan Saha, Viral Gupta, Noah Lee, Eunki Kim, Jason Zhu, Natesh Pillai, and S. Sathiya Keerthi is also implemented in the [`CPOTrainer`]. AlphaPO is an alternative method that applies a transformation to the reward function shape in the context of SimPO loss. The abstract from the paper is the following:
+
+> Reinforcement Learning with Human Feedback (RLHF) and its variants have made huge strides toward the effective alignment of large language models (LLMs) to follow instructions and reflect human values. More recently, Direct Alignment Algorithms (DAAs) have emerged in which the reward modeling stage of RLHF is skipped by characterizing the reward directly as a function of the policy being learned. Some popular examples of DAAs include Direct Preference Optimization (DPO) and Simple Preference Optimization (SimPO). These methods often suffer from likelihood displacement, a phenomenon by which the probabilities of preferred responses are often reduced undesirably. In this paper, we argue that, for DAAs the reward (function) shape matters. We introduce AlphaPO, a new DAA method that leverages an α-parameter to help change the shape of the reward function beyond the standard log reward. AlphaPO helps maintain fine-grained control over likelihood displacement and overoptimization. Compared to SimPO, one of the best performing DAAs, AlphaPO leads to about 7% to 10% relative improvement in alignment performance for the instruct versions of Mistral-7B and Llama3-8B while achieving 15% to 50% relative improvement over DPO on the same models. The analysis and results presented highlight the importance of the reward shape and how one can systematically change it to affect training dynamics, as well as improve alignment performance.
+
+To use this loss as described in the paper, we can set the `loss_type="alphapo"` which automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`, together with `alpha` and `simpo_gamma` to recommended values in the [`CPOConfig`]. Alternatively, you can manually set `loss_type="simpo"`, `cpo_alpha=0.0`, together with `alpha` and `simpo_gamma` to recommended values. Other variants of this method are also possible, such as setting `loss_type="ipo"` and `alpha` to any non-zero value.
+
 ## Loss functions

 The CPO algorithm supports several loss functions. The loss function can be set using the `loss_type` parameter in the [`CPOConfig`]. The following loss functions are supported:

 | `loss_type=`                           | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
 | -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `"sigmoid"` (default)                  | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model and in fact the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `"sigmoid"` (default)                  | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
 | `"hinge"`                              | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `"ipo"`                                | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair and thus the smaller the `beta` the larger this gaps is. As per the paper the loss is averaged over log-likelihoods of the completion (unlike DPO which is summed only).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
+| `"ipo"`                                | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only).                                                                                                                        |
+| `"simpo"`                              | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value.  |
+| `"alphapo"`                            | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero.  |
+
+

 ### For Mixture of Experts Models: Enabling the auxiliary loss

 MOEs are the most efficient if the load is about equally distributed between experts.  
 To ensure that we train MOEs similarly during preference-tuning, it is beneficial to add the auxiliary loss from the load balancer to the final loss.

-This option is enabled by setting `output_router_logits=True` in the model config (e.g. [`~transformers.MixtralConfig`]).  
+This option is enabled by setting `output_router_logits=True` in the model config (e.g., [`~transformers.MixtralConfig`]).  
 To scale how much the auxiliary loss contributes to the total loss, use the hyperparameter `router_aux_loss_coef=...` (default: `0.001`) in the model config.

 ## CPOTrainer

 [[autodoc]] CPOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## CPOConfig

-[[autodoc]] CPOConfig
+[[autodoc]] CPOConfig
--- a/docs/source/customization.md
+++ b/docs/source/customization.md
@ -1,49 +1,7 @@
 # Training customization

-TRL is designed with modularity in mind so that users to be able to efficiently customize the training loop for their needs. Below are some examples on how you can apply and test different techniques.  Note: Although these examples use the DPOTrainer, the customization applies to most (if not all) trainers.
+TRL is designed with modularity in mind so that users are able to efficiently customize the training loop for their needs. Below are some examples on how you can apply and test different techniques.  Note: Although these examples use the DPOTrainer, the customization applies to most (if not all) trainers.

-## Train on multiple GPUs / nodes
-
-The trainers in TRL use 🤗 Accelerate to enable distributed training across multiple GPUs or nodes. To do so, first create an 🤗 Accelerate config file by running
-
-```bash
-accelerate config
-```
-
-and answering the questions according to your multi-gpu / multi-node setup. You can then launch distributed training by running:
-
-```bash
-accelerate launch your_script.py
-```
-
-We also provide config files in the [examples folder](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) that can be used as templates. To use these templates, simply pass the path to the config file when launching a job, e.g.:
-
-```shell
-accelerate launch --config_file=examples/accelerate_configs/multi_gpu.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
-```
-
-Refer to the [examples page](https://github.com/huggingface/trl/tree/main/examples) for more details.
-
-### Distributed training with DeepSpeed
-
-All of the trainers in TRL can be run on multiple GPUs together with DeepSpeed ZeRO-{1,2,3} for efficient sharding of the optimizer states, gradients, and model weights. To do so, run:
-
-```shell
-accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero{1,2,3}.yaml --num_processes {NUM_GPUS} path_to_your_script.py --all_arguments_of_the_script
-```
-
-Note that for ZeRO-3, a small tweak is needed to initialize your reward model on the correct device via the `zero3_init_context_manager()` context manager. In particular, this is needed to avoid DeepSpeed hanging after a fixed number of training steps. Here is a snippet of what is involved from the [`sentiment_tuning`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py) example:
-
-```python
-ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
-if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
-    with ds_plugin.zero3_init_context_manager(enable=False):
-        sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
-else:
-    sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
-```
-
-Consult the 🤗 Accelerate [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more information about the DeepSpeed plugin.


 ## Use different optimizers and schedulers
@ -154,10 +112,10 @@ trainer = DPOTrainer(
 trainer.train()
 ```

-## Use the CUDA cache optimizer
+## Use the accelerator cache optimizer

-When training large models, you should better handle the CUDA cache by iteratively clearing it. To do so, simply pass `optimize_cuda_cache=True` to `DPOConfig`:
+When training large models, you should better handle the accelerator cache by iteratively clearing it. To do so, simply pass `optimize_device_cache=True` to `DPOConfig`:

 ```python
-training_args = DPOConfig(..., optimize_cuda_cache=True)
+training_args = DPOConfig(..., optimize_device_cache=True)
 ```
--- a/docs/source/data_utils.md
+++ b/docs/source/data_utils.md
@ -1,9 +1,17 @@
 # Data Utilities

+## prepare_multimodal_messages
+
+[[autodoc]] prepare_multimodal_messages
+
 ## is_conversational

 [[autodoc]] is_conversational

+## is_conversational_from_value
+
+[[autodoc]] is_conversational_from_value
+
 ## apply_chat_template

 [[autodoc]] apply_chat_template
@ -12,6 +20,10 @@

 [[autodoc]] maybe_apply_chat_template

+## maybe_convert_to_chatml
+
+[[autodoc]] maybe_convert_to_chatml
+
 ## extract_prompt

 [[autodoc]] extract_prompt
@ -27,3 +39,11 @@
 ## maybe_unpair_preference_dataset

 [[autodoc]] maybe_unpair_preference_dataset
+
+## pack_dataset
+
+[[autodoc]] pack_dataset
+
+## truncate_dataset
+
+[[autodoc]] truncate_dataset
--- a/docs/source/dataset_formats.md
+++ b/docs/source/dataset_formats.md
@ -134,6 +134,132 @@ preference_example = {

 Conversational datasets are useful for training chat models, but must be converted into a standard format before being used with TRL trainers. This is typically done using chat templates specific to the model being used. For more information, refer to the [Working with conversational datasets in TRL](#working-with-conversational-datasets-in-trl) section.

+#### Tool Calling
+
+Some chat templates support *tool calling*, which allows the model to interact with external functions—referred to as **tools**—during generation. This extends the conversational capabilities of the model by enabling it to output a `"tool_calls"` field instead of a standard `"content"` message whenever it decides to invoke a tool.
+
+After the assistant initiates a tool call, the tool executes and returns its output. The assistant can then process this output and continue the conversation accordingly.
+
+Here’s a simple example of a tool-calling interaction:
+
+```python
+messages = [
+    {"role": "user", "content": "Turn on the living room lights."},
+    {"role": "assistant", "tool_calls": [
+        {"type": "function", "function": {
+            "name": "control_light",
+            "arguments": {"room": "living room", "state": "on"}
+        }}]
+    },
+    {"role": "tool", "name": "control_light", "content": "The lights in the living room are now on."},
+    {"role": "assistant", "content": "Done!"}
+]
+```
+
+When preparing datasets for Supervised Fine-Tuning (SFT) with tool calling, it is important that your dataset includes an additional column named `tools`. This column contains the list of available tools for the model, which is usually used by the chat template to construct the system prompt.
+
+The tools must be specified in a codified JSON schema format. You can automatically generate this schema from Python function signatures using the [`~transformers.utils.get_json_schema`] utility:
+
+```python
+from transformers.utils import get_json_schema
+
+def control_light(room: str, state: str) -> str:
+    """
+    Controls the lights in a room.
+
+    Args:
+        room: The name of the room.
+        state: The desired state of the light ("on" or "off").
+
+    Returns:
+        str: A message indicating the new state of the lights.
+    """
+    return f"The lights in {room} are now {state}."
+
+# Generate JSON schema
+json_schema = get_json_schema(control_light)
+```
+
+The generated schema would look like:
+
+```python
+{
+    "type": "function",
+    "function": {
+        "name": "control_light",
+        "description": "Controls the lights in a room.",
+        "parameters": {
+            "type": "object",
+            "properties": {
+                "room": {"type": "string", "description": "The name of the room."},
+                "state": {"type": "string", "description": 'The desired state of the light ("on" or "off").'},
+            },
+            "required": ["room", "state"],
+        },
+        "return": {"type": "string", "description": "str: A message indicating the new state of the lights."},
+    },
+}
+```
+
+A complete dataset entry for SFT might look like:
+
+```python
+{"messages": messages, "tools": [json_schema]}
+```
+
+For more detailed information on tool calling, refer to the [Tool Calling section in the `transformers` documentation](https://huggingface.co/docs/transformers/chat_extras#tools-and-rag) and the blog post [Tool Use, Unified](https://huggingface.co/blog/unified-tool-use).
+
+### Harmony
+
+The [Harmony response format](https://cookbook.openai.com/articles/openai-harmony) was introduced with the [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4). It extends the conversational format by adding richer structure for reasoning, function calls, and metadata about the model’s behavior. Key features include:
+
+- **Developer role** – Provides high level instructions (similar to a system prompt) and lists available tools.
+- **Channels** – Separate types of assistant output into distinct streams:
+
+  - `analysis` – for internal reasoning, from the key `"thinking"`
+  - `final` – for the user-facing answer, from the key `"content"`
+  - `commentary` – for tool calls or meta notes
+
+- **Reasoning effort** – Signals how much thinking the model should show (e.g., `"low"`, `"medium"`, `"high"`).
+- **Model identity** – Explicitly defines the assistant’s persona.
+
+```python
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("openai/gpt-oss-20b")
+
+messages = [
+    {"role": "developer", "content": "Use a friendly tone."},
+    {"role": "user", "content": "What is the meaning of life?"},
+    {"role": "assistant", "thinking": "Deep reflection...", "content": "The final answer is..."},
+]
+
+print(
+    tokenizer.apply_chat_template(
+        messages,
+        tokenize=False,
+        reasoning_effort="low",
+        model_identity="You are HuggingGPT, a large language model trained by Hugging Face."
+    )
+)
+```
+
+This produces:
+
+```txt
+<|start|>system<|message|>You are HuggingGPT, a large language model trained by Hugging Face.
+Knowledge cutoff: 2024-06
+Current date: 2025-08-03
+
+Reasoning: low
+
+# Valid channels: analysis, commentary, final. Channel must be included for every message.<|end|><|start|>developer<|message|># Instructions
+
+Use a friendly tone.<|end|><|start|>user<|message|>What is the meaning of life?<|end|><|start|>assistant<|channel|>analysis<|message|>Deep reflection...<|end|><|start|>assistant<|channel|>final<|message|>The final answer is...<|return|>
+```
+
+For full details on message structure, supported fields, and advanced usage, see the [Harmony documentation](https://cookbook.openai.com/articles/openai-harmony).
+
 ### Types

 #### Language modeling
@ -152,7 +278,7 @@ language_modeling_example = {"messages": [

 #### Prompt-only

-In a prompt-only dataset, only the initial prompt (the question or partial sentence) is provided under the key `"prompt"`. The training typically involves generating the completion based on this prompt, where the model learns to continue or complete the given input.
+In a prompt-only dataset, only the initial prompt (the question or partial sentence) is provided under the key `"prompt"`. The training typically involves generating completion based on this prompt, where the model learns to continue or complete the given input.

 ```python
 # Standard format
@ -206,7 +332,7 @@ For examples of prompt-completion datasets, refer to the [Prompt-completion data
 #### Preference

 A preference dataset is used for tasks where the model is trained to choose between two or more possible completions to the same prompt. This dataset includes a `"prompt"`, a `"chosen"` completion, and a `"rejected"` completion. The model is trained to select the `"chosen"` response over the `"rejected"` response.
-Some dataset may not include the `"prompt"` column, in which case the prompt is implicit and directly included in the `"chosen"` and `"rejected"` completions. We recommend using explicit prompts whenever possible.
+Some datasets may not include the `"prompt"` column, in which case the prompt is implicit and directly included in the `"chosen"` and `"rejected"` completions. We recommend using explicit prompts whenever possible.

 ```python
 # Standard format
@ -266,7 +392,7 @@ Choosing the right dataset type depends on the task you are working on and the s

 | Trainer                 | Expected dataset type                                                                                  |
 | ----------------------- | ------------------------------------------------------------------------------------------------------ |
-| [`BCOTrainer`]          | [Unpaired preference](#unpaired-preference)                                                            |
+| [`BCOTrainer`]          | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
 | [`CPOTrainer`]          | [Preference (explicit prompt recommended)](#preference)                                                |
 | [`DPOTrainer`]          | [Preference (explicit prompt recommended)](#preference)                                                |
 | [`GKDTrainer`]          | [Prompt-completion](#prompt-completion)                                                                |
@ -279,7 +405,8 @@ Choosing the right dataset type depends on the task you are working on and the s
 | [`PPOTrainer`]          | Tokenized language modeling                                                                            |
 | [`PRMTrainer`]          | [Stepwise supervision](#stepwise-supervision)                                                          |
 | [`RewardTrainer`]       | [Preference (implicit prompt recommended)](#preference)                                                |
-| [`SFTTrainer`]          | [Language modeling](#language-modeling)                                                                |
+| [`RLOOTrainer`]         | [Prompt-only](#prompt-only)                                                                            |
+| [`SFTTrainer`]          | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion)                     |
 | [`XPOTrainer`]          | [Prompt-only](#prompt-only)                                                                            |

 <Tip>
@ -341,7 +468,7 @@ dataset = dataset.map(apply_chat_template, fn_kwargs={"tokenizer": tokenizer})

 <Tip warning={true}>

-We recommend using the [`apply_chat_template`] function instead of calling `tokenizer.apply_chat_template` directly. Handling chat templates for non-language modeling datasets can be tricky and may result in errors, such as mistakenly placing a system prompt in the middle conversation.
+We recommend using the [`apply_chat_template`] function instead of calling `tokenizer.apply_chat_template` directly. Handling chat templates for non-language modeling datasets can be tricky and may result in errors, such as mistakenly placing a system prompt in the middle of a conversation.
 For additional examples, see [#1930 (comment)](https://github.com/huggingface/trl/pull/1930#issuecomment-2292908614). The [`apply_chat_template`] is designed to handle these intricacies and ensure the correct application of chat templates for various tasks.

 </Tip>
@ -830,7 +957,7 @@ dataset = dataset.filter(lambda x: all(x["labels"])).map(concatenate_prompt_comp
 {'text': 'Blue light scatters more in the atmosphere, so the sky is green.'}
 ```

-### From stepwise supervision to prompt completion dataset
+### From stepwise supervision to prompt-completion dataset

 To convert a stepwise supervision dataset into a prompt-completion dataset, join the good completions and remove the labels.

@ -856,7 +983,7 @@ dataset = dataset.filter(lambda x: all(x["labels"])).map(join_completions, remov
 {'prompt': 'Blue light', 'completion': ' scatters more in the atmosphere, so the sky is green.'}
 ```

-### From stepwise supervision to prompt only dataset
+### From stepwise supervision to prompt-only dataset

 To convert a stepwise supervision dataset into a prompt-only dataset, remove the completions and the labels.

@ -907,7 +1034,7 @@ dataset = dataset.map(merge_completions_and_labels, remove_columns=["completions

 ## Vision datasets

-Some trainers also support fine-tuning vision-language models (VLMs) using image-text pairs. In this scenario, it's recommended to use a conversational format, as each model handles image placeholders in text differently. 
+Some trainers also support fine-tuning vision-language models (VLMs) using image-text pairs. In this scenario, it's recommended to use a conversational format, as each model handles image placeholders in text differently.

 A conversational vision dataset differs from a standard conversational dataset in two key ways:

@ -935,4 +1062,3 @@ An example of a conversational vision dataset is the [openbmb/RLAIF-V-Dataset](h
  width="100%"
  height="560px"
 ></iframe>
-
--- a/docs/source/deepspeed_integration.md
+++ b/docs/source/deepspeed_integration.md
@ -4,4 +4,36 @@

 Section under construction. Feel free to contribute!

-</Tip>
+</Tip>
+
+TRL supports training with DeepSpeed, a library that implements advanced training optimization techniques. These include optimizer state partitioning, offloading, gradient partitioning, and more.
+
+DeepSpeed integrates the [Zero Redundancy Optimizer (ZeRO)](https://huggingface.co/papers/1910.02054), which allows to scale the model size proportional to the number of devices with sustained high efficiency.
+
+![ZeRO Stages](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/zero_stages.png)
+
+## Installation
+
+To use DeepSpeed with TRL, install it using the following command:
+
+```bash
+pip install deepspeed
+```
+
+## Running Training Scripts with DeepSpeed
+
+No modifications to your training script are required. Simply run it with the DeepSpeed configuration file:
+
+```bash
+accelerate launch --config_file <ACCELERATE_WITH_DEEPSPEED_CONFIG_FILE.yaml> train.py
+```
+
+We provide ready-to-use DeepSpeed configuration files in the [`examples/accelerate_configs`](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) directory. For example, to run training with ZeRO Stage 2, use the following command:
+
+```bash
+accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml train.py
+```
+
+## Additional Resources
+
+Consult the 🤗 Accelerate [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more information about the DeepSpeed plugin.
--- a/docs/source/detoxifying_a_lm.md
+++ b/docs/source/detoxifying_a_lm.md
@ -30,7 +30,7 @@ We selected the following models for our experiments to show that TRL can be eas
 * [`EleutherAI/gpt-neo-2.7B`](https://huggingface.co/EleutherAI/gpt-neo-2.7B) (2.7 billion parameters)
 * [`EleutherAI/gpt-j-6B`](https://huggingface.co/EleutherAI/gpt-j-6B) (6 billion parameters)

-For the selection of the smallest model, we have chosen `EleutherAI/gpt-neo-125M` because it has shown to be a model that was the "most toxic" compared to other models. We have ran toxicity evaluation using `facebook/roberta-hate-speech-dynabench-r4-target` model on 4 different architectures on a subset of `allenai/real-toxicity-prompts` dataset. Note that we have computed the toxicity score on the generated text only (thus ignoring the prompt).
+For the selection of the smallest model, we have chosen `EleutherAI/gpt-neo-125M` because it has shown to be a model that was the "most toxic" compared to other models. We have run toxicity evaluation using `facebook/roberta-hate-speech-dynabench-r4-target` model on 4 different architectures on a subset of `allenai/real-toxicity-prompts` dataset. Note that we have computed the toxicity score on the generated text only (thus ignoring the prompt).

 | Model | Mean toxicity score |
 |---|---| 
@ -88,7 +88,7 @@ As a compromise between the two we took for a context window of 10 to 15 tokens

 ### How to deal with OOM issues

-Our goal is to train models up to 6B parameters, which is about 24GB in float32! Here two tricks we use to be able to train a 6B model on a single 40GB-RAM GPU:
+Our goal is to train models up to 6B parameters, which is about 24GB in float32! Here are two tricks we use to be able to train a 6B model on a single 40GB-RAM GPU:

 - Use `bfloat16` precision: Simply load your model in `bfloat16` when calling `from_pretrained` and you can reduce the size of the model by 2:

@ -174,7 +174,7 @@ The evaluation script can be found [here](https://github.com/huggingface/trl/blo

 ### Discussions

-The results are quite promising, as we can see that the models are able to reduce the toxicity score of the generated text by an interesting margin. The gap is clear for `gpt-neo-2B` model but we less so for the `gpt-j-6B` model. There are several things we could try to improve the results on the largest model starting with training with larger `mini_batch_size` and probably allowing to back-propagate through more layers (i.e. use less shared layers).
+The results are quite promising, as we can see that the models are able to reduce the toxicity score of the generated text by an interesting margin. The gap is clear for `gpt-neo-2B` model but we see less so for the `gpt-j-6B` model. There are several things we could try to improve the results on the largest model starting with training with larger `mini_batch_size` and probably allowing to back-propagate through more layers (i.e. use less shared layers).

 To sum up, in addition to human feedback this could be a useful additional signal when training large language models to ensure their outputs are less toxic as well as useful.

--- a/docs/source/distributing_training.md
+++ b/docs/source/distributing_training.md
@ -0,0 +1,60 @@
+# Distributing Training
+
+<Tip warning={true}>
+Section under construction. Feel free to contribute!
+</Tip>
+
+## Multi-GPU Training with TRL
+
+The trainers in TRL use [🤗 Accelerate](https://github.com/huggingface/accelerate) to enable distributed training across multiple GPUs or nodes. To do so, first create an [🤗 Accelerate](https://github.com/huggingface/accelerate) config file by running
+
+```bash
+accelerate config
+```
+
+and answering the questions according to your multi-GPU / multi-node setup. You can then launch distributed training by running:
+
+```bash
+accelerate launch train.py
+```
+
+We also provide config files in the [examples folder](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) that can be used as templates. To use these templates, simply pass the path to the config file when launching a job, e.g.:
+
+```shell
+accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train.py <SCRIPT_ARGS>
+```
+
+This automatically distributes the workload across all available GPUs.
+
+Under the hood, [🤗 Accelerate](https://github.com/huggingface/accelerate) creates one model per GPU. Each process:
+- Processes its own batch of data
+- Computes the loss and gradients for that batch
+- Shares gradient updates across all GPUs
+
+![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)
+
+The effective batch size is calculated as:
+
+$$
+\text{Batch Size} = \text{per\_device\_train\_batch\_size} \times \text{num\_devices} \times \text{gradient\_accumulation\_steps}
+$$
+
+To maintain a consistent batch size when scaling to multiple GPUs, make sure to update `per_device_train_batch_size` and `gradient_accumulation_steps` accordingly.
+
+Example, these configurations are equivalent, and should yield the same results:
+
+| Number of GPUs | Per device batch size | Gradient accumulation steps | Comments |
+| --- | --- | --- | --- |
+| 1 | 32 | 1 | Possibly high memory usage, but faster training |
+| 1 | 4 | 8 | Lower memory usage, slower training |
+| 8 | 4 | 1 | Multi-GPU to get the best of both worlds |
+
+<Tip> 
+
+Having one model per GPU can lead to high memory usage, which may not be feasible for large models or low-memory GPUs. In such cases, you can leverage [DeepSpeed](https://github.com/deepspeedai/DeepSpeed), which provides optimizations like model sharding, Zero Redundancy Optimizer, mixed precision training, and offloading to CPU or NVMe. Check out our [DeepSpeed Integration](deepspeed_integration) guide for more details.
+
+</Tip>
+
+## Multi-Node Training
+
+We're working on a guide for multi-node training. Stay tuned! 🚀
--- a/docs/source/dpo_trainer.md
+++ b/docs/source/dpo_trainer.md
@ -46,7 +46,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

-training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
+training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO")
 trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
 trainer.train()
 ```
@ -61,27 +61,19 @@ Distributed across 8 GPUs, the training takes approximately 3 minutes. You can v

 ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/dpo-qwen2-reward-margin.png)

-To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-DPO) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-DPO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

-<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-DPO
-<strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
-What is the best programming language?
+<pre><code>$ transformers chat trl-lib/Qwen2-0.5B-DPO
+<strong><span style="color: red;">&lt;shirin_yamani&gt;:</span></strong>
+What is Huggingface?

 <strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-DPO&gt;:</span></strong>
-The best programming language for specific applications can vary depending on the use case and knowledge level of the programmer. Here are some general factors that can be used as input to choose the best programming language:
-
- <strong><span style="color: green;">1</span></strong> Ease of use: Some programming languages are more user-friendly than others, such as Python, Java, or Ruby. Python is popular due to its simplicity and great scalability.
- <strong><span style="color: green;">2</span></strong> Versatility: The ability to work with a wide range of data structures and frameworks can define the language as versatile.
- <strong><span style="color: green;">3</span></strong> Ease of learning: Different programming languages have different learning curves, so users must be willing to take some time to master one.
- <strong><span style="color: green;">4</span></strong> Community support: The broader community of developers and enthusiasts in the selected programming language can provide great support and resources.
- <strong><span style="color: green;">5</span></strong> Reusability: Languages that emphasize code reuse and can be easily modifiable can be more suitable for software development.
-
-The best programming language based on these factors is subjective and depends on what the programmer intends to accomplish.
+Huggingface is a platform that allows users to access a variety of open-source machine learning resources such as pre-trained models and datasets Huggingface is a platform that allows users to access a variety of open-source machine learning resources such as pre-trained models and datasets for the development of machine learning models and applications. It provides a repository of over 300, 000 pre-trained models in  Huggingface is a platform that allows users to access a variety of open-source machine learning resources such as pre-trained models and datasets for the development of machine learning models and applications. It provides a repository of over 300, 000  pre-trained models in a variety of languages, enabling users to explore and utilize the latest techniques and technologies in the field of machine learning.
 </code></pre>

 ## Expected dataset type

-DPO requires a [preference dataset](dataset_formats#preference). The [`DPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+DPO requires a [preference dataset](dataset_formats#preference). The [`DPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 Although the [`DPOTrainer`] supports both explicit and implicit prompts, we recommend using explicit prompts. If provided with an implicit prompt dataset, the trainer will automatically extract the prompt from the `"chosen"` and `"rejected"` columns. For more information, refer to the [preference style](dataset_formats#preference) section.

@ -93,7 +85,7 @@ Additionally, unlike standard text-based models where a `tokenizer` is used, for

 ```diff
 - model = AutoModelForCausalLM.from_pretrained(model_id)
-+ model = AutoModelForVision2Seq.from_pretrained(model_id)
+ model = AutoModelForImageTextToText.from_pretrained(model_id)

 - tokenizer = AutoTokenizer.from_pretrained(model_id)
 + processor = AutoProcessor.from_pretrained(model_id)
@ -121,13 +113,12 @@ accelerate launch trl/scripts/dpo.py \
    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
    --dataset_name trl-lib/ultrafeedback_binarized \
    --num_train_epochs 1 \
-    --logging_steps 25 \
    --output_dir Qwen2-0.5B-DPO
 ```

 ## Logged metrics

-While training and evaluating we record the following reward metrics:
+While training and evaluating, we record the following reward metrics:

 - `rewards/chosen`: the mean difference between the log probabilities of the policy model and the reference model for the chosen responses scaled by beta
 - `rewards/rejected`: the mean difference between the log probabilities of the policy model and the reference model for the rejected responses scaled by beta
@ -138,19 +129,36 @@ While training and evaluating we record the following reward metrics:

 The DPO algorithm supports several loss functions. The loss function can be set using the `loss_type` parameter in the [`DPOConfig`]. The following loss functions are supported:

-| `loss_type=`                           | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      |
-| -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `"sigmoid"` (default)                  | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model and in fact the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `"hinge"`                              | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
-| `"ipo"`                                | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair and thus the smaller the `beta` the larger this gaps is. As per the paper the loss is averaged over log-likelihoods of the completion (unlike DPO which is summed only).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| `"exo_pair"`                           | The [EXO](https://huggingface.co/papers/2402.00856) authors propose to minimize the reverse KL instead of the negative log-sigmoid loss of DPO which corresponds to forward KL. Setting non-zero `label_smoothing` (default `1e-3`) leads to a simplified version of EXO on pair-wise preferences (see Eqn. (16) of the [EXO paper](https://huggingface.co/papers/2402.00856)). The full version of EXO uses `K>2` completions generated by the SFT policy, which becomes an unbiased estimator of the PPO objective (up to a constant) when `K` is sufficiently large.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          |
-| `"nca_pair"`                           | The [NCA](https://huggingface.co/papers/2402.05369) authors shows that NCA optimizes the absolute likelihood for each response rather than the relative likelihood.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| `"robust"`                             | The [Robust DPO](https://huggingface.co/papers/2403.00409) authors propose an unbiased estimate of the DPO loss that is robust to preference noise in the data. Like in cDPO, it assumes that the preference labels are noisy with some probability. In this approach, the `label_smoothing` parameter in the [`DPOConfig`] is used to model the probability of existing label noise. To apply this conservative loss, set `label_smoothing` to a value greater than 0.0 (between 0.0 and 0.5; the default is 0.0)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| `"bco_pair"`                           | The [BCO](https://huggingface.co/papers/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0. For unpaired data, we recommend the dedicated [`BCOTrainer`].                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| `"sppo_hard"`                          | The [SPPO](https://huggingface.co/papers/2405.00675) authors claim that SPPO is capable of solving the Nash equilibrium iteratively by pushing the chosen rewards to be as large as 1/2 and the rejected rewards to be as small as -1/2 and can alleviate data sparsity issues. The implementation approximates this algorithm by employing hard label probabilities, assigning 1 to the winner and 0 to the loser.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |
-| `"aot"`  or `loss_type="aot_pair"`     | The [AOT](https://huggingface.co/papers/2406.05882) authors propose to use Distributional Preference Alignment Via Optimal Transport. Traditionally, the alignment algorithms use paired preferences at a sample level, which does not ensure alignment on the distributional level. AOT, on the other hand, can align LLMs on paired or unpaired preference data by making the reward distribution of the positive samples stochastically dominant in the first order on the distribution of negative samples. Specifically, `loss_type="aot"` is appropriate for paired datasets, where each prompt has both chosen and rejected responses; `loss_type="aot_pair"` is for unpaired datasets. In a nutshell, `loss_type="aot"` ensures that the log-likelihood ratio of chosen to rejected of the aligned model has higher quantiles than that ratio for the reference model. `loss_type="aot_pair"` ensures that the chosen reward is higher on all quantiles than the rejected reward. Note that in both cases quantiles are obtained via sorting. To fully leverage the advantages of the AOT algorithm, it is important to maximize the per-GPU batch size. |
-| `"apo_zero"` or `loss_type="apo_down"` | The [APO](https://huggingface.co/papers/2408.06266) method introduces an "anchored" version of the alignment objective. There are two variants: `apo_zero` and `apo_down`. The `apo_zero` loss increases the likelihood of winning outputs while decreasing the likelihood of losing outputs, making it suitable when the model is less performant than the winning outputs. On the other hand, `apo_down` decreases the likelihood of both winning and losing outputs, but with a stronger emphasis on reducing the likelihood of losing outputs. This variant is more effective when the model is better than the winning outputs.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| `"discopop"`                           | The [DiscoPOP](https://huggingface.co/papers/2406.08414) paper uses LLMs to discover more efficient offline preference optimization losses. In the paper the proposed DiscoPOP loss (which is a log-ratio modulated loss) outperformed other optimization losses on different tasks (IMDb positive text generation, Reddit TLDR summarization, and Alpaca Eval 2.0).                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  |
+| `loss_type=` | Description |
+| --- | --- |
+| `"sigmoid"` (default) | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model and in fact the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression. |
+| `"hinge"` | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin. |
+| `"ipo"` | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair and thus the smaller the `beta` the larger this gaps is. As per the paper the loss is averaged over log-likelihoods of the completion (unlike DPO which is summed only). |
+| `"exo_pair"` | The [EXO](https://huggingface.co/papers/2402.00856) authors propose to minimize the reverse KL instead of the negative log-sigmoid loss of DPO which corresponds to forward KL. Setting non-zero `label_smoothing` (default `1e-3`) leads to a simplified version of EXO on pair-wise preferences (see Eqn. (16) of the [EXO paper](https://huggingface.co/papers/2402.00856)). The full version of EXO uses `K>2` completions generated by the SFT policy, which becomes an unbiased estimator of the PPO objective (up to a constant) when `K` is sufficiently large. |
+| `"nca_pair"` | The [NCA](https://huggingface.co/papers/2402.05369) authors shows that NCA optimizes the absolute likelihood for each response rather than the relative likelihood. |
+| `"robust"` | The [Robust DPO](https://huggingface.co/papers/2403.00409) authors propose an unbiased estimate of the DPO loss that is robust to preference noise in the data. Like in cDPO, it assumes that the preference labels are noisy with some probability. In this approach, the `label_smoothing` parameter in the [`DPOConfig`] is used to model the probability of existing label noise. To apply this conservative loss, set `label_smoothing` to a value greater than 0.0 (between 0.0 and 0.5; the default is 0.0) |
+| `"bco_pair"` | The [BCO](https://huggingface.co/papers/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0. For unpaired data, we recommend the dedicated [`BCOTrainer`]. |
+| `"sppo_hard"` | The [SPPO](https://huggingface.co/papers/2405.00675) authors claim that SPPO is capable of solving the Nash equilibrium iteratively by pushing the chosen rewards to be as large as 1/2 and the rejected rewards to be as small as -1/2 and can alleviate data sparsity issues. The implementation approximates this algorithm by employing hard label probabilities, assigning 1 to the winner and 0 to the loser. |
+| `"aot"`  or `loss_type="aot_pair"` | The [AOT](https://huggingface.co/papers/2406.05882) authors propose to use Distributional Preference Alignment Via Optimal Transport. Traditionally, the alignment algorithms use paired preferences at a sample level, which does not ensure alignment on the distributional level. AOT, on the other hand, can align LLMs on paired or unpaired preference data by making the reward distribution of the positive samples stochastically dominant in the first order on the distribution of negative samples. Specifically, `loss_type="aot"` is appropriate for paired datasets, where each prompt has both chosen and rejected responses; `loss_type="aot_pair"` is for unpaired datasets. In a nutshell, `loss_type="aot"` ensures that the log-likelihood ratio of chosen to rejected of the aligned model has higher quantiles than that ratio for the reference model. `loss_type="aot_pair"` ensures that the chosen reward is higher on all quantiles than the rejected reward. Note that in both cases quantiles are obtained via sorting. To fully leverage the advantages of the AOT algorithm, it is important to maximize the per-GPU batch size. |
+| `"apo_zero"` or `loss_type="apo_down"` | The [APO](https://huggingface.co/papers/2408.06266) method introduces an "anchored" version of the alignment objective. There are two variants: `apo_zero` and `apo_down`. The `apo_zero` loss increases the likelihood of winning outputs while decreasing the likelihood of losing outputs, making it suitable when the model is less performant than the winning outputs. On the other hand, `apo_down` decreases the likelihood of both winning and losing outputs, but with a stronger emphasis on reducing the likelihood of losing outputs. This variant is more effective when the model is better than the winning outputs. |
+| `"discopop"` | The [DiscoPOP](https://huggingface.co/papers/2406.08414) paper uses LLMs to discover more efficient offline preference optimization losses. In the paper the proposed DiscoPOP loss (which is a log-ratio modulated loss) outperformed other optimization losses on different tasks (IMDb positive text generation, Reddit TLDR summarization, and Alpaca Eval 2.0). |
+| `"sft"` | SFT (Supervised Fine-Tuning) loss is the negative log likelihood loss, used to train the model to generate preferred responses. |
+
+### Multi-loss combinations
+
+The DPO trainer supports combining multiple loss functions with different weights, enabling more sophisticated optimization strategies. This is particularly useful for implementing algorithms like MPO (Mixed Preference Optimization). MPO is a training approach that combines multiple optimization objectives, as described in the paper [Enhancing the Reasoning Ability of Multimodal Large Language Models via Mixed Preference Optimization](https://huggingface.co/papers/2411.10442).
+
+To combine multiple losses, specify the loss types and corresponding weights as lists:
+
+```python
+# MPO: Combines DPO (sigmoid) for preference and BCO (bco_pair) for quality
+training_args = DPOConfig(
+    loss_type=["sigmoid", "bco_pair", "sft"],  # Loss types to combine
+    loss_weights=[0.8, 0.2, 1.0]  # Corresponding weights, as used in the MPO paper
+)
+```
+
+If `loss_weights` is not provided, all loss types will have equal weights (1.0 by default).

 ### Label smoothing

@ -168,6 +176,10 @@ The [RPO](https://huggingface.co/papers/2404.19733) paper implements an iterativ

 The [WPO](https://huggingface.co/papers/2406.11827) paper adapts off-policy data to resemble on-policy data more closely by reweighting preference pairs according to their probability under the current policy. To use this method, set the `use_weighting` flag to `True` in the [`DPOConfig`].

+### LD-DPO loss
+
+The [LD-DPO](https://huggingface.co/papers/2409.06411) paper decomposes the portion of the response that exceeds the desired length into two components — human-like preferences and verbosity preference — based on a mixing coefficient  \\( \alpha \\). To use this method, set the `ld_alpha` in the [`DPOConfig`] to an appropriate value. The paper suggests setting this value between `0.0` and `1.0`.
+
 ### For Mixture of Experts Models: Enabling the auxiliary loss

 MOEs are the most efficient if the load is about equally distributed between experts.  
@ -180,7 +192,7 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype

 You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below:

-| GPU      | Model     | Dataset    | 🤗   | 🤗 + Flash Attention 2 | 🦥 Unsloth | 🦥 VRAM saved |
+| GPU      | Model     | Dataset    | 🤗   | 🤗 + FlashAttention 2 | 🦥 Unsloth | 🦥 VRAM saved |
 | -------- | --------- | ---------- | --- | --------------------- | --------- | ------------ |
 | A100 40G | Zephyr 7b | Ultra Chat | 1x  | 1.24x                 | **1.88x** | -11.6%       |
 | Tesla T4 | Zephyr 7b | Ultra Chat | 1x  | 1.09x                 | **1.55x** | -18.6%       |
@ -199,8 +211,8 @@ First install `unsloth` according to the [official documentation](https://github
 + model = FastLanguageModel.get_peft_model(model)
  train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

- training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10)
-+ training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", logging_steps=10, bf16=True)
+- training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO")
+ training_args = DPOConfig(output_dir="Qwen2-0.5B-DPO", bf16=True)
  trainer = DPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
  trainer.train()

@ -246,7 +258,6 @@ model = AutoModelForCausalLM.from_pretrained(
    torch_dtype=torch.bfloat16,
    device_map="auto",
 )
-model.config.use_cache = False

 # Load the adapter.
 model = PeftModel.from_pretrained(
@ -273,6 +284,9 @@ dpo_trainer = DPOTrainer(
 ## DPOTrainer

 [[autodoc]] DPOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## DPOConfig

@ -280,4 +294,4 @@ dpo_trainer = DPOTrainer(

 ## DataCollatorForPreference

-[[autodoc]] trainer.dpo_trainer.DataCollatorForPreference
+[[autodoc]] trainer.dpo_trainer.DataCollatorForPreference
--- a/docs/source/example_overview.md
+++ b/docs/source/example_overview.md
@ -5,21 +5,21 @@

 The examples should work in any of the following settings (with the same script):
   - single GPU
-   - multi GPUS (using PyTorch distributed mode)
-   - multi GPUS (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
+   - multi GPUs (using PyTorch distributed mode)
+   - multi GPUs (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
   - fp16 (mixed-precision), fp32 (normal precision), or bf16 (bfloat16 precision)

 To run it in each of these various modes, first initialize the accelerate
 configuration with `accelerate config`

-**NOTE to train with a 4-bit or 8-bit model**, please run
+To train with a 4-bit or 8-bit model, please run:

 ```bash
 pip install --upgrade trl[quantization]
 ```

-
 ## Accelerate Config
+
 For all the examples, you'll need to generate a 🤗 Accelerate config file with:

 ```shell
@ -29,30 +29,49 @@ accelerate config # will prompt you to define the training configuration
 Then, it is encouraged to launch jobs with `accelerate launch`!


-# Maintained Examples
+## Maintained Examples

 Scripts can be used as examples of how to use TRL trainers. They are located in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) directory. Additionally, we provide examples in the [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directory. These examples are maintained and tested regularly.

-| File                                                                                                                          | Description                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| ----------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| [`examples/scripts/alignprop.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/alignprop.py)                 | This script shows how to use the [`AlignPropTrainer`] to fine-tune a diffusion model.                                                                                                                                                                                                                                                                                                                                                                             |
-| [`examples/scripts/bco.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/bco.py)                             | This script shows how to use the [`KTOTrainer`] with the BCO loss to fine-tune a model to increase instruction-following, truthfulness, honesty and helpfulness using the [openbmb/UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback) dataset.                                                                                                                                                                                                 |
-| [`examples/scripts/cpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/cpo.py)                             | This script shows how to use the [`CPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset.                                                                                                                                                                                                                                                           |
-| [`examples/scripts/ddpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ddpo.py)                           | This script shows how to use the [`DDPOTrainer`] to fine-tune a stable diffusion model using reinforcement learning.                                                                                                                                                                                                                                                                                                                                              |
-| [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py)                     | This script shows how to use the [`DPOTrainer`] to fine-tune a Vision Language Model to reduce hallucinations using the [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) dataset.                                                                                                                                                                                                                                               |
-| [`examples/scripts/orpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py)                           | This script shows how to use the [`ORPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset.                                                                                                                                                                                                                                                          |
-| [`examples/scripts/ppo/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo.py)                     | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to continue text with positive sentiment or physically descriptive language                                                                                                                                                                                                                                                                                           |
-| [`examples/scripts/ppo/ppo_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py)           | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to generate TL;DR summaries.                                                                                                                                                                                                                                                                                                                                          |
-| [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/reward_modeling.py)     | This script shows how to use the [`RewardTrainer`] to train a reward model on your own dataset.                                                                                                                                                                                                                                                                                                                                                                   |
-| [`examples/scripts/sft_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm.py)                     | This script shows how to use the [`SFTTrainer`] to fine-tune a Vision Language Model in a chat setting. The script has only been tested with [LLaVA 1.5](https://huggingface.co/llava-hf/llava-1.5-7b-hf), [LLaVA 1.6](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf), and [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) models so users may see unexpected behaviour in other model architectures. |
+| File | Description |
+| --- | --- |
+| [`examples/scripts/alignprop.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/alignprop.py) | This script shows how to use the [`AlignPropTrainer`] to fine-tune a diffusion model. |
+| [`examples/scripts/bco.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/bco.py) | This script shows how to use the [`KTOTrainer`] with the BCO loss to fine-tune a model to increase instruction-following, truthfulness, honesty and helpfulness using the [openbmb/UltraFeedback](https://huggingface.co/datasets/openbmb/UltraFeedback) dataset. |
+| [`examples/scripts/cpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/cpo.py) | This script shows how to use the [`CPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
+| [`examples/scripts/ddpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ddpo.py) | This script shows how to use the [`DDPOTrainer`] to fine-tune a stable diffusion model using reinforcement learning. |
+| [`trl/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a model. |
+| [`examples/scripts/dpo_online.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_online.py) | This script shows how to use the [`OnlineDPOTrainer`] to fine-tune a model. |
+| [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py) | This script shows how to use the [`DPOTrainer`] to fine-tune a Vision Language Model to reduce hallucinations using the [openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset) dataset. |
+| [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`HfPairwiseJudge`] or [`OpenAIPairwiseJudge`] to judge model generations. |
+| [`examples/scripts/gkd.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gkd.py) | This script shows how to use the [`GKDTrainer`] to fine-tune a model. |
+| [`trl/scripts/grpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/grpo.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a model. |
+| [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset.  |
+| [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset.  |
+| [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset.  |
+| [`examples/scripts/kto.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/kto.py) | This script shows how to use the [`KTOTrainer`] to fine-tune a model. |
+| [`examples/scripts/mpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/mpo_vlm.py) | This script shows how to use MPO via the [`DPOTrainer`] to align a model based on preferences using the [HuggingFaceH4/rlaif-v_formatted](https://huggingface.co/datasets/HuggingFaceH4/rlaif-v_formatted) dataset and a set of loss weights with weights. |
+| [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`NashMDTrainer`] to fine-tune a model. |
+| [`examples/scripts/orpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/orpo.py) | This script shows how to use the [`ORPOTrainer`] to fine-tune a model to increase helpfulness and harmlessness using the [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset. |
+| [`examples/scripts/ppo/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to continue text with positive sentiment or physically descriptive language. |
+| [`examples/scripts/ppo/ppo_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo/ppo_tldr.py) | This script shows how to use the [`PPOTrainer`] to fine-tune a model to improve its ability to generate TL;DR summaries. |
+| [`examples/scripts/prm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/prm.py) | This script shows how to use the [`PRMTrainer`] to fine-tune a Process-supervised Reward Model (PRM). |
+| [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/reward_modeling.py) | This script shows how to use the [`RewardTrainer`] to train a Outcome Reward Model (ORM) on your own dataset. |
+| [`examples/scripts/rloo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/rloo.py) | This script shows how to use the [`RLOOTrainer`] to fine-tune a model to improve its ability to solve math questions. |
+| [`examples/scripts/sft.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/sft.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a model. |
+| [`examples/scripts/sft_gemma3.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_gemma3.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Gemma 3 model. |
+| [`examples/scripts/sft_video_llm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_video_llm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Video Language Model. |
+| [`examples/scripts/sft_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Vision Language Model in a chat setting. The script has only been tested with [LLaVA 1.5](https://huggingface.co/llava-hf/llava-1.5-7b-hf), [LLaVA 1.6](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf), and [Llama-3.2-11B-Vision-Instruct](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct) models so users may see unexpected behaviour in other model architectures. |
+| [`examples/scripts/sft_vlm_gemma3.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm_gemma3.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a Gemma 3 model on vision to text tasks. |
+| [`examples/scripts/sft_vlm_smol_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_vlm_smol_vlm.py) | This script shows how to use the [`SFTTrainer`] to fine-tune a SmolVLM model. |
+| [`examples/scripts/xpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/xpo.py) | This script shows how to use the [`XPOTrainer`] to fine-tune a model. |

 Here are also some easier-to-run colab notebooks that you can use to get started with TRL:

-| File                                                                                                                              | Description                                                                                                             |
-| --------------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------- |
-| [`examples/notebooks/best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb)           | This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO. |
-| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.               |
-| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)     | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.                   |
+| File | Description |
+| --- | --- |
+| [`examples/notebooks/best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb) | This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO. |
+| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
+| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. |


 We also have some other examples that are less maintained but can be used as a reference:
@ -61,7 +80,7 @@ We also have some other examples that are less maintained but can be used as a r

 ## Distributed training

-All of the scripts can be run on multiple GPUs by providing the path of an 🤗 Accelerate config file when calling `accelerate launch`. To launch one of them on one or multiple GPUs, run the following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine and `--all_arguments_of_the_script` with your arguments.)
+All the scripts can be run on multiple GPUs by providing the path of an 🤗 Accelerate config file when calling `accelerate launch`. To launch one of them on one or multiple GPUs, run the following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine and `--all_arguments_of_the_script` with your arguments).

 ```shell
 accelerate launch --config_file=examples/accelerate_configs/multi_gpu.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
@ -71,7 +90,7 @@ You can also adjust the parameters of the 🤗 Accelerate config file to suit yo

 ### Distributed training with DeepSpeed

-Most of the scripts can be run on multiple GPUs together with DeepSpeed ZeRO-{1,2,3} for efficient sharding of the optimizer states, gradients, and model weights. To do so, run following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine, `--all_arguments_of_the_script` with your arguments, and `--deepspeed_config` with the path to the DeepSpeed config file such as `examples/deepspeed_configs/deepspeed_zero1.yaml`):
+Most of the scripts can be run on multiple GPUs together with DeepSpeed ZeRO-{1,2,3} for efficient sharding of the optimizer states, gradients, and model weights. To do so, run the following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine, `--all_arguments_of_the_script` with your arguments, and `--deepspeed_config` with the path to the DeepSpeed config file such as `examples/deepspeed_configs/deepspeed_zero1.yaml`):

 ```shell
 accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero{1,2,3}.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
--- a/docs/source/gkd_trainer.md
+++ b/docs/source/gkd_trainer.md
@ -88,10 +88,12 @@ The dataset should be formatted as a list of "messages" where each message is a
 * `role`: either `system`, `assistant` or `user`
 * `content`: the message content

-
 ## GKDTrainer

 [[autodoc]] GKDTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## GKDConfig

--- a/docs/source/grpo_trainer.md
+++ b/docs/source/grpo_trainer.md
@ -14,10 +14,10 @@ This post-training method was contributed by [Quentin Gallouédec](https://huggi

 ## Quick start

-This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [TLDR dataset](https://huggingface.co/datasets/trl-lib/tldr) (completion column is ingored!). You can view the data in the dataset here:
+This example demonstrates how to train a model using the GRPO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:

 <iframe
-  src="https://huggingface.co/datasets/trl-lib/tldr/embed/viewer/default/train?row=0"
+  src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
  frameborder="0"
  width="100%"
  height="560px"
@ -30,16 +30,18 @@ Below is the script to train the model.
 from datasets import load_dataset
 from trl import GRPOConfig, GRPOTrainer

-dataset = load_dataset("trl-lib/tldr", split="train")
+dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")

-# Define the reward function, which rewards completions that are close to 20 characters
-def reward_len(completions, **kwargs):
-    return [abs(20 - len(completion)) for completion in completions]
+# Dummy reward function for demonstration purposes
+def reward_num_unique_letters(completions, **kwargs):
+    """Reward function that rewards completions with more unique letters."""
+    completion_contents = [completion[0]["content"] for completion in completions]
+    return [float(len(set(content))) for content in completion_contents]

-training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO", logging_steps=10)
+training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO")
 trainer = GRPOTrainer(
    model="Qwen/Qwen2-0.5B-Instruct",
-    reward_funcs=reward_len,
+    reward_funcs=reward_num_unique_letters,
    args=training_args,
    train_dataset=dataset,
 )
@ -68,11 +70,23 @@ At each training step, we sample a batch of prompts and generate a set of  \\( G

 ### Computing the advantage

-For each of the  \\( G \\) sequences, we compute the reward using a reward model. To align with the comparative nature of reward models—typically trained on datasets of comparisons between outputs for the same question—the advantage is calculated to reflect these relative comparisons. It is normalized as follows:  
+For each of the  \\( G \\) sequences, we compute the reward using a reward model or reward function. To align with the comparative nature of reward models—typically trained on datasets of comparisons between outputs for the same question—the advantage is calculated to reflect these relative comparisons. It is normalized as follows:

-$$\hat{A}_{i,t} = \frac{r_i - \text{mean}(\mathbf{r})}{\text{std}(\mathbf{r})}$$  
+$$\hat{A}_{i,t} = \frac{r_i - \text{mean}(\mathbf{r})}{\text{std}(\mathbf{r})}$$

-This approach gives the method its name: **Group Relative Policy Optimization (GRPO)**.  
+This approach gives the method its name: **Group Relative Policy Optimization (GRPO)**.
+
+<Tip>
+
+It was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that scaling by  \\( \text{std}(\mathbf{r}) \\) may cause a question-level difficulty bias. You can disable this scaling by setting `scale_rewards=False` in [`GRPOConfig`].
+
+</Tip>
+
+<Tip>
+
+[Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)](https://huggingface.co/papers/2508.08221) showed that calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping. You can use this scaling strategy by setting `scale_rewards="batch"` in [`GRPOConfig`].
+
+</Tip>

 ### Estimating the KL divergence

@ -83,46 +97,244 @@ $$

 ### Computing the loss

-The objective is to maximize the advantage while ensuring that the model remains close to the reference policy. Consequently, the loss is defined as follows:  
+The objective is to maximize the advantage while ensuring that the model remains close to the reference policy. Consequently, the loss is defined as follows:

 $$
-\mathcal{L}_{\text{GRPO}}(\theta) = -\frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \left[ \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\left[\pi_\theta(o_{i,t} \mid q, o_{i,< t})\right]_{\text{no grad}}} \hat{A}_{i,t} - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right] \right],
+\mathcal{L}_{\text{GRPO}}(\theta) = -\frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \left[ \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\left[\pi_\theta(o_{i,t} \mid q, o_{i,< t})\right]_{\text{no grad}}} \hat{A}_{i,t} - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right] \right],
 $$

-where the first term represents the scaled advantage and the second term penalizes deviations from the reference policy through KL divergence.  
+where the first term represents the scaled advantage and the second term penalizes deviations from the reference policy through KL divergence.

-In the original paper, this formulation is generalized to account for multiple updates after each generation by leveraging the **clipped surrogate objective**:  
+<Tip>
+
+Note that compared to the original formulation in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300), we don't scale by  \\( \frac{1}{|o_i|} \\) because it was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that this introduces a response-level length bias. More details in [loss types](#loss-types).
+
+</Tip>
+
+<Tip>
+
+Note that compared to the original formulation in [DeepSeekMath: Pushing the Limits of Mathematical Reasoning in Open Language Models](https://huggingface.co/papers/2402.03300), we use  \\( \beta = 0.0 \\) by default, meaning that the KL divergence term is not used. This choice is motivated by several recent studies (e.g., [Open-Reasoner-Zero: An Open Source Approach to Scaling Up Reinforcement Learning on the Base Model](https://huggingface.co/papers/2503.24290)) which have shown that the KL divergence term is not essential for training with GRPO. As a result, it has become common practice to exclude it (e.g. [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783), [DAPO: An Open-Source LLM Reinforcement Learning System at Scale](https://huggingface.co/papers/2503.14476)). If you wish to include the KL divergence term, you can set `beta` in [`GRPOConfig`] to a non-zero value.
+
+</Tip>
+
+In the original paper, this formulation is generalized to account for multiple updates after each generation (denoted  \\( \mu \\), can be set with `num_iterations` in [`GRPOConfig`]) by leveraging the **clipped surrogate objective**:

 $$
-\mathcal{L}_{\text{GRPO}}(\theta) = - \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} \left[ \min \left( \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \hat{A}_{i,t}, \, \text{clip}\left( \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}, 1 - \epsilon, 1 + \epsilon \right) \hat{A}_{i,t} \right) - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right] \right],
+\mathcal{L}_{\text{GRPO}}(\theta) = - \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} \left[ \min \left( \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})} \hat{A}_{i,t}, \, \text{clip}\left( \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}, 1 - \epsilon, 1 + \epsilon \right) \hat{A}_{i,t} \right) - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right] \right],
 $$

 where  \\(\text{clip}(\cdot, 1 - \epsilon, 1 + \epsilon) \\) ensures that updates do not deviate excessively from the reference policy by bounding the policy ratio between  \\( 1 - \epsilon \\) and  \\( 1 + \epsilon \\).
-In TRL though, as in the original paper, we only do one update per generation, so we can simplify the loss to the first form.
+When  \\( \mu = 1 \\) (default in TRL), the clipped surrogate objective simplifies to the original objective.
+
+#### Loss Types
+
+Several formulations of the objective have been proposed in the literature. Initially, the objective of GRPO was defined as follows:
+
+$$
+\mathcal{L}_{\text{GRPO}}(\theta) = - \frac{1}{G} \sum_{i=1}^G \frac{1}{|o_i|} \sum_{t=1}^{|o_i|} l_{i,t},
+$$
+
+where
+
+$$
+l_{i,t} = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\left[\pi_\theta(o_{i,t} \mid q, o_{i,< t})\right]_{\text{no grad}}} \hat{A}_{i,t} - \beta \mathbb{D}_{\text{KL}}\left[\pi_\theta \| \pi_{\text{ref}}\right].
+$$
+
+The [DAPO paper](https://huggingface.co/papers/2503.14476) highlights the limitations of the GRPO algorithm’s sample-level loss in long-CoT scenarios, where longer responses are under-penalized, leading to poorer quality outputs. The proposed solution is a token-level normalization, which better handles longer sequences by assigning more balanced rewards to individual tokens, regardless of response length:
+
+$$
+\mathcal{L}_{\text{DAPO}}(\theta) = - \frac{1}{\sum_{i=1}^G |o_i|} \sum_{i=1}^G \sum_{t=1}^{|o_i|} l_{i,t},
+$$
+
+To use this formulation, set `loss_type="dapo"` in [`GRPOConfig`].
+
+Furthermore, it was demonstrated in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that the initial GRPO formulation introduces a response length bias. They show that while the DAPO formulation reduces this bias, it does not eliminate it completely. To fully remove this bias, they propose dividing by a constant instead of the sequence length, resulting in the following formulation:
+
+$$
+\mathcal{L}_{\text{Dr. GRPO}}(\theta) = - \frac{1}{LG} \sum_{i=1}^G \sum_{t=1}^{|o_i|} l_{i,t},
+$$
+
+This constant is recommended to be the maximum completion length. To use this formulation, set `loss_type="dr_grpo"` in the [`GRPOConfig`].

 ## Logged metrics

-The GRPO Trainer logs the following metrics:
+While training and evaluating, we record the following reward metrics:

- `completion_length`: The average completion length.
- `reward/{reward_func_name}`: The reward computed by each reward function.
- `reward`: The average reward.
- `reward_std` : The average standard deviation within reward groups.
- `kl` : The average KL divergence between the model and the reference model calculated on completions.
+- `num_tokens`: The total number of tokens processed so far, including both prompts and completions.
+- `completions/mean_length`: The average length of generated completions.
+- `completions/min_length`: The minimum length of generated completions.
+- `completions/max_length`: The maximum length of generated completions.
+- `completions/mean_terminated_length`: The average length of generated completions that terminate with EOS.
+- `completions/min_terminated_length`: The minimum length of generated completions that terminate with EOS.
+- `completions/max_terminated_length`: The maximum length of generated completions that terminate with EOS.
+- `completions/clipped_ratio` : The ratio of truncated (clipped) completions.
+- `reward/{reward_func_name}/mean`: The average reward from a specific reward function.
+- `reward/{reward_func_name}/std`: The standard deviation of the reward from a specific reward function.
+- `reward`: The overall average reward after applying reward weights.
+- `reward_std`: The standard deviation of rewards after applying reward weights.  
+  - If `scale_rewards` is `"group"` or `"none"`, this is the average of the per-group standard deviations.
+  - If `scale_rewards` is `"batch"`, this is the standard deviation computed over all rewards in the batch (ignoring groups).
+- `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
+- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
+- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
+- `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:
+$$
+\text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}\,.
+$$
+A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
+- `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
+- `clip_ratio/low_min`: The minimum ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
+- `clip_ratio/high_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the upper bound of the trust region:  \\(r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}\\)
+- `clip_ratio/high_max`: The maximum ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the upper bound of the trust region:  \\(r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}\\).

 ## Customization

-## Speed up training with vLLM-powered generation  
+### Speed up training with vLLM-powered generation

-Generation is often the main bottleneck that makes training slow with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a library that enables fast generation. To enable it, pass `use_vllm=True` in the training arguments.  
+Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
+```shell
+pip install trl[vllm]
+```
+
+We support two ways of using vLLM during training: **server mode** and **colocate mode**.
+
+#### 🔌 Option 1: Server mode
+
+In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.
+
+1. **Start the vLLM server**:
+   ```bash
+   trl vllm-serve --model <model_name>
+   ```
+
+2. **Enable server mode in your training script**:
+   ```python
+   from trl import GRPOConfig
+
+   training_args = GRPOConfig(
+       ...,
+       use_vllm=True,
+       vllm_mode="server",  # default value, can be omitted
+   )
+   ```
+
+<Tip warning={true}>
+
+Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the `CUDA_VISIBLE_DEVICES` environment variable.
+
+</Tip>
+
+#### 🧩 Option 2: Colocate mode
+
+In this mode, vLLM runs inside the trainer process and shares GPU memory with the training model. This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.

 ```python
 from trl import GRPOConfig

-training_args = GRPOConfig(..., use_vllm=True)
-```  
+training_args = GRPOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```

-For more information, see [Speeding up training with vLLM](speeding_up_training#vllm-for-fast-generation-in-online-methods).  
+<Tip>
+
+Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the `vllm_gpu_memory_utilization` parameter in [`GRPOConfig`] to avoid underutilization or out-of-memory errors.
+
+We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
+
+<iframe
+	src="https://trl-lib-recommend-vllm-memory.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
+
+If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
+
+</Tip>
+
+<Tip>
+
+By default, GRPO uses `MASTER_ADDR=localhost` and `MASTER_PORT=12345` for vLLM, but you can override these values by setting the environment variables accordingly.
+
+</Tip>
+
+For more information, see [Speeding up training with vLLM](speeding_up_training#vllm-for-fast-generation-in-online-methods).
+
+### GRPO at scale: train a 70B+ Model on multiple nodes
+
+When training large models like **Qwen2.5-72B**, you need several key optimizations to make the training efficient and scalable across multiple GPUs and nodes. These include:
+
+- **DeepSpeed ZeRO Stage 3**: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such model. For more details, see [DeepSpeed Integration](deepspeed_integration).
+- **Accelerate**: Accelerate is a library that simplifies distributed training across multiple GPUs and nodes. It provides a simple API to launch distributed training and handles the complexities of distributed training, such as data parallelism, gradient accumulation, and distributed data loading. For more details, see [Distributing Training](distributing_training).
+- **vLLM**: See the previous section on how to use vLLM to speed up generation.
+
+Below is an example SLURM script to train a 70B model with GRPO on multiple nodes. This script trains a model on 4 nodes and uses the 5th node for vLLM-powered generation.
+
+```sh
+#!/bin/bash
+#SBATCH --nodes=5
+#SBATCH --gres=gpu:8
+
+# Get the list of allocated nodes
+NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
+
+# Assign the first 4 nodes for training and the 5th node for vLLM
+TRAIN_NODES="${NODELIST[@]:0:4}"  # Nodes 0, 1, 2, 3 for training
+VLLM_NODE="${NODELIST[4]}"  # Node 4 for vLLM
+
+# Run training on the first 4 nodes (Group 1)
+srun --nodes=4 --ntasks=4 --nodelist="${NODELIST[@]:0:4}" accelerate launch \
+     --config_file examples/accelerate_configs/deepspeed_zero3.yaml \
+     --num_processes 32 \
+     --num_machines 4 \
+     --main_process_ip ${NODELIST[0]} \
+     --machine_rank $SLURM_PROCID \
+     --rdzv_backend c10d \
+     train_grpo.py \
+     --server_ip $VLLM_NODE &
+
+# Run vLLM server on the 5th node (Group 2)
+srun --nodes=1 --ntasks=1 --nodelist="${NODELIST[4]}" trl vllm-serve --model Qwen/Qwen2.5-72B --tensor_parallel_size 8 &
+
+wait
+```
+
+```python
+import argparse
+
+from datasets import load_dataset
+from trl import GRPOTrainer, GRPOConfig
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vllm_server_host", type=str, default="", help="The server IP")
+    args = parser.parse_args()
+
+    # Example dataset from TLDR
+    dataset = load_dataset("trl-lib/tldr", split="train")
+
+    # Dummy reward function: count the number of unique characters in the completions
+    def reward_num_unique_chars(completions, **kwargs):
+        return [len(set(c)) for c in completions]
+
+    training_args = GRPOConfig(
+        output_dir="Qwen2.5-72B-GRPO",
+        per_device_train_batch_size=4,
+        bf16=True,
+        gradient_checkpointing=True,
+        use_vllm=True,
+        vllm_server_host=args.vllm_server_host.replace("ip-", "").replace("-", "."),  # from ip-X-X-X-X to X.X.X.X
+    )
+
+    trainer = GRPOTrainer(model="Qwen/Qwen2.5-72B", args=training_args, reward_funcs=reward_num_unique_chars, train_dataset=dataset)
+    trainer.train()
+
+if __name__=="__main__":
+    main()
+```

 ### Using a custom reward function

@ -132,6 +344,8 @@ The [`GRPOTrainer`] supports using custom reward functions instead of dense rewa
   - The function must accept the following as keyword arguments:
     - `prompts` (contains the prompts),
     - `completions` (contains the generated completions),
+     - `completions_ids` (contains the tokenized completions),
+     - `trainer_state` ([`~transformers.TrainerState`]): The current state of the trainer. This can be used to implement dynamic reward functions, such as curriculum learning, where the reward is adjusted based on the training progress.
     - All columns names (but `prompt`) that the dataset may have. For example, if the dataset contains a column named `ground_truth`, the function will be called with `ground_truth` as a keyword argument.

     The easiest way to comply with this requirement is to use `**kwargs` in the function signature.
@ -145,9 +359,29 @@ The [`GRPOTrainer`] supports using custom reward functions instead of dense rewa

 Below is an example of a reward function for a standard format that rewards longer completions:

+```python
+def reward_func(completions_ids, **kwargs):
+    """Reward function that assigns higher scores to longer completions (in terms of token count)."""
+    return [float(len(ids)) for ids in completions_ids]
+```
+
+You can test it as follows:
+
+```python
+>>> prompts = ["The sky is", "The sun is"]  # not used in the reward function, but the trainer will pass it
+>>> completions = [" blue.", " in the sky."]  # not used in the reward function, but the trainer will pass it
+>>> completions_ids = [[6303, 13], [304, 279, 12884, 13]]
+>>> reward_func(prompts=prompts, completions=completions, completions_ids=completions_ids)
+[2.0, 4.0]
+```
+
+#### Example 1.1: Reward longer completions (based in the number of characters)
+
+Same as the previous example, but this time the reward function is based on the number of characters instead of tokens.
+
 ```python
 def reward_func(completions, **kwargs):
-    """Reward function that gives higher scores to longer completions."""
+    """Reward function that assigns higher scores to longer completions (in terms of character count)."""
    return [float(len(completion)) for completion in completions]
 ```

@ -156,7 +390,8 @@ You can test it as follows:
 ```python
 >>> prompts = ["The sky is", "The sun is"]
 >>> completions = [" blue.", " in the sky."]
->>> print(reward_func(prompts=prompts, completions=completions))
+>>> completions_ids = [[6303, 13], [304, 279, 12884, 13]]  # not used in the reward function, but the trainer will pass it
+>>> reward_func(prompts=prompts, completions=completions, completions_ids=completions_ids)
 [6.0, 12.0]
 ```

@ -193,7 +428,7 @@ You can test this function as follows:

 #### Example 3: Reward completions based on a reference

-Below is an example of a reward function that checks if the is correct. This example is inspired by the _accuracy reward_ function used in the paper [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://huggingface.co/papers/2501.12948).
+Below is an example of a reward function that checks if the completion is correct. This example is inspired by the _accuracy reward_ function used in the paper [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://huggingface.co/papers/2501.12948).
 This example is designed for [standard format](dataset_formats#standard), where the dataset contains a column named `ground_truth`.

 ```python
@ -216,10 +451,71 @@ You can test this function as follows:
 >>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
 [1.0, 0.0]
 ```
+#### Example 4: Multi-task reward functions
+
+Below is an example of using multiple reward functions in the [`GRPOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
+
+```python
+from datasets import Dataset
+from trl import GRPOTrainer
+
+# Define a dataset that contains both math and coding problems
+dataset = Dataset.from_list(
+    [
+        {"prompt": "What is 2+2?", "task": "math"},
+        {"prompt": "Write a function that returns the sum of two numbers.", "task": "code"},
+        {"prompt": "What is 3*4?", "task": "math"},
+        {"prompt": "Write a function that returns the product of two numbers.", "task": "code"},
+    ]
+)
+
+# Math-specific reward function
+def math_reward_func(prompts, completions, task, **kwargs):
+    rewards = []
+    for prompt, completion, t in zip(prompts, completions, task):
+        if t == "math":
+            # Calculate math-specific reward
+            correct = check_math_solution(prompt, completion)
+            reward = 1.0 if correct else -1.0
+            rewards.append(reward)
+        else:
+            # Return None for non-math tasks
+            rewards.append(None)
+    return rewards
+
+# Coding-specific reward function
+def coding_reward_func(prompts, completions, task, **kwargs):
+    rewards = []
+    for prompt, completion, t in zip(prompts, completions, task):
+        if t == "coding":
+            # Calculate coding-specific reward
+            works = test_code_solution(prompt, completion)
+            reward = 1.0 if works else -1.0
+            rewards.append(reward)
+        else:
+            # Return None for non-coding tasks
+            rewards.append(None)
+    return rewards
+
+# Use both task-specific reward functions
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=[math_reward_func, coding_reward_func],
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+In this example, the `math_reward_func` and `coding_reward_func` are designed to work with a mixed dataset that contains both math and coding problems. The `task` column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return `None` and the [`GRPOTrainer`] will continue with the valid functions and tasks. This allows the [`GRPOTrainer`] to handle multiple reward functions with different applicability.
+
+Note that the [`GRPOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.
+
+

 #### Passing the reward function to the trainer

-To use your custom reward function, pass it to the `GRPOTrainer` as follows:
+To use your custom reward function, pass it to the [`GRPOTrainer`] as follows:

 ```python
 from trl import GRPOTrainer
@ -241,13 +537,76 @@ trainer = GRPOTrainer(
 )
 ```

-and the reward will be computed as the sum of the rewards from each function.
+and the reward will be computed as the sum of the rewards from each function, or the weighted sum if `reward_weights` is provided in the config.

 Note that [`GRPOTrainer`] supports multiple reward functions of different types. See the parameters documentation for more details.

+## Vision-Language Model (VLM) Training
+
+GRPO supports training Vision-Language Models (VLMs) on multimodal datasets containing both text and images.
+
+### Supported Models
+
+Tested with:
+
+- **Gemma3** — e.g., `google/gemma-3-4b-it`
+- **LLaVA-NeXT** — e.g., `llava-hf/llava-v1.6-mistral-7b-hf`
+- **Qwen2-VL** — e.g., `Qwen/Qwen2-VL-2B-Instruct`
+- **Qwen2.5-VL** — e.g., `Qwen/Qwen2.5-VL-3B-Instruct`
+- **SmolVLM2** — e.g., `HuggingFaceTB/SmolVLM2-2.2B-Instruct`
+  
+<Tip>
+Compatibility with all VLMs is not guaranteed. If you believe a model should be supported, feel free to open an issue on GitHub — or better yet, submit a pull request with the required changes.
+</Tip>
+
+### Quick Start
+
+Use [grpo\_vlm.py](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) to fine-tune a VLM. Example command for training on [`lmms-lab/multimodal-open-r1-8k-verified`](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified):
+
+```bash
+accelerate launch \
+  --config_file=examples/accelerate_configs/deepspeed_zero3.yaml \
+  examples/scripts/grpo_vlm.py \
+  --model_name_or_path Qwen/Qwen2.5-VL-3B-Instruct \
+  --output_dir grpo-Qwen2.5-VL-3B-Instruct \
+  --learning_rate 1e-5 \
+  --gradient_checkpointing \
+  --torch_dtype bfloat16 \
+  --max_prompt_length 2048 \
+  --max_completion_length 1024 \
+  --use_vllm \
+  --vllm_mode colocate \
+  --use_peft \
+  --lora_target_modules "q_proj", "v_proj" \
+  --log_completions
+```
+
+### Configuration Tips
+
+<Tip warning={true}>
+VLM training may fail if image tokens are truncated. We highly recommend to disable truncation by setting `max_prompt_length` to `None`.
+</Tip>
+
+- Use LoRA on vision-language projection layers
+- Enable 4-bit quantization to reduce memory usage
+- VLMs are memory-intensive — start with smaller batch sizes
+- Most models are compatible with vLLM (`server` and `colocate` modes)
+
+### Dataset Format
+
+Each training sample should include:
+
+- `prompt`: Text formatted via the processor's chat template
+- `image`: A single image (PIL or NumPy array)
+
+The trainer automatically handles image-to-tensor conversion via the model’s image processor.
+
 ## GRPOTrainer

 [[autodoc]] GRPOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## GRPOConfig

--- a/docs/source/how_to_train.md
+++ b/docs/source/how_to_train.md
@ -9,7 +9,7 @@ To address this, we recommend focusing on two key metrics first:
 **Mean Reward**: The primary goal is to maximize the reward achieved by the model during RL training.
 **Objective KL Divergence**: KL divergence (Kullback-Leibler divergence) measures the dissimilarity between two probability distributions. In the context of RL training, we use it to quantify the difference between the current model and a reference model. Ideally, we want to keep the KL divergence between 0 and 10 to ensure the model's generated text remains close to what the reference model produces.

-However, there are more metrics that can be useful for debugging, checkout the [logging section](logging).
+However, there are more metrics that can be useful for debugging, check out the [logging section](logging).

 ## Why Do We Use a Reference Model, and What's the Purpose of KL Divergence?

@ -26,7 +26,7 @@ To address this issue, we add a penalty to the reward function based on the KL d

 ## What Is the Concern with Negative KL Divergence?

-If you generate text by purely sampling from the model distribution things work fine in general. But when you use the `generate` method there are a few caveats because it does not always purely sample depending on the settings which can cause KL-divergence to go negative. Essentially when the active model achieves `log_p_token_active < log_p_token_ref` we get negative KL-div. This can happen in a several cases:
+If you generate text by purely sampling from the model distribution things work fine in general. But when you use the `generate` method there are a few caveats because it does not always purely sample depending on the settings which can cause KL-divergence to go negative. Essentially when the active model achieves `log_p_token_active < log_p_token_ref` we get negative KL-div. This can happen in several cases:

 - **top-k sampling**: the model can smooth out the probability distribution causing the top-k tokens having a smaller probability than those of the reference model but they still are selected
 - **min_length**: this ignores the EOS token until `min_length` is reached. thus the model can assign a very low log prob to the EOS token and very high probs to all others until min_length is reached
@ -50,7 +50,7 @@ generation_kwargs = {
 }
 ```

-With these settings we usually don't encounter any issues. You can also experiments with other settings but if you encounter issues with negative KL-divergence try to go back to these and see if they persist.
+With these settings we usually don't encounter any issues. You can also experiment with other settings but if you encounter issues with negative KL-divergence try to go back to these and see if they persist.

 ## How can debug your own use-case?

@ -60,6 +60,6 @@ Debugging the RL pipeline can be challenging due to its complexity. Here are som
 - **Start small, scale later**: Training large models can be very slow and take several hours or days until you see any improvement. For debugging this is not a convenient timescale so try to use small model variants during the development phase and scale up once that works. That being said you sometimes have to be careful as small models might not have the capacity to solve a complicated task either.
 - **Start simple**: Try to start with a minimal example and build complexity from there. Your use-case might require for example a complicated reward function consisting of many different rewards - try to use one signal first and see if you can optimize that and then add more complexity after that.
 - **Inspect the generations**: It's always a good idea to inspect what the model is generating. Maybe there is a bug in your post-processing or your prompt. Due to bad settings you might cut-off generations too soon. These things are very hard to see on the metrics but very obvious if you look at the generations.
- **Inspect the reward model**: If you reward is not improving over time maybe there's an issue with the reward model. You can look at extreme cases to see if it does what it should: e.g. in the sentiment case you can check if simple positive and negative examples really get different rewards. And you can look at the distribution of your dataset. Finally, maybe the reward is dominated by the query which the model can't affect so you might need to normalize this (e.g. reward of query+response minus reward of the query).
+- **Inspect the reward model**: If your reward is not improving over time maybe there's an issue with the reward model. You can look at extreme cases to see if it does what it should: e.g. in the sentiment case you can check if simple positive and negative examples really get different rewards. And you can look at the distribution of your dataset. Finally, maybe the reward is dominated by the query which the model can't affect so you might need to normalize this (e.g. reward of query+response minus reward of the query).

 These are just a few tips that we find helpful - if you have more useful tricks feel free to open a PR to add them as well!
--- a/docs/source/index.md
+++ b/docs/source/index.md
@ -4,37 +4,58 @@

 # TRL - Transformer Reinforcement Learning

-TRL is a full stack library where we provide a set of tools to train transformer language models with Reinforcement Learning, from the Supervised Fine-tuning step (SFT), Reward Modeling step (RM) to the Proximal Policy Optimization (PPO) step. 
+TRL is a full stack library where we provide a set of tools to train transformer language models with methods like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO), Direct Preference Optimization (DPO), Reward Modeling, and more.
 The library is integrated with 🤗 [transformers](https://github.com/huggingface/transformers).

+## 🎉 What's New
+
+**✨ OpenAI GPT OSS Support**: TRL now fully supports fine-tuning the latest [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4)! Check out the:
+
+- [OpenAI Cookbook](https://cookbook.openai.com/articles/gpt-oss/fine-tune-transfomers)
+- [GPT OSS recipes](https://github.com/huggingface/gpt-oss-recipes)
+- [Our example script](https://github.com/huggingface/trl/blob/main/examples/scripts/sft_gpt_oss.py)
+
+You can also explore TRL-related models, datasets, and demos in the [TRL Hugging Face organization](https://huggingface.co/trl-lib).
+
 ## Learn

 Learn post-training with TRL and other libraries in 🤗 [smol course](https://github.com/huggingface/smol-course).

-## API documentation
+## Contents

- [Model Classes](models): *A brief overview of what each public model class does.*
- [`SFTTrainer`](sft_trainer): *Supervise Fine-tune your model easily with `SFTTrainer`*
- [`RewardTrainer`](reward_trainer): *Train easily your reward model using `RewardTrainer`.*
- [`PPOTrainer`](ppo_trainer): *Further fine-tune the supervised fine-tuned model using PPO algorithm*
- [Best-of-N Sampling](best-of-n): *Use best of n sampling as an alternative way to sample predictions from your active model*
- [`DPOTrainer`](dpo_trainer): *Direct Preference Optimization training using `DPOTrainer`.*
- [`TextEnvironment`](text_environments): *Text environment to train your model using tools with RL.*
-
-## Examples
-
- [Sentiment Tuning](sentiment_tuning): *Fine tune your model to generate positive movie contents*
- [Training with PEFT](lora_tuning_peft): *Memory efficient RLHF training using adapters with PEFT*
- [Detoxifying LLMs](detoxifying_a_lm): *Detoxify your language model through RLHF*
- [StackLlama](using_llama_models): *End-to-end RLHF training of a Llama model on Stack exchange dataset*
- [Learning with Tools](learning_tools): *Walkthrough of using `TextEnvironments`*
- [Multi-Adapter Training](multi_adapter_rl): *Use a single base model and multiple adapters for memory efficient end-to-end training*
+The documentation is organized into the following sections:

+- **Getting Started**: installation and quickstart guide.
+- **Conceptual Guides**: dataset formats, training FAQ, and understanding logs.
+- **How-to Guides**: reducing memory usage, speeding up training, distributing training, etc.
+- **Integrations**: DeepSpeed, Liger Kernel, PEFT, etc.
+- **Examples**: example overview, community tutorials, etc.
+- **API**: trainers, utils, etc.

 ## Blog posts

 <div class="mt-10">
  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-vlm-alignment">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/trl_vlm/thumbnail.png" alt="thumbnail" class="mt-0">
+      <p class="text-gray-500 text-sm">Published on August 7, 2025</p>
+      <p class="text-gray-700">Vision Language Model Alignment in TRL ⚡️</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/vllm-colocate">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/vllm-colocate/thumbnail.png" alt="thumbnail" class="mt-0">
+      <p class="text-gray-500 text-sm">Published on June 3, 2025</p>
+      <p class="text-gray-700">NO GPU left behind: Unlocking Efficiency with Co-located vLLM in TRL</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/liger-grpo">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/liger-grpo/thumbnail.png" alt="thumbnail" class="mt-0">
+      <p class="text-gray-500 text-sm">Published on May 25, 2025</p>
+      <p class="text-gray-700">🐯 Liger GRPO meets TRL</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/open-r1">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/open-r1/thumbnails.png" alt="thumbnail" class="mt-0">
+      <p class="text-gray-500 text-sm">Published on January 28, 2025</p>
+      <p class="text-gray-700">Open-R1: a fully open reproduction of DeepSeek-R1</p>
+    </a>
    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo_vlm">
      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/dpo_vlm/thumbnail.png" alt="thumbnail" class="mt-0">
      <p class="text-gray-500 text-sm">Published on July 10, 2024</p>
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@ -7,7 +7,7 @@ Install the library with pip or [uv](https://docs.astral.sh/uv/):
 <hfoptions id="install">
 <hfoption id="uv">

-uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions), .
+uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions).

 ```bash
 uv pip install trl
--- a/docs/source/iterative_sft_trainer.md
+++ b/docs/source/iterative_sft_trainer.md
@ -2,56 +2,146 @@

 [![](https://img.shields.io/badge/All_models-Iterative_SFT-blue)](https://huggingface.co/models?other=iterative-sft,trl)

+<Tip warning={true}>
+
+The IterativeSFTTrainer is deprecated and will be removed in version 0.24.0. Please use the [`SFTTrainer`].
+
+</Tip>

 Iterative fine-tuning is a training method that enables to perform custom actions (generation and filtering for example) between optimization steps. In TRL we provide an easy-to-use API to fine-tune your models in an iterative way in just a few lines of code.

-## Usage
+## Quickstart

-To get started quickly, instantiate an instance a model, and a tokenizer.
+To get started quickly, you can either pass a model identifier or a pre-instantiated model to the trainer:

 ```python
+from trl import IterativeSFTConfig, IterativeSFTTrainer

-model = AutoModelForCausalLM.from_pretrained(model_name)
-tokenizer = AutoTokenizer.from_pretrained(model_name)
-if tokenizer.pad_token is None:
-    tokenizer.pad_token = tokenizer.eos_token
+# Using a model identifier
+trainer = IterativeSFTTrainer(
+    "facebook/opt-350m",
+    args=IterativeSFTConfig(
+        max_length=512,
+        output_dir="./output",
+    ),
+)
+
+# Or using a pre-instantiated model
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")

 trainer = IterativeSFTTrainer(
    model,
-    tokenizer
+    args=IterativeSFTConfig(
+        max_length=512,
+        output_dir="./output",
+    ),
+    processing_class=tokenizer,
 )
-
 ```

-You have the choice to either provide a list of strings or a list of tensors to the step function. 
+## Usage

-#### Using a list of tensors as input:
+The [`IterativeSFTTrainer`] supports two ways of providing input data to the `step` function:
+
+### Using a list of tensors as input:

 ```python
-
 inputs = {
    "input_ids": input_ids,
-    "attention_mask": attention_mask
+    "attention_mask": attention_mask,
 }

 trainer.step(**inputs)
-
 ```

-#### Using a list of strings as input:
+### Using a list of strings as input:

 ```python
-
 inputs = {
-    "texts": texts
+    "texts": texts,
+    "texts_labels": texts_labels,  # Optional, defaults to texts
 }

 trainer.step(**inputs)
-
 ```

-For causal language models, labels will automatically be created from input_ids or from texts. When using sequence to sequence models you will have to provide your own labels or text_labels.
+For causal language models, labels will automatically be created from `input_ids` or from `texts`. When using sequence to sequence models you will have to provide your own labels or `text_labels`.

-## IterativeTrainer
+## Configuration
+
+The [`IterativeSFTConfig`] class provides several parameters to customize the training:
+
+```python
+from trl import IterativeSFTConfig
+
+config = IterativeSFTConfig(
+    # Model initialization parameters
+    model_init_kwargs={"torch_dtype": "bfloat16"},
+
+    # Data preprocessing parameters
+    max_length=512,
+    truncation_mode="keep_end",
+
+    # Training parameters
+    output_dir="./output",
+    learning_rate=2e-5,
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=4,
+    max_steps=1000,
+    save_steps=100,
+    optim="adamw_torch",
+    report_to="wandb",
+)
+```
+
+### Model Initialization
+
+You can control how the model is initialized by passing keyword arguments to `model_init_kwargs`:
+
+```python
+config = IterativeSFTConfig(
+    model_init_kwargs={
+        "torch_dtype": "bfloat16",
+        "device_map": "auto",
+        "trust_remote_code": True,
+    }
+)
+```
+
+### Data Preprocessing
+
+The trainer supports two truncation modes:
+
+- `keep_end`: Truncates from the start of the sequence
+- `keep_start`: Truncates from the end of the sequence
+
+```python
+config = IterativeSFTConfig(
+    max_length=512,
+    truncation_mode="keep_end",  # or "keep_start"
+)
+```
+
+### Training Optimization
+
+You can optimize CUDA cache usage for more memory-efficient training:
+
+```python
+config = IterativeSFTConfig(
+    optimize_device_cache=True,
+)
+```
+
+## IterativeSFTTrainer

 [[autodoc]] IterativeSFTTrainer
+    - train
+    - save_model
+    - push_to_hub
+
+## IterativeSFTConfig
+
+[[autodoc]] IterativeSFTConfig
--- a/docs/source/jobs_training.md
+++ b/docs/source/jobs_training.md
@ -0,0 +1,392 @@
+# Training using Jobs
+
+[Jobs](https://huggingface.co/docs/huggingface_hub/guides/jobs) lets you run training scripts on fully managed infrastructure (no need to handle GPUs, dependencies, or environment setup locally). This makes it easy to scale and monitor your experiments directly from the Hub.
+
+In this guide, you’ll learn how to:
+
+- Run TRL training scripts using Jobs.
+- Configure hardware, timeouts, environment variables, and secrets.
+- Monitor and manage jobs from the CLI or Python.
+
+<Tip>
+
+When a model is trained using **TRL + Jobs**, a tag is automatically added to the model card.  
+You can explore models trained with this method [Hugging Face model hub](https://huggingface.co/models?other=hf_jobs).
+
+</Tip>
+
+## Requirements
+
+- [Pro](https://hf.co/pro), [Team](https://hf.co/enterprise), or [Enterprise](https://hf.co/enterprise) plan.
+- Logged into the Hugging Face Hub (`hf auth login`).
+
+## Preparing your Script
+
+You can launch Jobs using either the [`hf jobs` CLI](https://huggingface.co/docs/huggingface_hub/guides/cli#hf-jobs) or the Python API. A convenient option is to use [UV scripts](https://docs.astral.sh/uv/guides/scripts/), which packages all dependencies directly into a single Python file. You can run them like this:
+
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+```bash
+hf jobs uv run --flavor a100-large --secrets HF_TOKEN "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" --model_name_or_path Qwen/Qwen2-0.5B --dataset_name trl-lib/Capybara
+```
+
+The script can also be a local file:
+
+```bash
+hf jobs uv run --flavor a100-large --secrets HF_TOKEN trl/scripts/sft.py --model_name_or_path Qwen/Qwen2-0.5B --dataset_name trl-lib/Capybara
+```
+
+Since it runs using a Docker Image from Hugging Face Spaces or Docker Hub, you can also specify it:
+
+```bash
+hf jobs uv run --flavor a100-large --secrets HF_TOKEN --image <docker-image> trl/scripts/sft.py --model_name_or_path Qwen/Qwen2-0.5B --dataset_name trl-lib/Capybara
+```
+
+</hfoption>
+<hfoption id="python">
+
+```python
+from huggingface_hub import run_uv_job
+run_uv_job(
+    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py",
+    token="hf...",
+    flavor="a100-large",
+    script_args=[
+        "--model_name_or_path", "Qwen/Qwen2-0.5B",
+        "--dataset_name", "trl-lib/Capybara",
+    ]
+)
+```
+
+The script can also be a local file:
+
+```python
+from huggingface_hub import run_uv_job
+run_uv_job(
+    "trl/scripts/sft.py",
+    token="hf...",
+    flavor="a100-large",
+    script_args=[
+        "--model_name_or_path", "Qwen/Qwen2-0.5B",
+        "--dataset_name", "trl-lib/Capybara",
+    ]
+)
+```
+
+Since it runs using a Docker Image from Hugging Face Spaces or Docker Hub, you can also specify it:
+
+```python
+from huggingface_hub import run_uv_job
+run_uv_job(
+    "sft.py",
+    token="hf...",
+    flavor="a100-large",
+    image="<docker-image>",
+    script_args=[
+        "--model_name_or_path", "Qwen/Qwen2-0.5B",
+        "--dataset_name", "trl-lib/Capybara",
+    ]
+)
+```
+
+</hfoption>
+</hfoptions>
+
+You can also run jobs without UV:
+
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+In this case, we give the cli the Docker image and run it as:
+
+```bash
+hf jobs run --flavor a100-large pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel python -c "import torch; print(torch.cuda.get_device_name())"
+```
+
+</hfoption>
+<hfoption id="python">
+
+```python
+from huggingface_hub import run_job
+run_job(
+    image="pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel",
+    command=["python", "-c", "import torch; print(torch.cuda.get_device_name())"],
+    flavor="a100-large",
+)
+```
+
+</hfoption>
+</hfoptions>
+
+### Adding Dependencies with UV
+
+All example scripts in TRL are compatible with `uv`, allowing seamless execution with Jobs. You can check the full list of examples in [Maintained examples](example_overview#maintained-examples).  
+
+Dependencies are specified at the top of the script using this structure:
+
+```python
+# /// script
+# dependencies = [
+#     "trl @ git+https://github.com/huggingface/trl.git",
+#     "peft",
+# ]
+# ///
+```
+
+When you run the UV script, these dependencies are automatically installed. In the example above, `trl` and `peft` would be installed before the script runs.
+
+You can also provide dependencies directly in the `uv run` command:
+
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+Using the `--with` flag.
+
+```bash
+hf jobs uv run \
+    --flavor a100-large \
+    --secrets HF_TOKEN \
+    --with transformers \
+    --with torch \
+    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
+    --model_name_or_path Qwen/Qwen2-0.5B  \
+    --dataset_name trl-lib/Capybara
+```
+
+</hfoption>
+<hfoption id="python">
+
+Using the `dependencies` argument.
+
+```python
+from huggingface_hub import run_uv_job
+run_uv_job(
+    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py",
+    dependencies=["transformers", "torch"]
+    token="hf...",
+    flavor="a100-large",
+    script_args=[
+        "--model_name_or_path", "Qwen/Qwen2-0.5B",
+        "--dataset_name", "trl-lib/Capybara",
+    ]
+)
+```
+
+</hfoption>
+</hfoptions>
+
+### Hardware and Timeout Settings
+
+Jobs allow you to select a specific hardware configuration using the `--flavor` flag. As of 08/25, the available options are:
+
+**CPU:** `cpu-basic`, `cpu-upgrade`  
+**GPU:** `t4-small`, `t4-medium`, `l4x1`, `l4x4`, `a10g-small`, `a10g-large`, `a10g-largex2`, `a10g-largex4`, `a100-large`  
+**TPU:** `v5e-1x1`, `v5e-2x2`, `v5e-2x4`  
+
+You can always check the latest list of supported hardware flavors in [Spaces config reference](https://huggingface.co/docs/hub/en/spaces-config-reference).
+
+By default, jobs have a **30-minute timeout**, after which they will automatically stop. For long-running tasks like training, you can increase the timeout as needed. Supported time units are:
+
+- `s`: seconds
+- `m`: minutes
+- `h`: hours
+- `d`: days
+
+Example with a 2-hour timeout:
+
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+Using the `--timeout` flag:
+
+```bash
+hf jobs uv run \
+    --timeout 2h \
+    --flavor a100-large \
+    --secrets HF_TOKEN \
+    --with transformers \
+    --with torch \
+    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py" \
+    --model_name_or_path Qwen/Qwen2-0.5B  \
+    --dataset_name trl-lib/Capybara
+```
+
+</hfoption>
+<hfoption id="python">
+
+Using the `timeout` argument:
+
+```python
+from huggingface_hub import run_uv_job
+run_uv_job(
+    "https://raw.githubusercontent.com/huggingface/trl/main/trl/scripts/sft.py",
+    timeout="2h",
+    token="hf...",
+    flavor="a100-large",
+    script_args=[
+        "--model_name_or_path", "Qwen/Qwen2-0.5B",
+        "--dataset_name", "trl-lib/Capybara",
+    ]
+)
+```
+
+</hfoption>
+</hfoptions>
+
+### Environment Variables, Secrets, and Token
+
+You can pass environment variables, secrets, and your auth token to your jobs. 
+
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+Using the `--env`, `--secrets`, and/or `--token` options.
+
+```bash
+hf jobs uv run \
+    trl/scripts/sft.py \
+    --flavor a100-large \
+    --env FOO=foo \
+    --env BAR=bar \
+    --secrets HF_TOKEN=HF_TOKEN \
+    --secrets MY_SECRET=password \
+    --token hf...
+```
+
+</hfoption>
+<hfoption id="python">
+
+
+Using the `env`, `secrets`, and/or `token` arguments.
+
+```python
+from huggingface_hub import run_uv_job
+run_uv_job(
+    "trl/scripts/sft.py",
+    env={"FOO": "foo", "BAR": "bar"},
+    secrets={"MY_SECRET": "psswrd"},
+    token="hf..."
+)
+```
+
+</hfoption>
+</hfoptions>
+
+## Training and Evaluating a Model with Jobs
+
+TRL example scripts are fully UV-compatible, allowing you to run a complete training workflow directly on Jobs. You can customize the training by providing the usual script arguments, along with hardware specifications and secrets.  
+
+To evaluate your training runs, in addition to reviewing the job logs, you can use [**Trackio**](https://huggingface.co/blog/trackio), a lightweight experiment tracking library. Trackio enables end-to-end experiment management on the Hugging Face Hub. All TRL example scripts already support reporting to Trackio via the `report_to` argument. Using this feature saves your experiments in an interactive HF Space, making it easy to monitor metrics, compare runs, and track progress over time.
+
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+```bash
+hf jobs uv run \
+    --flavor a100-large \
+    --secrets HF_TOKEN \
+    "trl/scripts/sft.py" \
+    --model_name_or_path Qwen/Qwen2-0.5B \
+    --dataset_name trl-lib/Capybara \
+    --learning_rate 2.0e-5 \
+    --num_train_epochs 1 \
+    --packing \
+    --per_device_train_batch_size 2 \
+    --gradient_accumulation_steps 8 \
+    --eos_token '<|im_end|>' \
+    --eval_strategy steps \
+    --eval_steps 100 \
+    --output_dir Qwen2-0.5B-SFT \
+    --report_to trackio \
+    --push_to_hub
+```
+
+</hfoption>
+<hfoption id="python">
+
+```python
+from huggingface_hub import run_uv_job
+
+run_uv_job(
+    "trl/scripts/sft.py",
+    flavor="a100-large",
+    secrets={"HF_TOKEN": "your_hf_token"},
+    script_args=[
+        "--model_name_or_path", "Qwen/Qwen2-0.5B",
+        "--dataset_name", "trl-lib/Capybara",
+        "--learning_rate", "2.0e-5",
+        "--num_train_epochs", "1",
+        "--packing",
+        "--per_device_train_batch_size", "2",
+        "--gradient_accumulation_steps", "8",
+        "--eos_token", "<|im_end|>",
+        "--eval_strategy", "steps",
+        "--eval_steps", "100",
+        "--output_dir", "Qwen2-0.5B-SFT",
+        "--report_to", "trackio",
+        "--push_to_hub"
+    ]
+)
+```
+
+</hfoption>
+</hfoptions>
+
+## Monitoring and Managing Jobs
+
+After launching a job, you can track its progress on the [Jobs page](https://huggingface.co/settings/jobs). Additionally, Jobs provides CLI and Python commands to check status, view logs, or cancel a job.
+
+<hfoptions id="script_type">
+<hfoption id="bash">
+
+```bash
+# List your jobs
+hf jobs ps -a
+
+# List your running jobs
+hf jobs ps 
+
+# Inspect the status of a job
+hf jobs inspect
+
+# View logs from a job
+hf jobs logs job_id
+
+# Cancel a job
+hf jobs cancel job_id
+```
+
+</hfoption>
+<hfoption id="python">
+
+
+```python
+from huggingface_hub import list_jobs, inspect_job, fetch_job_logs, cancel_job
+
+# List your jobs
+jobs = list_jobs()
+jobs[0]
+
+# List your running jobs
+running_jobs = [job for job in list_jobs() if job.status.stage == "RUNNING"]
+
+# Inspect the status of a job
+inspect_job(job_id=job_id)
+
+# View logs from a job
+for log in fetch_job_logs(job_id=job_id):
+    print(log)
+
+# Cancel a job
+cancel_job(job_id=job_id)
+```
+
+</hfoption>
+</hfoptions>
+
+## Best Practices and Tips
+
+- Choose hardware that fits the size of your model and dataset for optimal performance.
+- Training jobs can be long-running. Consider increasing the default timeout.
+- Reuse training and evaluation scripts whenever possible to streamline workflows.
--- a/docs/source/kto_trainer.md
+++ b/docs/source/kto_trainer.md
@ -38,7 +38,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 train_dataset = load_dataset("trl-lib/kto-mix-14k", split="train")

-training_args = KTOConfig(output_dir="Qwen2-0.5B-KTO", logging_steps=10)
+training_args = KTOConfig(output_dir="Qwen2-0.5B-KTO")
 trainer = KTOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
 trainer.train()
 ```
@ -53,9 +53,9 @@ Distributed across 8 x H100 GPUs, the training takes approximately 30 minutes. Y

 ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kto-qwen2-reward-margin.png)

-To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

-<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-KTO
+<pre><code>$ transformers chat trl-lib/Qwen2-0.5B-KTO
 <strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
 What is the best programming language?

@ -74,7 +74,7 @@ Here are some other factors to consider when choosing a programming language for

 KTO requires an [unpaired preference dataset](dataset_formats#unpaired-preference). Alternatively, you can provide a *paired* preference dataset (also known simply as a *preference dataset*). In this case, the trainer will automatically convert it to an unpaired format by separating the chosen and rejected responses, assigning `label = True` to the chosen completions and `label = False` to the rejected ones.

-The [`KTOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+The [`KTOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 In theory, the dataset should contain at least one chosen and one rejected completion. However, some users have successfully run KTO using *only* chosen or only rejected data. If using only rejected data, it is advisable to adopt a conservative learning rate.

@ -89,7 +89,6 @@ accelerate launch trl/scripts/kto.py \
    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
    --dataset_name trl-lib/kto-mix-14k \
    --num_train_epochs 1 \
-    --logging_steps 25 \
    --output_dir Qwen2-0.5B-KTO
 ```

@ -119,20 +118,23 @@ By default, they are both 1. However, if you have more of one or the other, then

 ## Logged metrics

-While training and evaluating we record the following reward metrics:
+While training and evaluating, we record the following reward metrics:

- `rewards/chosen`: the mean log probabilities of the policy model for the chosen responses scaled by beta
- `rewards/rejected`: the mean log probabilities of the policy model for the rejected responses scaled by beta
- `rewards/margins`: the mean difference between the chosen and corresponding rejected rewards
- `logps/chosen`: the mean log probabilities of the chosen completions
- `logps/rejected`: the mean log probabilities of the rejected completions
- `logits/chosen`: the mean logits of the chosen completions
- `logits/rejected`: the mean logits of the rejected completions
- `kl`: the KL divergence between the policy model and the reference model
+- `rewards/chosen_sum`: the sum of log probabilities of the policy model for the chosen responses scaled by beta
+- `rewards/rejected_sum`: the sum of log probabilities of the policy model for the rejected responses scaled by beta
+- `logps/chosen_sum`: the sum of log probabilities of the chosen completions
+- `logps/rejected_sum`: the sum of log probabilities of the rejected completions
+- `logits/chosen_sum`: the sum of logits of the chosen completions
+- `logits/rejected_sum`: the sum of logits of the rejected completions
+- `count/chosen`: the count of chosen samples in a batch
+- `count/rejected`: the count of rejected samples in a batch

 ## KTOTrainer

 [[autodoc]] KTOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## KTOConfig

--- a/docs/source/learning_tools.md
+++ b/docs/source/learning_tools.md
@ -1,233 +0,0 @@
-# Learning Tools (Experimental 🧪)
-
-Using Large Language Models (LLMs) with tools has been a popular topic recently with awesome works such as [ToolFormer](https://huggingface.co/papers/2302.04761) and [ToolBench](https://huggingface.co/papers/2305.16504). In TRL, we provide a simple example of how to teach LLM to use tools with reinforcement learning. 
-
-
-Here's an overview of the scripts in the [trl repository](https://github.com/lvwerra/trl/tree/main/examples/research_projects/tools):
-
-| File | Description | 
-|---|---| 
-| [`calculator.py`](https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/calculator.py) | Script to train LLM to use a calculator with reinforcement learning. |
-| [`triviaqa.py`](https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/triviaqa.py) | Script to train LLM to use a wiki tool to answer questions. |
-| [`python_interpreter.py`](https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/python_interpreter.py) | Script to train LLM to use python interpreter to solve math puzzles. |
-
-<Tip warning={true}>
-
-Note that the scripts above rely heavily on the `TextEnvironment` API which is still under active development. The API may change in the future. Please see [`TextEnvironment`](text_environment) for the related docs.
-</Tip>
-
-
-## Learning to Use a Calculator
-
-
-The rough idea is as follows:
-
-1. Load a tool such as [ybelkada/simple-calculator](https://huggingface.co/spaces/ybelkada/simple-calculator) that parse a text calculation like `"14 + 34"` and return the calulated number:
-    ```python
-    from transformers import AutoTokenizer, load_tool
-    tool = load_tool("ybelkada/simple-calculator")
-    tool_fn = lambda text: str(round(float(tool(text)), 2))  # rounding to 2 decimal places
-    ```
-1. Define a reward function that returns a positive reward if the tool returns the correct answer. In the script we create a dummy reward function like `reward_fn = lambda x: 1`, but we override the rewards directly later.
-1. Create a prompt on how to use the tools
-    ```python
-    # system prompt
-    prompt = """\
-    What is 13.1-3?
-
-    <request><SimpleCalculatorTool>13.1-3<call>10.1<response>
-
-    Result=10.1<submit>
-
-    What is 4*3?
-
-    <request><SimpleCalculatorTool>4*3<call>12<response>
-
-    Result=12<submit>
-
-    What is 12.1+1?
-
-    <request><SimpleCalculatorTool>12.1+1<call>13.1<response>
-
-    Result=13.1<submit>
-
-    What is 12.1-20?
-
-    <request><SimpleCalculatorTool>12.1-20<call>-7.9<response>
-
-    Result=-7.9<submit>"""
-    ```
-3. Create a `trl.TextEnvironment` with the model 
-    ```python
-    env = TextEnvironment(
-        model,
-        tokenizer,
-        {"SimpleCalculatorTool": tool_fn},
-        reward_fn,
-        prompt,
-        generation_kwargs=generation_kwargs,
-    )
-    ```
-4. Then generate some data such as `tasks = ["\n\nWhat is 13.1-3?", "\n\nWhat is 4*3?"]` and run the environment with `queries, responses, masks, rewards, histories = env.run(tasks)`. The environment will look for the `<call>` token in the prompt and append the tool output to the response; it will also return the mask associated with the response. You can further use the `histories` to visualize the interaction between the model and the tool; `histories[0].show_text()` will show the text with color-coded tool output and `histories[0].show_tokens(tokenizer)` will show visualize the tokens.
-    ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/learning_tools.png)
-1. Finally, we can train the model with `train_stats = ppo_trainer.step(queries, responses, rewards, masks)`. The trainer will use the mask to ignore the tool output when computing the loss, make sure to pass that argument to `step`.
-
-## Experiment results
-
-We trained a model with the above script for 10 random seeds. You can reproduce the run with the following command. Feel free to remove the `--slurm-*` arguments if you don't have access to a slurm cluster.
-
-```
-WANDB_TAGS="calculator_final" python benchmark/benchmark.py \
-    --command "python examples/research_projects/tools/calculator.py" \
-    --num-seeds 10 \
-    --start-seed 1 \
-    --workers 10 \
-    --slurm-gpus-per-task 1 \
-    --slurm-ntasks 1 \
-    --slurm-total-cpus 8 \
-    --slurm-template-path benchmark/trl.slurm_template
-```
-
-We can then use [`openrlbenchmark`](https://github.com/openrlbenchmark/openrlbenchmark) which generates the following plot.
-```
-# pip install openrlbenchmark==0.2.1a5
-python -m openrlbenchmark.rlops_multi_metrics \
-    --filters '?we=openrlbenchmark&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.tracker_project_name&cen=trl_ppo_trainer_config.value.log_with&metrics=env/reward_mean&metrics=objective/kl' \
-        'wandb?tag=calculator_final&cl=calculator_mask' \
-    --env-ids trl \
-    --check-empty-runs \
-    --pc.ncols 2 \
-    --pc.ncols-legend 1 \
-    --output-filename static/0compare \
-    --scan-history
-```
-
-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/learning_tools_chart.png)
-
-As we can see, while 1-2 experiments crashed for some reason, most of the runs obtained near perfect proficiency in the calculator task.
-
-
-## (Early Experiments 🧪): learning to use a wiki tool for question answering
-
-In the [ToolFormer](https://huggingface.co/papers/2302.04761) paper, it shows an interesting use case that utilizes a Wikipedia Search tool to help answer questions. In this section, we attempt to perform similar experiments but uses RL instead to teach the model to use a wiki tool on the [TriviaQA](https://nlp.cs.washington.edu/triviaqa/) dataset.
-
-
-<Tip warning={true}>
-
-**Note that many settings are different so the results are not directly comparable.**
-</Tip>
-
-
-
-
-### Building a search index
-
-Since [ToolFormer](https://huggingface.co/papers/2302.04761) did not open source, we needed to first replicate the search index. It is mentioned in their paper that the authors built the search index using a BM25 retriever that indexes the Wikipedia dump from [KILT](https://github.com/facebookresearch/KILT)
-
-Fortunately, [`pyserini`](https://github.com/castorini/pyserini) already implements the BM25 retriever and provides a prebuilt index for the KILT Wikipedia dump. We can use the following code to search the index.
-
-```python
-from pyserini.search.lucene import LuceneSearcher
-import json
-searcher = LuceneSearcher.from_prebuilt_index('wikipedia-kilt-doc')
-def search(query):
-    hits = searcher.search(query, k=1)
-    hit = hits[0]
-    contents = json.loads(hit.raw)['contents']
-    return contents
-print(search("tennis racket"))
-```
-```
-Racket (sports equipment)
-A racket or racquet is a sports implement consisting of a handled frame with an open hoop across which a network of strings or catgut is stretched tightly. It is used for striking a ball or shuttlecock in games such as squash, tennis, racquetball, and badminton. Collectively, these games are known as racket sports. Racket design and manufacturing has changed considerably over the centuries.
-
-The frame of rackets for all sports was traditionally made of solid wood (later laminated wood) and the strings of animal intestine known as catgut. The traditional racket size was limited by the strength and weight of the wooden frame which had to be strong enough to hold the strings and stiff enough to hit the ball or shuttle. Manufacturers started adding non-wood laminates to wood rackets to improve stiffness. Non-wood rackets were made first of steel, then of aluminum, and then carbon fiber composites. Wood is still used for real tennis, rackets, and xare. Most rackets are now made of composite materials including carbon fiber or fiberglass, metals such as titanium alloys, or ceramics.
-...
-```
-
-We then basically deployed this snippet as a Hugging Face space [here](https://huggingface.co/spaces/vwxyzjn/pyserini-wikipedia-kilt-doc), so that we can use the space as a `transformers.Tool` later.
-
-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/pyserini.png)
-
-### Experiment settings
-
-We use the following settings:
-
-* use the `bigcode/starcoderbase` model as the base model
-* use the `pyserini-wikipedia-kilt-doc` space as the wiki tool and only uses the first paragrahs of the search result, allowing the `TextEnvironment` to obtain at most `max_tool_reponse=400` response tokens from the tool.
-* test if the response contain the answer string, if so, give a reward of 1, otherwise, give a reward of 0.
-    * notice this is a simplified evaluation criteria. In [ToolFormer](https://huggingface.co/papers/2302.04761), the authors checks if the first 20 words of the response contain the correct answer.
-* used the following prompt that demonstrates the usage of the wiki tool.
-```python
-prompt = """\
-Answer the following question:
-
-Q: In which branch of the arts is Patricia Neary famous?
-A: Ballets
-A2: <request><Wiki>Patricia Neary<call>Patricia Neary (born October 27, 1942) is an American ballerina, choreographer and ballet director, who has been particularly active in Switzerland. She has also been a highly successful ambassador for the Balanchine Trust, bringing George Balanchine's ballets to 60 cities around the globe.<response>
-Result=Ballets<submit>
-
-Q: Who won Super Bowl XX?
-A: Chicago Bears
-A2: <request><Wiki>Super Bowl XX<call>Super Bowl XX was an American football game between the National Football Conference (NFC) champion Chicago Bears and the American Football Conference (AFC) champion New England Patriots to decide the National Football League (NFL) champion for the 1985 season. The Bears defeated the Patriots by the score of 46–10, capturing their first NFL championship (and Chicago's first overall sports victory) since 1963, three years prior to the birth of the Super Bowl. Super Bowl XX was played on January 26, 1986 at the Louisiana Superdome in New Orleans.<response>
-Result=Chicago Bears<submit>
-
-Q: """
-```
-
-
-### Result and Discussion
-
-
-Our experiments show that the agent can learn to use the wiki tool to answer questions. The learning curves would go up mostly, but one of the experiment did crash.
-
-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/triviaqa_learning_curves.png)
-
-Wandb report is [here](https://wandb.ai/costa-huang/cleanRL/reports/TriviaQA-Final-Experiments--Vmlldzo1MjY0ODk5) for further inspection.
-
-
-Note that the correct rate of the trained model is on the low end, which could be due to the following reasons:
-
-* **incorrect searches:** When given the question `"What is Bruce Willis' real first name?"` if the model searches for `Bruce Willis`, our wiki tool returns "Patrick Poivey (born 18 February 1948) is a French actor. He is especially known for his voice: he is the French dub voice of Bruce Willis since 1988.` But a correct search should be `Walter Bruce Willis (born March 19, 1955) is an American former actor. He achieved fame with a leading role on the comedy-drama series Moonlighting (1985–1989) and appeared in over a hundred films, gaining recognition as an action hero after his portrayal of John McClane in the Die Hard franchise (1988–2013) and other roles.[1][2]"
-
-
-    ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/real_first_name.png)
-
-* **unnecessarily long response**: The wiki tool by default sometimes output very long sequences. E.g., when the wiki tool searches for "Brown Act"
-    * Our wiki tool returns "The Ralph M. Brown Act, located at California Government Code 54950 "et seq.", is an act of the California State Legislature, authored by Assemblymember Ralph M. Brown and passed in 1953, that guarantees the public's right to attend and participate in meetings of local legislative bodies."
-    * [ToolFormer](https://huggingface.co/papers/2302.04761)'s wiki tool returns "The Ralph M. Brown Act is an act of the California State Legislature that guarantees the public's right to attend and participate in meetings of local legislative bodies." which is more succinct.
-
-    ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/brown_act.png)
-
-
-## (Early Experiments 🧪): solving math puzzles with python interpreter
-
-In this section, we attempt to teach the model to use a python interpreter to solve math puzzles. The rough idea is to give the agent a prompt like the following:
-
-```python
-prompt = """\
-Example of using a Python API to solve math questions.
-
-Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
-
-<request><PythonInterpreter>
-def solution():
-    money_initial = 23
-    bagels = 5
-    bagel_cost = 3
-    money_spent = bagels * bagel_cost
-    money_left = money_initial - money_spent
-    result = money_left
-    return result
-print(solution())
-<call>72<response>
-
-Result = 72 <submit>
-
-Q: """
-```
-
-
-Training experiment can be found at https://wandb.ai/lvwerra/trl-gsm8k/runs/a5odv01y
-
-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/gms8k_learning_curve.png)
--- a/docs/source/liger_kernel_integration.md
+++ b/docs/source/liger_kernel_integration.md
@ -4,4 +4,29 @@

 Section under construction. Feel free to contribute!

-</Tip>
+</Tip>
+
+[Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU training throughput by 20% and reduce memory usage by 60%. That way, we can **4x** our context length, as described in the benchmark below. They have implemented Hugging Face compatible `RMSNorm`, `RoPE`, `SwiGLU`, `CrossEntropy`, `FusedLinearCrossEntropy`, with more to come. The kernel works out of the box with [FlashAttention](https://github.com/Dao-AILab/flash-attention), [PyTorch FSDP](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html), and [Microsoft DeepSpeed](https://github.com/microsoft/DeepSpeed).
+
+With this memory reduction, you can potentially turn off `cpu_offloading` or gradient checkpointing to further boost the performance.
+
+| Speed Up                 | Memory Reduction        |
+|--------------------------|-------------------------|
+| ![Speed up](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-tps.png) | ![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-memory.png) |
+
+1. To use Liger-Kernel in [`SFTTrainer`], first install it by:
+
+```bash
+pip install liger-kernel
+```
+
+2. Once installed, set `use_liger_kernel` in [`SFTConfig`]. No other changes are needed!
+
+```python
+training_args = SFTConfig(
+    use_liger_kernel=True,
+    ...
+)
+```
+
+To learn more about Liger-Kernel, visit their [official repository](https://github.com/linkedin/Liger-Kernel/).
--- a/docs/source/logging.md
+++ b/docs/source/logging.md
@ -1,74 +1,106 @@
 # Logging

 As reinforcement learning algorithms are historically challenging to debug, it's important to pay careful attention to logging.
-By default, the TRL [`PPOTrainer`] saves a lot of relevant information to wandb or tensorboard.
+By default, TRL trainers like [`PPOTrainer`] and [`GRPOTrainer`] save a lot of relevant information to supported experiment trackers like Trackio, Weights & Biases (wandb) or TensorBoard.

-Upon initialization, pass one of these two options to the [`PPOConfig`]:
+Upon initialization, pass the `report_to` argument to the respective configuration object (e.g., [`PPOConfig`] for `PPOTrainer`, or [`GRPOConfig`] for `GRPOTrainer`):

-```
-training_args = PPOConfig(..., report_to="wandb")  # or "tensorboard"
+```python
+# For PPOTrainer
+ppo_config = PPOConfig(
+    # ...,
+    report_to="trackio"  # or "wandb" or "tensorboard"
+)
+
+# For GRPOTrainer
+grpo_config = GRPOConfig(
+    # ...,
+    report_to="trackio"  # or "wandb" or "tensorboard"
+)
 ```

-If you want to log with tensorboard, add the kwarg `project_kwargs={"logging_dir": PATH_TO_LOGS}` to the PPOConfig.
+If you want to log with TensorBoard, you might also need to specify logging directories, for example, by adding `logging_dir=PATH_TO_LOGS` to the configuration object (e.g., `PPOConfig` or `GRPOConfig`).

 ## PPO Logging

 Here's a brief explanation for the logged metrics provided in the data:

-Key metrics to monitor. We want to maximize the reward, maintain a low KL divergence, and maximize entropy:
-1. `env/reward_mean`: The average reward obtained from the environment. Alias `ppo/mean_scores`, which is sed to specifically monitor the reward model.
-1. `env/reward_std`: The standard deviation of the reward obtained from the environment. Alias ``ppo/std_scores`, which is sed to specifically monitor the reward model.
-1. `env/reward_dist`: The histogram distribution of the reward obtained from the environment.
-1. `objective/kl`: The mean Kullback-Leibler (KL) divergence between the old and new policies. It measures how much the new policy deviates from the old policy. The KL divergence is used to compute the KL penalty in the objective function.
-1. `objective/kl_dist`: The histogram distribution of the `objective/kl`.
-1. `objective/kl_coef`: The coefficient for Kullback-Leibler (KL) divergence in the objective function. 
-1. `ppo/mean_non_score_reward`: The **KL penalty** calculated by `objective/kl * objective/kl_coef` as the total reward for optimization to prevent the new policy from deviating too far from the old policy.
-1. `objective/entropy`: The entropy of the model's policy, calculated by `-logprobs.sum(-1).mean()`. High entropy means the model's actions are more random, which can be beneficial for exploration.
-
-Training stats:
-1. `ppo/learning_rate`: The learning rate for the PPO algorithm.
-1. `ppo/policy/entropy`: The entropy of the model's policy, calculated by `pd = torch.nn.functional.softmax(logits, dim=-1); entropy = torch.logsumexp(logits, dim=-1) - torch.sum(pd * logits, dim=-1)`. It measures the randomness of the policy.
-1. `ppo/policy/clipfrac`: The fraction of probability ratios (old policy / new policy) that fell outside the clipping range in the PPO objective. This can be used to monitor the optimization process.
-1. `ppo/policy/approxkl`: The approximate KL divergence between the old and new policies, measured by `0.5 * masked_mean((logprobs - old_logprobs) ** 2, mask)`, corresponding to the `k2` estimator in http://joschu.net/blog/kl-approx.html
-1. `ppo/policy/policykl`: Similar to `ppo/policy/approxkl`, but measured by `masked_mean(old_logprobs - logprobs, mask)`, corresponding to the `k1` estimator in http://joschu.net/blog/kl-approx.html
-1. `ppo/policy/ratio`:  The histogram distribution of the ratio between the new and old policies, used to compute the PPO objective.
-1. `ppo/policy/advantages_mean`: The average of the GAE (Generalized Advantage Estimation) advantage estimates. The advantage function measures how much better an action is compared to the average action at a state.
-1. `ppo/policy/advantages`: The histogram distribution of `ppo/policy/advantages_mean`.
-1. `ppo/returns/mean`: The mean of the TD(λ) returns, calculated by `returns = advantage + values`, another indicator of model performance. See https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/ for more details.
-1. `ppo/returns/var`: The variance of the TD(λ) returns, calculated by `returns = advantage + values`, another indicator of model performance.
-1. `ppo/val/mean`: The mean of the values, used to monitor the value function's performance.
-1. `ppo/val/var` : The variance of the values, used to monitor the value function's performance.
-1. `ppo/val/var_explained`: The explained variance for the value function, used to monitor the value function's performance.
-1. `ppo/val/clipfrac`: The fraction of the value function's predicted values that are clipped.
-1. `ppo/val/vpred`: The predicted values from the value function.
-1. `ppo/val/error`: The mean squared error between the `ppo/val/vpred` and returns, used to monitor the value function's performance.
-1. `ppo/loss/policy`: The policy loss for the Proximal Policy Optimization (PPO) algorithm.
-1. `ppo/loss/value`: The loss for the value function in the PPO algorithm. This value quantifies how well the function estimates the expected future rewards.
-1. `ppo/loss/total`: The total loss for the PPO algorithm. It is the sum of the policy loss and the value function loss.
-
-
-Stats on queries, responses, and logprobs:
-1. `tokens/queries_len_mean`: The average length of the queries tokens.
-1. `tokens/queries_len_std`: The standard deviation of the length of the queries tokens.
-1. `tokens/queries_dist`: The histogram distribution of the length of the queries tokens.
-1. `tokens/responses_len_mean`: The average length of the responses tokens.
-1. `tokens/responses_len_std`: The standard deviation of the length of the responses tokens.
-1. `tokens/responses_dist`: The histogram distribution of the length of the responses tokens. (Costa: inconsistent naming, should be `tokens/responses_len_dist`)
-1. `objective/logprobs`: The histogram distribution of the log probabilities of the actions taken by the model.
-1. `objective/ref_logprobs`: The histogram distribution of the log probabilities of the actions taken by the reference model.
-
-
+* `eps`: Tracks the number of episodes per second.
+* `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
+* `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
+* `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
+* `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
+* `objective/scores`: The mean scores returned by the reward model / environment.
+* `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
+* `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
+* `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
+* `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
+* `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to `policy/clipfrac_avg` but for the value function.
+* `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
+* `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
+* `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
+* `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
+* `lr`: The current learning rate used by the optimizer.
+* `episode`: The current episode count in the training process.

 ### Crucial values
 During training, many values are logged, here are the most important ones:

-1. `env/reward_mean`,`env/reward_std`, `env/reward_dist`: the properties of the reward distribution from the "environment" /  reward model
-1. `ppo/mean_non_score_reward`: The mean negated KL penalty during training (shows the delta between the reference model and the new policy over the batch in the step)
+1. `objective/scores`: The mean scores returned by the reward model / environment.
+1. `objective/rlhf_reward`: The mean RLHF reward. This is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
+1. `objective/non_score_reward`: The mean reward from non-score-related sources (e.g., KL penalty).

 Here are some parameters that are useful to monitor for stability (when these diverge or collapse to 0, try tuning variables):

-1. `ppo/loss/value`: it will spike / NaN when not going well.
-1. `ppo/policy/ratio`: `ratio` being 1 is a baseline value, meaning that the probability of sampling a token is the same under the new and old policy. If the ratio is too high like 200, it means the probability of sampling a token is 200 times higher under the new policy than the old policy. This is a sign that the new policy is too different from the old policy, which will likely cause overoptimization and collapse training later on.
-1. `ppo/policy/clipfrac` and `ppo/policy/approxkl`: if `ratio` is too high, the `ratio` is going to get clipped, resulting in high `clipfrac` and high `approxkl` as well.
-1. `objective/kl`: it should stay positive so that the policy is not too far away from the reference policy.
-1. `objective/kl_coef`: The target coefficient with [`AdaptiveKLController`]. Often increases before numerical instabilities.
+1. `loss/value_avg`: The average value loss. It will spike / NaN when not going well.
+1. `val/ratio`: The mean ratio of the current policy probability to the old policy probability. This number should float around 1.0. If this `ratio` is too high (e.g., 2.0 or 1000.0) or too small (e.g., 0.1), it means the updates between consecutive policies are too drastic.
+1. `policy/clipfrac_avg` and `policy/approxkl_avg`: If `val/ratio` is too high, the `ratio` is going to get clipped, resulting in high `policy/clipfrac_avg` and high `policy/approxkl_avg` as well.
+1. `objective/kl`: The mean KL divergence. It should stay positive and ideally not too large, so that the policy is not too far away from the reference policy.
+
+## GRPO Logging
+
+Here's a brief explanation for the logged metrics provided in the data for the GRPO trainer:
+
+* `num_tokens`: Total number of input tokens processed during training so far.
+
+#### Completions
+
+* `completions/mean_length`: Mean length of all generated completions (including those not ending with an EOS token).
+* `completions/min_length`: Minimum length among all generated completions.
+* `completions/max_length`: Maximum length among all generated completions.
+* `completions/clipped_ratio`: The ratio of completions that did not end with an EOS token before reaching the maximum generation length (i.e., they were truncated).
+* `completions/mean_terminated_length`: Mean length of only those completions that successfully ended with an EOS token.
+* `completions/min_terminated_length`: Minimum length among completions that ended with an EOS token.
+* `completions/max_terminated_length`: Maximum length among completions that ended with an EOS token.
+
+#### Rewards
+
+* `rewards/{reward_func_name}/mean`: The mean reward obtained from a specific, named reward function (e.g., `rewards/my_custom_reward/mean`). This is logged for each reward function used.
+* `rewards/{reward_func_name}/std`: The standard deviation of rewards from a specific, named reward function.
+* `reward`: The overall mean of the (potentially weighted and, if `args.scale_rewards` is true, normalized) rewards, after group-wise normalization (advantages).
+* `reward_std`: The standard deviation of the (potentially weighted) rewards *before* group-wise normalization for advantages.
+
+#### Policy and Loss Metrics
+
+* `kl`: The mean Kullback-Leibler (KL) divergence between the current policy and the reference policy. This is logged only if `beta` (the KL coefficient in `GRPOConfig`) is non-zero.
+* `entropy`: Average entropy of token predictions across generated completions.
+* If Liger GRPOLoss is used (`use_liger_loss: True` in `GRPOConfig`):
+    *   `clip_ratio`: The fraction of policy updates where the probability ratio was clipped according to the GRPO loss's epsilon bounds.
+* If standard GRPOLoss is used (`use_liger_loss: False`):
+    *   `clip_ratio/low_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the lower bound `1 - epsilon_low` (occurs when advantage is negative and ratio is below the bound).
+    *   `clip_ratio/low_min`: The minimum observed fraction for `clip_ratio/low_mean` across batches/processes.
+    *   `clip_ratio/high_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the upper bound `1 + epsilon_high` (occurs when advantage is positive and ratio is above the bound).
+    *   `clip_ratio/high_max`: The maximum observed fraction for `clip_ratio/high_mean` across batches/processes.
+    *   `clip_ratio/region_mean`: The mean fraction of instances where the probability ratio was clipped at either the lower or upper bound.
+
+### Crucial GRPO values
+
+During GRPO training, monitor these values for insights into performance and stability:
+
+1.  `reward`: This is the primary objective. It reflects the (group-wise normalized) rewards the policy is achieving. It should generally increase during successful training.
+1.  `kl`: If `beta > 0`, this tracks the divergence from the reference model. Keep an eye on it to ensure the policy doesn't stray too far, which can lead to instability.
+1.  `clip_ratio/*` (either `clip_ratio` for Liger loss or the more detailed `clip_ratio/...` metrics for standard loss): These indicate how often the policy updates are being constrained by the GRPO clipping mechanism. Very high values might suggest that the policy is trying to change too drastically (potentially due to large advantages or a learning rate that's too high) or that the epsilon clipping range is too restrictive.
+1.  `completions/clipped_ratio`: A high ratio here indicates that the model is frequently generating completions that are cut off by `max_completion_length` rather than naturally ending with an EOS token. This might suggest issues with learning sequence termination or that `max_completion_length` is too short.
+1. `rewards/{reward_func_name}/mean`: Monitoring the mean of individual reward functions can help diagnose which aspects of the desired behavior the model is learning or struggling with, especially when using multiple reward sources.
+1. `entropy`: Measures how uncertain the policy is in its action choices, higher entropy suggests more exploration. A collapse in entropy means the policy is becoming overconfident and deterministic, often too early. This can stall learning by reducing exploration and making updates overly biased. Stable but non-zero entropy is usually a sign that the policy retains flexibility and continues to explore.
+
--- a/docs/source/model_utils.md
+++ b/docs/source/model_utils.md
@ -0,0 +1,9 @@
+# Model Utilities
+
+## clone_chat_template
+
+[[autodoc]] clone_chat_template
+
+## get_act_offloading_ctx_manager
+
+[[autodoc]] models.get_act_offloading_ctx_manager
--- a/docs/source/nash_md_trainer.md
+++ b/docs/source/nash_md_trainer.md
@ -36,7 +36,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 judge = PairRMJudge()
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")

-training_args = NashMDConfig(output_dir="Qwen2-0.5B-NashMD", logging_steps=10)
+training_args = NashMDConfig(output_dir="Qwen2-0.5B-NashMD")
 trainer = NashMDTrainer(
    model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
 )
@ -51,9 +51,9 @@ accelerate launch train_nash_md.py

 Distributed across 8 GPUs, the training takes approximately 3 hours.

-To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-NashMD) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-NashMD) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

-<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-NashMD
+<pre><code>$ transformers chat trl-lib/Qwen2-0.5B-NashMD
 <strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
 What is the best programming language?

@ -63,7 +63,7 @@ The best programming language depends on personal preference, the complexity of

 ## Expected dataset type

-Nash-MD requires a [prompt-only dataset](dataset_formats#prompt-only). The [`NashMDTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+Nash-MD requires a [prompt-only dataset](dataset_formats#prompt-only). The [`NashMDTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 ## Usage tips

@ -125,7 +125,6 @@ python examples/scripts/nash_md.py \
    --judge pair_rm \
    --dataset_name trl-lib/ultrafeedback-prompt \
    --learning_rate 5.0e-7 \
-    --logging_steps 25 \
    --output_dir Qwen2.5-0.5B-NashMD-PairRM \
    --warmup_ratio 0.1 \
    --push_to_hub
@ -133,7 +132,7 @@ python examples/scripts/nash_md.py \

 ## Logged metrics

-The logged metrics are as follows:
+While training and evaluating, we record the following reward metrics:

 * `loss/kl`: The mean KL divergence between the model and reference data.
 * `objective/entropy`: The mean entropy of the model and reference data.
@ -153,6 +152,9 @@ The logged metrics are as follows:
 ## NashMDTrainer

 [[autodoc]] NashMDTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## NashMDConfig

--- a/docs/source/online_dpo_trainer.md
+++ b/docs/source/online_dpo_trainer.md
@ -36,7 +36,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 judge = PairRMJudge()
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")

-training_args = OnlineDPOConfig(output_dir="Qwen2-0.5B-OnlineDPO", logging_steps=10)
+training_args = OnlineDPOConfig(output_dir="Qwen2-0.5B-OnlineDPO")
 trainer = OnlineDPOTrainer(
    model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
 )
@ -53,9 +53,9 @@ Distributed across 8 GPUs, the training takes approximately 1 hour. You can veri

 ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/online-dpo-qwen2.png)

-To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-OnlineDPO) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-OnlineDPO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

-<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-OnlineDPO
+<pre><code>$ transformers chat trl-lib/Qwen2-0.5B-OnlineDPO
 <strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
 What is the best programming language?

@ -65,7 +65,7 @@ The best programming language depends on your specific needs and priorities. Som

 ## Expected dataset type

-Online DPO only requires a [prompt-only dataset](dataset_formats#prompt-only) (unlike offline DPO, that expects [preference dataset](dataset_formats#preference)). The [`OnlineDPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset format. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
+Online DPO only requires a [prompt-only dataset](dataset_formats#prompt-only) (unlike offline DPO, that expects [preference dataset](dataset_formats#preference)). The [`OnlineDPOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.

 ## Usage tips

@ -125,7 +125,6 @@ python examples/scripts/dpo_online.py \
    --judge pair_rm \
    --dataset_name trl-lib/ultrafeedback-prompt \
    --learning_rate 5.0e-7 \
-    --logging_steps 25 \
    --output_dir Qwen2.5-0.5B-Online-DPO-PairRM \
    --warmup_ratio 0.1 \
    --push_to_hub
@ -133,7 +132,7 @@ python examples/scripts/dpo_online.py \

 ## Logged metrics

-The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/w4apmsi9)
+While training and evaluating, we record the following reward metrics. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/w4apmsi9)

 * `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current model and reference model.
 * `objective/entropy`: The mean entropy of the model, indicating the randomness of the actions chosen by the model.
@ -171,7 +170,6 @@ accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml \
    --max_new_tokens 53 \
    --warmup_ratio 0.1 \
    --missing_eos_penalty 1.0 \
-    --logging_steps 20 \
    --save_steps 0.1 \
    --push_to_hub

@ -190,8 +188,6 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml
    --max_new_tokens 53 \
    --warmup_ratio 0.1 \
    --missing_eos_penalty 1.0 \
-    --bf16 \
-    --logging_steps 20 \
    --save_steps 0.1 \
    --push_to_hub

@ -210,9 +206,7 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml
    --max_new_tokens 53 \
    --warmup_ratio 0.1 \
    --missing_eos_penalty 1.0 \
-    --bf16 \
    --gradient_checkpointing \
-    --logging_steps 20 \
    --save_steps 0.1 \
    --push_to_hub
 ```
@ -272,6 +266,9 @@ The online DPO checkpoint gets increasingly more win rate as we scale up the mod
 ## OnlineDPOTrainer

 [[autodoc]] OnlineDPOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## OnlineDPOConfig

--- a/docs/source/orpo_trainer.md
+++ b/docs/source/orpo_trainer.md
@ -41,7 +41,7 @@ model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

-training_args = ORPOConfig(output_dir="Qwen2-0.5B-ORPO", logging_steps=10)
+training_args = ORPOConfig(output_dir="Qwen2-0.5B-ORPO")
 trainer = ORPOTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
 trainer.train()
 ```
@ -56,9 +56,9 @@ Distributed across 8 GPUs, the training takes approximately 30 minutes. You can

 ![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/orpo-qwen2-reward-margin.png)

-To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-ORPO) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-ORPO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

-<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-ORPO
+<pre><code>$ transformers chat trl-lib/Qwen2-0.5B-ORPO
 <strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
 What is the best programming language?

@ -94,7 +94,6 @@ accelerate launch examples/scripts/orpo.py \
    --model_name_or_path Qwen/Qwen2-0.5B-Instruct \
    --dataset_name trl-lib/ultrafeedback_binarized \
    --num_train_epochs 1 \
-    --logging_steps 25 \
    --output_dir Qwen2-0.5B-ORPO
 ```

@ -110,7 +109,7 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype

 ## Logged metrics

-While training and evaluating we record the following reward metrics:
+While training and evaluating, we record the following reward metrics:

 - `rewards/chosen`: the mean log probabilities of the policy model for the chosen responses scaled by beta
 - `rewards/rejected`: the mean log probabilities of the policy model for the rejected responses scaled by beta
@ -123,6 +122,9 @@ While training and evaluating we record the following reward metrics:
 ## ORPOTrainer

 [[autodoc]] ORPOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## ORPOConfig

--- a/docs/source/others.md
+++ b/docs/source/others.md
@ -0,0 +1,9 @@
+# Other
+
+## profiling_decorator
+
+[[autodoc]] extras.profiling.profiling_decorator
+
+## profiling_context
+
+[[autodoc]] extras.profiling.profiling_context
--- a/docs/source/paper_index.md
+++ b/docs/source/paper_index.md
@ -0,0 +1,220 @@
+# Paper Index
+
+<Tip warning={true}>
+
+Section under construction. Feel free to contribute!
+
+</Tip>
+
+## Group Sequence Policy Optimization
+
+**📜 Paper**: https://huggingface.co/papers/2507.18071
+
+GSPO is a GRPO variant that computes importance sampling weights at the sequence level instead of per-token. To reproduce the paper's setting, use this configuration:
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    importance_sampling_level="sequence",
+    loss_type="grpo",
+    beta=0.0,  # GSPO set KL regularization to zero: https://github.com/volcengine/verl/pull/2775#issuecomment-3131807306 
+    epsilon=3e-4,  # GSPO paper (v2), section 5.1
+    epsilon_high=4e-4,  # GSPO paper (v2), section 5.1
+    gradient_accumulation_steps=1,
+    steps_per_generation=4,  # partition rollout batch into 4 mini-batches. GSPO paper (v2), section 5.1. Must be 4 times gradient_accumulation_steps
+)
+```
+
+Note that this method only has an effect when training goes slightly off-policy—for example, when `steps_per_generation > gradient_accumulation_steps` or `num_iterations > 1`. Otherwise, it is effectively equivalent to no modification.
+
+### Policy ratio: GRPO vs. GSPO
+
+In GSPO, the policy ratio is defined at the sequence-level. In other words, it is the ratio between the probability of the current policy generating a sequence over the old policy generating that same sequence.
+
+The sequence likelihood is defined as:
+
+$$
+\pi_\theta (o_i \mid q) = \prod_{t=1}^{|o_i|} \pi_\theta  (o_{i,t} | q, o_{i, \lt t} ),
+$$
+
+where  \\( \pi_\theta \\) is the policy  \\( \pi \\) with parameters  \\(\theta\\),  \\( o_i \\) is the  \\( i \\)-th output sequence  \\( o \\) and  \\(y_{i,t}\\) is the  \\( t \\)-th token in this sequence,  \\( q \\) is the input query. The sequence likelihood ratio  \\( s_i (\theta) \\) is defined as:
+
+$$
+s_i (\theta) = \left(\frac{\pi_\theta (o_i | q)}{\pi_{\theta_{old}} (o_i | q)} \right)^{\frac{1}{|o_i|}}
+$$
+
+The exponent  \\( \frac{1}{|y_i|} \\) represents a sequence-length normalization, minimizing the influence of sequence lenght in sequence likelihood. In other terms, it computes the geometric mean of token probabilities, ensuring a fair comparison across sequences of varying lengths.
+
+While GSPO defines the policy ratio at the sequence level, GRPO operates at the token level. Specifically, GRPO computes an importance ratio for each token in the sequence:
+
+$$
+w_{i,t}(\theta) = \frac{\pi_\theta (o_{i,t} \mid q, o_{i,\lt t})}{\pi_{\theta_{\text{old}}} (o_{i,t} \mid q, o_{i,\lt t})}
+$$
+
+This token-level ratio is then combined with a shared advantage  \\( \hat{A}_i \\), and the GRPO objective clips and optimizes each token independently across the sequence.
+
+## DAPO: An Open-Source LLM Reinforcement Learning System at Scale
+
+**📜 Paper**: https://huggingface.co/papers/2503.14476
+
+The DAPO algorithm includes 5 key components:
+
+- Overlong Filtering
+- Clip-Higher
+- Soft Overlong Punishment
+- Token-level Loss
+- Dynamic Sampling (⚠️ Not supported in TRL)
+
+To reproduce the paper's setting, use this configuration:
+
+```python
+from trl import GRPOConfig, GRPOTrainer
+
+training_args = GRPOConfig(
+    # Overlong Filtering
+    mask_truncated_completions=True,
+    # Token-level Loss
+    loss_type="dapo",
+    # Clip-Higher
+    epsilon_high=0.28, # DAPO paper: section 4.1
+    epsilon=0.2, # DAPO paper: section 4.1
+    # Other parameters used
+    per_device_train_batch_size=512, # mini-batch size for training in the paper, DAPO paper: section 4.1
+    num_generations=16, # number of sample responses in the paper, DAPO paper: section 4.1
+    max_completion_length=20480, #  maximum number of tokens for generation in the paper, DAPO paper: section 4.1
+    beta=0.0 # section 2.3, DAPO paper
+
+)
+# Soft Overlong Punishment
+sop_reward = get_soft_overlong_punishment(max_completion_len=20480, soft_punish_cache=4096) # DAPO paper: section 4.1
+trainer = GRPOTrainer(
+    ...,
+    args=training_args,
+    reward_funcs=[..., sop_reward],
+)
+```
+
+## Dr. GRPO: Understanding R1-Zero-Like Training: A Critical Perspective
+
+**📜 Paper**: https://huggingface.co/papers/2503.20783
+
+A study of R1-Zero training identifies pretraining effects on RL performance and proffers Dr. GRPO to enhance token efficiency, achieving superior accuracy on AIME 2024. To reproduce the paper's setting, use this configuration:
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    loss_type="dr_grpo",
+    per_device_train_batch_size=1, # train_batch_size_per_device in the Training section of the repository
+    num_generations=8, #  num_samples in the Training section of the repository
+    max_prompt_length=1024, #  prompt_max_length in the Training section of the repository
+    max_completion_length=3000, # generate_max_length in the Training section of the repository
+    beta=0.0, # beta in the Training section of the repository
+)
+```
+
+## Direct Preference Optimization (DPO): Your Language Model is Secretly a Reward Model
+
+**📜 Paper**: https://huggingface.co/papers/2305.18290
+
+Direct Preference Optimization (DPO) fine-tunes language models more efficiently and with better performance compared to reinforcement learning from human feedback (RLHF), by directly optimizing policy training based on human preferences. To reproduce the paper's setting, use this configuration:
+
+```python
+from trl import DPOConfig
+
+training_args = DPOConfig(
+    loss_type="sigmoid", # losses in Appendix B of the paper
+    per_device_train_batch_size=64, #  batch size in Appendix B of the paper
+    learning_rate=1e-6, # learning rate in Appendix B of the paper
+    beta=0.1, # beta in Appendix B of the paper
+)
+```
+
+## Back to Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs
+
+**📜 Paper**: https://huggingface.co/papers/2402.14740
+
+RLOO is a variant of REINFORCE that reduces variance by using leave-one-out baselines. It computes rewards by comparing each sample against the average of all other samples in the batch, providing more stable gradients than standard REINFORCE. To reproduce the paper's setting, use this configuration:
+
+```python
+from trl import RLOOConfig
+
+training_args = RLOOConfig(
+    per_device_train_batch_size=512,  # section C Training Detail of the paper
+    steps_per_generation=2  # section C Training Detail of the paper
+    beta=0.03  # section C Training Detail of the paper
+    num_generations=2,  # experiments of paper different num_generations={2,4}
+    learning_rate=1e-6  # section C Training Detail of the paper
+)
+```
+
+## AlphaPO -- Reward shape matters for LLM alignment
+
+**📜 Paper**: https://huggingface.co/papers/2501.03884
+
+AlphaPO is a new Direct Alignment Algorithms (DAAs) method that leverages an alpha-parameter to help change the shape of the reward function beyond the standard log reward. AlphaPO helps maintain fine-grained control over likelihood displacement and over-optimization. To reproduce the paper's setting, use this configuration:
+
+```python
+from trl import CPOConfig
+
+# Mistral-Instruct from Table 3 of the paper
+training_args = CPOConfig(
+    loss_type="alphapo",
+    alpha=0.25,
+    beta=2.5,
+    simpo_gamma=0.1,
+    learning_rate=7e-7,
+    ...
+)
+```
+
+## EMA Without the Lag: Bias-Corrected Iterate Averaging Schemes
+
+**📜 Paper**: https://huggingface.co/papers/2508.00180
+
+Bias-Corrected Exponential Moving Average (BEMA) improves the stability and efficiency of language model fine-tuning by reducing stochasticity and eliminating bias. To use BEMA with SFT as described in the paper, you can use the [`BEMACallback`]:
+
+```python
+from trl import BEMACallback, SFTTrainer
+
+trainer = SFTTrainer(
+    ...
+    callbacks=[BEMACallback()],
+)
+```
+
+## Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)
+
+**📜 Paper**: https://huggingface.co/papers/2508.08221
+
+The authors of this paper find that the combination of:
+
+1. scaling rewards by the standard deviation computed over the entire batch and
+2. aggregating loss over the total number of tokens
+
+can unlock the learning capability of critic-free policies using vanilla PPO loss. Their results demonstrate that this simple combination consistently improves performance, surpassing strategies like GRPO and [DAPO](https://huggingface.co/papers/2503.14476).
+
+TRL supports using these learnings to train a GRPO model by:
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(
+    ...
+    scale_rewards="batch",
+    loss_type="dapo",
+    # Other parameters used
+    beta=0.0,  # = init_kl_coef in the paper
+    top_p=0.99,
+    top_k=100,
+    temperature=0.99,
+    num_completions=8, # = num_return_sequences in the paper
+    num_iterations=1,  # = ppo_epochs in the paper
+    per_device_train_batch_size=4,
+    gradient_accumulation_steps=32,
+    steps_per_generation=8,  # (rollout_batch_size*num_return_sequences) / (per_device_train_batch_size*gradient_accumulation_steps)
+)
+```
+
+Note that when using gradient accumulation, the loss is aggregated over the total number of tokens in the batch, but not over the accumulated batch. For more details, see the [GRPO Trainer - Loss types](grpo_trainer#loss_types).
--- a/docs/source/peft_integration.md
+++ b/docs/source/peft_integration.md
@ -1,6 +1,6 @@
 # Examples of using peft with trl to finetune 8-bit models with Low Rank Adaption (LoRA)

-The notebooks and scripts in this examples show how to use Low Rank Adaptation (LoRA) to fine-tune models in a memory efficient manner. Most of PEFT methods supported in peft library but note that some PEFT methods such as Prompt tuning are not supported.
+The notebooks and scripts in these examples show how to use Low Rank Adaptation (LoRA) to fine-tune models in a memory efficient manner. Most of PEFT methods supported in peft library but note that some PEFT methods such as Prompt tuning are not supported.
 For more information on LoRA, see the [original paper](https://huggingface.co/papers/2106.09685).

 Here's an overview of the `peft`-enabled notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):
--- a/docs/source/ppo_trainer.md
+++ b/docs/source/ppo_trainer.md
@ -26,6 +26,8 @@ python examples/scripts/ppo/ppo.py \
    --gradient_accumulation_steps 1 \
    --total_episodes 10000 \
    --model_name_or_path EleutherAI/pythia-1b-deduped \
+    --sft_model_path EleutherAI/pythia-1b-deduped \
+    --reward_model_path EleutherAI/pythia-1b-deduped \
    --missing_eos_penalty 1.0
 ```

@ -50,13 +52,13 @@ The logged metrics are as follows. Here is an example [tracked run at Weights an
 * `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
 * `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
 * `lr`: lr: The current learning rate used by the optimizer.
-* `episode`: episode: The current global step or episode count in the training process.
+* `episode`: episode: The current episode count in the training process.


 ## Cookbook

 * Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
-* Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try undertand why this is happening and try to fix it.
+* Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try understand why this is happening and try to fix it.
 * Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
 * Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
 * Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
@ -231,7 +233,10 @@ python -m openrlbenchmark.rlops_multi_metrics \
 ## PPOTrainer

 [[autodoc]] PPOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## PPOConfig

-[[autodoc]] PPOConfig
+[[autodoc]] PPOConfig
--- a/docs/source/prm_trainer.md
+++ b/docs/source/prm_trainer.md
@ -42,7 +42,7 @@ model = AutoModelForTokenClassification.from_pretrained("Qwen/Qwen2-0.5B", num_l
 tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B")
 train_dataset = load_dataset("trl-lib/math_shepherd", split="train[:10%]")

-training_args = PRMConfig(output_dir="Qwen2-0.5B-Reward-Math-Sheperd", logging_steps=10)
+training_args = PRMConfig(output_dir="Qwen2-0.5B-Reward-Math-Sheperd")
 trainer = PRMTrainer(model=model, args=training_args, processing_class=tokenizer, train_dataset=train_dataset)
 trainer.train()
 ```
@ -112,13 +112,15 @@ accelerate launch examples/scripts/prm.py \
    --model_name_or_path Qwen/Qwen2-0.5B \
    --dataset_name trl-lib/math_shepherd \
    --num_train_epochs 1 \
-    --logging_steps 25 \
    --output_dir Qwen2-0.5B-Reward-Math-Sheperd
 ```

 ## PRMTrainer

 [[autodoc]] PRMTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## PRMConfig

--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@ -1,88 +1,125 @@
 # Quickstart

-## How does it work?
+TRL is a comprehensive library for post-training foundation models using techniques like Supervised Fine-Tuning (SFT), Group Relative Policy Optimization (GRPO),  Direct Preference Optimization (DPO).

-Fine-tuning a language model via PPO consists of roughly three steps:
+## Quick Examples

-1. **Rollout**: The language model generates a response or continuation based on a query which could be the start of a sentence.
-2. **Evaluation**: The query and response are evaluated with a function, model, human feedback, or some combination of them. The important thing is that this process should yield a scalar value for each query/response pair. The optimization will aim at maximizing this value.
-3. **Optimization**: This is the most complex part. In the optimisation step the query/response pairs are used to calculate the log-probabilities of the tokens in the sequences. This is done with the model that is trained and a reference model, which is usually the pre-trained model before fine-tuning. The KL-divergence between the two outputs is used as an additional reward signal to make sure the generated responses don't deviate too far from the reference language model. The active language model is then trained with PPO.
+Get started instantly with TRL's most popular trainers. Each example uses compact models for quick experimentation.

-The full process is illustrated in the following figure:
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl_overview.png"/>
-
-## Minimal example
-
-The following code illustrates the steps above. 
+### Supervised Fine-Tuning

 ```python
-# 0. imports
-import torch
-from transformers import GPT2Tokenizer
+from trl import SFTTrainer
+from datasets import load_dataset

-from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
-
-
-# 1. load a pretrained model
-model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
-ref_model = AutoModelForCausalLMWithValueHead.from_pretrained("gpt2")
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-tokenizer.pad_token = tokenizer.eos_token
-
-# 2. initialize trainer
-ppo_config = {"mini_batch_size": 1, "batch_size": 1}
-config = PPOConfig(**ppo_config)
-ppo_trainer = PPOTrainer(config, model, ref_model, tokenizer)
-
-# 3. encode a query
-query_txt = "This morning I went to the "
-query_tensor = tokenizer.encode(query_txt, return_tensors="pt").to(model.pretrained_model.device)
-
-# 4. generate model response
-generation_kwargs = {
-    "min_length": -1,
-    "top_k": 0.0,
-    "top_p": 1.0,
-    "do_sample": True,
-    "pad_token_id": tokenizer.eos_token_id,
-    "max_new_tokens": 20,
-}
-response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs)
-response_txt = tokenizer.decode(response_tensor[0])
-
-# 5. define a reward for response
-# (this could be any reward such as human feedback or output from another model)
-reward = [torch.tensor(1.0, device=model.pretrained_model.device)]
-
-# 6. train model with ppo
-train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)
+trainer = SFTTrainer(
+    model="Qwen/Qwen2.5-0.5B",
+    train_dataset=load_dataset("trl-lib/Capybara", split="train"),
+)
+trainer.train()
 ```

-In general, you would run steps 3-6 in a for-loop and run it on many diverse queries. You can find more realistic examples in the examples section. 
-
-## How to use a trained model
-
-After training a `AutoModelForCausalLMWithValueHead`, you can directly use the model in `transformers`.
-```python
-
-# .. Let's assume we have a trained model using `PPOTrainer` and `AutoModelForCausalLMWithValueHead`
-
-# push the model on the Hub
-model.push_to_hub("my-fine-tuned-model-ppo")
-
-# or save it locally
-model.save_pretrained("my-fine-tuned-model-ppo")
-
-# load the model from the Hub
-from transformers import AutoModelForCausalLM
-
-model = AutoModelForCausalLM.from_pretrained("my-fine-tuned-model-ppo")
-```
-
-You can also load your model with `AutoModelForCausalLMWithValueHead` if you want to use the value head, for example to continue training.
+### Group Relative Policy Optimization

 ```python
-from trl.model import AutoModelForCausalLMWithValueHead
+from trl import GRPOTrainer
+from datasets import load_dataset

-model = AutoModelForCausalLMWithValueHead.from_pretrained("my-fine-tuned-model-ppo")
+# Define a simple reward function (count unique chars as example)
+def reward_function(completions, **kwargs):
+    return [len(set(completion.lower())) for completion in completions]
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2.5-0.5B-Instruct",  # Start from SFT model
+    train_dataset=load_dataset("trl-lib/tldr", split="train"),
+    reward_function=reward_function,
+)
+trainer.train()
 ```
+
+### Direct Preference Optimization
+
+```python
+from trl import DPOTrainer
+from datasets import load_dataset
+
+trainer = DPOTrainer(
+    model="Qwen/Qwen2.5-0.5B-Instruct",  # Use your SFT model
+    ref_model="Qwen/Qwen2.5-0.5B-Instruct",  # Original base model
+    train_dataset=load_dataset("trl-lib/ultrafeedback_binarized", split="train"),
+)
+trainer.train()
+```
+
+## Command Line Interface
+
+Skip the code entirely - train directly from your terminal:
+
+```bash
+# SFT: Fine-tune on instructions
+trl sft --model_name_or_path Qwen/Qwen2.5-0.5B \
+    --dataset_name trl-lib/Capybara
+
+# DPO: Align with preferences  
+trl dpo --model_name_or_path Qwen/Qwen2.5-0.5B-Instruct \
+    --dataset_name trl-lib/ultrafeedback_binarized
+```
+
+## What's Next?
+
+### 📚 Learn More
+
+- [SFT Trainer](sft_trainer) - Complete SFT guide
+- [DPO Trainer](dpo_trainer) - Preference alignment
+- [GRPO Trainer](grpo_trainer) - Group relative policy optimization
+- [Training FAQ](how_to_train) - Common questions
+
+### 🚀 Scale Up
+
+- [Distributed Training](distributing_training) - Multi-GPU setups
+- [Memory Optimization](reducing_memory_usage) - Efficient training
+- [PEFT Integration](peft_integration) - LoRA and QLoRA
+
+### 💡 Examples
+
+- [Example Scripts](https://github.com/huggingface/trl/tree/main/examples) - Production-ready code
+- [Community Tutorials](community_tutorials) - External guides
+
+## Troubleshooting
+
+### Out of Memory?
+
+Reduce batch size and enable optimizations:
+
+<hfoptions id="batch_size">
+<hfoption id="SFT">
+
+```python
+training_args = SFTConfig(
+    per_device_train_batch_size=1,  # Start small
+    gradient_accumulation_steps=8,  # Maintain effective batch size
+)
+```
+
+</hfoption>
+<hfoption id="DPO">
+
+```python
+training_args = DPOConfig(
+    per_device_train_batch_size=1,  # Start small
+    gradient_accumulation_steps=8,  # Maintain effective batch size
+)
+```
+
+</hfoption>
+</hfoptions>
+
+### Loss not decreasing?
+
+Try adjusting the learning rate:
+
+```python
+training_args = SFTConfig(learning_rate=2e-5)  # Good starting point
+```
+
+For more help, see our [Training FAQ](how_to_train) or open an [issue on GitHub](https://github.com/huggingface/trl/issues).
--- a/docs/source/reducing_memory_usage.md
+++ b/docs/source/reducing_memory_usage.md
@ -11,18 +11,18 @@ Section under construction. Feel free to contribute!
 Sequence lengths in the dataset can vary widely. When data is batched, sequences are padded to match the longest one in the batch, which can cause high memory usage, even if most sequences are relatively short.

 <div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png" alt="Truncation prompt completion" width="600"/>
+    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png" alt="Truncation prompt-completion" width="600"/>
 </div>

-To reduce memory usage, it’s important to truncate sequences to a reasonable length. While TRL trainers truncate sequences by default, you may want to adjust the default truncation length to better align with your specific use case.
+To reduce memory usage, it's important to truncate sequences to a reasonable length. While TRL trainers truncate sequences by default, you may want to adjust the default truncation length to better align with your specific use case.

-<hfoptions id="dpo">
+<hfoptions id="truncation">
 <hfoption id="DPO">

 DPO truncation is applied first to the prompt and to the completion via the `max_prompt_length` and `max_completion_length` parameters. The `max_length` parameter is then used to truncate the resulting sequence.

 <div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png" alt="Truncation prompt completion" width="600"/>
+    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png" alt="Truncation prompt-completion" width="600"/>
 </div>

 To set the truncation parameters, use the following code snippet:
@ -44,7 +44,7 @@ training_args = DPOConfig(..., max_completion_length=...)
 </hfoption>
 <hfoption id="SFT">

-SFT truncation is applied to the input sequence via the `max_seq_length` parameter.
+SFT truncation is applied to the input sequence via the `max_length` parameter.

 <div class="flex justify-center">
    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png" alt="Truncation input ids" width="600"/>
@ -55,12 +55,20 @@ To set the truncation parameter, use the following code snippet:
 ```python
 from trl import SFTConfig

-training_args = SFTConfig(..., max_seq_length=...)
+training_args = SFTConfig(..., max_length=...)
 ```

 </hfoption>
 </hfoptions>

+### How to choose the `max_length` value?
+
+If `max_length` is too small, a significant portion of your tokens will be discarded and won't contribute to training. If it's too large, memory usage can spike, potentially leading to OOM (Out-Of-Memory) errors. Without packing or padding-free, a large `max_length` may also result in inefficient training, as many tokens will be padding.
+
+To help you choose an appropriate value, we provide a utility to visualize the sequence length distribution in your dataset.
+
+<iframe src="https://trl-lib-dataset-length-profiler.hf.space" frameborder="0" width="100%" height="1000"></iframe>
+
 ## Packing

 <Tip>
@ -77,15 +85,21 @@ This technique applies only to SFT.
 Packing, introduced in [Raffel et al., 2020](https://huggingface.co/papers/1910.10683), addresses these issues by grouping sequences instead of truncating. It concatenates and splits dataset sequences into the desired lengths.

 <div class="flex justify-center">
-    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing.png" alt="Packing" width="600"/>
+    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_2.png" alt="Packing" width="600"/>
 </div>

-Packing eliminates padding, preserves all sequence information, and allows for flexible sequence lengths, making it a more efficient alternative to truncation. To enable packing, use `packing=True` in the [`SFTConfig`]:
+Packing reduces padding by merging several sequences in one row when possible. We use an advanced method to be near-optimal in the way we pack the dataset. To enable packing, use `packing=True` in the [`SFTConfig`].
+
+<Tip>
+
+In TRL 0.18 and earlier, packing used a more aggressive method that reduced padding to almost nothing, but had the downside of breaking sequence continuity for a large fraction of the dataset. To revert to this strategy, use `packing_strategy="wrapped"` in `SFTConfig`.
+
+</Tip>

 ```python
 from trl import SFTConfig

-training_args = SFTConfig(..., packing=True, max_seq_length=512)
+training_args = SFTConfig(..., packing=True, max_length=512)
 ```

 <Tip warning={true}>
@ -94,6 +108,119 @@ Packing may cause batch contamination, where adjacent sequences influence one an

 </Tip>

+## Liger for reducing peak memory usage
+
+> [Liger Kernel](https://github.com/linkedin/Liger-Kernel) is a collection of Triton kernels designed specifically for LLM training. It can effectively increase multi-GPU training throughput by 20% and reduces memory usage by 60%.
+
+For more information, see [Liger Kernel Integration](liger_kernel_integration)
+
+<hfoptions id="liger">
+<hfoption id="DPO">
+
+To use Liger for reducing peak memory usage, use the following code snippet:
+  
+```python
+from trl import DPOConfig
+
+training_args = DPOConfig(..., use_liger_loss=True)
+```
+
+</hfoption>
+<hfoption id="GRPO">
+
+To use Liger for reducing peak memory usage, use the following code snippet:
+  
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(..., use_liger_loss=True)
+```
+
+</hfoption>
+<hfoption id="KTO">
+
+To use Liger for reducing peak memory usage, use the following code snippet:
+  
+```python
+from trl import KTOConfig
+
+training_args = KTOConfig(..., use_liger_loss=True)
+```
+
+</hfoption>
+</hfoptions>
+
+## Padding-free
+
+Padding-free batching is an alternative approach for reducing memory usage. In this method, a batch is first sampled and then flattened into a single sequence, avoiding padding. Unlike packing, which can result in incomplete sequences by combining parts of different samples, padding-free batching ensures that all sequences remain complete and intact.
+
+<div class="flex justify-center">
+    <img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png" alt="Padding-free batching" width="600"/>
+</div>
+
+<Tip warning={true}>
+
+It's highly recommended to use padding-free batching with **FlashAttention 2** or **FlashAttention 3**. Otherwise, you may encounter batch contamination issues.
+
+</Tip>
+
+<hfoptions id="padding-free">
+<hfoption id="DPO">
+
+```python
+from trl import DPOConfig
+
+training_args = DPOConfig(..., padding_free=True, model_init_kwargs={"attn_implementation": "flash_attention_2"})
+```
+
+</hfoption>
+<hfoption id="SFT">
+
+```python
+from trl import SFTConfig
+
+training_args = SFTConfig(..., padding_free=True, model_init_kwargs={"attn_implementation": "flash_attention_2"})
+```
+
+</hfoption>
+</hfoptions>
+
+## Activation offloading
+
+Activation offloading is a memory efficiency technique that reduces GPU VRAM usage by temporarily moving activation tensors to CPU RAM during the forward pass and bringing them back only when needed for the backward pass. This significantly reduces peak memory usage at the cost of slightly increased training time.
+
+To enable activation offloading in your SFT training configuration:
+
+<hfoptions>
+<hfoption id="SFT">
+
+```python
+from trl import SFTConfig
+
+training_args = SFTConfig(..., activation_offloading=True)
+```
+
+</hfoption>
+</hfoptions>
+
+<Tip warning={true}>
+
+When using activation offloading with models that use Liger kernels, you must disable Liger cross entropy due to compatibility issues. The issue occurs specifically with `use_liger_kernel=True` because Liger cross entropy performs in-place operations which conflict with activation offloading. The default setting (`use_liger_kernel=False`) works:
+
+```python
+# When using activation offloading with a model that uses Liger kernels:
+from trl import SFTConfig
+
+training_args = SFTConfig(
+    activation_offloading=True,
+    use_liger_kernel=False,  # Disable Liger cross entropy
+    # Other parameters...
+)
+```
+</Tip>
+
+Under the hood, activation offloading implements PyTorch's [`saved_tensors_hooks`](https://pytorch.org/tutorials/intermediate/autograd_saved_tensors_hooks_tutorial.html#hooks-for-autograd-saved-tensors) to intercept activations during the forward pass. It intelligently manages which tensors to offload based on size and context, avoiding offloading output tensors which would be inefficient. For performance optimization, it can optionally use CUDA streams to overlap computation with CPU-GPU transfers.
+
 ## Disabling model gathering for generation in online methods

 When using DeepSpeed ZeRO-3, model weights are sharded across multiple GPUs. Online methods involve generating completions from the model as part of the training process. During this step, the model weights are temporarily gathered on a single GPU for generation. For very large models, this gathering can lead to out-of-memory (OOM) errors, as described in this issue: [#2250](https://github.com/huggingface/trl/issues/2250#issue-2598304204).
@ -101,6 +228,15 @@ When using DeepSpeed ZeRO-3, model weights are sharded across multiple GPUs. Onl
 If you encounter this issue, you can disable the gathering of model weights for generation by setting the following parameter:

 <hfoptions id="ds3_gather_for_generation">
+<hfoption id="GRPO">
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(..., ds3_gather_for_generation=False)
+```
+
+</hfoption>
 <hfoption id="Online DPO">

 ```python
--- a/docs/source/reward_trainer.md
+++ b/docs/source/reward_trainer.md
@ -84,6 +84,9 @@ For reference results, please refer PR [#1932](https://github.com/huggingface/tr
 ## RewardTrainer

 [[autodoc]] RewardTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## RewardConfig

--- a/docs/source/rewards.md
+++ b/docs/source/rewards.md
@ -0,0 +1,15 @@
+# Reward Functions
+
+This module contains some useful reward functions, primarily intended for use with the [`GRPOTrainer`] and [`RLOOTrainer`].
+
+## Format rewards
+
+### think_format_reward
+
+[[autodoc]] rewards.think_format_reward
+
+## Other rewards
+
+### get_soft_overlong_punishment
+
+[[autodoc]] rewards.get_soft_overlong_punishment
--- a/docs/source/rloo_trainer.md
+++ b/docs/source/rloo_trainer.md
@ -2,289 +2,572 @@

 [![](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)

-TRL supports training LLMs with REINFORCE Leave-One-Out (RLOO). The idea is that instead of using a value function, RLOO generates K completions for each prompt. For each completion, RLOO uses the mean scores from the other K-1 completions as a baseline to calculate the advantage. RLOO also models the entire completion as a single action, where as PPO models each token as an action. Note that REINFORCE / A2C is a special case of PPO, when the number of PPO epochs is 1 and the number of mini-batches is 1, which is how we implement RLOO in TRL.
+## Overview

-References:
- [Back to Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs](https://huggingface.co/papers/2402.14740)
- [A2C is a special case of PPO](https://huggingface.co/papers/2205.09123)
- [Fine-Tuning Language Models from Human Preferences](https://github.com/openai/lm-human-preferences)
- [Learning to Summarize from Human Feedback](https://github.com/openai/summarize-from-feedback)
- [The N Implementation Details of RLHF with PPO](https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo)
- [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031)
+TRL supports the RLOO Trainer for training language models, as described in the paper [Back to Basics: Revisiting REINFORCE Style
+Optimization for Learning from Human Feedback in LLMs](https://huggingface.co/papers/2402.14740) by  [Arash Ahmadian](https://huggingface.co/ArashAhmadian), Chris Cremer, [Matthias Gallé](https://huggingface.co/mgalle), [Marzieh Fadaee](https://huggingface.co/MarziehFadaee), [Julia Kreutzer](https://huggingface.co/JuliaKreutzerCohere), [Ahmet Üstün](https://huggingface.co/ahmetu) and [Sara Hooker](https://huggingface.co/sarahooker).

-## Get started
+The abstract from the paper is the following:

-To just run a RLOO script to make sure the trainer can run, you can run the following command to train a RLOO model with a dummy reward model.
+> AI alignment in the shape of Reinforcement Learning from Human Feedback (RLHF) is increasingly treated as a crucial ingredient for high performance large language models. Proximal Policy Optimization (PPO) has been positioned by recent literature as the canonical method for the RL part of RLHF However, it involves both high computational cost and sensitive hyperparameter tuning. We posit that most of the motivational principles that led to the development of PPO are less of a practical concern in RLHF and advocate for a less computationally expensive method that preserves and even increases performance. We revisit the formulation of alignment from human preferences in the context of RL. Keeping simplicity as a guiding principle, we show that many components of PPO are unnecessary in an RLHF context and that far simpler REINFORCE-style optimization variants outperform both PPO and newly proposed “RL-free” methods such as DPO and RAFT. Our work suggests that careful adaptation to LLMs alignment characteristics enables benefiting from online RL optimization at low cost.

-```bash
-python examples/scripts/rloo/rloo.py \
-    --dataset_name trl-internal-testing/descriptiveness-sentiment-trl-style \
-    --dataset_train_split descriptiveness \
-    --learning_rate 3e-6 \
-    --output_dir models/minimal/rloo \
-    --per_device_train_batch_size 64 \
-    --gradient_accumulation_steps 1 \
-    --total_episodes 10000 \
-    --model_name_or_path EleutherAI/pythia-14m \
-    --reward_model_path EleutherAI/pythia-14m \
-    --missing_eos_penalty 1.0
-```
+This post-training method was contributed by [Costa Huang](https://github.com/vwxyzjn) and later refactored by [Shirin Yamani](https://huggingface.co/ShirinYamani).

+## Quick start

-## Explanation of the logged metrics
+This example demonstrates how to train a model using the RLOO method. We train a [Qwen 0.5B Instruct model](https://huggingface.co/Qwen/Qwen2-0.5B-Instruct) with the prompts from the [UltraFeedback prompts dataset](https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt). You can view the data in the dataset here:

-The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/u2sqci34)
+<iframe
+  src="https://huggingface.co/datasets/trl-lib/ultrafeedback-prompt/embed/viewer/default/train?row=0"
+  frameborder="0"
+  width="100%"
+  height="560px"
+></iframe>

-<!-- * `rlhf_reward_var_per_prompt`: calculated by `rlhf_reward.var(0).mean()`. This is the variance of the rewards estimated across the `args.rloo_k` samples. Usually we expect it to go down (cause policy entropy goes down). -->
-
-* `eps`: Tracks the number of episodes per second.
-* `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
-* `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
-* `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
-* `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
-* `objective/scores`: The mean scores returned by the reward model / environment.
-* `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
-* `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
-* `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
-* `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to policy/clipfrac_avg but for the value function.
-* `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
-* `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
-* `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
-* `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
-* `lr`: lr: The current learning rate used by the optimizer.
-* `episode`: episode: The current global step or episode count in the training process.
-
-
-## Cookbook
-
-* Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
-* Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try undertand why this is happening and try to fix it.
-* Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
-* Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
-* Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
-
-
-## What is my model doing exactly?
-
-To help you understand what your model is doing, we periodically log some sample completions from the model. Here is an example of a completion. In an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/u2sqci34), it looks like the following, allowing you to see the model's response at different stages of training. By default we generate `--num_sample_generations 10` during training, but you can customize the number of generations.
-
-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2_completions.gif)
-
-
-In the logs the sampled generations look like 
-
-```
-┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
-┃ query                           ┃ model response                  ┃ score    ┃
-┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
-│  SUBREDDIT: r/AskReddit         │  I'm in love with a friend, and │ 3.921875 │
-│                                 │ I don't know how to get rid of  │          │
-│ TITLE: How do you get someone   │ those feelings. I'm             │          │
-│ out of your head?               │ desperate.<|endoftext|>[PAD][P… │          │
-│                                 │                                 │          │
-│ POST: Hi,                       │                                 │          │
-│ I'm 22, and I have been with my │                                 │          │
-│ girlfriend for 5 years now. We  │                                 │          │
-│ recently moved together. We've  │                                 │          │
-│ always loved each other         │                                 │          │
-│ intensely.                      │                                 │          │
-│                                 │                                 │          │
-│ Problem, I recently started to  │                                 │          │
-│ have feelings for an other      │                                 │          │
-│ person (a friend). This person  │                                 │          │
-│ has had a boyfriend for now 3   │                                 │          │
-│ years, and has absolutely no    │                                 │          │
-│ ideas. Those feelings were so   │                                 │          │
-│ strong, it was hard to hide     │                                 │          │
-│ them. After 2 months of me      │                                 │          │
-│ being distant and really sad,   │                                 │          │
-│ my girlfriend forced me to say  │                                 │          │
-│ what was bothering me. I'm not  │                                 │          │
-│ a good liar, and now she knows. │                                 │          │
-│                                 │                                 │          │
-│ We decided to give us a week    │                                 │          │
-│ alone, I went to my parents.    │                                 │          │
-│                                 │                                 │          │
-│ Now, I'm completely lost. I     │                                 │          │
-│ keep on thinking about this     │                                 │          │
-│ person, and I hate that. I      │                                 │          │
-│ would like for those feelings   │                                 │          │
-│ to go away, to leave me alone.  │                                 │          │
-│ But I can't.                    │                                 │          │
-│                                 │                                 │          │
-│ What do I do? It's been 3       │                                 │          │
-│ months now, and I'm just        │                                 │          │
-│ desperate.                      │                                 │          │
-│                                 │                                 │          │
-│ TL;DR:                          │                                 │          │
-├─────────────────────────────────┼─────────────────────────────────┼──────────┤
-│  SUBREDDIT: r/pettyrevenge      │  My mom woke me up with a loud  │ 6.84375  │
-│                                 │ TV. I blasted Gangnam Style on  │          │
-│ TITLE: So, my mom woke me up    │ repeat, with the bass cranked   │          │
-│ with a loud TV.                 │ up as high as it could          │          │
-│                                 │ go.<|endoftext|>[PAD][PAD][PAD… │          │
-│ POST: She was in her living     │                                 │          │
-│ room, watching TV. This was at  │                                 │          │
-│ about 8:30 in the morning, and  │                                 │          │
-│ she was exercising. She turned  │                                 │          │
-│ the TV up extra loud to hear it │                                 │          │
-│ over her excercycle, and woke   │                                 │          │
-│ me up. I went in there asking   │                                 │          │
-│ for her to turn it down. She    │                                 │          │
-│ said she didn't have to; I      │                                 │          │
-│ explained that I always used    │                                 │          │
-│ headphones so she didn't have   │                                 │          │
-│ to deal with my noise and that  │                                 │          │
-│ she should give me a little     │                                 │          │
-│ more respect, given that I paid │                                 │          │
-│ rent at the time.               │                                 │          │
-│                                 │                                 │          │
-│ She disagreed. I went back to   │                                 │          │
-│ my room, rather pissed off at   │                                 │          │
-│ the lack of equality. I had no  │                                 │          │
-│ lock on my door; but I had a    │                                 │          │
-│ dresser right next to it, so I  │                                 │          │
-│ pulled one of the drawers out   │                                 │          │
-│ enough so that it caused the    │                                 │          │
-│ door to not be openable. Then,  │                                 │          │
-│ I turned my speakers up really  │                                 │          │
-│ loud and blasted Gangnam Style  │                                 │          │
-│ on repeat, with the bass        │                                 │          │
-│ cranked up as high as it could  │                                 │          │
-│ go.                             │                                 │          │
-│                                 │                                 │          │
-│ If you hate Gangnam Style for   │                                 │          │
-│ being overplayed, you will see  │                                 │          │
-│ why I chose that particular     │                                 │          │
-│ song. I personally don't mind   │                                 │          │
-│ it. But here's the thing about  │                                 │          │
-│ my bass; it vibrates the walls, │                                 │          │
-│ making one hell of a lot of     │                                 │          │
-│ noise. Needless to say, my mom  │                                 │          │
-│ was not pleased and shut off    │                                 │          │
-│ the internet. But it was oh so  │                                 │          │
-│ worth it.                       │                                 │          │
-│                                 │                                 │          │
-│ TL;DR:                          │                                 │          │
-└─────────────────────────────────┴─────────────────────────────────┴──────────┘
-```
-
-## Implementation details
-
-The bulk of RLOOTrainer is based on the PPO implementation, which is based on the [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).
-
-
-Below is a vectorized advantage calculation for RLOO:
+Below is the script to train the model.

 ```python
-def test_rloo_reward():
-    local_batch_size = 3
-    rloo_k = 4
-    rlhf_reward = torch.tensor([
-        1, 2, 3, # first rlhf reward for three prompts
-        2, 3, 4, # second rlhf reward for three prompts
-        5, 6, 7, # third rlhf reward for three prompts
-        8, 9, 10, # fourth rlhf reward for three prompts
-    ]).float() # here we have 3 prompts which have 4 completions each
+# train_rloo.py
+from datasets import load_dataset
+from trl import RLOOConfig, RLOOTrainer

-    baseline = (rlhf_reward.sum(0) - rlhf_reward) / (rloo_k - 1)
-    advantages = torch.zeros_like(rlhf_reward)
-    for i in range(0, len(advantages), local_batch_size):
-        other_response_rlhf_rewards = []
-        for j in range(0, len(advantages), local_batch_size):
-            if i != j:
-                other_response_rlhf_rewards.append(rlhf_reward[j : j + local_batch_size])
-        advantages[i : i + local_batch_size] = rlhf_reward[i : i + local_batch_size] - torch.stack(other_response_rlhf_rewards).mean(0)
-    
-    assert (1 - (2 + 5 + 8) / 3 - advantages[0].item()) < 1e-6  # First rlhf reward for the first prompt
-    assert (6 - (3 + 2 + 9) / 3 - advantages[7].item()) < 1e-6  # Third rlhf reward for the second prompt
+dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")

-    # Vectorized implementation
-    rlhf_reward = rlhf_reward.reshape(rloo_k, local_batch_size)
-    baseline = (rlhf_reward.sum(0) - rlhf_reward) / (rloo_k - 1)
-    vec_advantages = rlhf_reward - baseline
-    torch.testing.assert_close(vec_advantages.flatten(), advantages)
+# Dummy reward function for demonstration purposes
+def reward_num_unique_letters(completions, **kwargs):
+    """Reward function that rewards completions with more unique letters."""
+    completion_contents = [completion[0]["content"] for completion in completions]
+    return [float(len(set(content))) for content in completion_contents]
+
+training_args = RLOOConfig(output_dir="Qwen2-0.5B-RLOO")
+trainer = RLOOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=reward_num_unique_letters,
+    args=training_args,
+    train_dataset=dataset,
+)
+trainer.train()
 ```

-## Benchmark experiments
-
-To validate the RLOO implementation works, we ran experiment on the 1B model. Here are the command we used to run the experiment. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).
-
-```
-accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \
-    --output_dir models/minimal/rloo_tldr \
-    --dataset_name trl-internal-testing/tldr-preference-sft-trl-style \
-    --dataset_test_split validation \
-    --num_ppo_epochs 2 \
-    --num_mini_batches 2 \
-    --learning_rate 3e-6 \
-    --per_device_train_batch_size 16 \
-    --gradient_accumulation_steps 16 \
-    --total_episodes 1000000 \
-    --model_name_or_path EleutherAI/pythia-1b-deduped \
-    --sft_model_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr \
-    --reward_model_path cleanrl/EleutherAI_pythia-1b-deduped__reward__tldr \
-    --local_rollout_forward_batch_size 16 \
-    --missing_eos_penalty 1.0 \
-    --stop_token eos \
-    --kl_coef 0.03
-```
-
-Checkpoints and experiment tracking are available at:
-
- [🤗 Model checkpoint](https://huggingface.co/vwxyzjn/rloo_tldr)
- [🐝 Tracked experiment](https://wandb.ai/huggingface/trl/runs/u2sqci34)
-
-
-To evaluate, we use [vLLM](https://github.com/vllm-project/vllm) to load the checkpoints and GPT-4o mini as a judge model to evaluate the generated TL;DR against the reference TL;DR.
-For more information on how to use judges, see [Judges](judges).
+Execute the script using the following command:

 ```bash
-$ python examples/scripts/evals/judge_tldr.py --model_name_or_path cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr --judge_model gpt-4o-mini --num_examples 1000
-Model win rate: 33.00%
-$ python examples/scripts/evals/judge_tldr.py --model_name_or_path vwxyzjn/rloo_tldr --judge_model gpt-4o-mini --num_examples 1000
-Model win rate: 51.20%
+accelerate launch train_rloo.py
 ```

-The RLOO checkpoint gets a 51.2% preferred rate vs the 33.0% preference rate of the SFT checkpoint. This is a good sign that the RLOO training is working as intended.
+## Looking deeper into the RLOO method
+
+RLOO is an online learning algorithm, meaning it improves iteratively by using the data generated by the trained model itself during training. The intuition behind RLOO objective is to maximize the advantage of the generated completions, while ensuring that the model remains close to the reference policy. To understand how RLOO works, it can be broken down into four main steps: **Generating completions**, **computing the advantage**, **estimating the KL divergence**, and **computing the loss**.
+
+![RLOO](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/rloo.png)
+
+### Generating completions
+
+At each training step, we sample a batch of prompts and generate a set of  \\( G \\) completions for each prompt (denoted as  \\( o_i \\)).
+
+### Computing the reward
+
+In RLOO, the reward consists of two components: the reward provided by the reward model (or reward function) and a KL penalty that discourages the policy from deviating too far from a fixed reference policy
+
+1. For each of the  \\( G \\) generated sequences  \\( o_i = (o_{i,1}, \dots, o_{i,T}) \\) conditioned on a query \\( q \\), we compute a scalar reward using a reward model  \\( R(o_i, q) \\).
+2. Concurenlty, we estimate the KL divergence between the current policy  \\( \pi_\theta \\) and the fixed reference policy  \\( \pi_{\text{ref}} \\) over the sequence. The KL estimate for sequence  \\( o_i \\) is:
+
+$$
+\mathbb{D}_{\mathrm{KL}}\!\left[\pi_\theta\|\pi_{\mathrm{ref}}\right] = \sum_{t=1}^T \log \frac{\pi_\theta(o_{i,t} \mid q, o_{i,<t})}{\pi_{\mathrm{ref}}(o_{i,t} \mid q, o_{i,<t})}.
+$$
+
+The final reward assigned to sequence  \\( o_i \\) is then:
+
+$$
+r_i = R(o_i, q) - \beta \, \mathbb{D}_{\mathrm{KL}}\!\left[\pi_\theta \|\pi_{\mathrm{ref}}\right],
+$$
+
+where  \\( \beta > 0 \\) controls the strength of the KL penalty.
+
+<Tip>  
+
+In a purely online setting (`num_iterations = 1`, default), the data are generated by the current policy. In this case, the KL penalty is computed directly using the current policy.  
+
+In the more general setting (e.g., multiple gradient steps per batch), the data are instead generated by an earlier snapshot \\( \pi_{\text{old}} \\). To keep the penalty consistent with the sampling distribution, the KL is defined with respect to this policy:
+
+$$
+\mathbb{D}_{\mathrm{KL}}\!\left[\pi_{\text{old}} \,\|\, \pi_{\text{ref}}\right].
+$$
+
+Equivalently, for a sampled sequence $o$, the Monte Carlo estimate is
+
+$$
+\mathbb{D}_{\mathrm{KL}}\!\left[\pi_{\text{old}} \|\pi_{\mathrm{ref}}\right] = \sum_{t=1}^T \log \frac{\pi_{\text{old}}(o_{i,t} \mid q, o_{i,<t})}{\pi_{\mathrm{ref}}(o_{i,t} \mid q, o_{i,<t})}.
+$$
+
+</Tip>
+
+### Computing the advantage
+
+Once the rewards for each completion have been computed, we calculate a baseline as the average reward of all other samples in the same batch, excluding the current sample. This baseline is used to reduce the variance of the policy gradient estimate. The advantage for each completion is then obtained as the difference between its own reward and this leave-one-out baseline. 
+
+Formally, for a batch of G completions, the baseline for completion is:
+$$
+b_i = \frac{1}{G-1} \sum_{j \neq i} r_j
+$$


-Metrics:
+and then the advantage for each completion is computed as the difference between its reward and the baseline:

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/rloo.png)
+$$
+A_i = r_i - b_i
+$$

+### Computing the loss

-```bash
-# pip install openrlbenchmark==0.2.1a5
-# see https://github.com/openrlbenchmark/openrlbenchmark#get-started for documentation
-# to use it, change `?we=huggingface&wpn=trl` to your own project and `?tag=pr-1540` to your own tag
-python -m openrlbenchmark.rlops_multi_metrics \
-    --filters '?we=huggingface&wpn=trl&xaxis=train/episode&ceik=output_dir&cen=sft_model_path&metrics=train/objective/rlhf_reward&metrics=train/objective/scores&metrics=train/objective/kl&metrics=train/objective/non_score_reward&metrics=train/objective/entropy&metrics=train/policy/approxkl_avg&metrics=train/policy/clipfrac_avg&metrics=train/loss/policy_avg&metrics=train/policy/entropy_avg&metrics=train/val/ratio&metrics=train/val/ratio_var&metrics=train/val/num_eos_tokens&metrics=train/lr&metrics=train/eps' \
-        "cleanrl/EleutherAI_pythia-1b-deduped__sft__tldr?tag=pr-1540" \
-    --env-ids models/minimal/rloo_tldr \
-    --pc.ncols 4 \
-    --pc.ncols-legend 1 \
-    --pc.xlabel "Episode" \
-    --output-filename benchmark/trl/pr-1540/rloo \
-    --scan-history
+The REINFORCE loss is simply defined as:
+
+$$
+\mathcal{L}_{\text{RLOO}}(\theta) = - \frac{1}{G} \sum_{i=1}^G \hat{A}_i \, \log \pi_\theta(o_i \mid q)
+$$
+
+In practice, performing multiple gradient steps on the same batch makes the actions effectively off-policy relative to the current parameters. To correct for this, we introduce the importance sampling ratio. To prevent excessively large updates when the policy changes between sampling and gradient steps, we clip this ratio:
+
+$$
+\mathcal{L}_{\text{RLOO}}(\theta) = - \frac{1}{G} \sum_{i=1}^G \min \left( \frac{\pi_\theta(o_i \mid q)}{\pi_{\theta_\text{old}}(o_i \mid q)} \hat{A}_i, \, \text{clip}\left(\frac{\pi_\theta(o_i \mid q)}{\pi_{\theta_\text{old}}(o_i \mid q)}, 1-\epsilon, 1+\epsilon\right) \hat{A}_i \right)
+$$
+
+In a fully online, single-step setting (default),  \\( \frac{\pi_\theta(o_i \mid q)}{\pi_{\theta_\text{old}}(o_i \mid q)} = 1 \\) and this reduces to standard REINFORCE.
+
+## Logged metrics
+
+While training and evaluating, we record the following reward metrics:
+
+- `num_tokens`: The total number of tokens processed so far, including both prompts and completions.
+- `completions/mean_length`: The average length of generated completions.
+- `completions/min_length`: The minimum length of generated completions.
+- `completions/max_length`: The maximum length of generated completions.
+- `completions/mean_terminated_length`: The average length of generated completions that terminate with EOS.
+- `completions/min_terminated_length`: The minimum length of generated completions that terminate with EOS.
+- `completions/max_terminated_length`: The maximum length of generated completions that terminate with EOS.
+- `completions/clipped_ratio` : The ratio of truncated (clipped) completions.
+- `reward/{reward_func_name}/mean`: The average reward from a specific reward function.
+- `reward/{reward_func_name}/std`: The standard deviation of the reward from a specific reward function.
+- `reward`: The overall average reward after applying reward weights.
+- `reward_std`: The standard deviation of rewards after applying reward weights. This is the average of the per-group standard deviations.
+- `frac_reward_zero_std`: The fraction of samples in the generation batch with a reward std of zero, implying there is little diversity for that prompt (all answers are correct or incorrect).
+- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
+- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
+- `clip_ratio/region_mean`: The ratio of sequence probabilities where the RLOO objective is clipped to stay within the trust region:
+$$
+\text{clip}\left( r_{i}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i}(\theta) = \frac{\pi_\theta(o_{i} \mid q)}{\pi_{\theta_{\text{old}}}(o_{i} \mid q)}\,.
+$$
+
+    A higher value means more samples are clipped, which constrains how much the policy $\pi_\theta$ can change.
+- `clip_ratio/low_mean`: The average ratio of sequence probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
+- `clip_ratio/low_min`: The minimum ratio of sequence probabilities that were clipped on the lower bound of the trust region:  \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
+- `clip_ratio/high_mean`: The average ratio of sequence probabilities that were clipped on the upper bound of the trust region:  \\(r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}\\)
+- `clip_ratio/high_max`: The maximum ratio of sequence probabilities that were clipped on the upper bound of the trust region:  \\(r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}\\).
+
+## Customization
+
+### Speed up training with vLLM-powered generation
+
+Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
+```shell
+pip install trl[vllm]
 ```

-## Reinforce++
+We support two ways of using vLLM during training: **server mode** and **colocate mode**.

-The [Reinforce++](https://hijkzzz.notion.site/reinforce-plus-plus) report by Jian Hu suggests several optimization tricks to enhance performance and stability of RLHF. They include:
+#### 🔌 Option 1: Server mode

- Clipping rewards: limiting reward values within a specific range to mitigate the impact of extreme rewards on model updates, thus preventing gradient explosion
- Normalizing rewards: scaling rewards to have a mean of 0 and a standard deviation of 1, which helps in stabilizing the training process
- Normalizing advantages: scaling advantages to have a mean of 0 and a standard deviation of 1, which helps in stabilizing the training process
- Using token-level KL penalty that is defined as equation (1) of the report vs. sequence-level KL penalty (default)
+In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.

-These options are available via the appropriate arguments in the [`RLOOConfig`] class.
+1. **Start the vLLM server**:
+   ```bash
+   trl vllm-serve --model <model_name>
+   ```

+2. **Enable server mode in your training script**:
+   ```python
+   from trl import RLOOConfig
+
+   training_args = RLOOConfig(
+       ...,
+       use_vllm=True,
+       vllm_mode="server",  # default value, can be omitted
+   )
+   ```
+
+<Tip warning={true}>
+
+Make sure that the server is using different GPUs than the trainer, otherwise you may run into NCCL errors. You can specify the GPUs to use with the `CUDA_VISIBLE_DEVICES` environment variable.
+
+</Tip>
+
+#### 🧩 Option 2: Colocate mode
+
+In this mode, vLLM runs inside the trainer process and shares GPU memory with the training model. This avoids launching a separate server and can improve GPU utilization, but may lead to memory contention on the training GPUs.
+
+```python
+from trl import RLOOConfig
+
+training_args = RLOOConfig(
+    ...,
+    use_vllm=True,
+    vllm_mode="colocate",
+)
+```
+
+<Tip>
+
+Depending on the model size and the overall GPU memory requirements for training, you may need to adjust the `vllm_gpu_memory_utilization` parameter in [`RLOOConfig`] to avoid underutilization or out-of-memory errors.
+
+We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
+
+<iframe
+	src="https://trl-lib-recommend-vllm-memory.hf.space"
+	frameborder="0"
+	width="850"
+	height="450"
+></iframe>
+
+If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
+
+</Tip>
+
+<Tip>
+
+By default, RLOO uses `MASTER_ADDR=localhost` and `MASTER_PORT=12345` for vLLM, but you can override these values by setting the environment variables accordingly.
+
+</Tip>
+
+For more information, see [Speeding up training with vLLM](speeding_up_training#vllm-for-fast-generation-in-online-methods).
+
+### RLOO at scale: train a 70B+ Model on multiple nodes
+
+When training large models like **Qwen2.5-72B**, you need several key optimizations to make the training efficient and scalable across multiple GPUs and nodes. These include:
+
+- **DeepSpeed ZeRO Stage 3**: ZeRO leverages data parallelism to distribute model states (weights, gradients, optimizer states) across multiple GPUs and CPUs, reducing memory and compute requirements on each device. Since large models cannot fit on a single GPU, using ZeRO Stage 3 is required for training such model. For more details, see [DeepSpeed Integration](deepspeed_integration).
+- **Accelerate**: Accelerate is a library that simplifies distributed training across multiple GPUs and nodes. It provides a simple API to launch distributed training and handles the complexities of distributed training, such as data parallelism, gradient accumulation, and distributed data loading. For more details, see [Distributing Training](distributing_training).
+- **vLLM**: See the previous section on how to use vLLM to speed up generation.
+
+Below is an example SLURM script to train a 70B model with RLOO on multiple nodes. This script trains a model on 4 nodes and uses the 5th node for vLLM-powered generation.
+
+```sh
+#!/bin/bash
+#SBATCH --nodes=5
+#SBATCH --gres=gpu:8
+
+# Get the list of allocated nodes
+NODELIST=($(scontrol show hostnames $SLURM_JOB_NODELIST))
+
+# Assign the first 4 nodes for training and the 5th node for vLLM
+TRAIN_NODES="${NODELIST[@]:0:4}"  # Nodes 0, 1, 2, 3 for training
+VLLM_NODE="${NODELIST[4]}"  # Node 4 for vLLM
+
+# Run training on the first 4 nodes (Group 1)
+srun --nodes=4 --ntasks=4 --nodelist="${NODELIST[@]:0:4}" accelerate launch \
+     --config_file examples/accelerate_configs/deepspeed_zero3.yaml \
+     --num_processes 32 \
+     --num_machines 4 \
+     --main_process_ip ${NODELIST[0]} \
+     --machine_rank $SLURM_PROCID \
+     --rdzv_backend c10d \
+     train_rloo.py \
+     --server_ip $VLLM_NODE &
+
+# Run vLLM server on the 5th node (Group 2)
+srun --nodes=1 --ntasks=1 --nodelist="${NODELIST[4]}" trl vllm-serve --model Qwen/Qwen2.5-72B --tensor_parallel_size 8 &
+
+wait
+```
+
+```python
+import argparse
+
+from datasets import load_dataset
+from trl import RLOOTrainer, RLOOConfig
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--vllm_server_host", type=str, default="", help="The server IP")
+    args = parser.parse_args()
+
+    # Example dataset from TLDR
+    dataset = load_dataset("trl-lib/tldr", split="train")
+
+    # Dummy reward function: count the number of unique characters in the completions
+    def reward_num_unique_chars(completions, **kwargs):
+        return [len(set(c)) for c in completions]
+
+    training_args = RLOOConfig(
+        output_dir="Qwen2.5-72B-RLOO",
+        per_device_train_batch_size=4,
+        bf16=True,
+        gradient_checkpointing=True,
+        use_vllm=True,
+        vllm_server_host=args.vllm_server_host.replace("ip-", "").replace("-", "."),  # from ip-X-X-X-X to X.X.X.X
+    )
+
+    trainer = RLOOTrainer(model="Qwen/Qwen2.5-72B", args=training_args, reward_funcs=reward_num_unique_chars, train_dataset=dataset)
+    trainer.train()
+
+if __name__=="__main__":
+    main()
+```
+
+### Using a custom reward function
+
+The [`RLOOTrainer`] supports using custom reward functions instead of dense reward models. To ensure compatibility, your reward function must satisfy the following requirements:
+
+1. **Input arguments**:
+   - The function must accept the following as keyword arguments:
+     - `prompts` (contains the prompts),
+     - `completions` (contains the generated completions),
+     - `completions_ids` (contains the tokenized completions),
+     - `trainer_state` ([`~transformers.TrainerState`]): The current state of the trainer. This can be used to implement dynamic reward functions, such as curriculum learning, where the reward is adjusted based on the training progress.
+     - All columns names (but `prompt`) that the dataset may have. For example, if the dataset contains a column named `ground_truth`, the function will be called with `ground_truth` as a keyword argument.
+
+     The easiest way to comply with this requirement is to use `**kwargs` in the function signature.
+   - Depending on the dataset format, the input will vary:
+     - For [standard format](dataset_formats#standard), `prompts` and `completions` will be lists of strings.
+     - For [conversational format](dataset_formats#conversational), `prompts` and `completions` will be lists of message dictionaries.
+
+2. **Return value**: The function must return a list of floats. Each float represents the reward corresponding to a single completion.
+
+#### Example 1: Reward longer completions
+
+Below is an example of a reward function for a standard format that rewards longer completions:
+
+```python
+def reward_func(completions_ids, **kwargs):
+    """Reward function that assigns higher scores to longer completions (in terms of token count)."""
+    return [float(len(ids)) for ids in completions_ids]
+```
+
+You can test it as follows:
+
+```python
+>>> prompts = ["The sky is", "The sun is"]  # not used in the reward function, but the trainer will pass it
+>>> completions = [" blue.", " in the sky."]  # not used in the reward function, but the trainer will pass it
+>>> completions_ids = [[6303, 13], [304, 279, 12884, 13]]
+>>> reward_func(prompts=prompts, completions=completions, completions_ids=completions_ids)
+[2.0, 4.0]
+```
+
+#### Example 1.1: Reward longer completions (based in the number of characters)
+
+Same as the previous example, but this time the reward function is based on the number of characters instead of tokens.
+
+```python
+def reward_func(completions, **kwargs):
+    """Reward function that assigns higher scores to longer completions (in terms of character count)."""
+    return [float(len(completion)) for completion in completions]
+```
+
+You can test it as follows:
+
+```python
+>>> prompts = ["The sky is", "The sun is"]
+>>> completions = [" blue.", " in the sky."]
+>>> completions_ids = [[6303, 13], [304, 279, 12884, 13]]  # not used in the reward function, but the trainer will pass it
+>>> reward_func(prompts=prompts, completions=completions, completions_ids=completions_ids)
+[6.0, 12.0]
+```
+
+#### Example 2: Reward completions with specific format
+
+Below is an example of a reward function that checks if the completion has a specific format. This example is inspired by the _format reward_ function used in the paper [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://huggingface.co/papers/2501.12948).
+It is designed for conversational format, where prompts and completions consist of structured messages.
+
+```python
+import re
+
+def format_reward_func(completions, **kwargs):
+    """Reward function that checks if the completion has a specific format."""
+    pattern = r"^<think>.*?</think><answer>.*?</answer>$"
+    completion_contents = [completion[0]["content"] for completion in completions]
+    matches = [re.match(pattern, content) for content in completion_contents]
+    return [1.0 if match else 0.0 for match in matches]
+```
+
+You can test this function as follows:
+
+```python
+>>> prompts = [
+...     [{"role": "assistant", "content": "What is the result of (1 + 2) * 4?"}],
+...     [{"role": "assistant", "content": "What is the result of (3 + 1) * 2?"}],
+... ]
+>>> completions = [
+...     [{"role": "assistant", "content": "<think>The sum of 1 and 2 is 3, which we multiply by 4 to get 12.</think><answer>(1 + 2) * 4 = 12</answer>"}],
+...     [{"role": "assistant", "content": "The sum of 3 and 1 is 4, which we multiply by 2 to get 8. So (3 + 1) * 2 = 8."}],
+... ]
+>>> format_reward_func(prompts=prompts, completions=completions)
+[1.0, 0.0]
+```
+
+#### Example 3: Reward completions based on a reference
+
+Below is an example of a reward function that checks if the completion is correct. This example is inspired by the _accuracy reward_ function used in the paper [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](https://huggingface.co/papers/2501.12948).
+This example is designed for [standard format](dataset_formats#standard), where the dataset contains a column named `ground_truth`.
+
+```python
+import re
+
+def reward_func(completions, ground_truth, **kwargs):
+    # Regular expression to capture content inside \boxed{}
+    matches = [re.search(r"\\boxed\{(.*?)\}", completion) for completion in completions]
+    contents = [match.group(1) if match else "" for match in matches]
+    # Reward 1 if the content is the same as the ground truth, 0 otherwise
+    return [1.0 if c == gt else 0.0 for c, gt in zip(contents, ground_truth)]
+```
+
+You can test this function as follows:
+
+```python
+>>> prompts = ["Problem: Solve the equation $2x + 3 = 7$. Solution:", "Problem: Solve the equation $3x - 5 = 10$."]
+>>> completions = [r" The solution is \boxed{2}.", r" The solution is \boxed{6}."]
+>>> ground_truth = ["2", "5"]
+>>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
+[1.0, 0.0]
+```
+#### Example 4: Multi-task reward functions
+
+Below is an example of using multiple reward functions in the [`RLOOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
+
+```python
+from datasets import Dataset
+from trl import RLOOTrainer
+
+# Define a dataset that contains both math and coding problems
+dataset = Dataset.from_list(
+    [
+        {"prompt": "What is 2+2?", "task": "math"},
+        {"prompt": "Write a function that returns the sum of two numbers.", "task": "code"},
+        {"prompt": "What is 3*4?", "task": "math"},
+        {"prompt": "Write a function that returns the product of two numbers.", "task": "code"},
+    ]
+)
+
+# Math-specific reward function
+def math_reward_func(prompts, completions, task, **kwargs):
+    rewards = []
+    for prompt, completion, t in zip(prompts, completions, task):
+        if t == "math":
+            # Calculate math-specific reward
+            correct = check_math_solution(prompt, completion)
+            reward = 1.0 if correct else -1.0
+            rewards.append(reward)
+        else:
+            # Return None for non-math tasks
+            rewards.append(None)
+    return rewards
+
+# Coding-specific reward function
+def coding_reward_func(prompts, completions, task, **kwargs):
+    rewards = []
+    for prompt, completion, t in zip(prompts, completions, task):
+        if t == "coding":
+            # Calculate coding-specific reward
+            works = test_code_solution(prompt, completion)
+            reward = 1.0 if works else -1.0
+            rewards.append(reward)
+        else:
+            # Return None for non-coding tasks
+            rewards.append(None)
+    return rewards
+
+# Use both task-specific reward functions
+trainer = RLOOTrainer(
+    model="Qwen/Qwen2-0.5B-Instruct",
+    reward_funcs=[math_reward_func, coding_reward_func],
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+In this example, the `math_reward_func` and `coding_reward_func` are designed to work with a mixed dataset that contains both math and coding problems. The `task` column in the dataset is used to determine which reward function to apply to each problem. If there is no relevant reward function for a sample in the dataset, the reward function will return `None` and the [`RLOOTrainer`] will continue with the valid functions and tasks. This allows the [`RLOOTrainer`] to handle multiple reward functions with different applicability.
+
+Note that the [`RLOOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.
+
+
+
+#### Passing the reward function to the trainer
+
+To use your custom reward function, pass it to the [`RLOOTrainer`] as follows:
+
+```python
+from trl import RLOOTrainer
+
+trainer = RLOOTrainer(
+    reward_funcs=reward_func,
+    ...,
+)
+```
+
+If you have multiple reward functions, you can pass them as a list:
+
+```python
+from trl import RLOOTrainer
+
+trainer = RLOOTrainer(
+    reward_funcs=[reward_func1, reward_func2],
+    ...,
+)
+```
+
+and the reward will be computed as the sum of the rewards from each function, or the weighted sum if `reward_weights` is provided in the config.
+
+Note that [`RLOOTrainer`] supports multiple reward functions of different types. See the parameters documentation for more details.

 ## RLOOTrainer

 [[autodoc]] RLOOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## RLOOConfig

 [[autodoc]] RLOOConfig
+
+## References
+
+1. [RLOO Paper](https://openreview.net/pdf?id=r1lgTGL5DE)
+2. [Paper Back to Basics: Revisiting REINFORCE Style Optimization for Learning from Human Feedback in LLMs](https://huggingface.co/papers/2402.14740)
+3. [Paper - REINFORCE++: A Simple and Efficient Approach for Aligning Large Language Models](https://huggingface.co/papers/2501.03262)
+4. [Blog Post - Putting RL back in RLHF](https://huggingface.co/blog/putting_rl_back_in_rlhf_with_rloo)
+5. [Blog Post - Unraveling RLHF and Its Variants: Progress and Practical Engineering Insights](https://hijkzzz.notion.site/unraveling-rlhf-and-its-variants-engineering-insights#147d9a33ecc9806090f3d5c749d31f05)
+6. [Youtube - RLOO: A Cost-Efficient Optimization for Learning from Human Feedback in LLMs](https://www.youtube.com/watch?v=86asXGPK6RU&ab_channel=BuzzRobot)
+
+## Migration Guide from the old implementation (0.21 and below)
+
+With the release of version 0.22.0, we have revamped the [`RLOOTrainer`] to be more alinged with other online trainers in the library like [`GRPOTrainer`]. This new implementation introduces several changes to the configuration parameters and overall structure of the trainer.
+Below is a summary of the key changes for [`RLOOConfig`]:
+
+| TRL ≤ 0.21.x | TRL ≥ 0.22.0 |
+| --- | --- |
+| `rloo_k` | renamed to `num_generations` |
+| `cliprange` | renamed to `epsilon` |
+| `kl_coef` | renamed to `beta` |
+| `exp_name` | renamed to `run_name`. Use `run_name = f"{exp_name}__{seed}__{int(time.time())}"` to replicate old behavior |
+| `normalize_reward` | renamed to `normalize_advantages`. Note: this always normalized advantages (despite the old name) |
+| `num_ppo_epochs` | renamed to `num_iterations` (default: `1`) |
+| `token_level_kl` | **removed** – KL is now computed only at the sequence level |
+| `dataset_num_proc` | **removed** – it was unused |
+| `num_mini_batches` | renamed to `steps_per_generation` |
+| `total_episodes` | use `max_steps=total_episodes / gradient_accumulation_steps` instead |
+| `local_rollout_forward_batch_size` | **removed** – now automatically set to `per_device_train_batch_size` (or `per_device_eval_batch_size` during evaluation) |
+| `num_sample_generations` | **removed** – use `logging_steps` to control generation logging frequency |
+| `response_length` | renamed to `max_completion_length` (default: `256`) |
+| `stop_token` | **removed** |
+| `stop_token_id` | **removed** – use `processing_class.eos_token_id` instead |
+| `missing_eos_penalty` | **removed** – replicate with a custom reward function checking if `eos_token_id` is in `completion_ids` |
+
+Below is a summary of the key changes for [`RLOOTrainer`]:
+
+| TRL ≤ 0.21.x | TRL ≥ 0.22.0 |
+| --- | --- |
+| `config` | renamed to `args` |
+| `reward_model` | renamed to `reward_funcs`, which now supports both reward models and custom reward functions |
+| `policy` | renamed to `model` |
+| `ref_policy` | **removed** – the reference model is now created automatically from `model` |
+| `data_collator` | **removed** |
--- a/docs/source/script_utils.md
+++ b/docs/source/script_utils.md
@ -10,3 +10,15 @@
    - parse_args_and_config
    - parse_args_into_dataclasses
    - set_defaults_with_config
+
+## get_dataset
+
+[[autodoc]] get_dataset
+
+## DatasetConfig
+
+[[autodoc]] scripts.utils.DatasetConfig
+
+## DatasetMixtureConfig
+
+[[autodoc]] DatasetMixtureConfig
--- a/docs/source/sentiment_tuning.md
+++ b/docs/source/sentiment_tuning.md
@ -1,6 +1,6 @@
 # Sentiment Tuning Examples

-The notebooks and scripts in this examples show how to fine-tune a model with a sentiment classifier (such as `lvwerra/distilbert-imdb`).
+The notebooks and scripts in these examples show how to fine-tune a model with a sentiment classifier (such as `lvwerra/distilbert-imdb`).

 Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):

--- a/docs/source/sft_trainer.md
+++ b/docs/source/sft_trainer.md
--- a/docs/source/speeding_up_training.md
+++ b/docs/source/speeding_up_training.md
@ -37,7 +37,13 @@ training_args = OnlineDPOConfig(..., use_vllm=True)
 </hfoption>
 <hfoption id="GRPO">

-Then, enable it by passing `use_vllm=True` in the training arguments.
+First, start a vLLM server by running:
+
+```bash
+trl vllm-serve --model <model_name>
+```
+
+Then, run the training script and pass `use_vllm=True` in the training arguments.

 ```python
 from trl import GRPOConfig
@ -45,31 +51,58 @@ from trl import GRPOConfig
 training_args = GRPOConfig(..., use_vllm=True)
 ```

-The strategy here is to use a dedicated GPU for generation powered by vLLM, while using the remainder for training.
+You can customize the server configuration by passing additional arguments. For more information, see [vLLM integration](vllm_integration).

 <Tip warning={true}>

-When using vLLM, an additional GPU is required exclusively for generation. This means you need at least two available GPUs and must ensure that one remains unused by the trainer. To achieve this, run the training with `--num_processes <NUMBER_OF_GPUs - 1>`.
+When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  

-For example, if you have 4 GPUs, set `--num_processes 3` to allocate three GPUs for training while reserving one for generation.
-```bash
-accelerate launch --multi_gpu --num_processes 3 train_grpo.py
-```
+Set GPUs **0-3** for vLLM generation:  
+```sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
+```  

-![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/1_gpu_for_generation.png)
+And GPUs **4-7** for training:  
+```sh
+CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
+```  

 </Tip>

-You can further tune the vLLM configuration by setting a specific `vllm_device` and `vllm_gpu_memory_utilization` in the [`GRPOConfig`].
+</hfoption>
+<hfoption id="RLOO">
+
+First, start a vLLM server by running:
+
+```bash
+trl vllm-serve --model <model_name>
+```
+
+Then, run the training script and pass `use_vllm=True` in the training arguments.

 ```python
-training_args = GRPOConfig(
-    ...,
-    use_vllm=True,
-    vllm_device="cuda:4",
-    vllm_gpu_memory_utilization=0.7,
-)
+from trl import RLOOConfig
+
+training_args = RLOOConfig(..., use_vllm=True)
 ```

+You can customize the server configuration by passing additional arguments. For more information, see [vLLM integration](vllm_integration).
+
+<Tip warning={true}>
+
+When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.  
+
+Set GPUs **0-3** for vLLM generation:  
+```sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
+```  
+
+And GPUs **4-7** for training:  
+```sh
+CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
+```  
+
+</Tip>
+
 </hfoption>
 </hfoptions>
--- a/docs/source/text_environments.md
+++ b/docs/source/text_environments.md
@ -1,197 +0,0 @@
-# Text Environments
-
-Text environments provide a learning ground for language agents. It allows a language model to use tools to accomplish a task such as using a Python interpreter to answer math questions or using a search index for trivia questions. Having access to tools allows language models to solve tasks that would be very hard for the models itself but can be trivial for the appropriate tools. A good example is arithmetics of large numbers that become a simple copy-paste task once you have access to a calculator.
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/textenv.png">
-</div>
-
-Let's dive into how text environments work and start with tools!
-
-## Tools
-
-One of the core building blocks of text environments are tools that the model can use to solve tasks. In general tools can be any Python function that takes a string as input and returns string. The `TextEnvironment` offers two options for tools: either go with predefined tools from `transformers.Tool` or define your own function or class with `__call__` method. Let's have a look at both!
-
-### `transformers.Tool`
-
-Text environments fully support tools of the class `transformers.Tool`. The advantage of building tools in that framework is that they can easily be shared 
-
-```Python
-from transformers import load_tool
-
-# simple calculator tool that runs +-/* operations
-calc_tool = load_tool("ybelkada/simple-calculator")
-
-# python interpreter that executes program and returns outputs
-py_tool = load_tool("lvwerra/python-interpreter")
-
-# wikipedia search index that returns best search match
-wiki_tool = load_tool("vwxyzjn/pyserini-wikipedia-kilt-doc")
-```
-
-These tools are either loaded from the hub or from a local folder. Using the tool is as simple as calling them with a text query:
-
-```Python
-calc_tool("1/2")
->>> "0.5"
-```
-
-Note that both input and return values are strings to enable easy usage with a language model.
-
-### Custom Tools
-
-The following is an example of a tool that adds two integers:
-
-```Python
-def add(text):
-    int_1, int_2 = text.split("+")
-    result = int(int_1) + int(int_2)
-    return str(result)
-
-print(add("1+1"))
->>> "2"
-```
-
-We looked at basic examples such as a calculator but the principle holds for more complex tools as well such as a web search tool where you input the query and get the search results in return. Now let's look at how the model can use the tools with the call syntax.
-
-### Call syntax
-
-In order to have a unified way for the model to call a tool we created a simple syntax that looks as follows:
-
-```python
-"<request><TOOL_NAME>QUERY<call>TOOL_RESPONSE<response>"
-```
-
-There are a few special tokens involved so let's decompose it: First the model can signal that it wants to use a tool by emitting the `<request>` token. After that we want to know the name of the tool to call which is done by enclosing the tool name with `<>` brackets. Once we know which tool to call the tool query follows which is in free text form. The `<call>` tokens signifies the end of the query and stops the model generation. At this point the model output is parsed and the query sent to the tool. The environment appends the tool response to the string followed by the `<response>` token to show the end the tool output.
-
-Let's look at the concrete example of the calculator and assume its name is `Calculator` (more on how the name of a tool is inferred later):
-
-```python
-"<request><Calculator>1/2<call>0.5<response>"
-```
-
-Finally, the episode is ended and generation stops when the model generates `<submit>` which marks the interaction as completed.
-
-Now let's have a look how we can create a new text environment!
-
-## Create a `TextEnvironment`
-
-
-```python
-prompt = """\
-What is 13-3?
-<request><SimpleCalculatorTool>13-3<call>10.0<response>
-Result=10<submit>
-"""
-
-def reward_fn(result, answer):
-    """Simplified reward function returning 1 if result matches answer and 0 otherwise."""
-    result_parsed = result.split("=")[1].split("<")[0]
-    return int(result_parsed==answer)
-
-text_env = TextEnvironemnt(
-    model=model, 
-    tokenizer=tokenizer,
-    tools= {"SimpleCalculatorTool": load_tool("ybelkada/simple-calculator")},
-    reward_fn=exact_match_reward,
-    prompt=prompt, 
-    max_turns=1
-    max_tool_response=100
-    generation_kwargs={"do_sample": "true"}
-)
-```
-
-Let's decompose the settings:
-
-| Argument           | Description     |
-|:-------------------|:----------------|
-| `model`            | Language model to interact with the environment and generate requests. |
-| `tokenizer`        | Tokenizer of language model handling tokenization of strings. |
-| `tools`            | `list` of `dict` of tools. If former the name of the tool is inferred from class name and otherwise it's the keys of the dictionary.|
-| `reward_fn`        | A function that takes a string as input and returns. Can have extra arguments that are passed to `.run()` such as ground truth.|
-| `prompt`           | Prompt to prepend to every task. Usually a few examples to demonstrate to the model how to use the tools in a few-shot fashion. |
-| `max_turns`        | Maximum number of interactions between model and tools before episode ends.|
-| `max_tool_response`| The tool response is truncated to this number to avoid running out of model context.|
-| `max_length`       |  The maximum number of tokens to allow in an episode. |
-| `generation_kwargs`| Generation settings used by the language model. |
-
-You can customize the environment to your needs and add custom tools and settings. Let's see how you can use the environment to have the model interact with the available tools!
-
-
-## Run an Episode
-
-To run a set of queries through the text environment one can simply use the `run` method.
-
-```python
-queries = ["What is 1/2?"]
-answers = ["0.5"]
-
-queries, responses, masks, rewards, histories = text_env.run(queries, answers=answers)
-```
-
-This will execute the model/tool feedback loop for each query until either no tool is called anymore, the maximum number of turns is reached or to maximum number of tokens in an episode is exceeded. The extra `kwargs` (e.g. `answers=answers` above) passed to `run` will be passed on to the reward function.
-
-There are five objects that are returned by `run`: 
-
- `queries`: a list of the tokenized queries
- `responses`: all tokens that have been generated withing the environment including model and tool tokens
- `masks`: mask that indicates which tokens have been generated by the model and which tokens are generated by the tool
- `rewards`: a list of reward for each query/response
- `histories`: list of `TextHistory` objects, which are useful objects containing all the above and also the text equivalents
-
-The masks are crucial for training as we don't want to optimize tokens that the model has not generated which are tokens produced by the tools.
-
-Next, we'll train a PPO step with the generated responses!
-
-
-### Train
-Training on episodes from the `TextEnvironment` is straight forward and simply requires forwarding all the returned variables except the `TextHistory` objects to the `step` method:
-
-```python
-train_stats = ppo_trainer.step(queries, responses, rewards, masks)
-```
-
-## `TextHistory`
-
-The `TextHistory` object stores the interactions between the model and the text environment. It stores tokens and text generated in each turn and their source in each turn (model or system) as well as rewards. Let's go through the class attributes and methods.
-
-### Attributes
-
-The following table summarises the available attributes of the `TextEnvironment` class:
-
-| Attribute           | Description     |
-|:-------------------|:----------------|
-| `text`             | The full string of the text generated in the text environment with both model and system generated text. |
-| `text_spans`       | A list of tuples with the spans for each model or system generated text segment. |
-| `system_spans`     | A list of boolean values indicating if the segment is model or system generated. |
-| `tokens`           | All tokens generated in text environment with both model and system generated tokens. |
-| `token_spans`      | Similar to `text_spans` the `token_spans` indicate the boundaries of model andsystem generated tokens. |
-| `token_masks`      | The token masks can be used to ignore system generated tokens by masking them. |
-| `completed`        | Indicates if the interaction with the environment has completed. |
-| `truncated`        | Indicates if the interaction with the environment has completed because max length was reached. |
-
-With these attributes you can reconstruct every interaction of the model with the `TextEnvironment`. The `TextHistory` also lets you visualize the text history. Let's have a look!
-
-### Visualization
-
-When the model interacts inside the `TextEnvironment` it can be useful to visualize and separate which parts of the text outputs were generated by the model and which parts come from the system and tools. For that purpose there are the two methods [`TextHistory.show_text`] and [`TextHistory.show_tokens`]. They print the text and tokens respectively and highlight the various segments using the [`rich` libray](https://github.com/Textualize/rich) (make sure to install it before using these methods).
-
-You can see that the prompt is highlighted in gray, whereas system segments such as query and tool responses are highlighted in green. All segments generated by the model are highlighted in blue and in addition to the pure text output the reward is displayed as additional text in plum. Here an example of `show_text`:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/textenv_show_text.png" width=600>
-</div>
-
-Sometimes there can be tricky tokenization related issues that are hidden when showing the decoded text. Thus `TextHistory` also offers an option to display the same highlighting on the tokens directly with `show_tokens`:
-
-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/textenv_show_tokens.png" width=800>
-</div>
-
-Note that you can turn on the colour legend by passing `show_legend=True`.
-
-## API Documentation
-
-[[autodoc]] TextEnvironment
-
-[[autodoc]] TextHistory
--- a/docs/source/unsloth_integration.md
+++ b/docs/source/unsloth_integration.md
@ -1,7 +1,125 @@
 # Unsloth Integration

-<Tip warning={true}>
+Unsloth is an open‑source framework for fine‑tuning and reinforcement learning that trains LLMs (like Llama, OpenAI gpt-oss, Mistral, Gemma, DeepSeek, and more) up to 2× faster with up to 80% less VRAM. Unsloth allows [training](https://huggingface.co/docs/trl/en/unsloth_integration#Training), evaluation, running and [deployment](https://huggingface.co/docs/trl/en/unsloth_integration#Saving-the-model) with other inference engines like llama.cpp, Ollama and vLLM.

-Section under construction. Feel free to contribute!
+The library provides a streamlined, Hugging Face compatible workflow for training, evaluation, inference and deployment and is fully compatible with [`SFTTrainer`].

-</Tip>
+## Key Features
+
+- Training support for all transformer compatible models: Text-to-speech (TTS), multimodal, BERT, RL and more
+- Supports full fine-tuning, pretraining, LoRA, QLoRA, 8-bit training & more
+- Works on Linux, Windows, Colab, Kaggle; NVIDIA GPUs, soon AMD & Intel setups
+- Supports most features TRL supports, including RLHF (GSPO, GRPO, DPO etc.)
+- Hand-written Triton kernels and a manual backprop engine ensure no accuracy degradation (0% approximation error)
+
+## Installation
+
+### pip install
+
+Local Installation (Linux recommended):
+
+```sh
+pip install unsloth
+```
+
+You can also install `unsloth` according to the [official documentation](https://docs.unsloth.ai/get-started/installing-+-updating). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading [`~transformers.AutoModelForCausalLM`], you just need to load a `FastLanguageModel` as follows:
+
+```python
+import torch
+from trl import SFTConfig, SFTTrainer
+from unsloth import FastLanguageModel
+
+max_length = 2048 # Supports automatic RoPE Scaling, so choose any number
+
+# Load model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name="unsloth/mistral-7b",
+    max_seq_length=max_length,
+    dtype=None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+    load_in_4bit=True,  # Use 4bit quantization to reduce memory usage. Can be False
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r=16,
+    target_modules=[
+        "q_proj",
+        "k_proj",
+        "v_proj",
+        "o_proj",
+        "gate_proj",
+        "up_proj",
+        "down_proj",
+    ],
+    lora_alpha=16,
+    lora_dropout=0,  # Dropout = 0 is currently optimized
+    bias="none",  # Bias = "none" is currently optimized
+    use_gradient_checkpointing=True,
+    random_state=3407,
+)
+
+training_args = SFTConfig(output_dir="./output", max_length=max_length)
+
+trainer = SFTTrainer(
+    model=model,
+    args=training_args,
+    train_dataset=dataset,
+)
+trainer.train()
+```
+
+The saved model is fully compatible with Hugging Face's transformers library. Learn more about unsloth in their [official repository](https://github.com/unslothai/unsloth).
+
+### Docker Install
+
+```sh
+docker run -d -e JUPYTER_PASSWORD="mypassword" \
+  -p 8888:8888 -p 2222:22 \
+  -v $(pwd)/work:/workspace/work \
+  --gpus all \
+  unsloth/unsloth
+```
+
+Access Jupyter Lab at ```http://localhost:8888``` and start fine-tuning!
+
+## Training
+
+These are some core settings you can toggle before training:
+
+- ```max_seq_length = 2048``` – Controls context length. While Llama-3 supports 8192, we recommend 2048 for testing. Unsloth enables 4× longer context fine-tuning.
+- ```dtype = None``` – Defaults to None; use torch.float16 or torch.bfloat16 for newer GPUs.
+- ```load_in_4bit = True``` – Enables 4-bit quantization, reducing memory use 4× for fine-tuning. Disabling it allows for LoRA 16-bit fine-tuning to be enabled.
+- To enable full fine-tuning (FFT), set ```full_finetuning = True```. For 8-bit fine-tuning, set ```load_in_8bit = True```. Note: Only one training method can be set to True at a time.
+
+For more information on configuring Unsloth's hyperparameters and features, read their [documentation guide here](https://docs.unsloth.ai/get-started/fine-tuning-llms-guide).
+
+## Saving the model
+
+Unsloth allows you to directly save the finetuned model as a small file called a LoRA adapter. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a [Hugging Face token](https://huggingface.co/settings/tokens) and add your token!
+
+### Saving to GGUF
+
+To save to GGUF, Unsloth uses llama.cpp. To save locally:
+
+```python
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q4_k_m")
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q8_0")
+model.save_pretrained_gguf("directory", tokenizer, quantization_method = "f16")
+```
+
+To push to the hub:
+
+```python
+model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q4_k_m")
+model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q8_0")
+```
+
+### Saving to vLLM
+
+To save to 16-bit for vLLM, use:
+
+```python
+model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
+model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
+```
--- a/docs/source/use_model.md
+++ b/docs/source/use_model.md
@ -36,7 +36,7 @@ print(pipe("This movie was really")[0]["generated_text"])
 from peft import PeftConfig, PeftModel
 from transformers import AutoModelForCausalLM, AutoTokenizer

-base_model_name = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub"
+base_model_name = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub
 adapter_model_name = "path/to/my/adapter"

 model = AutoModelForCausalLM.from_pretrained(base_model_name)
--- a/docs/source/using_llama_models.md
+++ b/docs/source/using_llama_models.md
@ -7,7 +7,7 @@ We've begun rolling out examples to use Meta's LLaMA models in `trl` (see [Meta'
 Even training the smallest LLaMA model requires an enormous amount of memory. Some quick math: in bf16, every parameter uses 2 bytes (in fp32 4 bytes) in addition to 8 bytes used, e.g., in the Adam optimizer (see the [performance docs](https://huggingface.co/docs/transformers/perf_train_gpu_one#optimizer) in Transformers for more info). So a 7B parameter model would use `(2+8)*7B=70GB` just to fit in memory and would likely need more when you compute intermediate values such as attention scores. So you couldn’t train the model even on a single 80GB A100 like that. You can use some tricks, like more efficient optimizers of half-precision training, to squeeze a bit more into memory, but you’ll run out sooner or later.

 Another option is to use Parameter-Efficient Fine-Tuning (PEFT) techniques, such as the [`peft`](https://github.com/huggingface/peft) library, which can perform low-rank adaptation (LoRA) on a model loaded in 8-bit.
-For more on `peft` + `trl`, see the [docs](https://huggingface.co/docs/trl/sentiment_tuning_peft).
+For more on `peft` + `trl`, see the [Peft integration](peft_integration) docs.

 Loading the model in 8bit reduces the memory footprint drastically since you only need one byte per parameter for the weights (e.g. 7B LlaMa is 7GB in memory).
 Instead of training the original weights directly, LoRA adds small adapter layers on top of some specific layers (usually the attention layers); thus, the number of trainable parameters is drastically reduced.
@ -36,14 +36,13 @@ The easiest way to achieve this is by continuing to train the language model wit
 The [StackExchange dataset](https://huggingface.co/datasets/HuggingFaceH4/stack-exchange-preferences) is enormous (over 10 million instructions), so we can easily train the language model on a subset of it.

 There is nothing special about fine-tuning the model before doing RLHF - it’s just the causal language modeling objective from pretraining that we apply here.
-To use the data efficiently, we use a technique called packing: instead of having one text per sample in the batch and then padding to either the longest text or the maximal context of the model, we concatenate a lot of texts with a EOS token in between and cut chunks of the context size to fill the batch without any padding.
+To use the data efficiently, we use a technique called packing: instead of having one text per sample in the batch and then padding to either the longest text or the maximal context of the model, we concatenate a lot of texts with an EOS token in between and cut chunks of the context size to fill the batch without any padding.

 ![chapter10_preprocessing-clm.png](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/chapter10_preprocessing-clm.png)

 With this approach the training is much more efficient as each token that is passed through the model is also trained in contrast to padding tokens which are usually masked from the loss.
 If you don't have much data and are more concerned about occasionally cutting off some tokens that are overflowing the context you can also use a classical data loader.

-The packing is handled by the `ConstantLengthDataset` and we can then use the `Trainer` after loading the model with `peft`. First, we load the model in int8, prepare it for training, and then add the LoRA adapters.

 ```python
 # load model in 8bit
--- a/docs/source/vllm_integration.md
+++ b/docs/source/vllm_integration.md
@ -0,0 +1,196 @@
+# vLLM Integration
+
+This document will guide you through the process of using vLLM with TRL for faster generation in online methods like GRPO and Online DPO. We first summarize a tl;dr on how to use vLLM with TRL, and then we will go into the details of how it works under the hood. Let's go! 🔥
+
+## 🚀 How can I use vLLM with TRL to speed up training?
+
+💡 **Note**: Resources required for this specific example: a single node with 8 GPUs.
+
+<Tip warning={true}>
+vLLM server and TRL trainer must use different CUDA devices to avoid conflicts.
+</Tip>
+
+First, install vLLM using the following command:
+
+```bash
+pip install "trl[vllm]"
+```
+
+Then run the server on specific GPUs (e.g., GPUs 0-3):
+
+```sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model Qwen/Qwen2.5-7B --tensor-parallel-size 2 --data-parallel-size 2
+```
+
+Once the server is running, you can use it to generate completions for training. In the example below, we are using the `GRPOTrainer` to train a model using the vLLM server for generation. The `--tensor-parallel-size` and `--data-parallel-size` arguments control how the model and data are sharded across GPUs.
+
+In this example, we are sharding two copies of the model across 4 GPUs. Increasing data parallelism increases throughput, while increasing tensor parallelism allows for serving larger models. Then, run the training script on different GPUs (e.g., GPUs 4-7) by passing `use_vllm=True` in the training arguments as follows:
+
+Sample of a simple `train.py` script:
+
+```python
+from datasets import load_dataset
+from trl import GRPOTrainer, GRPOConfig
+
+dataset = load_dataset("trl-lib/tldr", split="train")
+
+# Dummy reward function: count the number of unique characters in the completions
+def reward_num_unique_chars(completions, **kwargs):
+    return [len(set(c)) for c in completions]
+
+training_args = GRPOConfig(
+    output_dir="my_test",
+    use_vllm=True,
+    bf16=True,
+    gradient_checkpointing=True,
+)
+
+trainer = GRPOTrainer(
+    model="Qwen/Qwen2.5-7B",
+    args=training_args,
+    reward_funcs=reward_num_unique_chars,
+    train_dataset=dataset,
+)
+
+trainer.train()
+```
+
+And the train command on separate GPUs from the server:
+
+```sh
+CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
+```
+
+## 🎬 Flashback: Why do we need to use vLLM in online methods?
+
+Online methods like GRPO or Online DPO require the model to generate completions during training, which are then used to compute reward signals. However, generation can be extremely time-consuming, especially with large or reasoning models. In the default setup (without vLLM), completions are generated using the [(unwrapped) model's `generate` method](https://github.com/huggingface/trl/blob/f3e8c2304428ef16e9ae5de9e5741ed84d533b7b/trl/trainer/grpo_trainer.py#L965C39-L965C66). This approach quickly becomes a major bottleneck — generation is slow and inefficient, particularly for large batches or models. As a result, training times increase significantly, and overall efficiency drops. To address this, we turn to vLLM, which enables much faster and more scalable generation, helping eliminate this bottleneck in online methods.
+
+## 🤔 How does vLLM solve the slow generation issue?
+
+If you've ever done autoregressive decoder training, you know all the input tokens to the LLM produce their attention key and value tensors, and these tensors are kept in GPU memory to later generate subsequent tokens based on them. These cached key and value tensors are often referred to as the KV cache. However, storing the KV cache occupies a lot of memory, so vLLM uses a technique called **PagedAttention** to solve this problem. PagedAttention, which is inspired by the OS’s virtual memory concept, stores continuous keys and values in **non-contiguous memory space**, which is much more efficient. The details of this are beyond the scope of this document, but in short, it allows the model to store the keys and values in a more efficient way, reducing the memory footprint and speeding up the generation process. If you are interested, make sure to check out the [vLLM PagedAttention](https://blog.vllm.ai/2023/06/20/vllm.html) for more details.
+
+## 🤔 What exactly happens when you run `trl vllm-serve --model <model_name>`?
+
+When you run for example
+
+```sh
+CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model Qwen/Qwen2.5-7B --tensor-parallel-size 1 --data-parallel-size 4
+```
+
+the following happens:
+
+![vllm](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/vllm-doc.png)
+
+1. vLLM first spawns multiple workers to handle incoming requests in parallel. The number of workers is determined by multiplying the `--tensor-parallel-size` and `--data-parallel-size` values. In this example, it spawns 4 workers (1 × 4).
+Each worker operates independently and processes a chunk of the incoming requests — which are basically the prompts sent to the server for generation. A key point to understand is that these 4 workers are running in parallel, and each one is responsible for handling a subset of the total incoming load.
+
+2. Once the incoming requests (prompts) are distributed across the workers, the model starts generating completions. Internally, the model’s weights are split across multiple GPUs based on the `--tensor-parallel-size` argument — this is how tensor parallelism is handled. Meanwhile, data parallelism (controlled by `--data-parallel-size`) ensures that different sets of requests are processed independently across the workers. In short: tensor parallelism splits the model across GPUs, and data parallelism splits the batch of requests across different model replicas.
+
+3. Although the GPUs process requests independently and in parallel, they still need to communicate with each other. Remember that each GPU handles only a slice of the incoming prompts (for example, with 4 GPUs and 8 prompts using `--data-parallel-size=4`, each GPU processes 2 prompts).
+This GPU-to-GPU communication is managed efficiently by NVIDIA’s NCCL library. The communication mainly ensures that each GPU gets its correct portion of the incoming requests — it’s lightweight and doesn’t interfere with generation itself.
+Separately, the number of completions to generate per prompt is controlled by the `num_generations` setting in the GRPO config. For instance, if you set `num_generations=2` (like in the picture above), each prompt will have 2 completions. So, with 8 prompts and `num_generations=2`, you would end up with 16 completions total — regardless of the number of GPUs or parallelism settings.
+
+## 🥸 More detail on what happens under the hood when running the server
+
+* The vLLM server starts by running the command: `trl vllm-serve --model Qwen/Qwen2.5-7B`.
+* Once the server is running, it generates completions based on requests from the client (trainer) using `vllm_client.generate` [here](https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035).
+* The client (trainer) then requests these completions from the server.
+* These completions are used to compute the reward signal.
+* Based on the reward signal and the model’s output, the loss is computed, and the backward pass is performed to update the model’s weights.
+* **Note**: The server only handles completion generation — it doesn’t train the model. Therefore, the model’s weights aren’t updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using `vllm_client.update_named_param(name, param.data)`.
+
+When using vLLM, ensure the GPUs assigned for training and generation are separate to avoid NCCL communication conflicts. If you do not set the `CUDA_VISIBLE_DEVICES` environment variable, the training script will use all available GPUs by default, which may lead to device conflicts. Starting from TRL next release after v0.19.1, the code automatically detects and prevents same-device usage, raising a error at the vllm server process:
+
+```
+RuntimeError: Attempting to use the same CUDA device for multiple distinct roles/ranks within the same communicator. 
+Ensure that trainer is using different devices than vLLM server.
+```
+
+For example, if you want to use GPUs 4–7 for training while the server runs on GPUs 0-3, set:
+
+```sh
+CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
+```
+
+## 🍷 More customization options with vLLM?
+
+You can customize the server configuration by passing additional arguments.
+
+```
+$ trl vllm-serve --help
+usage: trl vllm-serve [-h] --model MODEL [--revision REVISION] [--tensor_parallel_size TENSOR_PARALLEL_SIZE]
+                      [--data_parallel_size DATA_PARALLEL_SIZE] [--host HOST] [--port PORT]
+                      [--gpu_memory_utilization GPU_MEMORY_UTILIZATION] [--dtype DTYPE] [--max_model_len MAX_MODEL_LEN]
+                      [--enable_prefix_caching ENABLE_PREFIX_CACHING] [--enforce_eager ENFORCE_EAGER] [--log_level LOG_LEVEL]
+
+options:
+  -h, --help            Show this help message and exit
+  --model MODEL         Model name or path to load the model from. (default: None)
+  --revision REVISION   Revision to use for the model. If not specified, the default branch will be used. (default: None)
+  --tensor_parallel_size TENSOR_PARALLEL_SIZE, --tensor-parallel-size TENSOR_PARALLEL_SIZE
+                        Number of tensor parallel workers to use. (default: 1)
+  --data_parallel_size DATA_PARALLEL_SIZE, --data-parallel-size DATA_PARALLEL_SIZE
+                        Number of data parallel workers to use. (default: 1)
+  --host HOST           Host address to run the server on. (default: 0.0.0.0)
+  --port PORT           Port to run the server on. (default: 8000)
+  --gpu_memory_utilization GPU_MEMORY_UTILIZATION, --gpu-memory-utilization GPU_MEMORY_UTILIZATION
+                        Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV cache on the device
+                        dedicated to generation powered by vLLM. Higher values will increase the KV cache size and thus improve the
+                        model's throughput. However, if the value is too high, it may cause out-of-memory (OOM) errors during
+                        initialization. (default: 0.9)
+  --dtype DTYPE         Data type to use for vLLM generation. If set to 'auto', the data type will be automatically determined based on
+                        the model configuration. Find the supported values in the vLLM documentation. (default: auto)
+  --max_model_len MAX_MODEL_LEN, --max-model-len MAX_MODEL_LEN
+                        If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced
+                        `vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model context
+                        size, which might be much larger than the KV cache, leading to inefficiencies. (default: None)
+  --enable_prefix_caching ENABLE_PREFIX_CACHING, --enable-prefix-caching ENABLE_PREFIX_CACHING
+                        Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the hardware support this
+                        feature. (default: None)
+  --enforce_eager ENFORCE_EAGER, --enforce-eager ENFORCE_EAGER
+                        Whether to enforce eager execution. If set to `True`, we will disable CUDA graph and always execute the model
+                        in eager mode. If `False` (default behavior), we will use CUDA graph and eager execution in hybrid. (default:
+                        None)
+  --log_level LOG_LEVEL, --log-level LOG_LEVEL
+                        Log level for uvicorn. Possible choices: 'critical', 'error', 'warning', 'info', 'debug', 'trace'. (default:
+                        info)
+```
+
+## 🥳 Okay, now that we have the server running, how can we use it to generate completions?
+
+Run the training script and pass `use_vllm=True` in the training arguments:
+
+```python
+from trl import GRPOConfig
+
+training_args = GRPOConfig(..., use_vllm=True)
+```
+
+## 💆🏻‍♀️ What's the best distributed setup?
+
+![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png)
+![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png)
+
+First and foremost, always remember that the optimal setup depends on:
+
+* The model size
+* The number of GPUs you have
+* The GPU memory size
+* The batch size you are using
+* The number of requests you are sending to the server (prompts)
+* The `max_model_len` you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)
+* The number of completions you are generating for each request (`num_generations`)
+
+Given these factors, our experiments on the Qwen model family (3B, 7B, 14B, 32B) using 8 H100 GPUs show that:
+
+* For reasonable-sized models (3B–14B) and a moderate context window (`max_len < 8k`), using full capacity for data parallelism gives better throughput. The setup `(tp=1, dp=8)` yields the best results.
+* For larger models (32B) and longer context windows (`max_len > 8k`), a smaller DP size combined with some model-side parallelism performs better. For example, `(tp=2, dp=4)` is a good setup for 32B models with a larger context window.
+
+## vLLM with Transformers Backend
+
+vLLM now supports transformers backend for model implementations. Simply passing in `transformers` in `vllm_model_impl` in configurations or through argument parser will set use transformers backend. This works for both LLMs and VLMs. See an example below, you can get more information [here](https://blog.vllm.ai/2025/04/11/transformers-backend.html).
+
+```
+CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen
+2.5-VL-3B-Instruct --tensor-parallel-size 1 --port 8000 --enforce_eager --vllm_model_impl transformers
+```
--- a/docs/source/xpo_trainer.md
+++ b/docs/source/xpo_trainer.md
@ -4,7 +4,7 @@

 ## Overview

-Exploratory Preference Optimization (XPO) was proposed in the paper [Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF](https://huggingface.co/papers/2405.21046) by Tengyang Xie, Dylan J. Foster, Akshay Krishnamurthy, [Corby Rosset](https://huggingface.co/corbyrosset), [Ahmed Awadallah](https://huggingface.co/AhmedAwadallah), and Alexander Rakhlin. It is a simple online preference tuning method based on the DPO loss together with a reward model (RM). XPO augments the DPO objective with an exploration bonus allowing the method to explore outside the support of the intitial model and human feedback data.
+Exploratory Preference Optimization (XPO) was proposed in the paper [Exploratory Preference Optimization: Harnessing Implicit Q*-Approximation for Sample-Efficient RLHF](https://huggingface.co/papers/2405.21046) by Tengyang Xie, Dylan J. Foster, Akshay Krishnamurthy, [Corby Rosset](https://huggingface.co/corbyrosset), [Ahmed Awadallah](https://huggingface.co/AhmedAwadallah), and Alexander Rakhlin. It is a simple online preference tuning method based on the DPO loss together with a reward model (RM). XPO augments the DPO objective with an exploration bonus allowing the method to explore outside the support of the initial model and human feedback data.

 The abstract from the paper is the following:

@ -35,7 +35,7 @@ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
 judge = PairRMJudge()
 train_dataset = load_dataset("trl-lib/ultrafeedback-prompt", split="train")

-training_args = XPOConfig(output_dir="Qwen2-0.5B-XPO", logging_steps=10)
+training_args = XPOConfig(output_dir="Qwen2-0.5B-XPO")
 trainer = XPOTrainer(
    model=model, judge=judge, args=training_args, processing_class=tokenizer, train_dataset=train_dataset
 )
@ -50,9 +50,9 @@ accelerate launch train_xpo.py

 Distributed across 8 GPUs, the training takes approximately 1 hour.

-To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-XPO) performs, you can use the [TRL Chat CLI](clis#chat-interface).
+To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-XPO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).

-<pre><code>$ trl chat --model_name_or_path trl-lib/Qwen2-0.5B-XPO
+<pre><code>$ transformers chat trl-lib/Qwen2-0.5B-XPO
 <strong><span style="color: red;">&lt;quentin_gallouedec&gt;:</span></strong>
 What is the best programming language?

@ -124,7 +124,6 @@ python examples/scripts/xpo.py \
    --judge pair_rm \
    --dataset_name trl-lib/ultrafeedback-prompt \
    --learning_rate 5.0e-7 \
-    --logging_steps 25 \
    --output_dir Qwen2.5-0.5B-XPO-PairRM \
    --warmup_ratio 0.1 \
    --push_to_hub
@ -132,7 +131,7 @@ python examples/scripts/xpo.py \

 ## Logged metrics

-The logged metrics are as follows:
+While training and evaluating we record the following reward metrics:

 * `loss/xpo`: The mean xpo part of the full loss.
 * `loss/dpo`: The mean dpo part of the full loss.
@ -156,6 +155,9 @@ The logged metrics are as follows:
 ## XPOTrainer

 [[autodoc]] XPOTrainer
+    - train
+    - save_model
+    - push_to_hub

 ## XPOConfig

--- a/examples/accelerate_configs/fsdp1.yaml
+++ b/examples/accelerate_configs/fsdp1.yaml
@ -0,0 +1,28 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_backward_prefetch: BACKWARD_PRE
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_forward_prefetch: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: FULL_SHARD
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_sync_module_states: true
+  fsdp_use_orig_params: true
+  fsdp_version: 1
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/accelerate_configs/fsdp2.yaml
+++ b/examples/accelerate_configs/fsdp2.yaml
@ -0,0 +1,25 @@
+# Requires accelerate 1.7.0 or higher
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: FSDP
+downcast_bf16: 'no'
+enable_cpu_affinity: false
+fsdp_config:
+  fsdp_activation_checkpointing: false
+  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
+  fsdp_cpu_ram_efficient_loading: true
+  fsdp_offload_params: false
+  fsdp_reshard_after_forward: true
+  fsdp_state_dict_type: FULL_STATE_DICT
+  fsdp_version: 2
+machine_rank: 0
+main_training_function: main
+mixed_precision: bf16
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/accelerate_configs/fsdp_qlora.yaml
+++ b/examples/accelerate_configs/fsdp_qlora.yaml
@ -1,25 +0,0 @@
-compute_environment: LOCAL_MACHINE                                                                                                                                           
-debug: false                                                                                                                                                                 
-distributed_type: FSDP
-downcast_bf16: 'no'
-fsdp_config:
-  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
-  fsdp_backward_prefetch: BACKWARD_PRE
-  fsdp_cpu_ram_efficient_loading: true
-  fsdp_forward_prefetch: false
-  fsdp_offload_params: true
-  fsdp_sharding_strategy: FULL_SHARD
-  fsdp_state_dict_type: SHARDED_STATE_DICT
-  fsdp_sync_module_states: true
-  fsdp_use_orig_params: false
-machine_rank: 0
-main_training_function: main
-mixed_precision: 'bf16'
-num_machines: 1
-num_processes: 8
-rdzv_backend: static
-same_network: true
-tpu_env: []
-tpu_use_cluster: false
-tpu_use_sudo: false
-use_cpu: false
--- a/examples/accelerate_configs/single_gpu.yaml
+++ b/examples/accelerate_configs/single_gpu.yaml
@ -7,7 +7,7 @@ machine_rank: 0
 main_training_function: main
 mixed_precision: 'bf16'
 num_machines: 1
-num_processes: 8
+num_processes: 1
 rdzv_backend: static
 same_network: true
 tpu_env: []
--- a/examples/datasets/hh-rlhf-helpful-base.py
+++ b/examples/datasets/hh-rlhf-helpful-base.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/llava_instruct_mix.py
+++ b/examples/datasets/llava_instruct_mix.py
@ -0,0 +1,119 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import ast
+from dataclasses import dataclass, field
+from typing import Optional
+
+from datasets import load_dataset
+from huggingface_hub import ModelCard
+from transformers import HfArgumentParser
+
+
+@dataclass
+class ScriptArguments:
+    r"""
+    Arguments for the script.
+
+    Args:
+        push_to_hub (`bool`, *optional*, defaults to `False`):
+            Whether to push the dataset to the Hugging Face Hub.
+        repo_id (`str`, *optional*, defaults to `"trl-lib/llava-instruct-mix"`):
+            Hugging Face repository ID to push the dataset to.
+        dataset_num_proc (`int` or `None`, *optional*, defaults to `None`):
+            Number of workers to use for dataset processing.
+    """
+
+    push_to_hub: bool = field(
+        default=False,
+        metadata={"help": "Whether to push the dataset to the Hugging Face Hub."},
+    )
+    repo_id: str = field(
+        default="trl-lib/llava-instruct-mix",
+        metadata={"help": "Hugging Face repository ID to push the dataset to."},
+    )
+    dataset_num_proc: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of workers to use for dataset processing."},
+    )
+
+
+def process_example(example):
+    messages = []
+    for message in ast.literal_eval(example["conversations"]):
+        content = message["value"]
+        content = content.replace("<image>", "").strip()
+        role = "user" if message["from"] == "human" else "assistant"
+        messages.append({"role": role, "content": content})
+    return {"messages": messages, "images": [example["image"]]}
+
+
+def filter_long_examples(example):
+    total_length = sum(len(msg["content"]) for msg in example["messages"])
+    return total_length <= 1000
+
+
+def split_prompt_completion(example):
+    """
+    Splits the messages into a prompt and a completion. The last message is considered the completion.
+    """
+    assert len(example["messages"]) > 1
+    example["prompt"] = example["messages"][:-1]
+    example["completion"] = example["messages"][-1:]
+    return example
+
+
+model_card = ModelCard("""
+---
+tags: [trl]
+---
+
+# LLaVA Instruct Mix
+
+## Summary
+
+The LLaVA Instruct Mix dataset is a processed version of [LLaVA Instruct Mix](https://huggingface.co/datasets/theblackcat102/llava-instruct-mix).
+
+## Data Structure
+
+- **Format**: [Conversational](https://huggingface.co/docs/trl/main/dataset_formats#conversational)
+- **Type**: [Language-modeling](https://huggingface.co/docs/trl/main/dataset_formats#language-modeling)
+
+Columns:
+- `"images"`: The image associated with the text.
+- `"prompt"`: A list of messages that form the context for the conversation.
+- `"completion"`: The last message in the conversation, which is the model's response.
+
+This structure allows models to learn from the context of the conversation, enhancing their understanding of how to generate descriptive text based on visual inputs.
+
+## Generation script
+
+The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/llava_instruct_mix.py).
+""")
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(ScriptArguments)
+    script_args = parser.parse_args_into_dataclasses()[0]
+
+    dataset = load_dataset("theblackcat102/llava-instruct-mix", split="train", num_proc=script_args.dataset_num_proc)
+
+    dataset = dataset.map(
+        process_example, remove_columns=["conversations", "image"], num_proc=script_args.dataset_num_proc
+    )
+    dataset = dataset.filter(filter_long_examples, num_proc=script_args.dataset_num_proc)
+    dataset = dataset.map(split_prompt_completion, remove_columns=["messages"], num_proc=script_args.dataset_num_proc)
+
+    if script_args.push_to_hub:
+        dataset.push_to_hub(script_args.repo_id, num_proc=script_args.dataset_num_proc)
+        model_card.push_to_hub(script_args.repo_id, repo_type="dataset")
--- a/examples/datasets/lm-human-preferences-descriptiveness.py
+++ b/examples/datasets/lm-human-preferences-descriptiveness.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/lm-human-preferences-sentiment.py
+++ b/examples/datasets/lm-human-preferences-sentiment.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/math_shepherd.py
+++ b/examples/datasets/math_shepherd.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/prm800k.py
+++ b/examples/datasets/prm800k.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/rlaif-v.py
+++ b/examples/datasets/rlaif-v.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/tldr.py
+++ b/examples/datasets/tldr.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/tldr_preference.py
+++ b/examples/datasets/tldr_preference.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/datasets/ultrafeedback-prompt.py
+++ b/examples/datasets/ultrafeedback-prompt.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -81,7 +81,7 @@ Column:

 ## Generation script

-The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/ultafeedback-prompt.py).
+The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/ultrafeedback-prompt.py).
 """)

 if __name__ == "__main__":
--- a/examples/datasets/ultrafeedback.py
+++ b/examples/datasets/ultrafeedback.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -118,7 +118,7 @@ Column:

 ## Generation script

-The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/ultafeedback.py).
+The script used to generate this dataset can be found [here](https://github.com/huggingface/trl/blob/main/examples/datasets/ultrafeedback.py).
 """)

 if __name__ == "__main__":
--- a/examples/notebooks/README.md
+++ b/examples/notebooks/README.md
@ -4,4 +4,4 @@ This directory contains a collection of Jupyter notebooks that demonstrate how t

 - [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO.
 - [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.
- [`gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
+- [`gpt2-sentiment-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
--- a/examples/notebooks/best_of_n.ipynb
+++ b/examples/notebooks/best_of_n.ipynb
@ -30,7 +30,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": null,
   "metadata": {
    "id": "M1s_iNm773hM"
   },
@ -45,7 +45,8 @@
    "from trl import AutoModelForCausalLMWithValueHead\n",
    "from trl.core import LengthSampler\n",
    "\n",
-    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\""
+    "device = torch.accelerator.current_accelerator().type if hasattr(torch, \"accelerator\") else \"cuda\"\n",
+    "device = \"cpu\" if device is None else device"
   ]
  },
  {
@ -59,7 +60,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 3,
+   "execution_count": 2,
   "metadata": {
    "id": "MqS3OM6Q8x6g"
   },
@ -83,63 +84,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": null,
   "metadata": {
    "id": "b855NrL181Hh"
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/kashif/Github/transformers/src/transformers/tokenization_utils_base.py:1617: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be deprecated in transformers v4.45, and will be then set to `False` by default. For more details check this issue: https://github.com/huggingface/transformers/issues/31884\n",
-      "  warnings.warn(\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "AutoModelForCausalLMWithValueHead(\n",
-       "  (pretrained_model): GPT2LMHeadModel(\n",
-       "    (transformer): GPT2Model(\n",
-       "      (wte): Embedding(50257, 768)\n",
-       "      (wpe): Embedding(1024, 768)\n",
-       "      (drop): Dropout(p=0.1, inplace=False)\n",
-       "      (h): ModuleList(\n",
-       "        (0-11): 12 x GPT2Block(\n",
-       "          (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "          (attn): GPT2SdpaAttention(\n",
-       "            (c_attn): Conv1D(nf=2304, nx=768)\n",
-       "            (c_proj): Conv1D(nf=768, nx=768)\n",
-       "            (attn_dropout): Dropout(p=0.1, inplace=False)\n",
-       "            (resid_dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "          (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "          (mlp): GPT2MLP(\n",
-       "            (c_fc): Conv1D(nf=3072, nx=768)\n",
-       "            (c_proj): Conv1D(nf=768, nx=3072)\n",
-       "            (act): NewGELUActivation()\n",
-       "            (dropout): Dropout(p=0.1, inplace=False)\n",
-       "          )\n",
-       "        )\n",
-       "      )\n",
-       "      (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
-       "    )\n",
-       "    (lm_head): Linear(in_features=768, out_features=50257, bias=False)\n",
-       "  )\n",
-       "  (v_head): ValueHead(\n",
-       "    (dropout): Dropout(p=0.1, inplace=False)\n",
-       "    (summary): Linear(in_features=768, out_features=1, bias=True)\n",
-       "    (flatten): Flatten(start_dim=1, end_dim=-1)\n",
-       "  )\n",
-       ")"
-      ]
-     },
-     "execution_count": 4,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
   "source": [
    "model = AutoModelForCausalLMWithValueHead.from_pretrained(model_name)\n",
    "\n",
@ -151,7 +100,7 @@
    "\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
-    "# cuda-ize models\n",
+    "# put models to accelerator\n",
    "model.to(device)\n",
    "ref_model.to(device)"
   ]
@ -167,11 +116,24 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 4,
   "metadata": {
    "id": "LqLVEp5p_8XM"
   },
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Generating train split: 100%|██████████| 25000/25000 [00:00<00:00, 113700.67 examples/s]\n",
+      "Generating test split: 100%|██████████| 25000/25000 [00:00<00:00, 131049.39 examples/s]\n",
+      "Generating unsupervised split: 100%|██████████| 50000/50000 [00:00<00:00, 126486.39 examples/s]\n",
+      "Filter: 100%|██████████| 25000/25000 [00:00<00:00, 238843.61 examples/s]\n",
+      "Map:   0%|          | 0/24895 [00:00<?, ? examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (1168 > 1024). Running this sequence through the model will result in indexing errors\n",
+      "Map: 100%|██████████| 24895/24895 [00:17<00:00, 1462.36 examples/s]\n"
+     ]
+    }
+   ],
   "source": [
    "def build_dataset(\n",
    "    tokenizer,\n",
@ -201,7 +163,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 5,
   "metadata": {
    "id": "AqA2McjMAxNw"
   },
@ -219,7 +181,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
   "metadata": {
    "id": "L_q4qs35AxcR"
   },
@ -255,19 +217,11 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": null,
   "metadata": {
    "id": "-imZ7uEFBNbw"
   },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n"
-     ]
-    }
-   ],
+   "outputs": [],
   "source": [
    "for i in range(bs):\n",
    "    gen_len = output_length_sampler()\n",
@ -303,7 +257,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 8,
   "metadata": {
    "id": "PyDbbAQ0F_h7"
   },
@ -325,7 +279,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 9,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
@ -368,243 +322,243 @@
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
-       "      <td>This movie</td>\n",
-       "      <td>This movie should have read some books, and</td>\n",
-       "      <td>1.411889</td>\n",
-       "      <td>This movie has plenty of extraordinary feature...</td>\n",
-       "      <td>2.735337</td>\n",
-       "      <td>This movie was unexpectedly funny and funny, you</td>\n",
-       "      <td>2.405301</td>\n",
+       "      <td>This movie is one of</td>\n",
+       "      <td>This movie is one of the most twisted films I</td>\n",
+       "      <td>2.094254</td>\n",
+       "      <td>This movie is one of the finest directors of the</td>\n",
+       "      <td>2.726879</td>\n",
+       "      <td>This movie is one of the best looking movies I</td>\n",
+       "      <td>2.705925</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
-       "      <td>OK where do i begin?</td>\n",
-       "      <td>OK where do i begin? *** Acting is decent (not...</td>\n",
-       "      <td>1.555380</td>\n",
-       "      <td>OK where do i begin? For all of you who are no...</td>\n",
-       "      <td>0.019694</td>\n",
-       "      <td>OK where do i begin? i just wanted to add some...</td>\n",
-       "      <td>0.622912</td>\n",
+       "      <td>one may</td>\n",
+       "      <td>one may feel we are seeing more</td>\n",
+       "      <td>1.478813</td>\n",
+       "      <td>one may not have great assets,</td>\n",
+       "      <td>0.420451</td>\n",
+       "      <td>one may not be supported, terrible</td>\n",
+       "      <td>2.043730</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
-       "      <td>I watched</td>\n",
-       "      <td>I watched one can compare themselves upon view...</td>\n",
-       "      <td>1.380120</td>\n",
-       "      <td>I watched it because of its excellent cast. Th...</td>\n",
-       "      <td>2.498309</td>\n",
-       "      <td>I watched the trial trial for teaches us a goo...</td>\n",
-       "      <td>2.057187</td>\n",
+       "      <td>This is an amazing film,</td>\n",
+       "      <td>This is an amazing film, one of our favorite g...</td>\n",
+       "      <td>2.871389</td>\n",
+       "      <td>This is an amazing film, with all thelike wond...</td>\n",
+       "      <td>2.918770</td>\n",
+       "      <td>This is an amazing film, very moving and this ...</td>\n",
+       "      <td>2.871694</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
-       "      <td>It's been 19 years since Gordon</td>\n",
-       "      <td>It's been 19 years since Gordon finally left c...</td>\n",
-       "      <td>1.554914</td>\n",
-       "      <td>It's been 19 years since Gordon Tree has becom...</td>\n",
-       "      <td>1.632266</td>\n",
-       "      <td>It's been 19 years since Gordon Clarke put me ...</td>\n",
-       "      <td>2.783458</td>\n",
+       "      <td>just below</td>\n",
+       "      <td>just below)and makes it seem as</td>\n",
+       "      <td>0.861618</td>\n",
+       "      <td>just below the world capital is a man</td>\n",
+       "      <td>0.238322</td>\n",
+       "      <td>just below) in this beautiful comedy.</td>\n",
+       "      <td>2.760033</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
-       "      <td>Just kidding</td>\n",
-       "      <td>Just kidding; I know a lot</td>\n",
-       "      <td>-0.069533</td>\n",
-       "      <td>Just kidding \"Third World Snopes</td>\n",
-       "      <td>0.944632</td>\n",
-       "      <td>Just kidding, I didn't even</td>\n",
-       "      <td>1.945202</td>\n",
+       "      <td>Return To the</td>\n",
+       "      <td>Return To the Museum. That film, called Bl</td>\n",
+       "      <td>0.017376</td>\n",
+       "      <td>Return To the East\" is a fascinating film,</td>\n",
+       "      <td>2.648028</td>\n",
+       "      <td>Return To the International: Miyazaki, by Ts</td>\n",
+       "      <td>1.072344</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
-       "      <td>shakespeare's plays have a way</td>\n",
-       "      <td>shakespeare's plays have a way of weaving into...</td>\n",
-       "      <td>1.656927</td>\n",
-       "      <td>shakespeare's plays have a way. It's the look ...</td>\n",
-       "      <td>1.444803</td>\n",
-       "      <td>shakespeare's plays have a way of getting back...</td>\n",
-       "      <td>1.834373</td>\n",
+       "      <td>Brando plays the ace jet</td>\n",
+       "      <td>Brando plays the ace jet fighter pilot, who stops</td>\n",
+       "      <td>0.565335</td>\n",
+       "      <td>Brando plays the ace jet pilot, who's a</td>\n",
+       "      <td>0.668954</td>\n",
+       "      <td>Brando plays the ace jet pilot Charlie; his fo...</td>\n",
+       "      <td>0.679582</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
-       "      <td>This movie is wonderful. What</td>\n",
-       "      <td>This movie is wonderful. What could have been ...</td>\n",
-       "      <td>2.749068</td>\n",
-       "      <td>This movie is wonderful. What someone likes ab...</td>\n",
-       "      <td>2.759510</td>\n",
-       "      <td>This movie is wonderful. What a different look,</td>\n",
-       "      <td>2.695312</td>\n",
+       "      <td>And a rather U</td>\n",
+       "      <td>And a rather Utopian horror movie and with good</td>\n",
+       "      <td>2.245751</td>\n",
+       "      <td>And a rather Utop Congressional Movie, with a 45</td>\n",
+       "      <td>0.307100</td>\n",
+       "      <td>And a rather U of A complete combination of wh...</td>\n",
+       "      <td>2.209265</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
-       "      <td>I loved</td>\n",
-       "      <td>I loved this film. &lt;br /&gt;&lt;</td>\n",
-       "      <td>2.576181</td>\n",
-       "      <td>I loved it, and I really loved Audrey</td>\n",
-       "      <td>2.578412</td>\n",
-       "      <td>I loved this film. Reading reviews of it</td>\n",
-       "      <td>2.751773</td>\n",
+       "      <td>The plot of this movie hangs</td>\n",
+       "      <td>The plot of this movie hangs in the balance as...</td>\n",
+       "      <td>1.122540</td>\n",
+       "      <td>The plot of this movie hangs out well. The who...</td>\n",
+       "      <td>2.195263</td>\n",
+       "      <td>The plot of this movie hangs together within t...</td>\n",
+       "      <td>1.310783</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
-       "      <td>A superb and</td>\n",
-       "      <td>A superb and very cool drama. The novel is</td>\n",
-       "      <td>2.910374</td>\n",
-       "      <td>A superb and super fun movie that removes all the</td>\n",
-       "      <td>2.783201</td>\n",
-       "      <td>A superb and most finely acted role that I will</td>\n",
-       "      <td>2.894923</td>\n",
+       "      <td>This isn't</td>\n",
+       "      <td>This isn't all that bad; as for my</td>\n",
+       "      <td>0.623968</td>\n",
+       "      <td>This isn't a good film because I loved it</td>\n",
+       "      <td>1.694601</td>\n",
+       "      <td>This isn't bad writing, powerful actors and sp...</td>\n",
+       "      <td>1.835901</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
-       "      <td>I remember</td>\n",
-       "      <td>I remember.Very poor execution but good movies</td>\n",
-       "      <td>0.923775</td>\n",
-       "      <td>I remember when Shelter saw some girls on TV</td>\n",
-       "      <td>0.825408</td>\n",
-       "      <td>I remember thinking to myself how SOMEONE who</td>\n",
-       "      <td>1.634163</td>\n",
+       "      <td>This movie was for a</td>\n",
+       "      <td>This movie was for a good reason!' Uh, OK</td>\n",
+       "      <td>0.437566</td>\n",
+       "      <td>This movie was for a fun, and grand Robinson</td>\n",
+       "      <td>2.531890</td>\n",
+       "      <td>This movie was for a bastard.&lt;br /&gt;&lt;br</td>\n",
+       "      <td>2.311337</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
-       "      <td>This su*k</td>\n",
-       "      <td>This su*k camel down your kidd</td>\n",
-       "      <td>1.605957</td>\n",
-       "      <td>This su*k Dress! I loved it</td>\n",
-       "      <td>2.345865</td>\n",
-       "      <td>This su*k like a roll of crap</td>\n",
-       "      <td>2.422874</td>\n",
+       "      <td>witty. funny.</td>\n",
+       "      <td>witty. funny.&lt;|endoftext|&gt;</td>\n",
+       "      <td>1.636344</td>\n",
+       "      <td>witty. funny. funnier. more funny. funnier. fu...</td>\n",
+       "      <td>2.132353</td>\n",
+       "      <td>witty. funny. In the first scene the comical n...</td>\n",
+       "      <td>2.164077</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
-       "      <td>One Stink</td>\n",
-       "      <td>One Stink Act...&lt;br /&gt;&lt;br</td>\n",
-       "      <td>1.456476</td>\n",
-       "      <td>One Stinkl was a great actor, particularly</td>\n",
-       "      <td>1.782818</td>\n",
-       "      <td>One Stink?: Invisible of Saint Barbara, poor</td>\n",
-       "      <td>1.667756</td>\n",
+       "      <td>It's very hard</td>\n",
+       "      <td>It's very hard to believe that anyone would en...</td>\n",
+       "      <td>1.003727</td>\n",
+       "      <td>It's very hard to wrap your mind around what h...</td>\n",
+       "      <td>0.778888</td>\n",
+       "      <td>It's very hard to wrap this up, due to lack of...</td>\n",
+       "      <td>1.598843</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
-       "      <td>I pulled down a VHS</td>\n",
-       "      <td>I pulled down a VHS copy and watched it with m...</td>\n",
-       "      <td>0.756151</td>\n",
-       "      <td>I pulled down a VHS looking a good looking, and a</td>\n",
-       "      <td>-0.008258</td>\n",
-       "      <td>I pulled down a VHS copy the other day and all I</td>\n",
-       "      <td>0.992919</td>\n",
+       "      <td>Absolutely fantastic trash....this one</td>\n",
+       "      <td>Absolutely fantastic trash....this one was hav...</td>\n",
+       "      <td>1.350834</td>\n",
+       "      <td>Absolutely fantastic trash....this one is a pe...</td>\n",
+       "      <td>2.177587</td>\n",
+       "      <td>Absolutely fantastic trash....this one ruins i...</td>\n",
+       "      <td>2.221997</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
-       "      <td>For some</td>\n",
-       "      <td>For some alone no more Buddy Trumbull would ha...</td>\n",
-       "      <td>0.790762</td>\n",
-       "      <td>For some enthraled time, the film will impress...</td>\n",
-       "      <td>2.455694</td>\n",
-       "      <td>For some reason, a bomb crashed on the rear of...</td>\n",
-       "      <td>0.857423</td>\n",
+       "      <td>Prior to</td>\n",
+       "      <td>Prior to this action film,</td>\n",
+       "      <td>0.242474</td>\n",
+       "      <td>Prior to Christian Kane's star</td>\n",
+       "      <td>0.297408</td>\n",
+       "      <td>Prior to his restoration, Passion</td>\n",
+       "      <td>1.655534</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
-       "      <td>This one features all</td>\n",
-       "      <td>This one features all the good elements of spi...</td>\n",
-       "      <td>1.452079</td>\n",
-       "      <td>This one features all kinds of wit and humor r...</td>\n",
-       "      <td>2.743043</td>\n",
-       "      <td>This one features all the best Birdprogram sup...</td>\n",
-       "      <td>2.343950</td>\n",
+       "      <td>i,</td>\n",
+       "      <td>i, Marty Rathbun, Damon Wayans, Mark Watney and</td>\n",
+       "      <td>0.105734</td>\n",
+       "      <td>i, perhaps the great movie the director should...</td>\n",
+       "      <td>1.336116</td>\n",
+       "      <td>i, Martin was a thrill of 70s---wow!lee and Heath</td>\n",
+       "      <td>2.277638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
-       "      <td>Somehow a woman working with</td>\n",
-       "      <td>Somehow a woman working with Jim Wynorski prof...</td>\n",
-       "      <td>0.242172</td>\n",
-       "      <td>Somehow a woman working with her daughter play...</td>\n",
-       "      <td>0.092226</td>\n",
-       "      <td>Somehow a woman working with an overweight ins...</td>\n",
-       "      <td>1.415525</td>\n",
+       "      <td>The film</td>\n",
+       "      <td>The film takes a very grim craggy look</td>\n",
+       "      <td>0.069017</td>\n",
+       "      <td>The film is one of the best of that era</td>\n",
+       "      <td>2.737825</td>\n",
+       "      <td>The film's ambition was almost so great that its</td>\n",
+       "      <td>2.357480</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
-       "                              query  \\\n",
-       "0                        This movie   \n",
-       "1              OK where do i begin?   \n",
-       "2                         I watched   \n",
-       "3   It's been 19 years since Gordon   \n",
-       "4                      Just kidding   \n",
-       "5    shakespeare's plays have a way   \n",
-       "6     This movie is wonderful. What   \n",
-       "7                           I loved   \n",
-       "8                      A superb and   \n",
-       "9                        I remember   \n",
-       "10                        This su*k   \n",
-       "11                        One Stink   \n",
-       "12              I pulled down a VHS   \n",
-       "13                         For some   \n",
-       "14            This one features all   \n",
-       "15     Somehow a woman working with   \n",
+       "                                     query  \\\n",
+       "0                     This movie is one of   \n",
+       "1                                  one may   \n",
+       "2                 This is an amazing film,   \n",
+       "3                               just below   \n",
+       "4                            Return To the   \n",
+       "5                 Brando plays the ace jet   \n",
+       "6                           And a rather U   \n",
+       "7             The plot of this movie hangs   \n",
+       "8                               This isn't   \n",
+       "9                     This movie was for a   \n",
+       "10                           witty. funny.   \n",
+       "11                          It's very hard   \n",
+       "12  Absolutely fantastic trash....this one   \n",
+       "13                                Prior to   \n",
+       "14                                      i,   \n",
+       "15                                The film   \n",
       "\n",
       "                                       response (ref)  scores (ref)  \\\n",
-       "0         This movie should have read some books, and      1.411889   \n",
-       "1   OK where do i begin? *** Acting is decent (not...      1.555380   \n",
-       "2   I watched one can compare themselves upon view...      1.380120   \n",
-       "3   It's been 19 years since Gordon finally left c...      1.554914   \n",
-       "4                          Just kidding; I know a lot     -0.069533   \n",
-       "5   shakespeare's plays have a way of weaving into...      1.656927   \n",
-       "6   This movie is wonderful. What could have been ...      2.749068   \n",
-       "7                          I loved this film. <br /><      2.576181   \n",
-       "8          A superb and very cool drama. The novel is      2.910374   \n",
-       "9      I remember.Very poor execution but good movies      0.923775   \n",
-       "10                     This su*k camel down your kidd      1.605957   \n",
-       "11                          One Stink Act...<br /><br      1.456476   \n",
-       "12  I pulled down a VHS copy and watched it with m...      0.756151   \n",
-       "13  For some alone no more Buddy Trumbull would ha...      0.790762   \n",
-       "14  This one features all the good elements of spi...      1.452079   \n",
-       "15  Somehow a woman working with Jim Wynorski prof...      0.242172   \n",
+       "0       This movie is one of the most twisted films I      2.094254   \n",
+       "1                     one may feel we are seeing more      1.478813   \n",
+       "2   This is an amazing film, one of our favorite g...      2.871389   \n",
+       "3                     just below)and makes it seem as      0.861618   \n",
+       "4          Return To the Museum. That film, called Bl      0.017376   \n",
+       "5   Brando plays the ace jet fighter pilot, who stops      0.565335   \n",
+       "6     And a rather Utopian horror movie and with good      2.245751   \n",
+       "7   The plot of this movie hangs in the balance as...      1.122540   \n",
+       "8                  This isn't all that bad; as for my      0.623968   \n",
+       "9           This movie was for a good reason!' Uh, OK      0.437566   \n",
+       "10                         witty. funny.<|endoftext|>      1.636344   \n",
+       "11  It's very hard to believe that anyone would en...      1.003727   \n",
+       "12  Absolutely fantastic trash....this one was hav...      1.350834   \n",
+       "13                         Prior to this action film,      0.242474   \n",
+       "14    i, Marty Rathbun, Damon Wayans, Mark Watney and      0.105734   \n",
+       "15             The film takes a very grim craggy look      0.069017   \n",
       "\n",
       "                                      response (RLHF)  scores (RLHF)  \\\n",
-       "0   This movie has plenty of extraordinary feature...       2.735337   \n",
-       "1   OK where do i begin? For all of you who are no...       0.019694   \n",
-       "2   I watched it because of its excellent cast. Th...       2.498309   \n",
-       "3   It's been 19 years since Gordon Tree has becom...       1.632266   \n",
-       "4                    Just kidding \"Third World Snopes       0.944632   \n",
-       "5   shakespeare's plays have a way. It's the look ...       1.444803   \n",
-       "6   This movie is wonderful. What someone likes ab...       2.759510   \n",
-       "7               I loved it, and I really loved Audrey       2.578412   \n",
-       "8   A superb and super fun movie that removes all the       2.783201   \n",
-       "9        I remember when Shelter saw some girls on TV       0.825408   \n",
-       "10                        This su*k Dress! I loved it       2.345865   \n",
-       "11         One Stinkl was a great actor, particularly       1.782818   \n",
-       "12  I pulled down a VHS looking a good looking, and a      -0.008258   \n",
-       "13  For some enthraled time, the film will impress...       2.455694   \n",
-       "14  This one features all kinds of wit and humor r...       2.743043   \n",
-       "15  Somehow a woman working with her daughter play...       0.092226   \n",
+       "0    This movie is one of the finest directors of the       2.726879   \n",
+       "1                      one may not have great assets,       0.420451   \n",
+       "2   This is an amazing film, with all thelike wond...       2.918770   \n",
+       "3               just below the world capital is a man       0.238322   \n",
+       "4          Return To the East\" is a fascinating film,       2.648028   \n",
+       "5             Brando plays the ace jet pilot, who's a       0.668954   \n",
+       "6    And a rather Utop Congressional Movie, with a 45       0.307100   \n",
+       "7   The plot of this movie hangs out well. The who...       2.195263   \n",
+       "8           This isn't a good film because I loved it       1.694601   \n",
+       "9        This movie was for a fun, and grand Robinson       2.531890   \n",
+       "10  witty. funny. funnier. more funny. funnier. fu...       2.132353   \n",
+       "11  It's very hard to wrap your mind around what h...       0.778888   \n",
+       "12  Absolutely fantastic trash....this one is a pe...       2.177587   \n",
+       "13                     Prior to Christian Kane's star       0.297408   \n",
+       "14  i, perhaps the great movie the director should...       1.336116   \n",
+       "15            The film is one of the best of that era       2.737825   \n",
       "\n",
       "                                   response (best_of)  scores (best_of)  \n",
-       "0    This movie was unexpectedly funny and funny, you          2.405301  \n",
-       "1   OK where do i begin? i just wanted to add some...          0.622912  \n",
-       "2   I watched the trial trial for teaches us a goo...          2.057187  \n",
-       "3   It's been 19 years since Gordon Clarke put me ...          2.783458  \n",
-       "4                         Just kidding, I didn't even          1.945202  \n",
-       "5   shakespeare's plays have a way of getting back...          1.834373  \n",
-       "6     This movie is wonderful. What a different look,          2.695312  \n",
-       "7            I loved this film. Reading reviews of it          2.751773  \n",
-       "8     A superb and most finely acted role that I will          2.894923  \n",
-       "9       I remember thinking to myself how SOMEONE who          1.634163  \n",
-       "10                      This su*k like a roll of crap          2.422874  \n",
-       "11       One Stink?: Invisible of Saint Barbara, poor          1.667756  \n",
-       "12   I pulled down a VHS copy the other day and all I          0.992919  \n",
-       "13  For some reason, a bomb crashed on the rear of...          0.857423  \n",
-       "14  This one features all the best Birdprogram sup...          2.343950  \n",
-       "15  Somehow a woman working with an overweight ins...          1.415525  "
+       "0      This movie is one of the best looking movies I          2.705925  \n",
+       "1                  one may not be supported, terrible          2.043730  \n",
+       "2   This is an amazing film, very moving and this ...          2.871694  \n",
+       "3               just below) in this beautiful comedy.          2.760033  \n",
+       "4        Return To the International: Miyazaki, by Ts          1.072344  \n",
+       "5   Brando plays the ace jet pilot Charlie; his fo...          0.679582  \n",
+       "6   And a rather U of A complete combination of wh...          2.209265  \n",
+       "7   The plot of this movie hangs together within t...          1.310783  \n",
+       "8   This isn't bad writing, powerful actors and sp...          1.835901  \n",
+       "9              This movie was for a bastard.<br /><br          2.311337  \n",
+       "10  witty. funny. In the first scene the comical n...          2.164077  \n",
+       "11  It's very hard to wrap this up, due to lack of...          1.598843  \n",
+       "12  Absolutely fantastic trash....this one ruins i...          2.221997  \n",
+       "13                  Prior to his restoration, Passion          1.655534  \n",
+       "14  i, Martin was a thrill of 70s---wow!lee and Heath          2.277638  \n",
+       "15   The film's ambition was almost so great that its          2.357480  "
      ]
     },
-     "execution_count": 10,
+     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
@ -624,13 +578,6 @@
    "df_results = pd.DataFrame(output_data)\n",
    "df_results"
   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
  }
 ],
 "metadata": {
@ -640,7 +587,7 @@
  },
  "gpuClass": "standard",
  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
@ -654,7 +601,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.11.10"
+   "version": "3.12.3"
  }
 },
 "nbformat": 4,
--- a/examples/notebooks/gpt2-sentiment-control.ipynb
+++ b/examples/notebooks/gpt2-sentiment-control.ipynb
@ -801,7 +801,7 @@
    "\n",
    "One can observe how the model starts to generate more positive outputs after a few optimisation steps.\n",
    "\n",
-    "> Note: Investigating the KL-divergence will probably show that at this point the model has not converged to the target KL-divergence, yet. To get there would require longer training or starting with a higher inital coefficient."
+    "> Note: Investigating the KL-divergence will probably show that at this point the model has not converged to the target KL-divergence, yet. To get there would require longer training or starting with a higher initial coefficient."
   ]
  },
  {
--- a/examples/research_projects/layer_skip/README.md
+++ b/examples/research_projects/layer_skip/README.md
@ -0,0 +1,15 @@
+# LayerSkip Training Recipe
+
+Implements the training recipe as described in the [LayerSkip paper](https://huggingface.co/papers/2404.16710).
+
+## Run training
+```
+cd scripts
+python layer_skip_sft.py
+```
+
+## Run benchmark
+```
+cd scripts
+python benchmark_layer_skip.py
+```
--- a/examples/research_projects/layer_skip/scripts/benchmark_layer_skip.py
+++ b/examples/research_projects/layer_skip/scripts/benchmark_layer_skip.py
@ -0,0 +1,77 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import config
+import torch
+from torch.utils import benchmark
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def generate_tokens(model, inputs):
+    outputs = model.generate(
+        **inputs,
+        do_sample=False,
+        max_new_tokens=64,
+    )
+    return outputs
+
+
+def generate_tokens_with_assistance(model, inputs, assistant_early_exit):
+    outputs = model.generate(
+        **inputs,
+        assistant_early_exit=assistant_early_exit,
+        do_sample=False,
+        max_new_tokens=64,
+    )
+    return outputs
+
+
+if __name__ == "__main__":
+    ckpt = config.hub_model_id
+
+    model = AutoModelForCausalLM.from_pretrained(ckpt, device_map="auto", torch_dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(ckpt)
+
+    prompt = "### Instruction: What are my alarms for the rest of the day?\n ### Response: "
+
+    results = []
+    label = "Generation Times"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+
+    results.append(
+        benchmark.Timer(
+            stmt="generate_tokens(model, inputs)",
+            setup="from __main__ import generate_tokens",
+            globals={"model": model, "inputs": inputs},
+            num_threads=torch.get_num_threads(),
+            label=label,
+            sub_label="no layer skip",
+            description="generation",
+        ).blocked_autorange()
+    )
+
+    for i in range(1, model.config.num_hidden_layers):
+        results.append(
+            benchmark.Timer(
+                stmt="generate_tokens_with_assistance(model, inputs, assistant_early_exit)",
+                setup="from __main__ import generate_assistant_tokens",
+                globals={"model": model, "assistant_early_exit": i, "inputs": inputs},
+                num_threads=torch.get_num_threads(),
+                label=label,
+                sub_label=f"layer skip {i}",
+                description="generation",
+            ).blocked_autorange()
+        )
+
+    benchmark.Compare(results).print()
--- a/examples/research_projects/layer_skip/scripts/config.py
+++ b/examples/research_projects/layer_skip/scripts/config.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -12,6 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-################################################################################################
-# This file has been moved to https://github.com/huggingface/trl/blob/main/trl/scripts/chat.py #
-################################################################################################
+from huggingface_hub import whoami
+
+
+model_name = "unsloth/Llama-3.2-3B"
+tokenizer_name = "unsloth/Llama-3.2-3B"
+dataset_name = "WillHeld/top_v2"
+
+output_root_dir = "./checkpoints/"
+hub_model_id = f"{whoami()['name']}/layerskip-{model_name.split('/')[1]}-{dataset_name.split('/')[1]}"
+output_dir = f"{output_root_dir}/{hub_model_id}"
+
+per_device_train_batch_size = 8
+gradient_accumulation_steps = 1
+learning_rate = 2e-5
--- a/examples/research_projects/layer_skip/scripts/custom_trainer.py
+++ b/examples/research_projects/layer_skip/scripts/custom_trainer.py
@ -0,0 +1,48 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trl import SFTTrainer
+
+
+class LayerSkipSFTTrainer(SFTTrainer):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.early_exit_layer = 0  # initialize with 0
+        self.always_last_layer = True
+        self.early_exit_loss_scale = 1.0
+
+    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+        self.early_exit_layer = (
+            self.early_exit_layer % (model.config.num_hidden_layers - 1)
+        ) + 1  # rotates between [1, num_hidden_layers-1]
+        bs, seqlen = inputs.input_ids.shape
+
+        labels = inputs.pop("labels")
+        outputs = model(**inputs, output_hidden_states=True)
+
+        hidden_state = outputs["hidden_states"][self.early_exit_layer].to(model.dtype)
+        if self.early_exit_layer != model.config.num_hidden_layers:
+            hidden_state = model.model.norm(hidden_state)
+        logits = model.lm_head(hidden_state)
+        loss_early = model.loss_function(logits=logits, labels=labels, vocab_size=model.vocab_size)
+
+        if self.always_last_layer:
+            loss_last = model.loss_function(logits=outputs["logits"], labels=labels, vocab_size=model.vocab_size)
+            loss = self.early_exit_loss_scale * loss_early.to(loss_last.device) + 1.0 * loss_last
+            # normalize loss scales
+            loss = loss / (1.0 + self.early_exit_loss_scale)
+        else:
+            loss = loss_early
+
+        return loss
--- a/examples/research_projects/layer_skip/scripts/layer_skip_sft.py
+++ b/examples/research_projects/layer_skip/scripts/layer_skip_sft.py
@ -0,0 +1,90 @@
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import config
+import torch
+from custom_trainer import LayerSkipSFTTrainer
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from trl import DataCollatorForCompletionOnlyLM, SFTConfig
+
+
+def formatting_prompts_func(example):
+    text = f"### Instruction: {example['utterance']}\n ### Response: {example['semantic_parse']}"
+
+    # Inject eos_token as a string before tokenization, because they are not always added
+    # See: https://github.com/huggingface/transformers/issues/22794 and
+    # https://github.com/huggingface/trl/issues/1623
+    if tokenizer.eos_token:  # usually something like "</s>" for GPT2 or "<|endoftext|>"
+        text += f"{tokenizer.eos_token}"
+
+    return text
+
+
+if __name__ == "__main__":
+    # load the dataset
+    print("[INFO] loading the dataset...")
+    train_dataset = load_dataset(config.dataset_name, split="train")
+
+    print(f"output_root_dir: {config.output_root_dir}")
+    print(f"hub_model_id: {config.hub_model_id}")
+
+    # load the model and tokenizer
+    print("[INFO] loading the model and tokenizer...")
+    model = AutoModelForCausalLM.from_pretrained(config.model_name, device_map="auto", torch_dtype=torch.bfloat16)
+    tokenizer = AutoTokenizer.from_pretrained(config.tokenizer_name, add_eos_token=True)
+
+    # adding pad and eos tokens if not provided in the tokenizer
+    if tokenizer.pad_token is None:
+        # Add '[PAD]' token if it doesn't exist
+        tokenizer.add_special_tokens({"pad_token": "[PAD]"})
+        model.resize_token_embeddings(len(tokenizer))
+        model.config.pad_token_id = tokenizer.pad_token_id
+
+    if tokenizer.eos_token is None or tokenizer.eos_token == tokenizer.bos_token:
+        # Add '[EOS]' token if it doesn't exist
+        tokenizer.add_special_tokens({"eos_token": "[EOS]"})
+        model.resize_token_embeddings(len(tokenizer))
+        model.config.eos_token_id = tokenizer.eos_token_id
+
+    response_template = " ### Response:"
+    collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
+
+    args = SFTConfig(
+        do_train=True,
+        bf16=True,
+        max_seq_length=None,
+        per_device_train_batch_size=config.per_device_train_batch_size,
+        gradient_accumulation_steps=config.gradient_accumulation_steps,
+        learning_rate=config.learning_rate,
+        packing=False,
+        num_train_epochs=1.0,
+        report_to="none",
+        push_to_hub=True,
+        hub_model_id=config.hub_model_id,
+        output_dir=config.output_dir,
+        save_steps=1000,
+        save_total_limit=2,
+    )
+
+    trainer = LayerSkipSFTTrainer(
+        model,
+        train_dataset=train_dataset,
+        args=args,
+        formatting_func=formatting_prompts_func,
+        data_collator=collator,
+    )
+
+    trainer.train()
--- a/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py
+++ b/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/research_projects/stack_llama/scripts/reward_modeling.py
+++ b/examples/research_projects/stack_llama/scripts/reward_modeling.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -148,7 +148,6 @@ training_args = TrainingArguments(
    label_names=[],
    bf16=script_args.bf16,
    logging_strategy="steps",
-    logging_steps=10,
    optim=script_args.optim,
    lr_scheduler_type=script_args.lr_scheduler_type,
    seed=script_args.seed,
--- a/examples/research_projects/stack_llama/scripts/rl_training.py
+++ b/examples/research_projects/stack_llama/scripts/rl_training.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@ -82,7 +82,7 @@ config = PPOConfig(
    batch_size=script_args.batch_size,
    mini_batch_size=script_args.mini_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-    optimize_cuda_cache=True,
+    optimize_device_cache=True,
    early_stopping=script_args.early_stopping,
    target_kl=script_args.target_kl,
    ppo_epochs=script_args.ppo_epochs,
--- a/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
+++ b/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
@ -1,4 +1,4 @@
-# Copyright 2025 The HuggingFace Team. All rights reserved.
+# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
--- a/examples/research_projects/stack_llama_2/scripts/README.md
+++ b/examples/research_projects/stack_llama_2/scripts/README.md
@ -22,7 +22,6 @@ There were two main steps to the DPO training process:
    accelerate launch examples/research_projects/stack_llama_2/scripts/sft_llama2.py \
        --output_dir="./sft" \
        --max_steps=500 \
-        --logging_steps=10 \
        --save_steps=10 \
        --per_device_train_batch_size=4 \
        --per_device_eval_batch_size=1 \
--- a/Show More
+++ b/Show More