Release: v0.8.0 (#1453 )

* Release: v0.7.12 * 0.8.0 instead
FEAT: Update README to add DPO + CLIs (#1448 )
2025-10-20 18:43:52 +08:00 · 2024-03-19 17:19:38 +01:00 · 2024-03-19 16:55:56 +01:00 · 2024-03-19 16:52:42 +01:00 · 2024-03-19 16:07:50 +01:00 · 2024-03-19 12:37:06 +01:00
175 changed files with 18340 additions and 3305 deletions
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@ -0,0 +1,107 @@
+name: "Benchmark on Comment"
+
+# https://docs.github.com/en/actions/using-workflows/events-that-trigger-workflows
+on:
+  issue_comment:
+    types: [created]
+
+jobs:
+  Benchmark:
+    strategy:
+      fail-fast: true
+      matrix:
+        python-version: [3.9]
+        os: [self-hosted]
+
+    name: Benchmark
+    # Only run if it#s a PR and the comment contains /Benchmark
+    if: github.event.issue.pull_request && startsWith(github.event.comment.body, '/benchmark-trl-experiments') && contains(FromJSON('["vwxyzjn", "younesbelkada", "lvwerra", "lewtun"]'), github.actor)
+    runs-on: ${{ matrix.os }}
+
+    steps:
+      - name: Get branch of PR
+        uses: xt0rted/pull-request-comment-branch@v1
+        id: comment-branch
+      - name: Set latest commit status as pending
+        uses: myrotvorets/set-commit-status-action@master
+        with:
+          sha: ${{ steps.comment-branch.outputs.head_sha }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+          status: pending
+      - name: Checkout `main` branch
+        uses: actions/checkout@v3
+      - name: Checkout PR branch
+        run: gh pr checkout $PR_NUMBER
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          PR_NUMBER: ${{ github.event.issue.number }}
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v4
+        with:
+          python-version: ${{ matrix.python-version }}
+      # - name: Cleanup pip packages (specific to self-hosted runners)
+      #   run: |
+      #     echo PATH is $PATH
+      #     echo PYTHONPATH is $PYTHONPATH
+      #     echo which python is $(which python)
+      #     echo which pip is $(which pip)
+
+      #     pip_list=$(pip list --format=freeze | grep -v "^pip==" | grep -v "^setuptools==")
+      #     if [ ! -z "$pip_list" ]; then
+      #         echo "$pip_list" | xargs pip uninstall -y
+      #     fi
+      - name: Print python depdenencies
+        run: pip list --format=freeze
+      - name: Install dependencies
+        run: |
+          pip install .[test,benchmark]
+
+      - name: Login
+        run: wandb login ${{ secrets.WANDB_API_KEY }} && huggingface-cli login --token ${{ secrets.HUGGING_FACE_HUB_TOKEN }}
+      - name: Run benchmark
+        env:
+          GITHUB_CONTEXT: ${{ toJson(github) }}
+          PERSONAL_ACCESS_TOKEN_GITHUB: ${{ secrets.PERSONAL_ACCESS_TOKEN_GITHUB }}
+        run: |
+          COMMENT="${{ github.event.comment.body }}"
+          if [[ "$COMMENT" == *"/benchmark-trl-experiments benchmark/benchmark_level1.sh"* ]]; then
+            echo "Running benchmark/benchmark_level1.sh"
+            BENCHMARK_SCRIPT="benchmark/benchmark_level1.sh" BENCHMARK_PLOT_SCRIPT="benchmark/benchmark_level1_plot.sh" bash benchmark/benchmark_and_report.sh
+          elif [[ "$COMMENT" == *"/benchmark-trl-experiments benchmark/benchmark_level2.sh"* ]]; then
+            echo "Running benchmark/benchmark_level2.sh"
+            BENCHMARK_SCRIPT="benchmark/benchmark_level2.sh" BENCHMARK_PLOT_SCRIPT="benchmark/benchmark_level2_plot.sh" bash benchmark/benchmark_and_report.sh
+          elif [[ "$COMMENT" == *"/benchmark-trl-experiments benchmark/benchmark_level3.sh"* ]]; then
+            echo "Running benchmark/benchmark_level3.sh"
+            BENCHMARK_SCRIPT="benchmark/benchmark_level3.sh" BENCHMARK_PLOT_SCRIPT="benchmark/benchmark_level3_plot.sh" bash benchmark/benchmark_and_report.sh
+          else
+            echo "Invalid command in comment. Skipping execution."
+          fi
+
+      # send message to PR
+      - name: Setup Node.js 16
+        uses: actions/setup-node@v3
+        with:
+          node-version: 16
+      - name: Add workflow result as comment on PR
+        uses: actions/github-script@v6
+        if: always()
+        with:
+          script: |
+            const name = '${{ github.workflow	}}';
+            const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
+            const success = '${{ job.status }}' === 'success';
+            const body = `${name}: ${success ? 'succeeded ✅' : 'failed ❌'}\n${url}`;
+
+            await github.rest.issues.createComment({
+              issue_number: context.issue.number,
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              body: body
+            })
+      - name: Set latest commit status as ${{ job.status }}
+        uses: myrotvorets/set-commit-status-action@master
+        if: always()
+        with:
+          sha: ${{ steps.comment-branch.outputs.head_sha }}
+          token: ${{ secrets.GITHUB_TOKEN }}
+          status: ${{ job.status }}
--- a/.github/workflows/build_documentation.yml
+++ b/.github/workflows/build_documentation.yml
@ -13,8 +13,6 @@ jobs:
    with:
      commit_sha: ${{ github.sha }}
      package: trl
-      repo_owner: lvwerra
      version_tag_suffix: ""
    secrets:
-      token: ${{ secrets.HUGGINGFACE_PUSH }}
-      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
+      hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
--- a/.github/workflows/build_pr_documentation.yml
+++ b/.github/workflows/build_pr_documentation.yml
@ -14,5 +14,4 @@ jobs:
      commit_sha: ${{ github.event.pull_request.head.sha }}
      pr_number: ${{ github.event.number }}
      package: trl
-      repo_owner: lvwerra
      version_tag_suffix: ""
--- a/.github/workflows/delete_doc_comment.yml
+++ b/.github/workflows/delete_doc_comment.yml
@ -1,13 +0,0 @@
-name: Delete doc comment
-
-on:
-  workflow_run:
-    workflows: ["Delete doc comment trigger"]
-    types:
-      - completed
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment.yml@main
-    secrets:
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
--- a/.github/workflows/delete_doc_comment_trigger.yml
+++ b/.github/workflows/delete_doc_comment_trigger.yml
@ -1,12 +0,0 @@
-name: Delete doc comment trigger
-
-on:
-  pull_request:
-    types: [ closed ]
-
-
-jobs:
-  delete:
-    uses: huggingface/doc-builder/.github/workflows/delete_doc_comment_trigger.yml@main
-    with:
-      pr_number: ${{ github.event.number }}
--- a/.github/workflows/docker-build.yml
+++ b/.github/workflows/docker-build.yml
@ -0,0 +1,127 @@
+name: Build Docker images (scheduled)
+
+on:
+  workflow_dispatch:
+  workflow_call:
+  schedule:
+    - cron: "0 1 * * *"
+
+concurrency:
+  group: docker-image-builds
+  cancel-in-progress: false
+
+env:
+  CI_SLACK_CHANNEL: ${{ secrets.CI_DOCKER_CHANNEL }}
+
+jobs:
+  trl-latest:
+    name: "Latest TRL GPU"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Check out code
+        uses: actions/checkout@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push GPU
+        uses: docker/build-push-action@v4
+        with:
+          context: ./docker/trl-latest-gpu
+          push: true
+          tags: huggingface/trl-latest-gpu
+
+      - name: Post to a Slack channel
+        id: slack
+        #uses: slackapi/slack-github-action@v1.25.0
+        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          channel-id: ${{ env.CI_SLACK_CHANNEL }}
+          # For posting a rich message using Block Kit
+          payload: |
+            {
+              "text": "trl-latest-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "trl-latest-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+  trl-source:
+    name: "Latest TRL + HF ecosystem from source"
+    runs-on: ubuntu-latest
+    steps:
+      - name: Cleanup disk
+        run: |
+          sudo ls -l /usr/local/lib/
+          sudo ls -l /usr/share/
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+          sudo rm -rf /usr/local/lib/android
+          sudo rm -rf /usr/share/dotnet
+          sudo du -sh /usr/local/lib/
+          sudo du -sh /usr/share/
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v1
+      - name: Check out code
+        uses: actions/checkout@v3
+      - name: Login to DockerHub
+        uses: docker/login-action@v1
+        with:
+          username: ${{ secrets.DOCKERHUB_USERNAME }}
+          password: ${{ secrets.DOCKERHUB_PASSWORD }}
+
+      - name: Build and Push GPU
+        uses: docker/build-push-action@v4
+        with:
+          context: ./docker/trl-source-gpu
+          push: true
+          tags: huggingface/trl-source-gpu
+
+      - name: Post to a Slack channel
+        id: slack
+        #uses: slackapi/slack-github-action@v1.25.0
+        uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+        with:
+          # Slack channel id, channel name, or user id to post message.
+          # See also: https://api.slack.com/methods/chat.postMessage#channels
+          channel-id: ${{ env.CI_SLACK_CHANNEL }}
+          # For posting a rich message using Block Kit
+          payload: |
+            {
+              "text": "trl-source-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+              "blocks": [
+                {
+                  "type": "section",
+                  "text": {
+                    "type": "mrkdwn",
+                    "text": "trl-source-gpu Docker Image build result: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+                  }
+                }
+              ]
+            }
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/slow-tests.yml
+++ b/.github/workflows/slow-tests.yml
@ -0,0 +1,96 @@
+name: Slow tests (on push)
+
+on:
+  push:
+    branches: [ main ]
+    paths:
+      # Run only when python files are modified
+      - "trl/**.py"
+      - "examples/**.py"
+env:
+  RUN_SLOW: "yes"
+  IS_GITHUB_CI: "1"
+  SLACK_API_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
+
+
+jobs:
+  run_all_tests_single_gpu:
+    strategy:
+      fail-fast: false
+      matrix:
+        docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"]
+    runs-on: [self-hosted, single-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0"
+      TEST_TYPE: "single_gpu_${{ matrix.docker-image-name }}"
+    container:
+      image: ${{ matrix.docker-image-name }}
+      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v3
+      - name: Pip install
+        run: |
+          source activate trl
+          pip install -e ".[test]" --no-deps
+          pip install pytest-reportlog parameterized
+
+      - name: Run slow SFT tests on single GPU
+        if: always()
+        run: |
+          source activate trl
+          make slow_tests
+      
+      - name: Generate Report
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+
+
+  run_all_tests_multi_gpu:
+    strategy:
+      fail-fast: false
+      matrix:
+        docker-image-name: ["huggingface/trl-latest-gpu:latest", "huggingface/trl-source-gpu:latest"]
+    runs-on: [self-hosted, multi-gpu, nvidia-gpu, t4, ci]
+    env:
+      CUDA_VISIBLE_DEVICES: "0,1"
+      TEST_TYPE: "multi_gpu_${{ matrix.docker-image-name }}"
+    container:
+      image: ${{ matrix.docker-image-name }}
+      options: --gpus all --shm-size "16gb" -e NVIDIA_DISABLE_REQUIRE=true
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - uses: actions/checkout@v3
+      - name: Pip install
+        run: |
+          source activate trl
+          pip install -e ".[test]" --no-deps
+          pip install pytest-reportlog parameterized
+
+      - name: Run slow SFT tests on Multi GPU
+        if: always()
+        run: |
+          source activate trl
+          make slow_tests
+
+      - name: Run end-to-end examples tests on multi GPU
+        if: always()
+        run: |
+          source activate trl
+          pip install deepspeed
+          make test_examples
+      
+      - name: Generate Reports
+        if: always()
+        run: |
+          pip install slack_sdk tabulate
+          python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
+          python scripts/log_example_reports.py --text_file_name temp_results_sft_tests.txt >> $GITHUB_STEP_SUMMARY
+          python scripts/log_example_reports.py --text_file_name temp_results_dpo_tests.txt >> $GITHUB_STEP_SUMMARY
+          rm *.txt
--- a/.github/workflows/stale.yml
+++ b/.github/workflows/stale.yml
@ -7,7 +7,7 @@ on:
 jobs:
  close_stale_issues:
    name: Close Stale Issues
-    if: github.repository == 'lvwerra/trl'
+    if: github.repository == 'huggingface/trl'
    runs-on: ubuntu-latest
    env:
      GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/tests-main.yml
+++ b/.github/workflows/tests-main.yml
@ -0,0 +1,63 @@
+name: tests on transformers PEFT main
+
+on:
+  push:
+    branches: [ main ]
+
+env:
+  CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
+
+jobs:
+  tests:
+    strategy:
+      matrix:
+        python-version: ['3.9', '3.10', '3.11']
+        os: ['ubuntu-latest', 'windows-latest']
+      fail-fast: false
+    runs-on: ${{ matrix.os }}
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+        cache: "pip"
+        cache-dependency-path: |
+            setup.py
+            requirements.txt
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        # install PEFT & transformers from source
+        pip install -U git+https://github.com/huggingface/peft.git
+        pip install -U git+https://github.com/huggingface/transformers.git 
+        # cpu version of pytorch
+        pip install ".[test, diffusers]"
+    - name: Test with pytest
+      run: |
+        make test
+    - name: Post to a Slack channel
+      if: always()
+      id: slack
+      #uses: slackapi/slack-github-action@v1.25.0
+      uses: slackapi/slack-github-action@6c661ce58804a1a20f6dc5fbee7f0381b469e001
+      with:
+        # Slack channel id, channel name, or user id to post message.
+        # See also: https://api.slack.com/methods/chat.postMessage#channels
+        channel-id: ${{ env.CI_SLACK_CHANNEL }}
+        # For posting a rich message using Block Kit
+        payload: |
+          {
+            "text": "TRL CI on transformers/PEFT main: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}",
+            "blocks": [
+              {
+                "type": "section",
+                "text": {
+                  "type": "mrkdwn",
+                  "text": "TRL CI on transformers/PEFT main: ${{ job.status }}\n${{ github.event.pull_request.html_url || github.event.head_commit.url }}"
+                }
+              }
+            ]
+          }
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_CIFEEDBACK_BOT_TOKEN }}
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@ -5,35 +5,40 @@ on:
    branches: [ main ]
  pull_request:
    branches: [ main ]
+    paths:
+      # Run only when relevant files are modified
+      - "trl/**.py"
+      - "examples/**.py"
+      - "scripts/**.py"
+      - ".github/**.yml"
+      - "tests/**.py"

 jobs:
-
  check_code_quality:
    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: [3.9]
+
    steps:
-      - uses: actions/checkout@v3
-      - name: Set up Python
-        uses: actions/setup-python@v4
+      - uses: actions/checkout@v2
        with:
-          python-version: "3.8"
-          cache: "pip"
-          cache-dependency-path: |
-            setup.py
-            requirements.txt
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          pip install .[dev]
-      - name: Check quality
-        run: |
-          make quality
+          fetch-depth: 0
+          submodules: recursive
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v2
+        with:
+          python-version: ${{ matrix.python-version }}
+      - uses: pre-commit/action@v2.0.3
+        with:
+          extra_args: --all-files

  tests:
    needs: check_code_quality
    strategy:
      matrix:
        python-version: ['3.8', '3.9', '3.10']
-        os: ['ubuntu-latest', 'macos-latest', 'windows-latest']
+        os: ['ubuntu-latest', 'windows-latest']
    runs-on: ${{ matrix.os }}
    steps:
    - uses: actions/checkout@v3
@ -45,6 +50,28 @@ jobs:
        cache-dependency-path: |
            setup.py
            requirements.txt
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        # cpu version of pytorch
+        pip install ".[test, peft, diffusers]"
+    - name: Test with pytest
+      run: |
+        make test
+
+  tests_no_optional_dep:
+    needs: check_code_quality
+    runs-on: 'ubuntu-latest'
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python 3.9
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.9'
+        cache: "pip"
+        cache-dependency-path: |
+            setup.py
+            requirements.txt
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
+benchmark/trl
 *.bak
 .gitattributes
 .last_checked
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -0,0 +1,15 @@
+repos:
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.2.0
+    hooks:
+      - id: ruff
+        args: [ --fix ]
+      - id: ruff-format
+
+  # - repo: https://github.com/codespell-project/codespell
+  #   rev: v2.1.0
+  #   hooks:
+  #     - id: codespell
+  #       args:
+  #         - --ignore-words-list=nd,reacher,thist,ths,magent,ba
+  #         - --skip=docs/css/termynal.css,docs/js/termynal.js
--- a/CITATION.cff
+++ b/CITATION.cff
@ -17,7 +17,7 @@ authors:
    family-names: Thrush
  - given-names: Nathan
    family-names: Lambert
-repository-code: 'https://github.com/lvwerra/trl'
+repository-code: 'https://github.com/huggingface/trl'
 abstract: "With trl you can train transformer language models with Proximal Policy Optimization (PPO). The library is built on top of the transformers library by \U0001F917 Hugging Face. Therefore, pre-trained language models can be directly loaded via transformers. At this point, most decoder and encoder-decoder architectures are supported."
 keywords:
  - rlhf
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -5,7 +5,7 @@
 Before you start contributing make sure you installed all the dev tools:

 ```bash
-pip install -e ".[dev]"
+make dev
 ```

 ## Did you find a bug?
@ -36,10 +36,15 @@ First you want to make sure that all the tests pass:
 make test
 ```

-Then before submitting your PR make sure the code quality follows the standards. You can run the following command to format and test:
+Then before submitting your PR make sure the code quality follows the standards. You can run the following command to format:

 ```bash
-make style && make quality
+make precommit
+```
+
+Make sure to install `pre-commit` before running the command:
+```bash
+pip install pre-commit
 ```

 ## Do you want to contribute to the documentation?
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -2,4 +2,4 @@ include settings.ini
 include LICENSE
 include CONTRIBUTING.md
 include README.md
-recursive-exclude * __pycache__
+recursive-exclude * __pycache__
--- a/44
+++ b/44
@ -1,15 +1,43 @@
-.PHONY: quality style test
+.PHONY: test precommit benchmark_core benchmark_aux common_tests slow_tests test_examples tests_gpu

 check_dirs := examples tests trl

+ACCELERATE_CONFIG_PATH = `pwd`/examples/accelerate_configs
+COMMAND_FILES_PATH = `pwd`/commands
+
+
+dev:
+	[ -L "$(pwd)/trl/commands/scripts" ] && unlink "$(pwd)/trl/commands/scripts" || true
+	pip install -e ".[dev]"
+	ln -s `pwd`/examples/scripts/ `pwd`/trl/commands
+
 test:
 	python -m pytest -n auto --dist=loadfile -s -v ./tests/

-quality:
-	black --check --line-length 119 --target-version py38 $(check_dirs)
-	isort --check-only $(check_dirs)
-	flake8 $(check_dirs)
+precommit:
+	pre-commit run --all-files

-style:
-	black --line-length 119 --target-version py38 $(check_dirs)
-	isort $(check_dirs)
+benchmark_core:
+	bash ./benchmark/benchmark_core.sh
+
+benchmark_aux:
+	bash ./benchmark/benchmark_aux.sh
+
+tests_gpu:
+	python -m pytest tests/test_* $(if $(IS_GITHUB_CI),--report-log "common_tests.log",)
+
+slow_tests:
+	python -m pytest tests/slow/test_* $(if $(IS_GITHUB_CI),--report-log "slow_tests.log",)
+
+test_examples:
+	touch temp_results_sft_tests.txt
+	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
+		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_sft.sh; \
+		echo $$?','$${file} >> temp_results_sft_tests.txt; \
+	done
+
+	touch temp_results_dpo_tests.txt
+	for file in $(ACCELERATE_CONFIG_PATH)/*.yaml; do \
+		TRL_ACCELERATE_CONFIG=$${file} bash $(COMMAND_FILES_PATH)/run_dpo.sh; \
+		echo $$?','$${file} >> temp_results_dpo_tests.txt; \
+	done
--- a/README.md
+++ b/README.md
@ -3,57 +3,140 @@
 </div>

 # TRL - Transformer Reinforcement Learning
-> Train transformer language models with reinforcement learning.
+> Full stack library to fine-tune and align large language models.
+
+<p align="center">
+    <a href="https://github.com/huggingface/trl/blob/main/LICENSE">
+        <img alt="License" src="https://img.shields.io/github/license/huggingface/trl.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/docs/trl/index">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/docs/trl/index.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/trl/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/trl.svg">
+    </a>
+</p>


 ## What is it?
-With `trl` you can train transformer language models with Proximal Policy Optimization (PPO). The library is built on top of the [`transformers`](https://github.com/huggingface/transformers) library by  🤗 Hugging Face. Therefore, pre-trained language models can be directly loaded via `transformers`. At this point most of decoder architectures and encoder-decoder architectures are supported. 

-**Highlights:**
- `PPOTrainer`: A PPO trainer for language models that just needs (query, response, reward) triplets to optimise the language model.
- `AutoModelForCausalLMWithValueHead` & `AutoModelForSeq2SeqLMWithValueHead`: A transformer model with an additional scalar output for each token which can be used as a value function in reinforcement learning.
- Example: Train GPT2 to generate positive movie reviews with a BERT sentiment classifier.
+The `trl` library is a full stack tool to fine-tune and align transformer language and diffusion models using methods such as Supervised Fine-tuning step (SFT), Reward Modeling (RM) and the Proximal Policy Optimization (PPO) as well as Direct Preference Optimization (DPO). 

-## How it works
-Fine-tuning a language model via PPO consists of roughly three steps:
-
-1. **Rollout**: The language model generates a response or continuation based on query which could be the start of a sentence.
-2. **Evaluation**: The query and response are evaluated with a function, model, human feedback or some combination of them. The important thing is that this process should yield a scalar value for each query/response pair.
-3. **Optimization**: This is the most complex part. In the optimisation step the query/response pairs are used to calculate the log-probabilities of the tokens in the sequences. This is done with the model that is trained and and a reference model, which is usually the pre-trained model before fine-tuning. The KL-divergence between the two outputs is used as an additional reward signal to make sure the generated responses don't deviate to far from the reference language model. The active language model is then trained with PPO.
-
-This process is illustrated in the sketch below:
+The library is built on top of the [`transformers`](https://github.com/huggingface/transformers) library and thus allows to use any model architecture available there.


-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/trl_overview.png" width="800">
-<p style="text-align: center;"> <b>Figure:</b> Sketch of the workflow. </p>
-</div>
+## Highlights
+
+- **`Efficient and scalable`**: 
+    - [`accelerate`](https://github.com/huggingface/accelerate) is the backbone of `trl` which allows to scale model training from a single GPU to a large scale multi-node cluster with methods such as DDP and DeepSpeed.
+    - [`PEFT`](https://github.com/huggingface/peft) is fully integrated and allows to train even the largest models on modest hardware with quantisation and methods such as LoRA or QLoRA.
+    - [`unsloth`](https://github.com/unslothai/unsloth) is also integrated and allows to significantly speed up training with dedicated kernels.
+- **`CLI`**: With the [CLI](https://huggingface.co/docs/trl/clis) you can fine-tune and chat with LLMs without writing any code using a single command and a flexible config system.
+- **`Trainers`**: The Trainer classes are an abstraction to apply many fine-tuning methods with ease such as the [`SFTTrainer`](https://huggingface.co/docs/trl/sft_trainer), [`DPOTrainer`](https://huggingface.co/docs/trl/trainer#trl.DPOTrainer), [`RewardTrainer`](https://huggingface.co/docs/trl/reward_trainer), and [`PPOTrainer`](https://huggingface.co/docs/trl/trainer#trl.PPOTrainer).
+- **`AutoModels`**: The [`AutoModelForCausalLMWithValueHead`](https://huggingface.co/docs/trl/models#trl.AutoModelForCausalLMWithValueHead) & [`AutoModelForSeq2SeqLMWithValueHead`](https://huggingface.co/docs/trl/models#trl.AutoModelForSeq2SeqLMWithValueHead) classes add an additional value head to the model which allows to train them with RL algorithms such as PPO.
+- **`Examples`**: Train GPT2 to generate positive movie reviews with a BERT sentiment classifier, full RLHF using adapters only, train GPT-j to be less toxic, [StackLlama example](https://huggingface.co/blog/stackllama), etc. following the [examples](https://github.com/huggingface/trl/tree/main/examples).

 ## Installation

 ### Python package
-Install the library with pip:
+Install the library with `pip`:
 ```bash
 pip install trl
 ```

 ### From source
-If you want to run the examples in the repository a few additional libraries are required. Clone the repository and install it with pip:
+If you want to use the latest features before an official release you can install from source:
 ```bash
-git clone https://github.com/lvwerra/trl.git
-cd trl/
-pip install .
+pip install git+https://github.com/huggingface/trl.git
 ```

-If you wish to develop TRL, you should install in editable mode:
+### Repository
+If you want to use the examples you can clone the repository with the following command:
 ```bash
-pip install -e .
+git clone https://github.com/huggingface/trl.git
 ```

+## Command Line Interface (CLI)
+
+You can use TRL Command Line Interface (CLI) to quickly get started with Supervised Fine-tuning (SFT), Direct Preference Optimization (DPO) and test your aligned model with the chat CLI: 
+
+**SFT:**
+
+```bash
+trl sft --model_name_or_path facebook/opt-125m --dataset_name imdb --output_dir opt-sft-imdb
+```
+
+**DPO:**
+
+```bash
+trl dpo --model_name_or_path facebook/opt-125m --dataset_name trl-internal-testing/Anthropic-hh-rlhf-processed --output_dir opt-sft-hh-rlhf 
+```
+
+**Chat:**
+
+```bash
+trl chat --model_name_or_path Qwen/Qwen1.5-0.5B-Chat
+```
+
+Read more about CLI in the [relevant documentation section](https://huggingface.co/docs/trl/main/en/clis) or use `--help` for more details.
+
 ## How to use

-### Example
-This is a basic example on how to use the library. Based on a query the language model creates a response which is then evaluated. The evaluation could be a human in the loop or another model's output.
+For more flexibility and control over the training you can use the dedicated trainer classes to fine-tune the model in Python.
+
+### `SFTTrainer`
+
+This is a basic example on how to use the `SFTTrainer` from the library. The `SFTTrainer` is a light wrapper around the `transformers` Trainer to easily fine-tune language models or adapters on a custom dataset.
+
+```python
+# imports
+from datasets import load_dataset
+from trl import SFTTrainer
+
+# get dataset
+dataset = load_dataset("imdb", split="train")
+
+# get trainer
+trainer = SFTTrainer(
+    "facebook/opt-350m",
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=512,
+)
+
+# train
+trainer.train()
+```
+
+### `RewardTrainer`
+
+This is a basic example on how to use the `RewardTrainer` from the library. The `RewardTrainer` is a wrapper around the `transformers` Trainer to easily fine-tune reward models or adapters on a custom preference dataset.
+
+```python
+# imports
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from trl import RewardTrainer
+
+# load model and dataset - dataset needs to be in a specific format
+model = AutoModelForSequenceClassification.from_pretrained("gpt2", num_labels=1)
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+...
+
+# load trainer
+trainer = RewardTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+)
+
+# train
+trainer.train()
+```
+
+### `PPOTrainer`
+
+This is a basic example on how to use the `PPOTrainer` from the library. Based on a query the language model creates a response which is then evaluated. The evaluation could be a human in the loop or another model's output.

 ```python
 # imports
@ -67,11 +150,10 @@ model = AutoModelForCausalLMWithValueHead.from_pretrained('gpt2')
 model_ref = create_reference_model(model)

 tokenizer = AutoTokenizer.from_pretrained('gpt2')
+tokenizer.pad_token = tokenizer.eos_token

 # initialize trainer
-ppo_config = PPOConfig(
-    batch_size=1,
-)
+ppo_config = PPOConfig(batch_size=1, mini_batch_size=1)

 # encode a query
 query_txt = "This morning I went to the "
@ -91,31 +173,60 @@ reward = [torch.tensor(1.0)]
 train_stats = ppo_trainer.step([query_tensor[0]], [response_tensor[0]], reward)
 ```

-### Advanced example: IMDB sentiment
-For a detailed example check out the example python script `examples/sentiment/scripts/gpt2-sentiment.py`, where GPT2 is fine-tuned to generate positive movie reviews. An few examples from the language models before and after optimisation are given below:
+### `DPOTrainer`

-<div style="text-align: center">
-<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/table_imdb_preview.png" width="800">
-<p style="text-align: center;"> <b>Figure:</b> A few review continuations before and after optimisation. </p>
-</div>
+`DPOTrainer` is a trainer that uses [Direct Preference Optimization algorithm](https://arxiv.org/abs/2305.18290). This is a basic example on how to use the `DPOTrainer` from the library. The `DPOTrainer` is a wrapper around the `transformers` Trainer to easily fine-tune reward models or adapters on a custom preference dataset.
+
+```python
+# imports
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from trl import DPOTrainer
+
+# load model and dataset - dataset needs to be in a specific format
+model = AutoModelForCausalLM.from_pretrained("gpt2")
+tokenizer = AutoTokenizer.from_pretrained("gpt2")
+
+...
+
+# load trainer
+trainer = DPOTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+)
+
+# train
+trainer.train()
+```
+
+## Development
+
+If you want to contribute to `trl` or customizing it to your needs make sure to read the [contribution guide](https://github.com/huggingface/trl/blob/main/CONTRIBUTING.md) and make sure you make a dev install:
+
+```bash
+git clone https://github.com/huggingface/trl.git
+cd trl/
+make dev
+```

 ## References

 ### Proximal Policy Optimisation
 The PPO implementation largely follows the structure introduced in the paper **"Fine-Tuning Language Models from Human Preferences"** by D. Ziegler et al. \[[paper](https://arxiv.org/pdf/1909.08593.pdf), [code](https://github.com/openai/lm-human-preferences)].

-### Language models
-The language models utilize the `transformers` library by 🤗 Hugging Face.
+### Direct Preference Optimization
+DPO is based on the original implementation of **"Direct Preference Optimization: Your Language Model is Secretly a Reward Model"** by E. Mitchell et al. \[[paper](), [code](https://github.com/eric-mitchell/direct-preference-optimization)]
+

 ## Citation

 ```bibtex
@misc{vonwerra2022trl,
-  author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert},
+  author = {Leandro von Werra and Younes Belkada and Lewis Tunstall and Edward Beeching and Tristan Thrush and Nathan Lambert and Shengyi Huang},
  title = {TRL: Transformer Reinforcement Learning},
  year = {2020},
  publisher = {GitHub},
  journal = {GitHub repository},
-  howpublished = {\url{https://github.com/lvwerra/trl}}
+  howpublished = {\url{https://github.com/huggingface/trl}}
 }
 ```
--- a/benchmark/benchmark.py
+++ b/benchmark/benchmark.py
@ -6,6 +6,8 @@ import subprocess
 import uuid
 from distutils.util import strtobool

+import requests
+

 def parse_args():
    # fmt: off
@ -38,14 +40,65 @@ def parse_args():
 def run_experiment(command: str):
    command_list = shlex.split(command)
    print(f"running {command}")
-    fd = subprocess.Popen(command_list)
-    return_code = fd.wait()
-    assert return_code == 0
+
+    # Use subprocess.PIPE to capture the output
+    fd = subprocess.Popen(command_list, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    output, errors = fd.communicate()
+
+    return_code = fd.returncode
+    assert return_code == 0, f"Command failed with error: {errors.decode('utf-8')}"
+
+    # Convert bytes to string and strip leading/trailing whitespaces
+    return output.decode("utf-8").strip()
+
+
+def autotag() -> str:
+    wandb_tag = ""
+    print("autotag feature is enabled")
+    git_tag = ""
+    try:
+        git_tag = subprocess.check_output(["git", "describe", "--tags"]).decode("ascii").strip()
+        print(f"identified git tag: {git_tag}")
+    except subprocess.CalledProcessError as e:
+        print(e)
+    if len(git_tag) == 0:
+        try:
+            count = int(subprocess.check_output(["git", "rev-list", "--count", "HEAD"]).decode("ascii").strip())
+            hash = subprocess.check_output(["git", "rev-parse", "--short", "HEAD"]).decode("ascii").strip()
+            git_tag = f"no-tag-{count}-g{hash}"
+            print(f"identified git tag: {git_tag}")
+        except subprocess.CalledProcessError as e:
+            print(e)
+    wandb_tag = f"{git_tag}"
+
+    git_commit = subprocess.check_output(["git", "rev-parse", "--verify", "HEAD"]).decode("ascii").strip()
+    try:
+        # try finding the pull request number on github
+        prs = requests.get(f"https://api.github.com/search/issues?q=repo:huggingface/trl+is:pr+{git_commit}")
+        if prs.status_code == 200:
+            prs = prs.json()
+            if len(prs["items"]) > 0:
+                pr = prs["items"][0]
+                pr_number = pr["number"]
+                wandb_tag += f",pr-{pr_number}"
+        print(f"identified github pull request: {pr_number}")
+    except Exception as e:
+        print(e)
+
+    return wandb_tag


 if __name__ == "__main__":
    args = parse_args()
-
+    if args.auto_tag:
+        existing_wandb_tag = os.environ.get("WANDB_TAGS", "")
+        wandb_tag = autotag()
+        if len(wandb_tag) > 0:
+            if len(existing_wandb_tag) > 0:
+                os.environ["WANDB_TAGS"] = ",".join([existing_wandb_tag, wandb_tag])
+            else:
+                os.environ["WANDB_TAGS"] = wandb_tag
+    print("WANDB_TAGS: ", os.environ.get("WANDB_TAGS", ""))
    commands = []
    for seed in range(0, args.num_seeds):
        commands += [" ".join([args.command, "--seed", str(args.start_seed + seed)])]
@ -93,4 +146,5 @@ if __name__ == "__main__":
        slurm_path = os.path.join("slurm", f"{filename}.slurm")
        print(f"saving command in {slurm_path}")
        if args.workers > 0:
-            run_experiment(f"sbatch {slurm_path}")
+            job_id = run_experiment(f"sbatch --parsable {slurm_path}")
+            print(f"Job ID: {job_id}")
--- a/benchmark/benchmark_and_report.sh
+++ b/benchmark/benchmark_and_report.sh
@ -0,0 +1,26 @@
+export WANDB_ENTITY=huggingface
+export WANDB_PROJECT=trl
+bash $BENCHMARK_SCRIPT > output.txt
+
+# Extract Job IDs into an array
+job_ids=($(grep "Job ID:" output.txt | awk '{print $3}'))
+
+# Extract WANDB_TAGS into an array
+WANDB_TAGS=($(grep "WANDB_TAGS:" output.txt | awk '{print $2}'))
+WANDB_TAGS=($(echo $WANDB_TAGS | tr "," "\n"))
+
+# Print to verify
+echo "Job IDs: ${job_ids[@]}"
+echo "WANDB_TAGS: ${WANDB_TAGS[@]}"
+
+TAGS_STRING="?tag=${WANDB_TAGS[0]}"
+FOLDER_STRING="${WANDB_TAGS[0]}"
+for tag in "${WANDB_TAGS[@]:1}"; do
+    TAGS_STRING+="&tag=$tag"
+    FOLDER_STRING+="_$tag"
+done
+
+echo "TAGS_STRING: $TAGS_STRING"
+echo "FOLDER_STRING: $FOLDER_STRING"
+
+TAGS_STRING=$TAGS_STRING FOLDER_STRING=$FOLDER_STRING BENCHMARK_PLOT_SCRIPT=$BENCHMARK_PLOT_SCRIPT sbatch --dependency=afterany:$job_ids benchmark/post_github_comment.sbatch
--- a/benchmark/benchmark_level1.sh
+++ b/benchmark/benchmark_level1.sh
@ -0,0 +1,44 @@
+# hello world experiment
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+
+python benchmark/benchmark.py \
+    --command "python examples/scripts/dpo.py --model_name_or_path=gpt2 --per_device_train_batch_size 4 --max_steps 1000 --learning_rate 1e-3 --gradient_accumulation_steps 1 --logging_steps 10 --eval_steps 500 --output_dir="dpo_anthropic_hh" --optim adamw_torch --warmup_steps 150 --report_to wandb --bf16 --logging_first_step --no_remove_unused_columns" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+
+python benchmark/benchmark.py \
+    --command "python examples/scripts/sft.py --model_name_or_path="facebook/opt-350m" --report_to="wandb" --learning_rate=1.41e-5 --per_device_train_batch_size=64 --gradient_accumulation_steps=16 --output_dir="sft_openassistant-guanaco" --logging_steps=1 --num_train_epochs=3 --max_steps=-1 --push_to_hub --gradient_checkpointing" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+
+python benchmark/benchmark.py \
+    --command "python examples/scripts/reward_modeling.py --model_name_or_path=facebook/opt-350m --output_dir="reward_modeling_anthropic_hh" --per_device_train_batch_size=64 --num_train_epochs=1 --gradient_accumulation_steps=16 --gradient_checkpointing=True --learning_rate=1.41e-5 --report_to="wandb" --remove_unused_columns=False --optim="adamw_torch" --logging_steps=10 --evaluation_strategy="steps" --max_length=512" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
--- a/benchmark/benchmark_level1_plot.sh
+++ b/benchmark/benchmark_level1_plot.sh
@ -0,0 +1,50 @@
+# pip install openrlbenchmark==0.2.1a5
+# see https://github.com/openrlbenchmark/openrlbenchmark#get-started for documentation
+echo "we deal with $TAGS_STRING"
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "ppo$TAGS_STRING" \
+    --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$FOLDER_STRING/ppo \
+    --scan-history
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=output_dir&cen=_name_or_path&metrics=train/rewards/accuracies&metrics=train/loss' \
+        "gpt2$TAGS_STRING" \
+    --env-ids dpo_anthropic_hh \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$FOLDER_STRING/dpo \
+    --scan-history
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=output_dir&cen=_name_or_path&metrics=train/loss&metrics=eval/accuracy&metrics=eval/loss' \
+        "facebook/opt-350m$TAGS_STRING" \
+    --env-ids reward_modeling_anthropic_hh \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$FOLDER_STRING/reward_modeling \
+    --scan-history
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=output_dir&cen=_name_or_path&metrics=train/loss' \
+        "facebook/opt-350m$TAGS_STRING" \
+    --env-ids sft_openassistant-guanaco \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$FOLDER_STRING/sft \
+    --scan-history
+
+python benchmark/upload_benchmark.py \
+    --folder_path="benchmark/trl/$FOLDER_STRING" \
+    --path_in_repo="images/benchmark/$FOLDER_STRING" \
+    --repo_id="trl-internal-testing/example-images" \
+    --repo_type="dataset"
+
--- a/benchmark/benchmark_level2.sh
+++ b/benchmark/benchmark_level2.sh
@ -0,0 +1,23 @@
+# compound experiments: gpt2xl + grad_accu
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name ppo_gpt2xl_grad_accu --model_name gpt2-xl --mini_batch_size 16 --gradient_accumulation_steps 8 --log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+
+# compound experiments: Cerebras-GPT-6.7B + deepspeed zero2 + grad_accu
+python benchmark/benchmark.py \
+    --command "accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml examples/scripts/ppo.py --exp_name ppo_Cerebras-GPT-6.7B_grad_accu_deepspeed_stage2  --batch_size 32  --mini_batch_size 32 --log_with wandb --model_name cerebras/Cerebras-GPT-6.7B --reward_model sentiment-analysis:cerebras/Cerebras-GPT-6.7B" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 8 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 90 \
+    --slurm-template-path benchmark/trl.slurm_template
--- a/benchmark/benchmark_level2_plot.sh
+++ b/benchmark/benchmark_level2_plot.sh
@ -0,0 +1,31 @@
+# pip install openrlbenchmark==0.2.1a5
+# see https://github.com/openrlbenchmark/openrlbenchmark#get-started for documentation
+echo "we deal with $TAGS_STRING"
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "ppo$TAGS_STRING" \
+        "ppo_gpt2xl_grad_accu$TAGS_STRING" \
+    --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$FOLDER_STRING/different_models \
+    --scan-history
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "ppo_Cerebras-GPT-6.7B_grad_accu_deepspeed_stage2$TAGS_STRING" \
+    --env-ids sentiment-analysis:cerebras/Cerebras-GPT-6.7B \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$FOLDER_STRING/deepspeed \
+    --scan-history
+
+python benchmark/upload_benchmark.py \
+    --folder_path="benchmark/trl/$FOLDER_STRING" \
+    --path_in_repo="images/benchmark/$FOLDER_STRING" \
+    --repo_id="trl-internal-testing/example-images" \
+    --repo_type="dataset"
+
--- a/benchmark/benchmark_level3.sh
+++ b/benchmark/benchmark_level3.sh
@ -0,0 +1,46 @@
+## w/ and w/o gradient accumulation
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name ppo_step_grad_accu --mini_batch_size 1 --gradient_accumulation_steps 128 --log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+
+## w/ different models (gpt2, gpt2-xl, falcon, llama2)
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name ppo_gpt2 --log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name ppo_falcon_rw_1b --model_name tiiuae/falcon-rw-1b --log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+
+
+## w/ and w/o PEFT
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name ppo_peft --use_peft --log_with wandb" \
+    --num-seeds 3 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
--- a/benchmark/plot.sh
+++ b/benchmark/plot.sh
@ -0,0 +1,56 @@
+# pip install openrlbenchmark==0.2.1a5
+# see https://github.com/openrlbenchmark/openrlbenchmark#get-started for documentation
+BASELINE_PR_TAG=v0.4.7-55-g110e672
+BASELINE_PR_NAME=PR-662
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "sentiment_tuning?tag=$BASELINE_PR_TAG&cl=sentiment lvwerra/gpt2-imdb ($BASELINE_PR_NAME)" \
+    --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$BASELINE_PR_TAG/sentiment \
+    --scan-history
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "sentiment_tuning?tag=$BASELINE_PR_TAG&cl=sentiment lvwerra/gpt2-imdb ($BASELINE_PR_NAME)" \
+        "sentiment_tuning_step_grad_accu?tag=$BASELINE_PR_TAG&cl=sentiment lvwerra/gpt2-imdb gradient accumulation ($BASELINE_PR_NAME)" \
+    --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$BASELINE_PR_TAG/gradient_accu \
+    --scan-history
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "sentiment_tuning?tag=$BASELINE_PR_TAG&cl=sentiment lvwerra/gpt2-imdb ($BASELINE_PR_NAME)" \
+        "sentiment_tuning_gpt2?tag=$BASELINE_PR_TAG&cl=sentiment gpt2 ($BASELINE_PR_NAME)" \
+        "sentiment_tuning_falcon_rw_1b?tag=$BASELINE_PR_TAG&cl=sentiment tiiuae/falcon-rw-1b ($BASELINE_PR_NAME)" \
+        "sentiment_tuning_gpt2xl_grad_accu?tag=$BASELINE_PR_TAG&cl=sentiment gpt2xl ($BASELINE_PR_NAME)" \
+    --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$BASELINE_PR_TAG/different_models \
+    --scan-history
+
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=huggingface&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.reward_model&cen=trl_ppo_trainer_config.value.exp_name&metrics=env/reward_mean&metrics=objective/kl' \
+        "sentiment_tuning?tag=$BASELINE_PR_TAG&cl=sentiment lvwerra/gpt2-imdb ($BASELINE_PR_NAME)" \
+        "sentiment_tuning_peft?tag=$BASELINE_PR_TAG&cl=sentiment lvwerra/gpt2-imdb w/ peft ($BASELINE_PR_NAME)" \
+    --env-ids sentiment-analysis:lvwerra/distilbert-imdb \
+    --no-check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename benchmark/trl/$BASELINE_PR_TAG/peft \
+    --scan-history
+
+
+python benchmark/upload_benchmark.py \
+    --folder_path="benchmark/trl/$BASELINE_PR_TAG" \
+    --path_in_repo="images/benchmark/$BASELINE_PR_TAG" \
+    --repo_id="trl-internal-testing/example-images" \
+    --repo_type="dataset"
--- a/benchmark/post_github_comment.py
+++ b/benchmark/post_github_comment.py
@ -0,0 +1,26 @@
+import json
+import os
+
+from ghapi.all import GhApi
+
+
+FOLDER_STRING = os.environ.get("FOLDER_STRING", "")
+folder = f"benchmark/trl/{FOLDER_STRING}"
+host_url = f"https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/benchmark/{FOLDER_STRING}"
+
+# Create a GitHub API instance
+github_context = json.loads(os.environ["GITHUB_CONTEXT"])
+token = os.environ["PERSONAL_ACCESS_TOKEN_GITHUB"]  # this needs to refreshed every 12 months
+status_message = "**[COSTA BENCHMARK BOT]**: Here are the results"
+body = status_message
+repo = github_context["repository"]
+owner, repo = repo.split("/")
+api = GhApi(owner=owner, repo=repo, token=token)
+
+# for each `.png` file in the folder, add it to the comment
+for file in os.listdir(folder):
+    if file.endswith(".png"):
+        body += f"\n![{file}]({host_url}/{file})"
+
+# Create a comment on the issue
+api.issues.create_comment(issue_number=github_context["event"]["issue"]["number"], body=body)
--- a/benchmark/post_github_comment.sbatch
+++ b/benchmark/post_github_comment.sbatch
@ -0,0 +1,9 @@
+#!/bin/bash
+#SBATCH --job-name=trl
+#SBATCH --partition=hopper-cpu
+#SBATCH --ntasks=1
+#SBATCH --output=slurm/logs/%x_%j.out
+
+sleep 2m
+bash $BENCHMARK_PLOT_SCRIPT
+srun python benchmark/post_github_comment.py
--- a/benchmark/regression_test.sh
+++ b/benchmark/regression_test.sh
@ -0,0 +1,3 @@
+BENCHMARK_SCRIPT="benchmark/benchmark_level1.sh" \
+BENCHMARK_PLOT_SCRIPT="benchmark/benchmark_level1_plot.sh" \
+bash benchmark/benchmark_and_report.sh
--- a/benchmark/trl.slurm_template
+++ b/benchmark/trl.slurm_template
@ -1,11 +1,14 @@
 #!/bin/bash
-#SBATCH --partition=dev-cluster
+#SBATCH --job-name=trl
+#SBATCH --partition=hopper-prod
 #SBATCH --gpus-per-task={{gpus_per_task}}
 #SBATCH --cpus-per-gpu={{cpus_per_gpu}}
 #SBATCH --ntasks={{ntasks}}
-#SBATCH --mem-per-cpu=11G
 #SBATCH --output=slurm/logs/%x_%j.out
 #SBATCH --array={{array}}
+##SBATCH --exclude=ip-26-0-149-199
+
+module load cuda/12.1

 {{nodes}}

--- a/benchmark/upload_benchmark.py
+++ b/benchmark/upload_benchmark.py
@ -0,0 +1,23 @@
+from dataclasses import dataclass
+
+import tyro
+from huggingface_hub import HfApi
+
+
+@dataclass
+class Args:
+    folder_path: str = "benchmark/trl"
+    path_in_repo: str = "images/benchmark"
+    repo_id: str = "trl-internal-testing/example-images"
+    repo_type: str = "dataset"
+
+
+args = tyro.cli(Args)
+api = HfApi()
+
+api.upload_folder(
+    folder_path=args.folder_path,
+    path_in_repo=args.path_in_repo,
+    repo_id=args.repo_id,
+    repo_type=args.repo_type,
+)
--- a/commands/run_dpo.sh
+++ b/commands/run_dpo.sh
@ -0,0 +1,58 @@
+#!/bin/bash
+# This script runs an SFT example end-to-end on a tiny model using different possible configurations
+# but defaults to QLoRA + PEFT
+OUTPUT_DIR="test_dpo/"
+MODEL_NAME="HuggingFaceM4/tiny-random-LlamaForCausalLM"
+DATASET_NAME="trl-internal-testing/Anthropic-hh-rlhf-processed"
+MAX_STEPS=5
+BATCH_SIZE=2
+SEQ_LEN=128
+
+# Handle extra arguments in case one passes accelerate configs.
+EXTRA_ACCELERATE_ARGS=""
+EXTRA_TRAINING_ARGS="""--use_peft \
+    --load_in_4bit
+"""
+
+# This is a hack to get the number of available GPUs
+NUM_GPUS=2
+
+if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
+  EXTRA_ACCELERATE_ARGS=""
+else
+  EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
+  # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
+  # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
+  if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
+    EXTRA_TRAINING_ARGS="--fp16"
+  else
+    echo "Keeping QLoRA + PEFT"
+  fi
+fi
+
+
+CMD="""
+accelerate launch $EXTRA_ACCELERATE_ARGS \
+    --num_processes $NUM_GPUS \
+    --mixed_precision 'fp16' \
+    `pwd`/examples/scripts/dpo.py \
+    --model_name_or_path $MODEL_NAME \
+    --dataset_name $DATASET_NAME \
+    --output_dir $OUTPUT_DIR \
+    --max_steps $MAX_STEPS \
+    --per_device_train_batch_size $BATCH_SIZE \
+    --max_length $SEQ_LEN \
+    $EXTRA_TRAINING_ARGS
+"""
+
+echo "Starting program..."
+
+{ # try
+    echo $CMD
+    eval "$CMD"
+} || { # catch
+    # save log for exception 
+    echo "Operation Failed!"
+    exit 1
+}
+exit 0
--- a/commands/run_sft.sh
+++ b/commands/run_sft.sh
@ -0,0 +1,59 @@
+#!/bin/bash
+# This script runs an SFT example end-to-end on a tiny model using different possible configurations
+# but defaults to QLoRA + PEFT
+OUTPUT_DIR="test_sft/"
+MODEL_NAME="HuggingFaceM4/tiny-random-LlamaForCausalLM"
+DATASET_NAME="imdb"
+MAX_STEPS=5
+BATCH_SIZE=2
+SEQ_LEN=128
+
+
+# Handle extra arguments in case one passes accelerate configs.
+EXTRA_ACCELERATE_ARGS=""
+EXTRA_TRAINING_ARGS="""--use_peft \
+    --load_in_4bit
+"""
+
+# Set your number of GPUs here
+NUM_GPUS=2
+
+if [[ "${TRL_ACCELERATE_CONFIG}" == "" ]]; then
+  EXTRA_ACCELERATE_ARGS=""
+else
+  EXTRA_ACCELERATE_ARGS="--config_file $TRL_ACCELERATE_CONFIG"
+  # For DeepSpeed configs we need to set the `--fp16` flag to comply with our configs exposed
+  # on `examples/accelerate_configs` and our runners do not support bf16 mixed precision training.
+  if [[ $TRL_ACCELERATE_CONFIG == *"deepspeed"* ]]; then
+    EXTRA_TRAINING_ARGS="--fp16"
+  else
+    echo "Keeping QLoRA + PEFT"
+  fi
+fi
+
+
+CMD="""
+accelerate launch $EXTRA_ACCELERATE_ARGS \
+    --num_processes $NUM_GPUS \
+    --mixed_precision 'fp16' \
+    `pwd`/examples/scripts/sft.py \
+    --model_name $MODEL_NAME \
+    --dataset_name $DATASET_NAME \
+    --output_dir $OUTPUT_DIR \
+    --max_steps $MAX_STEPS \
+    --per_device_train_batch_size $BATCH_SIZE \
+    --max_seq_length $SEQ_LEN \
+    $EXTRA_TRAINING_ARGS
+"""
+
+echo "Starting program..."
+
+{ # try
+    echo $CMD
+    eval "$CMD"
+} || { # catch
+    # save log for exception 
+    echo "Operation Failed!"
+    exit 1
+}
+exit 0
--- a/docker/trl-latest-gpu/Dockerfile
+++ b/docker/trl-latest-gpu/Dockerfile
@ -0,0 +1,66 @@
+# Builds GPU docker image of PyTorch
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+ENV PYTHON_VERSION=3.10
+# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN apt-get update && \
+    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Install audio-related libraries 
+RUN apt-get update && \
+    apt install -y ffmpeg
+
+RUN apt install -y libsndfile1-dev
+RUN git lfs install
+
+# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/trl/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Stage 2
+FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+RUN source activate trl && \ 
+    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Activate the conda env and install transformers + accelerate from source
+RUN source activate trl && \
+    python3 -m pip install -U --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    transformers \
+    accelerate \
+    peft \
+    trl[test]@git+https://github.com/huggingface/trl
+
+RUN source activate trl && \ 
+    pip freeze | grep trl
+
+RUN echo "source activate trl" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
--- a/docker/trl-source-gpu/Dockerfile
+++ b/docker/trl-source-gpu/Dockerfile
@ -0,0 +1,66 @@
+# Builds GPU docker image of PyTorch
+# Uses multi-staged approach to reduce size
+# Stage 1
+# Use base conda image to reduce time
+FROM continuumio/miniconda3:latest AS compile-image
+# Specify py version
+ENV PYTHON_VERSION=3.10
+# Install apt libs - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN apt-get update && \
+    apt-get install -y curl git wget software-properties-common git-lfs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Install audio-related libraries 
+RUN apt-get update && \
+    apt install -y ffmpeg
+
+RUN apt install -y libsndfile1-dev
+RUN git lfs install
+
+# Create our conda env - copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+RUN conda create --name trl python=${PYTHON_VERSION} ipython jupyter pip
+RUN python3 -m pip install --no-cache-dir --upgrade pip
+
+# Below is copied from https://github.com/huggingface/accelerate/blob/main/docker/accelerate-gpu/Dockerfile
+# We don't install pytorch here yet since CUDA isn't available
+# instead we use the direct torch wheel
+ENV PATH /opt/conda/envs/trl/bin:$PATH
+# Activate our bash shell
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+
+# Stage 2
+FROM nvidia/cuda:12.2.2-devel-ubuntu22.04 AS build-image
+COPY --from=compile-image /opt/conda /opt/conda
+ENV PATH /opt/conda/bin:$PATH
+
+RUN chsh -s /bin/bash
+SHELL ["/bin/bash", "-c"]
+RUN source activate trl && \ 
+    python3 -m pip install --no-cache-dir bitsandbytes optimum auto-gptq
+
+# Install apt libs
+RUN apt-get update && \
+    apt-get install -y curl git wget && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists*
+
+# Activate the conda env and install transformers + accelerate from source
+RUN source activate trl && \
+    python3 -m pip install -U --no-cache-dir \
+    librosa \
+    "soundfile>=0.12.1" \
+    scipy \
+    git+https://github.com/huggingface/transformers \
+    git+https://github.com/huggingface/accelerate \
+    git+https://github.com/huggingface/peft \
+    trl[test]@git+https://github.com/huggingface/trl
+
+RUN source activate trl && \ 
+    pip freeze | grep transformers
+
+RUN echo "source activate trl" >> ~/.profile
+
+# Activate the virtualenv
+CMD ["/bin/bash"]
--- a/docs/source/_toctree.yml
+++ b/docs/source/_toctree.yml
@ -1,14 +1,20 @@
- sections: 
+- sections:
  - local: index
    title: TRL
  - local: quickstart
    title: Quickstart
  - local: installation
    title: Installation
+  - local: clis
+    title: Get started with Command Line Interfaces (CLIs)
+  - local: how_to_train
+    title: PPO Training FAQ
+  - local: use_model
+    title: Use Trained Models
  - local: customization
-    title: Customize your training
+    title: Customize the Training
  - local: logging
-    title: Understanding logs
+    title: Understanding Logs
  title: Get started
 - sections:
  - local: models
@ -16,23 +22,37 @@
  - local: trainer
    title: Trainer Classes
  - local: reward_trainer
-    title: Training your own reward model
+    title: Reward Model Training
  - local: sft_trainer
-    title: Supervised fine-tuning
-  - local: extras
-    title: Extras - Better model output without reinforcement learning
+    title: Supervised Fine-Tuning
+  - local: ppo_trainer
+    title: PPO Trainer
+  - local: best_of_n
+    title: Best of N Sampling
+  - local: dpo_trainer
+    title: DPO Trainer
+  - local: kto_trainer
+    title: KTO Trainer
+  - local: ddpo_trainer
+    title: Denoising Diffusion Policy Optimization
+  - local: iterative_sft_trainer
+    title: Iterative Supervised Fine-Tuning
+  - local: text_environments
+    title: Text Environments
  title: API
- sections: 
+- sections:
+  - local: example_overview
+    title: Example Overview
  - local: sentiment_tuning
    title: Sentiment Tuning
  - local: lora_tuning_peft
-    title: Peft support - Low rank adaption of 8 bit models
-  - local: summarization_reward_tuning
-    title: Summarization Reward Tuning
+    title: Training with PEFT
  - local: detoxifying_a_lm
    title: Detoxifying a Language Model
  - local: using_llama_models
-    title: Using LLaMA with TRL
+    title: Training StackLlama
+  - local: learning_tools
+    title: Learning to Use Tools
  - local: multi_adapter_rl
-    title: Multi Adapter RL (MARL) - a single base model for everything
+    title: Multi Adapter RLHF
  title: Examples
--- a/docs/source/best_of_n.mdx
+++ b/docs/source/best_of_n.mdx
@ -1,4 +1,4 @@
-# Extras: Alternative ways to get better model output without RL based fine-tuning 
+# Best of N sampling: Alternative ways to get better model output without RL based fine-tuning 

 Within the extras module is the `best-of-n` sampler class that serves as an alternative method of generating better model output.
 As to how it fares against the RL based fine-tuning, please look in the `examples` directory for a comparison example
--- a/docs/source/clis.mdx
+++ b/docs/source/clis.mdx
@ -0,0 +1,109 @@
+# Command Line Interfaces (CLIs)
+
+You can use TRL to fine-tune your Language Model with Supervised Fine-Tuning (SFT) or Direct Policy Optimization (DPO) or even chat with your model using the TRL CLIs.
+
+Currently supported CLIs are:
+
+- `trl sft`: fine-tune a LLM on a text/instruction dataset
+- `trl dpo`: fine-tune a LLM with DPO on a preference dataset 
+- `trl chat`: quickly spin up a LLM fine-tuned for chatting
+
+## Fine-tuning with the CLI
+
+Before getting started, pick up a Language Model from Hugging Face Hub. Supported models can be found with the filter "text-generation" within models. Also make sure to pick up a relevant dataset for your task.
+
+Before using the `sft` or `dpo` commands make sure to run:
+```bash
+accelerate config
+```
+and pick up the right configuration for your training setup (single / multi-GPU, DeepSpeed, etc.). Make sure to complete all steps of `accelerate config` before running any CLI command.
+
+We also recommend you passing a YAML config file to configure your training protocol. Below is a simple example of a YAML file that you can use for training your models with `trl sft` command.
+
+```yaml
+model_name_or_path:
+  HuggingFaceM4/tiny-random-LlamaForCausalLM
+dataset_name:
+  imdb
+dataset_text_field:
+  text
+report_to:
+  none
+learning_rate:
+  0.0001
+lr_scheduler_type:
+  cosine
+```
+
+Save that config in a `.yaml` and get directly started ! Note you can overwrite the arguments from the config file by explicitly passing them to the CLI, e.g.:
+
+```bash
+trl sft --config example_config.yaml --output_dir test-trl-cli --lr_scheduler_type cosine_with_restarts
+```
+
+Will force-use `cosine_with_restarts` for `lr_scheduler_type`.
+
+### Supported Arguments 
+
+We do support all arguments from `transformers.TrainingArguments`, for loading your model, we support all arguments from `~trl.ModelConfig`:
+
+[[autodoc]] ModelConfig
+
+You can pass any of these arguments either to the CLI or the YAML file.
+
+### Supervised Fine-tuning (SFT)
+
+Follow the basic instructions above and run `trl sft --output_dir <output_dir> <*args>`: 
+
+```bash
+trl sft --model_name_or_path facebook/opt-125m --dataset_name imdb --output_dir opt-sft-imdb
+```
+
+The SFT CLI is based on the `examples/scripts/sft.py` script.
+
+### Direct Policy Optimization (DPO)
+
+First, follow the basic instructions above and run `trl dpo --output_dir <output_dir> <*args>`. Make sure to process your DPO dataset in the TRL format as follows:
+
+1- Make sure to pre-tokenize the dataset using chat templates:
+
+```bash
+python examples/datasets/tokenize_ds.py --model gpt2 --dataset yourdataset
+```
+
+You might need to adapt the `examples/datasets/tokenize_ds.py` to use yout chat template
+
+2- Format the dataset into TRL format (you can adapt the `examples/datasets/anthropic_hh.py`):
+
+```bash
+python examples/datasets/anthropic_hh.py --push_to_hub --hf_entity your-hf-org
+```
+
+Once your dataset being pushed, run the dpo CLI as follows:
+
+```bash
+trl dpo --model_name_or_path facebook/opt-125m --dataset_name trl-internal-testing/Anthropic-hh-rlhf-processed --output_dir opt-sft-hh-rlhf
+```
+
+The SFT CLI is based on the `examples/scripts/dpo.py` script.
+
+## Chat interface
+
+The chat CLI lets you quickly load the model and talk to it. Simply run the following:
+
+```bash
+trl chat --model_name_or_path  Qwen/Qwen1.5-0.5B-Chat 
+```
+
+Note that the chat interface relies on the chat template of the tokenizer to format the inputs for the model. Make sure your tokenizer has a chat template defined.
+
+Besides talking to the model there are a few commands you can use:
+
+- **clear**: clears the current conversation and start a new one
+- **example {NAME}**: load example named `{NAME}` from the config and use it as the user input
+- **set {SETTING_NAME}={SETTING_VALUE};**: change the system prompt or generation settings (multiple settings are separated by a ';').
+- **reset**: same as clear but also resets the generation configs to defaults if they have been changed by **set**
+- **save {SAVE_NAME} (optional)**: save the current chat and settings to file by default to `./chat_history/{MODEL_NAME}/chat_{DATETIME}.yaml` or `{SAVE_NAME}` if provided
+- **exit**: closes the interface
+
+The default examples are defined in `examples/scripts/config/default_chat_config.yaml` but you can pass your own with `--config CONIG_FILE` where you can also specify the default generation parameters.
--- a/docs/source/customization.mdx
+++ b/docs/source/customization.mdx
@ -1,22 +1,50 @@
 # Training customization

-At `trl` we provide the possibility to give enough modularity to users to be able to efficiently customize the training loop for their needs. Below are some examples on how you can apply and test different techniques.
+TRL is designed with modularity in mind so that users to be able to efficiently customize the training loop for their needs. Below are some examples on how you can apply and test different techniques.

-## Run on multiple GPUs / nodes
+## Train on multiple GPUs / nodes

-We leverage `accelerate` to enable users to run their training on multiple GPUs or nodes. You should first create your accelerate config by simply running:
+The trainers in TRL use 🤗 Accelerate to enable distributed training across multiple GPUs or nodes. To do so, first create an 🤗 Accelerate config file by running

 ```bash
 accelerate config
 ```

-Then make sure you have selected multi-gpu / multi-node setup. You can then run your training by simply running:
+and answering the questions according to your multi-gpu / multi-node setup. You can then launch distributed training by running:

 ```bash
 accelerate launch your_script.py
 ```

-Refer to the [examples page](https://github.com/lvwerra/trl/tree/main/examples) for more details
+We also provide config files in the [examples folder](https://github.com/huggingface/trl/tree/main/examples/accelerate_configs) that can be used as templates. To use these templates, simply pass the path to the config file when launching a job, e.g.:
+
+```shell
+accelerate launch --config_file=examples/accelerate_configs/multi_gpu.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
+```
+
+Refer to the [examples page](https://github.com/huggingface/trl/tree/main/examples) for more details.
+
+### Distributed training with DeepSpeed
+
+All of the trainers in TRL can be run on multiple GPUs together with DeepSpeed ZeRO-{1,2,3} for efficient sharding of the optimizer states, gradients, and model weights. To do so, run:
+
+```shell
+accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero{1,2,3}.yaml --num_processes {NUM_GPUS} path_to_your_script.py --all_arguments_of_the_script
+```
+
+Note that for ZeRO-3, a small tweak is needed to initialize your reward model on the correct device via the `zero3_init_context_manager()` context manager. In particular, this is needed to avoid DeepSpeed hanging after a fixed number of training steps. Here is a snippet of what is involved from the [`sentiment_tuning`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py) example:
+
+```python
+ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
+if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
+    with ds_plugin.zero3_init_context_manager(enable=False):
+        sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
+else:
+    sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
+```
+
+Consult the 🤗 Accelerate [documentation](https://huggingface.co/docs/accelerate/usage_guides/deepspeed) for more information about the DeepSpeed plugin.
+

 ## Use different optimizers

@ -167,31 +195,22 @@ When training large models, you should better handle the CUDA cache by iterative
 config = PPOConfig(..., optimize_cuda_cache=True)
 ```

-## Use correctly DeepSpeed stage 3:

-A small tweak need to be added to your training script to use DeepSpeed stage 3 correctly. You need to properly initialize your reward model on the correct device using the `zero3_init_context_manager` context manager. Here is an example adapted for the `gpt2-sentiment` script:

+## Use score scaling/normalization/clipping
+As suggested by [Secrets of RLHF in Large Language Models Part I: PPO](https://arxiv.org/abs/2307.04964), we support score (aka reward) scaling/normalization/clipping to improve training stability via `PPOConfig`:
 ```python
-ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
-if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
-    with ds_plugin.zero3_init_context_manager(enable=False):
-        sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
-else:
-    sentiment_pipe = pipeline("sentiment-analysis", model="lvwerra/distilbert-imdb", device=device)
+from trl import PPOConfig
+
+ppo_config = {
+    use_score_scaling=True,
+    use_score_norm=True,
+    score_clip=0.5,
+}
+config = PPOConfig(**ppo_config)
 ```

-## Use torch distributed 
-torch.distributed package provides PyTorch natives method to distribute a network over several machines (mostly useful if there are several GPU nodes). It copies the model on each GPU, runs the forward and backward on each and then applies the mean of gradient of all GPUs for each one. If running torch 1.XX, you can call `torch.distributed.launch`, like
-
-`python -m torch.distributed.launch --nproc_per_node=1 reward_summarization.py --bf16`
-
-For torch 2.+ `torch.distributed.launch` is deprecated and one needs to run:
-`torchrun --nproc_per_node=1 reward_summarization.py --bf16`
-or 
-`python -m torch.distributed.run --nproc_per_node=1 reward_summarization.py --bf16`
-
-Note that using `python -m torch.distributed.launch --nproc_per_node=1 reward_summarization.py --bf16` with torch 2.0 ends in
+To run `ppo.py`, you can use the following command:
 ```
-ValueError: Some specified arguments are not used by the HfArgumentParser: ['--local-rank=0']
-ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 194889) of binary: /home/ubuntu/miniconda3/envs/trl/bin/python
+python examples/scripts/ppo.py --log_with wandb --use_score_scaling --use_score_norm --score_clip 0.5
 ```
--- a/docs/source/ddpo_trainer.mdx
+++ b/docs/source/ddpo_trainer.mdx
@ -0,0 +1,119 @@
+# Denoising Diffusion Policy Optimization
+## The why
+
+| Before | After DDPO finetuning |
+| --- | --- |
+| <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/pre_squirrel.png"/></div> |  <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/post_squirrel.png"/></div> |
+| <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/pre_crab.png"/></div> |  <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/post_crab.png"/></div> |
+|  <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/pre_starfish.png"/></div> |  <div style="text-align: center"><img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/post_starfish.png"/></div> |
+
+
+## Getting started with Stable Diffusion finetuning with reinforcement learning
+
+The machinery for finetuning of Stable Diffusion models with reinforcement learning makes heavy use of HuggingFace's `diffusers`
+library. A reason for  stating this is that getting started requires a bit of familiarity with the `diffusers` library concepts, mainly two of them - pipelines and schedulers.
+Right out of the box (`diffusers` library), there isn't a `Pipeline` nor a `Scheduler` instance that is suitable for finetuning with reinforcement learning. Some adjustments need to made. 
+
+There is a pipeline interface that is provided by this library that is required to be implemented to be used with the `DDPOTrainer`, which is the main machinery for fine-tuning Stable Diffusion with reinforcement learning. **Note: Only the StableDiffusion architecture is supported at this point.**
+There is a default implementation of this interface that you can use out of the box. Assuming the default implementation is sufficient and/or to get things moving, refer to the training example alongside this guide. 
+
+The point of the interface is to fuse the pipeline and the scheduler into one object which allows for minimalness in terms of having the constraints all in one place. The interface was designed in hopes of catering to pipelines and schedulers beyond the examples in this repository and elsewhere at this time of writing. Also the scheduler step is a method of this pipeline interface and this may seem redundant given that the raw scheduler is accessible via the interface but this is the only way to constrain the scheduler step output to an output type befitting of the algorithm at hand (DDPO).
+
+For a more detailed look into the interface and the associated default implementation, go [here](https://github.com/lvwerra/trl/tree/main/trl/models/modeling_sd_base.py)
+
+Note that the default implementation has a LoRA implementation path and a non-LoRA based implementation path. The LoRA flag enabled by default and this can be turned off by passing in the flag to do so. LORA based training is faster and the LORA associated model hyperparameters responsible for model convergence aren't as finicky as non-LORA based training.
+
+Also in addition, there is the expectation of providing a reward function and a prompt function. The reward function is used to evaluate the generated images  and the prompt function is used to generate the prompts that are used to generate the images.
+
+## Getting started with `examples/scripts/ddpo.py`
+
+The `ddpo.py` script is a working example of using the `DDPO` trainer to finetune a Stable Diffusion model. This example explicitly configures a small subset of the overall parameters associated with the config object (`DDPOConfig`).
+
+**Note:** one A100 GPU is recommended to get this running. Anything below a A100 will not be able to run this example script and even if it does via relatively smaller sized parameters, the results will most likely be poor.
+
+Almost every configuration parameter has a default. There is only one commandline flag argument that is required of the user to get things up and running. The user is expected to have a [huggingface user access token](https://huggingface.co/docs/hub/security-tokens) that will be used to upload the model post finetuning to HuggingFace hub. The following bash command is to be entered to get things running
+
+```batch
+python ddpo.py --hf_user_access_token <token>
+```
+
+To obtain the documentation of `stable_diffusion_tuning.py`, please run `python stable_diffusion_tuning.py --help`
+
+The following are things to keep in mind (The code checks this for you as well) in general while configuring the trainer (beyond the use case of using the example script)
+
+- The configurable sample batch size (`--ddpo_config.sample_batch_size=6`) should be greater than or equal to the configurable training batch size (`--ddpo_config.train_batch_size=3`)
+- The configurable sample batch size (`--ddpo_config.sample_batch_size=6`) must be divisible by the configurable train batch size (`--ddpo_config.train_batch_size=3`)
+- The configurable sample batch size (`--ddpo_config.sample_batch_size=6`) must be divisible by both the configurable gradient accumulation steps (`--ddpo_config.train_gradient_accumulation_steps=1`) and the configurable accelerator processes count 
+
+## Setting up the image logging hook function
+
+Expect the function to be given a list of lists of the form
+```python
+[[image, prompt, prompt_metadata, rewards, reward_metadata], ...]
+
+```
+and `image`, `prompt`, `prompt_metadata`, `rewards`, `reward_metadata` are batched.
+The last list in the lists of lists represents the last sample batch. You are likely to want to log this one
+While you are free to log however you want the use of `wandb` or `tensorboard` is recommended.
+
+### Key terms
+
+- `rewards` : The rewards/score is a numerical associated with the generated image and is key to steering the RL process
+- `reward_metadata` : The reward metadata is the metadata associated with the reward. Think of this as extra information payload delivered alongside the reward
+- `prompt` : The prompt is the text that is used to generate the image
+- `prompt_metadata` : The prompt metadata is the metadata associated with the prompt. A situation where this will not be empty is when the reward model comprises of a [`FLAVA`](https://huggingface.co/docs/transformers/model_doc/flava) setup where questions and ground answers (linked to the generated image) are expected with the generated image (See here: https://github.com/kvablack/ddpo-pytorch/blob/main/ddpo_pytorch/rewards.py#L45)
+- `image` : The image generated by the Stable Diffusion model
+
+Example code for logging sampled images with `wandb` is given below.
+
+```python
+# for logging these images to wandb
+
+def image_outputs_hook(image_data, global_step, accelerate_logger):
+    # For the sake of this example, we only care about the last batch
+    # hence we extract the last element of the list
+    result = {}
+    images, prompts, _, rewards, _ = image_data[-1]
+    for i, image in enumerate(images):
+        pil = Image.fromarray(
+            (image.cpu().numpy().transpose(1, 2, 0) * 255).astype(np.uint8)
+        )
+        pil = pil.resize((256, 256))
+        result[f"{prompts[i]:.25} | {rewards[i]:.2f}"] = [pil]
+    accelerate_logger.log_images(
+        result,
+        step=global_step,
+    )
+
+```
+
+### Using the finetuned model
+
+Assuming you've done with all the epochs and have pushed up your model to the hub, you can use the finetuned model as follows
+
+```python
+
+import torch
+from trl import DefaultDDPOStableDiffusionPipeline
+
+pipeline = DefaultDDPOStableDiffusionPipeline("metric-space/ddpo-finetuned-sd-model")
+
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+
+# memory optimization
+pipeline.vae.to(device, torch.float16)
+pipeline.text_encoder.to(device, torch.float16)
+pipeline.unet.to(device, torch.float16)
+
+prompts = ["squirrel", "crab", "starfish", "whale","sponge", "plankton"]
+results = pipeline(prompts)
+
+for prompt, image in zip(prompts,results.images):
+    image.save(f"{prompt}.png")
+
+```
+
+## Credits
+
+This work is heavily influenced by the repo [here](https://github.com/kvablack/ddpo-pytorch) and the associated paper [Training Diffusion Models
+with Reinforcement Learning by Kevin Black, Michael Janner, Yilan Du, Ilya Kostrikov, Sergey Levine](https://arxiv.org/abs/2305.13301).
--- a/docs/source/detoxifying_a_lm.mdx
+++ b/docs/source/detoxifying_a_lm.mdx
@ -4,12 +4,12 @@ Language models (LMs) are known to sometimes generate toxic outputs. In this exa

 Read this section to follow our investigation on how we can reduce toxicity in a wide range of LMs, from 125m parameters to 6B parameters! 

-Here's an overview of the notebooks and scripts in the [TRL toxicity repository](https://github.com/lvwerra/trl/tree/main/examples/toxicity/scripts) as well as the link for the interactive demo:
+Here's an overview of the notebooks and scripts in the [TRL toxicity repository](https://github.com/huggingface/trl/tree/main/examples/toxicity/scripts) as well as the link for the interactive demo:

 | File | Description | Colab link |
 |---|---| --- |
-| [`gpt-j-6b-toxicity.py`](https://github.com/lvwerra/trl/blob/main/examples/toxicity/scripts/gpt-j-6b-toxicity.py) | Detoxify `GPT-J-6B` using PPO | x | 
-| [`evaluate-toxicity.py`](https://github.com/lvwerra/trl/blob/main/examples/toxicity/scripts/evaluate-toxicity.py) | Evaluate de-toxified models using `evaluate` | x | 
+| [`gpt-j-6b-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py) | Detoxify `GPT-J-6B` using PPO | x | 
+| [`evaluate-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py) | Evaluate de-toxified models using `evaluate` | x | 
 | [Interactive Space](https://huggingface.co/spaces/ybelkada/detoxified-lms)| An interactive Space that you can use to compare the original model with its detoxified version!| x |

 ## Context
@ -174,7 +174,7 @@ Below are few generation examples of `gpt-j-6b-detox` model:
 <img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/trl-toxicity-examples.png">
 </div>

-The evaluation script can be found [here](https://github.com/lvwerra/trl/blob/main/examples/toxicity/scripts/evaluate-toxicity.py).
+The evaluation script can be found [here](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py).

 ### Discussions

--- a/docs/source/dpo_trainer.mdx
+++ b/docs/source/dpo_trainer.mdx
@ -0,0 +1,235 @@
+# DPO Trainer
+
+TRL supports the DPO Trainer for training language models from preference data, as described in the paper [Direct Preference Optimization: Your Language Model is Secretly a Reward Model](https://arxiv.org/abs/2305.18290) by Rafailov et al., 2023. For a full example have a look at  [`examples/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo.py).
+
+The first step as always is to train your SFT model, to ensure the data we train on is in-distribution for the DPO algorithm.
+
+## How DPO works
+
+Fine-tuning a language model via DPO consists of two steps and is easier than PPO:
+
+1. **Data collection**: Gather a preference dataset with positive and negative selected pairs of generation, given a prompt.
+2. **Optimization**: Maximize the log-likelihood of the DPO loss directly.
+
+DPO-compatible datasets can be found with [the tag `dpo` on Hugging Face Hub](https://huggingface.co/datasets?other=dpo).
+
+This process is illustrated in the sketch below (from [figure 1 of the original paper](https://arxiv.org/pdf/2305.18290.pdf)):
+
+<img width="835" alt="Screenshot 2024-03-19 at 12 39 41" src="https://github.com/huggingface/trl/assets/49240599/9150fac6-3d88-4ca2-8ec6-2a6f3473216d">
+
+Read more about DPO algorithm in the [original paper](https://arxiv.org/pdf/2305.18290.pdf).
+
+
+## Expected dataset format
+
+The DPO trainer expects a very specific format for the dataset. Since the model will be trained to directly optimize the preference of which sentence is the most relevant, given two sentences. We provide an example from the [`Anthropic/hh-rlhf`](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset below:
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/rlhf-antropic-example.png", width="50%">
+</div>
+
+Therefore the final dataset object should contain these 3 entries if you use the default `DPODataCollatorWithPadding` data collator. The entries should be named:
+
+- `prompt`
+- `chosen`
+- `rejected`
+
+for example:
+
+```py
+dpo_dataset_dict = {
+    "prompt": [
+        "hello",
+        "how are you",
+        "What is your name?",
+        "What is your name?",
+        "Which is the best programming language?",
+        "Which is the best programming language?",
+        "Which is the best programming language?",
+    ],
+    "chosen": [
+        "hi nice to meet you",
+        "I am fine",
+        "My name is Mary",
+        "My name is Mary",
+        "Python",
+        "Python",
+        "Java",
+    ],
+    "rejected": [
+        "leave me alone",
+        "I am not fine",
+        "Whats it to you?",
+        "I dont have a name",
+        "Javascript",
+        "C++",
+        "C++",
+    ],
+}
+```
+
+where the `prompt` contains the context inputs, `chosen` contains the corresponding chosen responses and `rejected` contains the corresponding negative (rejected) responses. As can be seen a prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays.
+
+## Expected model format
+The DPO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
+
+## Using the `DPOTrainer`
+
+For a detailed example have a look at the `examples/scripts/dpo.py` script. At a high level we need to initialize the `DPOTrainer` with a `model` we wish to train, a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response, the `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).
+
+```py
+dpo_trainer = DPOTrainer(
+    model,
+    model_ref,
+    args=training_args,
+    beta=0.1,
+    train_dataset=train_dataset,
+    tokenizer=tokenizer,
+)
+```
+After this one can then call:
+
+```py
+dpo_trainer.train()
+```
+
+Note that the `beta` is the temperature parameter for the DPO loss, typically something in the range of `0.1` to `0.5`. We ignore the reference model as `beta` -> 0.
+
+## Loss functions
+
+Given the preference data, we can fit a binary classifier according to the Bradley-Terry model and in fact the DPO authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression.
+
+The [RSO](https://arxiv.org/abs/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://arxiv.org/abs/2305.10425) paper. The `DPOTrainer` can be switched to this loss via the `loss_type="hinge"` argument and the `beta` in this case is the reciprocal of the margin.
+
+The [IPO](https://arxiv.org/abs/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss which can be used via the `loss_type="ipo"` argument to the trainer. Note that the `beta`  parameter is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair and thus the smaller the `beta` the larger this gaps is. As per the paper the loss is averaged over log-likelihoods of the completion (unlike DPO which is summed only).
+
+The [cDPO](https://ericmitchell.ai/cdpo.pdf) is a tweak on the DPO loss where we assume that the preference labels are noisy with some probability that can be passed to the `DPOTrainer` via `label_smoothing` argument (between 0 and 0.5) and then a conservative DPO loss is used. Use the `loss_type="cdpo"` argument to the trainer to use it.
+
+The [KTO](https://arxiv.org/abs/2402.01306) authors directly maximize the utility of LLM generations instead of the log-likelihood of preferences. To use preference data with KTO, we recommend breaking up the n preferences into 2n examples and using [`KTOTrainer`](kto_trainer) (i.e., treating the data like an unpaired feedback dataset). Although it is possible to pass in `loss_type="kto_pair"` into DPOTrainer, this is a highly simplified version of KTO that we *do not recommend* in most cases. Please use [`KTOTrainer`](kto_trainer) when possible.
+
+## Logging
+
+While training and evaluating we record the following reward metrics:
+
+* `rewards/chosen`: the mean difference between the log probabilities of the policy model and the reference model for the chosen responses scaled by beta
+* `rewards/rejected`: the mean difference between the log probabilities of the policy model and the reference model for the rejected responses scaled by beta
+* `rewards/accuracies`: mean of how often the chosen rewards are > than the corresponding rejected rewards
+* `rewards/margins`: the mean difference between the chosen and corresponding rejected rewards
+
+## Accelerate DPO fine-tuning using `unsloth`
+
+You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below:
+
+|  GPU     | Model           | Dataset   | 🤗  | 🤗 + Flash Attention 2 | 🦥 Unsloth     | 🦥 VRAM saved  |
+|----------|-----------------|-----------|------|------------------------|-----------------|----------------|
+| A100 40G | Zephyr 7b       | Ultra Chat| 1x   | 1.24x                  | **1.88x**       | -11.6%         |
+| Tesla T4 | Zephyr 7b       | Ultra Chat| 1x   | 1.09x                  | **1.55x**       | -18.6%         |
+
+First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:
+
+```python
+import torch
+from transformers import TrainingArguments
+from trl import DPOTrainer
+from unsloth import FastLanguageModel
+
+max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number.
+
+# Load model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/zephyr-sft",
+    max_seq_length = max_seq_length,
+    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+    load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False.
+    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Dropout = 0 is currently optimized
+    bias = "none",    # Bias = "none" is currently optimized
+    use_gradient_checkpointing = True,
+    random_state = 3407,
+)
+
+training_args = TrainingArguments(output_dir="./output")
+
+dpo_trainer = DPOTrainer(
+    model,
+    model_ref=None,
+    args=training_args,
+    beta=0.1,
+    train_dataset=train_dataset,
+    tokenizer=tokenizer,
+)
+dpo_trainer.train()
+```
+
+The saved model is fully compatible with Hugging Face's transformers library. Learn more about unsloth in their [official repository](https://github.com/unslothai/unsloth).
+
+## Reference model considerations with PEFT
+
+You have three main options (plus several variants) for how the reference model works when using PEFT, assuming the model that you would like to further enhance with DPO was tuned using (Q)LoRA.
+
+1. Simply create two instances of the model, each loading your adapter - works fine but is very inefficient.
+2. Merge the adapter into the base model, create another adapter on top, then leave the `model_ref` param null, in which case DPOTrainer will unload the adapter for reference inference - efficient, but has potential downsides discussed below.
+3. Load the adapter twice with different names, then use `set_adapter` during training to swap between the adapter being DPO'd and the reference adapter - slightly less efficient compared to 2 (~adapter size VRAM overhead), but avoids the pitfalls.
+
+### Downsides to merging QLoRA before DPO (approach 2)
+
+As suggested by [Benjamin Marie](https://medium.com/@bnjmn_marie/dont-merge-your-lora-adapter-into-a-4-bit-llm-65b6da287997), the best option for merging QLoRA adapters is to first dequantize the base model, then merge the adapter. Something similar to [this script](https://github.com/jondurbin/qlora/blob/main/qmerge.py).
+
+However, after using this approach, you will have an unquantized base model. Therefore, to use QLoRA for DPO, you will need to re-quantize the merged model or use the unquantized merge (resulting in higher memory demand).
+
+### Using option 3 - load the adapter twice
+
+To avoid the downsides with option 2, you can load your fine-tuned adapter into the model twice, with different names, and set the model/ref adapter names in DPOTrainer.
+
+For example:
+```python
+# Load the base model.
+bnb_config = BitsAndBytesConfig(
+    load_in_4bit=True,
+    llm_int8_threshold=6.0,
+    llm_int8_has_fp16_weight=False,
+    bnb_4bit_compute_dtype=torch.bfloat16,
+    bnb_4bit_use_double_quant=True,
+    bnb_4bit_quant_type="nf4",
+)
+model = AutoModelForCausalLM.from_pretrained(
+    "mistralai/mixtral-8x7b-v0.1",
+    load_in_4bit=True,
+    quantization_config=bnb_config,
+    attn_implementation="flash_attention_2",
+    torch_dtype=torch.bfloat16,
+    device_map="auto",
+)
+model.config.use_cache = False
+
+# Load the adapter.
+model = PeftModel.from_pretrained(
+    model,
+    "/path/to/peft",
+    is_trainable=True,
+    adapter_name="train",
+)
+# Load the adapter a second time, with a different name, which will be our reference model.
+model.load_adapter("/path/to/peft", adapter_name="reference")
+
+# Initialize the trainer, without a ref_model param.
+dpo_trainer = DPOTrainer(
+    model,
+    ...
+    model_adapter_name="train",
+    ref_adapter_name="reference",
+)
+```
+
+## DPOTrainer
+
+[[autodoc]] DPOTrainer
--- a/docs/source/example_overview.md
+++ b/docs/source/example_overview.md
@ -0,0 +1,73 @@
+# Examples
+
+
+## Introduction
+
+The examples should work in any of the following settings (with the same script):
+   - single GPU
+   - multi GPUS (using PyTorch distributed mode)
+   - multi GPUS (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
+   - fp16 (mixed-precision), fp32 (normal precision), or bf16 (bfloat16 precision)
+
+To run it in each of these various modes, first initialize the accelerate
+configuration with `accelerate config`
+
+**NOTE to train with a 4-bit or 8-bit model**, please run
+
+```bash
+pip install --upgrade trl[quantization]
+```
+
+
+## Accelerate Config
+For all the examples, you'll need to generate a 🤗 Accelerate config file with:
+
+```shell
+accelerate config # will prompt you to define the training configuration
+```
+
+Then, it is encouraged to launch jobs with `accelerate launch`!
+
+
+# Maintained Examples
+
+
+| File                                                                                           | Description                                                                                                              |
+|------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
+| [`examples/scripts/sft.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/sft.py) | This script shows how to use the `SFTTrainer` to fine tune a model or adapters into a target dataset.                     |
+| [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/reward_modeling.py) | This script shows how to use the `RewardTrainer` to train a reward model on your own dataset.                            |
+| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset                 |
+| [`examples/scripts/ppo_multi_adapter.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo_multi_adapter.py) | This script shows how to use the `PPOTrainer` to train a single base model with multiple adapters. Requires you to run the example script with the reward model training beforehand. |
+| [`examples/scripts/stable_diffusion_tuning_example.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/stable_diffusion_tuning_example.py) | This script shows to use DDPOTrainer to fine-tune a stable diffusion model using reinforcement learning.                 |
+
+Here are also some easier-to-run colab notebooks that you can use to get started with TRL:
+
+
+| File                                                                                           | Description                                                                                                              |
+|----------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
+| [`examples/notebooks/best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb)                       | This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO.  |
+| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb)              | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.                |
+| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)                  | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.                    |
+
+
+We also have some other examples that are less maintained but can be used as a reference:
+1. **[research_projects](https://github.com/huggingface/trl/tree/main/examples/research_projects)**: Check out this folder to find the scripts used for some research projects that used TRL (LM de-toxification, Stack-Llama, etc.)
+
+
+## Distributed training
+
+All of the scripts can be run on multiple GPUs by providing the path of an 🤗 Accelerate config file when calling `accelerate launch`. To launch one of them on one or multiple GPUs, run the following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine and `--all_arguments_of_the_script` with your arguments.)
+
+```shell
+accelerate launch --config_file=examples/accelerate_configs/multi_gpu.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
+```
+
+You can also adjust the parameters of the 🤗 Accelerate config file to suit your needs (e.g. training in mixed precision).
+
+### Distributed training with DeepSpeed
+
+Most of the scripts can be run on multiple GPUs together with DeepSpeed ZeRO-{1,2,3} for efficient sharding of the optimizer states, gradients, and model weights. To do so, run following command (swapping `{NUM_GPUS}` with the number of GPUs in your machine, `--all_arguments_of_the_script` with your arguments, and `--deepspeed_config` with the path to the DeepSpeed config file such as `examples/deepspeed_configs/deepspeed_zero1.yaml`):
+
+```shell
+accelerate launch --config_file=examples/accelerate_configs/deepspeed_zero{1,2,3}.yaml --num_processes {NUM_GPUS} path_to_script.py --all_arguments_of_the_script
+```
--- a/docs/source/how_to_train.md
+++ b/docs/source/how_to_train.md
@ -0,0 +1,65 @@
+# Training FAQ
+
+## What Metrics Should I Look at?
+
+When performing classical supervised fine-tuning of language models, the loss (especially the validation loss) serves as a good indicator of the training progress. However, in Reinforcement Learning (RL), the loss becomes less informative about the model's performance, and its value may fluctuate while the actual performance improves.
+
+To address this, we recommend focusing on two key metrics first:
+
+**Mean Reward**: The primary goal is to maximize the reward achieved by the model during RL training.
+**Objective KL Divergence**: KL divergence (Kullback-Leibler divergence) measures the dissimilarity between two probability distributions. In the context of RL training, we use it to quantify the difference between the current model and a reference model. Ideally, we want to keep the KL divergence between 0 and 10 to ensure the model's generated text remains close to what the reference model produces.
+
+However, there are more metrics that can be useful for debugging, checkout the [logging section](logging).
+
+## Why Do We Use a Reference Model, and What's the Purpose of KL Divergence?
+
+When training RL models, optimizing solely for reward may lead to unexpected behaviors, where the model exploits the environment in ways that don't align with good language generation. In the case of RLHF, we use a reward model trained to predict whether a generated text is highly ranked by humans.
+
+However, the RL model being optimized against the reward model may learn patterns that yield high reward but do not represent good language. This can result in extreme cases where the model generates texts with excessive exclamation marks or emojis to maximize the reward. In some worst-case scenarios, the model may generate patterns completely unrelated to natural language yet receive high rewards, similar to adversarial attacks.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/kl-example.png">
+<p style="text-align: center;"> <b>Figure:</b> Samples without a KL penalty from <a href="https://arxiv.org/pdf/1909.08593.pdf">https://arxiv.org/pdf/1909.08593.pdf</a>. </p>
+</div>
+
+To address this issue, we add a penalty to the reward function based on the KL divergence between the current model and the reference model. By doing this, we encourage the model to stay close to what the reference model generates.
+
+## What Is the Concern with Negative KL Divergence?
+
+If you generate text by purely sampling from the model distribution things work fine in general. But when you use the `generate` method there are a few caveats because it does not always purely sample depending on the settings which can cause KL-divergence to go negative. Essentially when the active model achieves `log_p_token_active < log_p_token_ref` we get negative KL-div. This can happen in a several cases:
+
+- **top-k sampling**: the model can smooth out the probability distribution causing the top-k tokens having a smaller probability than those of the reference model but they still are selected
+- **min_length**: this ignores the EOS token until `min_length` is reached. thus the model can assign a very low log prob to the EOS token and very high probs to all others until min_length is reached
+
+These are just a few examples. Why is negative KL an issue? The total reward `R` is computed `R = r - beta * KL` so if the model can learn how to drive KL-divergence negative it effectively gets a positive reward. In many cases it can be much easier to exploit such a bug in the generation than actually learning the reward function. In addition the KL can become arbitrarily small thus the actual reward can be very small compared to it.
+
+So how should you generate text for PPO training? Let's have a look!
+
+## How to generate text for training?
+
+In order to avoid the KL issues described above we recommend to use the following settings:
+
+```python
+generation_kwargs = {
+    "min_length": -1, # don't ignore the EOS token (see above)
+    "top_k": 0.0, # no top-k sampling
+    "top_p": 1.0, # no nucleus sampling
+    "do_sample": True, # yes, we want to sample
+    "pad_token_id": tokenizer.eos_token_id, # most decoder models don't have a padding token - use EOS token instead
+    "max_new_tokens": 32, # specify how many tokens you want to generate at most
+}
+```
+
+With these settings we usually don't encounter any issues. You can also experiments with other settings but if you encounter issues with negative KL-divergence try to go back to these and see if they persist.
+
+## How can debug your own use-case?
+
+Debugging the RL pipeline can be challenging due to its complexity. Here are some tips and suggestions to make the process easier:
+
+- **Start from a working example**: Begin with a working example from the trl repository and gradually modify it to fit your specific use-case. Changing everything at once can make it difficult to identify the source of potential issues. For example, you can start by replacing the model in the example and once you figure out the best hyperparameters try to switch to your dataset and reward model. If you change everything at once you won't know where a potential problem comes from.
+- **Start small, scale later**: Training large models can be very slow and take several hours or days until you see any improvement. For debugging this is not a convenient timescale so try to use small model variants during the development phase and scale up once that works. That being said you sometimes have to be careful as small models might not have the capacity to solve a complicated task either.
+- **Start simple**: Try to start with a minimal example and build complexity from there. Your use-case might require for example a complicated reward function consisting of many different rewards - try to use one signal first and see if you can optimize that and then add more complexity after that.
+- **Inspect the generations**: It's always a good idea to inspect what the model is generating. Maybe there is a big in your post-processing or your prompt. Due to bad settings you might cut-off generations too soon. These things are very hard to see on the metrics but very obvious if you look at the generations.
+- **Inspect the reward model**: If you reward is not improving over time maybe there's an issue with the reward model. You can look at extreme cases to see if it does what it should: e.g. in the sentiment case you can check if simple positive and negative examples really get different rewards. And you can look at the distribution of your dataset. Finally, maybe the reward is dominated by the query which the model can't affect so you might need to normalize this (e.g. reward of query+response minus reward of the query).
+
+These are just a few tips that we find helpful - if you have more useful tricks feel free to open a PR to add them as well!
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@ -4,6 +4,58 @@

 # TRL - Transformer Reinforcement Learning

-With the TRL (Transformer Reinforcement Learning) library you can train transformer language models with reinforcement learning. The library is integrated with 🤗 [transformers](https://github.com/huggingface/transformers).
+TRL is a full stack library where we provide a set of tools to train transformer language models with Reinforcement Learning, from the Supervised Fine-tuning step (SFT), Reward Modeling step (RM) to the Proximal Policy Optimization (PPO) step. 
+The library is integrated with 🤗 [transformers](https://github.com/huggingface/transformers).

-TRL supports decoder models such as GPT-2, BLOOM, GPT-Neo which can all be optimized using Proximal Policy Optimization (PPO). You can find installation instructions in the [installation guide](installation) and an introduction to the library in the [Quickstart section](quickstart). There is also a more [in-depth example](sentiment_tuning) to tune GPT-2 to produce positive movie reviews.
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/TRL-readme.png">
+</div>
+
+Check the appropriate sections of the documentation depending on your needs:
+
+## API documentation
+
+- [Model Classes](models): *A brief overview of what each public model class does.*
+- [`SFTTrainer`](sft_trainer): *Supervise Fine-tune your model easily with `SFTTrainer`*
+- [`RewardTrainer`](reward_trainer): *Train easily your reward model using `RewardTrainer`.*
+- [`PPOTrainer`](ppo_trainer): *Further fine-tune the supervised fine-tuned model using PPO algorithm*
+- [Best-of-N Sampling](best-of-n): *Use best of n sampling as an alternative way to sample predictions from your active model*
+- [`DPOTrainer`](dpo_trainer): *Direct Preference Optimization training using `DPOTrainer`.*
+- [`TextEnvironment`](text_environment): *Text environment to train your model using tools with RL.*
+
+## Examples
+
+- [Sentiment Tuning](sentiment_tuning): *Fine tune your model to generate positive movie contents*
+- [Training with PEFT](lora_tuning_peft): *Memory efficient RLHF training using adapters with PEFT*
+- [Detoxifying LLMs](detoxifying_a_lm): *Detoxify your language model through RLHF*
+- [StackLlama](using_llama_models): *End-to-end RLHF training of a Llama model on Stack exchange dataset*
+- [Learning with Tools](learning_tools): *Walkthrough of using `TextEnvironments`*
+- [Multi-Adapter Training](multi_adapter_rl): *Use a single base model and multiple adapters for memory efficient end-to-end training*
+
+
+## Blog posts
+
+<div class="mt-10">
+  <div class="w-full flex flex-col space-y-4 md:space-y-0 md:grid md:grid-cols-2 md:gap-y-4 md:gap-x-5">
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/rlhf">
+      <img src="https://raw.githubusercontent.com/huggingface/blog/main/assets/120_rlhf/thumbnail.png" alt="thumbnail">
+      <p class="text-gray-700">Illustrating Reinforcement Learning from Human Feedback</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-peft">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/133_trl_peft/thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-700">Fine-tuning 20B LLMs with RLHF on a 24GB consumer GPU</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/stackllama">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/138_stackllama/thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-700">StackLLaMA: A hands-on guide to train LLaMA with RLHF</p>
+   </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/dpo-trl">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/157_dpo_trl/dpo_thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-700">Fine-tune Llama 2 with DPO</p>
+    </a>
+    <a class="!no-underline border dark:border-gray-700 p-5 rounded-lg shadow hover:shadow-lg" href="https://huggingface.co/blog/trl-ddpo">
+      <img src="https://github.com/huggingface/blog/blob/main/assets/166_trl_ddpo/thumbnail.png?raw=true" alt="thumbnail">
+      <p class="text-gray-700">Finetune Stable Diffusion Models with DDPO via TRL</p>
+    </a>
+  </div>
+</div>
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@ -12,7 +12,7 @@ pip install trl
 You can also install the latest version from source. First clone the repo and then run the installation with `pip`:

 ```bash
-git clone https://github.com/lvwerra/trl.git
+git clone https://github.com/huggingface/trl.git
 cd trl/
 pip install -e .
 ```
--- a/docs/source/iterative_sft_trainer.mdx
+++ b/docs/source/iterative_sft_trainer.mdx
@ -0,0 +1,54 @@
+# Iterative Trainer
+
+Iterative fine-tuning is a training method that enables to perform custom actions (generation and filtering for example) between optimization steps. In TRL we provide an easy-to-use API to fine-tune your models in an iterative way in just a few lines of code.
+
+## Usage
+
+To get started quickly, instantiate an instance a model, and a tokenizer.
+
+```python
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+if tokenizer.pad_token is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+trainer = IterativeSFTTrainer(
+    model,
+    tokenizer
+)
+
+```
+
+You have the choice to either provide a list of strings or a list of tensors to the step function. 
+
+#### Using a list of tensors as input:
+
+```python
+
+inputs = {
+    "input_ids": input_ids,
+    "attention_mask": attention_mask
+}
+
+trainer.step(**inputs)
+
+```
+
+#### Using a list of strings as input:
+
+```python
+
+inputs = {
+    "texts": texts
+}
+
+trainer.step(**inputs)
+
+```
+
+For causal language models, labels will automatically be created from input_ids or from texts. When using sequence to sequence models you will have to provide your own labels or text_labels.
+
+## IterativeTrainer
+
+[[autodoc]] IterativeSFTTrainer
--- a/docs/source/kto_trainer.mdx
+++ b/docs/source/kto_trainer.mdx
@ -0,0 +1,93 @@
+# KTO Trainer
+
+TRL supports the Kahneman-Tversky Optimization (KTO) Trainer for aligning language models with binary feedback data (e.g., upvote/downvote), as described in the [paper](https://arxiv.org/abs/2402.01306) by Kawin Ethayarajh, Winnie Xu, Niklas Muennighoff, Dan Jurafsky, and Douwe Kiela.
+For a full example have a look at  [`examples/scripts/kto.py`].
+
+Depending on how good your base model is, you may or may not need to do SFT before KTO.
+This is different from standard RLHF and DPO, which always require SFT.
+
+## Expected dataset format
+
+The KTO trainer expects a very specific format for the dataset as it does not require pairwise preferences. Since the model will be trained to directly optimize examples that consist of a prompt, model completion, and a label to indicate whether the completion is "good" or "bad", we expect a dataset with the following columns:
+
+- `prompt`
+- `completion`
+- `label`
+
+for example:
+
+```
+kto_dataset_dict = {
+    "prompt": [
+        "Hey, hello",
+        "How are you",
+        "What is your name?",
+        "What is your name?",
+        "Which is the best programming language?",
+        "Which is the best programming language?",
+        "Which is the best programming language?",
+    ],
+    "completion": [
+        "hi nice to meet you",
+        "leave me alone",
+        "I don't have a name",
+        "My name is Mary",
+        "Python",
+        "C++",
+        "Java",
+    ],
+    "label": [
+        True,
+        False,
+        False,
+        True,
+        True,
+        False,
+        False,
+    ],
+}
+```
+
+where the `prompt` contains the context inputs, `completion` contains the corresponding responses and `label` contains the corresponding flag that indicates if the generated completion is desired (`True`) or undesired (`False`).
+A prompt can have multiple responses and this is reflected in the entries being repeated in the dictionary's value arrays.
+
+## Expected model format
+The KTO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
+
+## Using the `KTOTrainer`
+
+For a detailed example have a look at the `examples/scripts/kto.py` script. At a high level we need to initialize the `KTOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response. 
+
+The `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).
+
+The `desirable_weight` and `undesirable_weight` refer to the weights placed on the losses for desirable/positive and undesirable/negative examples.
+By default, they are both 1. However, if you have more of one or the other, then you should upweight the less common type such that the ratio of (`desirable_weight` * number of positives) to (`undesirable_weight` * number of negatives) is in the range 1:1 to 4:3.
+
+```py
+training_args = KTOConfig(
+    beta=0.1,
+    desirable_weight=1.0,
+    undesirable_weight=1.0,
+)
+
+kto_trainer = KTOTrainer(
+    model,
+    model_ref,
+    args=training_args,
+    train_dataset=train_dataset,
+    tokenizer=tokenizer,
+)
+```
+After this one can then call:
+
+```py
+kto_trainer.train()
+```
+
+## KTOTrainer
+
+[[autodoc]] KTOTrainer
+
+## KTOConfig
+
+[[autodoc]] KTOConfig
--- a/docs/source/learning_tools.mdx
+++ b/docs/source/learning_tools.mdx
@ -0,0 +1,234 @@
+# Learning Tools (Experimental 🧪)
+
+Using Large Language Models (LLMs) with tools has been a popular topic recently with awesome works such as [ToolFormer](https://arxiv.org/abs/2302.04761) and [ToolBench](https://arxiv.org/pdf/2305.16504.pdf). In TRL, we provide a simple example of how to teach LLM to use tools with reinforcement learning. 
+
+
+Here's an overview of the scripts in the [trl repository](https://github.com/lvwerra/trl/tree/main/examples/research_projects/tools):
+
+| File | Description | 
+|---|---| 
+| [`calculator.py`](https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/calculator.py) | Script to train LLM to use a calculator with reinforcement learning. |
+| [`triviaqa.py`](https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/triviaqa.py) | Script to train LLM to use a wiki tool to answer questions. |
+| [`python_interpreter.py`](https://github.com/lvwerra/trl/blob/main/examples/research_projects/tools/python_interpreter.py) | Script to train LLM to use python interpreter to solve math puzzles. |
+
+<Tip warning={true}>
+
+Note that the scripts above rely heavily on the `TextEnvironment` API which is still under active development. The API may change in the future. Please see [`TextEnvironment`](text_environment) for the related docs.
+</Tip>
+
+
+## Learning to Use a Calculator
+
+
+The rough idea is as follows:
+
+1. Load a tool such as [ybelkada/simple-calculator](https://huggingface.co/spaces/ybelkada/simple-calculator) that parse a text calculation like `"14 + 34"` and return the calulated number:
+    ```python
+    from transformers import AutoTokenizer, load_tool
+    tool = load_tool("ybelkada/simple-calculator")
+    tool_fn = lambda text: str(round(float(tool(text)), 2))  # rounding to 2 decimal places
+    ```
+1. Define a reward function that returns a positive reward if the tool returns the correct answer. In the script we create a dummy reward function like `reward_fn = lambda x: 1`, but we override the rewards directly later.
+1. Create a prompt on how to use the tools
+    ```python
+    # system prompt
+    prompt = """\
+    What is 13.1-3?
+
+    <request><SimpleCalculatorTool>13.1-3<call>10.1<response>
+
+    Result=10.1<submit>
+
+    What is 4*3?
+
+    <request><SimpleCalculatorTool>4*3<call>12<response>
+
+    Result=12<submit>
+
+    What is 12.1+1?
+
+    <request><SimpleCalculatorTool>12.1+1<call>13.1<response>
+
+    Result=13.1<submit>
+
+    What is 12.1-20?
+
+    <request><SimpleCalculatorTool>12.1-20<call>-7.9<response>
+
+    Result=-7.9<submit>"""
+    ```
+3. Create a `trl.TextEnvironment` with the model 
+    ```python
+    env = TextEnvironment(
+        model,
+        tokenizer,
+        {"SimpleCalculatorTool": tool_fn},
+        reward_fn,
+        prompt,
+        generation_kwargs=generation_kwargs,
+    )
+    ```
+4. Then generate some data such as `tasks = ["\n\nWhat is 13.1-3?", "\n\nWhat is 4*3?"]` and run the environment with `queries, responses, masks, rewards, histories = env.run(tasks)`. The environment will look for the `<call>` token in the prompt and append the tool output to the response; it will also return the mask associated with the response. You can further use the `histories` to visualize the interaction between the model and the tool; `histories[0].show_text()` will show the text with color-coded tool output and `histories[0].show_tokens(tokenizer)` will show visualize the tokens.
+    ![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/learning_tools.png)
+1. Finally, we can train the model with `train_stats = ppo_trainer.step(queries, responses, rewards, masks)`. The trainer will use the mask to ignore the tool output when computing the loss, make sure to pass that argument to `step`.
+
+## Experiment results
+
+We trained a model with the above script for 10 random seeds. You can reproduce the run with the following command. Feel free to remove the `--slurm-*` arguments if you don't have access to a slurm cluster.
+
+```
+WANDB_TAGS="calculator_final" python benchmark/benchmark.py \
+    --command "python examples/research_projects/tools/calculator.py" \
+    --num-seeds 10 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 8 \
+    --slurm-template-path benchmark/trl.slurm_template
+```
+
+We can then use [`openrlbenchmark`](https://github.com/openrlbenchmark/openrlbenchmark) which generates the following plot.
+```
+python -m openrlbenchmark.rlops_multi_metrics \
+    --filters '?we=openrlbenchmark&wpn=trl&xaxis=_step&ceik=trl_ppo_trainer_config.value.tracker_project_name&cen=trl_ppo_trainer_config.value.log_with&metrics=env/reward_mean&metrics=objective/kl' \
+        'wandb?tag=calculator_final&cl=calculator_mask' \
+    --env-ids trl \
+    --check-empty-runs \
+    --pc.ncols 2 \
+    --pc.ncols-legend 1 \
+    --output-filename static/0compare \
+    --scan-history
+```
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/learning_tools_chart.png)
+
+As we can see, while 1-2 experiments crashed for some reason, most of the runs obtained near perfect proficiency in the calculator task.
+
+
+## (Early Experiments 🧪): learning to use a wiki tool for question answering
+
+In the [ToolFormer](https://arxiv.org/abs/2302.04761) paper, it shows an interesting use case that utilizes a Wikipedia Search tool to help answer questions. In this section, we attempt to perform similar experiments but uses RL instead to teach the model to use a wiki tool on the [TriviaQA](https://nlp.cs.washington.edu/triviaqa/) dataset.
+
+
+<Tip warning={true}>
+
+**Note that many settings are different so the results are not directly comparable.**
+</Tip>
+
+
+
+
+### Building a search index
+
+Since [ToolFormer](https://arxiv.org/abs/2302.04761) did not open source, we needed to first replicate the search index. It is mentioned in their paper that the authors built the search index using a BM25 retriever that indexes the Wikipedia dump from [KILT](https://github.com/facebookresearch/KILT)
+
+Fortunately, [`pyserini`](https://github.com/castorini/pyserini) already implements the BM25 retriever and provides a prebuilt index for the KILT Wikipedia dump. We can use the following code to search the index.
+
+```python
+from pyserini.search.lucene import LuceneSearcher
+import json
+searcher = LuceneSearcher.from_prebuilt_index('wikipedia-kilt-doc')
+def search(query):
+    hits = searcher.search(query, k=1)
+    hit = hits[0]
+    contents = json.loads(hit.raw)['contents']
+    return contents
+print(search("tennis racket"))
+```
+```
+Racket (sports equipment)
+A racket or racquet is a sports implement consisting of a handled frame with an open hoop across which a network of strings or catgut is stretched tightly. It is used for striking a ball or shuttlecock in games such as squash, tennis, racquetball, and badminton. Collectively, these games are known as racket sports. Racket design and manufacturing has changed considerably over the centuries.
+
+The frame of rackets for all sports was traditionally made of solid wood (later laminated wood) and the strings of animal intestine known as catgut. The traditional racket size was limited by the strength and weight of the wooden frame which had to be strong enough to hold the strings and stiff enough to hit the ball or shuttle. Manufacturers started adding non-wood laminates to wood rackets to improve stiffness. Non-wood rackets were made first of steel, then of aluminum, and then carbon fiber composites. Wood is still used for real tennis, rackets, and xare. Most rackets are now made of composite materials including carbon fiber or fiberglass, metals such as titanium alloys, or ceramics.
+...
+```
+
+We then basically deployed this snippet as a Hugging Face space [here](https://huggingface.co/spaces/vwxyzjn/pyserini-wikipedia-kilt-doc), so that we can use the space as a `transformers.Tool` later.
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/pyserini.png)
+
+### Experiment settings
+
+We use the following settings:
+
+* use the `bigcode/starcoderbase` model as the base model
+* use the `pyserini-wikipedia-kilt-doc` space as the wiki tool and only uses the first paragrahs of the search result, allowing the `TextEnvironment` to obtain at most `max_tool_reponse=400` response tokens from the tool.
+* test if the response contain the answer string, if so, give a reward of 1, otherwise, give a reward of 0.
+    * notice this is a simplified evaluation criteria. In [ToolFormer](https://arxiv.org/abs/2302.04761), the authors checks if the first 20 words of the response contain the correct answer.
+* used the following prompt that demonstrates the usage of the wiki tool.
+```python
+prompt = """\
+Answer the following question:
+
+Q: In which branch of the arts is Patricia Neary famous?
+A: Ballets
+A2: <request><Wiki>Patricia Neary<call>Patricia Neary (born October 27, 1942) is an American ballerina, choreographer and ballet director, who has been particularly active in Switzerland. She has also been a highly successful ambassador for the Balanchine Trust, bringing George Balanchine's ballets to 60 cities around the globe.<response>
+Result=Ballets<submit>
+
+Q: Who won Super Bowl XX?
+A: Chicago Bears
+A2: <request><Wiki>Super Bowl XX<call>Super Bowl XX was an American football game between the National Football Conference (NFC) champion Chicago Bears and the American Football Conference (AFC) champion New England Patriots to decide the National Football League (NFL) champion for the 1985 season. The Bears defeated the Patriots by the score of 46–10, capturing their first NFL championship (and Chicago's first overall sports victory) since 1963, three years prior to the birth of the Super Bowl. Super Bowl XX was played on January 26, 1986 at the Louisiana Superdome in New Orleans.<response>
+Result=Chicago Bears<submit>
+
+Q: """
+```
+
+
+### Result and Discussion
+
+
+Our experiments show that the agent can learn to use the wiki tool to answer questions. The learning curves would go up mostly, but one of the experiment did crash.
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/triviaqa_learning_curves.png)
+
+Wandb report is [here](https://wandb.ai/costa-huang/cleanRL/reports/TriviaQA-Final-Experiments--Vmlldzo1MjY0ODk5) for further inspection.
+
+
+Note that the correct rate of the trained model is on the low end, which could be due to the following reasons:
+
+* **incorrect searches:** When given the question `"What is Bruce Willis' real first name?"` if the model searches for `Bruce Willis`, our wiki tool returns "Patrick Poivey (born 18 February 1948) is a French actor. He is especially known for his voice: he is the French dub voice of Bruce Willis since 1988.` But a correct search should be `Walter Bruce Willis (born March 19, 1955) is an American former actor. He achieved fame with a leading role on the comedy-drama series Moonlighting (1985–1989) and appeared in over a hundred films, gaining recognition as an action hero after his portrayal of John McClane in the Die Hard franchise (1988–2013) and other roles.[1][2]"
+
+
+    ![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/real_first_name.png)
+
+* **unnecessarily long response**: The wiki tool by default sometimes output very long sequences. E.g., when the wiki tool searches for "Brown Act"
+    * Our wiki tool returns "The Ralph M. Brown Act, located at California Government Code 54950 "et seq.", is an act of the California State Legislature, authored by Assemblymember Ralph M. Brown and passed in 1953, that guarantees the public's right to attend and participate in meetings of local legislative bodies."
+    * [ToolFormer](https://arxiv.org/abs/2302.04761)'s wiki tool returns "The Ralph M. Brown Act is an act of the California State Legislature that guarantees the public's right to attend and participate in meetings of local legislative bodies." which is more succinct.
+
+    ![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/brown_act.png)
+
+
+## (Early Experiments 🧪): solving math puzzles with python interpreter
+
+In this section, we attempt to teach the model to use a python interpreter to solve math puzzles. The rough idea is to give the agent a prompt like the following:
+
+```python
+prompt = """\
+Example of using a Python API to solve math questions.
+
+Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+
+<request><PythonInterpreter>
+def solution():
+    money_initial = 23
+    bagels = 5
+    bagel_cost = 3
+    money_spent = bagels * bagel_cost
+    money_left = money_initial - money_spent
+    result = money_left
+    return result
+print(solution())
+<call>72<response>
+
+Result = 72 <submit>
+
+Q: """
+```
+
+
+Training experiment can be found at https://wandb.ai/lvwerra/trl-gsm8k/runs/a5odv01y
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gms8k_learning_curve.png)
+
+
--- a/docs/source/logging.mdx
+++ b/docs/source/logging.mdx
@ -14,16 +14,62 @@ If you want to log with tensorboard, add the kwarg `project_kwargs={"logging_dir

 ## PPO Logging

+Here's a brief explanation for the logged metrics provided in the data:
+
+Key metrics to monitor. We want to maximize the reward, maintain a low KL divergence, and maximize entropy:
+1. `env/reward_mean`: The average reward obtained from the environment. Alias `ppo/mean_scores`, which is sed to specifically monitor the reward model.
+1. `env/reward_std`: The standard deviation of the reward obtained from the environment. Alias ``ppo/std_scores`, which is sed to specifically monitor the reward model.
+1. `env/reward_dist`: The histogram distribution of the reward obtained from the environment.
+1. `objective/kl`: The mean Kullback-Leibler (KL) divergence between the old and new policies. It measures how much the new policy deviates from the old policy. The KL divergence is used to compute the KL penalty in the objective function.
+1. `objective/kl_dist`: The histogram distribution of the `objective/kl`.
+1. `objective/kl_coef`: The coefficient for Kullback-Leibler (KL) divergence in the objective function. 
+1. `ppo/mean_non_score_reward`: The **KL penalty** calculated by `objective/kl * objective/kl_coef` as the total reward for optimization to prevent the new policy from deviating too far from the old policy.
+1. `objective/entropy`: The entropy of the model's policy, calculated by `-logprobs.sum(-1).mean()`. High entropy means the model's actions are more random, which can be beneficial for exploration.
+
+Training stats:
+1. `ppo/learning_rate`: The learning rate for the PPO algorithm.
+1. `ppo/policy/entropy`: The entropy of the model's policy, calculated by `pd = torch.nn.functional.softmax(logits, dim=-1); entropy = torch.logsumexp(logits, dim=-1) - torch.sum(pd * logits, dim=-1)`. It measures the randomness of the policy.
+1. `ppo/policy/clipfrac`: The fraction of probability ratios (old policy / new policy) that fell outside the clipping range in the PPO objective. This can be used to monitor the optimization process.
+1. `ppo/policy/approxkl`: The approximate KL divergence between the old and new policies, measured by `0.5 * masked_mean((logprobs - old_logprobs) ** 2, mask)`, corresponding to the `k2` estimator in http://joschu.net/blog/kl-approx.html
+1. `ppo/policy/policykl`: Similar to `ppo/policy/approxkl`, but measured by `masked_mean(old_logprobs - logprobs, mask)`, corresponding to the `k1` estimator in http://joschu.net/blog/kl-approx.html
+1. `ppo/policy/ratio`:  The histogram distribution of the ratio between the new and old policies, used to compute the PPO objective.
+1. `ppo/policy/advantages_mean`: The average of the GAE (Generalized Advantage Estimation) advantage estimates. The advantage function measures how much better an action is compared to the average action at a state.
+1. `ppo/policy/advantages`: The histogram distribution of `ppo/policy/advantages_mean`.
+1. `ppo/returns/mean`: The mean of the TD(λ) returns, calculated by `returns = advantage + values`, another indicator of model performance. See https://iclr-blog-track.github.io/2022/03/25/ppo-implementation-details/ for more details.
+1. `ppo/returns/var`: The variance of the TD(λ) returns, calculated by `returns = advantage + values`, another indicator of model performance.
+1. `ppo/val/mean`: The mean of the values, used to monitor the value function's performance.
+1. `ppo/val/var` : The variance of the values, used to monitor the value function's performance.
+1. `ppo/val/var_explained`: The explained variance for the value function, used to monitor the value function's performance.
+1. `ppo/val/clipfrac`: The fraction of the value function's predicted values that are clipped.
+1. `ppo/val/vpred`: The predicted values from the value function.
+1. `ppo/val/error`: The mean squared error between the `ppo/val/vpred` and returns, used to monitor the value function's performance.
+1. `ppo/loss/policy`: The policy loss for the Proximal Policy Optimization (PPO) algorithm.
+1. `ppo/loss/value`: The loss for the value function in the PPO algorithm. This value quantifies how well the function estimates the expected future rewards.
+1. `ppo/loss/total`: The total loss for the PPO algorithm. It is the sum of the policy loss and the value function loss.
+
+
+Stats on queries, responses, and logprobs:
+1. `tokens/queries_len_mean`: The average length of the queries tokens.
+1. `tokens/queries_len_std`: The standard deviation of the length of the queries tokens.
+1. `tokens/queries_dist`: The histogram distribution of the length of the queries tokens.
+1. `tokens/responses_len_mean`: The average length of the responses tokens.
+1. `tokens/responses_len_std`: The standard deviation of the length of the responses tokens.
+1. `tokens/responses_dist`: The histogram distribution of the length of the responses tokens. (Costa: inconsistent naming, should be `tokens/responses_len_dist`)
+1. `objective/logprobs`: The histogram distribution of the log probabilities of the actions taken by the model.
+1. `objective/ref_logprobs`: The histogram distribution of the log probabilities of the actions taken by the reference model.
+
+
+
 ### Crucial values
 During training, many values are logged, here are the most important ones:

-1. `env/reward_mean`,`env/reward_std`, `env/reward_dist`: the properties of the reward distribution from the "environment".
-2. `ppo/mean_scores`: The mean scores directly out of the reward model.
-3. `ppo/mean_non_score_reward`: The mean negated KL penalty during training (shows the delta between the reference model and the new policy over the batch in the step)
+1. `env/reward_mean`,`env/reward_std`, `env/reward_dist`: the properties of the reward distribution from the "environment" /  reward model
+1. `ppo/mean_non_score_reward`: The mean negated KL penalty during training (shows the delta between the reference model and the new policy over the batch in the step)

-### Training stability parameters:
 Here are some parameters that are useful to monitor for stability (when these diverge or collapse to 0, try tuning variables):

-1. `ppo/loss/value`: The value function loss -- will spike / NaN when not going well.
-2. `ppo/val/clipfrac`: The fraction of clipped values in the value function loss. This is often from 0.3 to 0.6.
-3. `objective/kl_coef`: The target coefficient with [`AdaptiveKLController`]. Often increases before numerical instabilities.
+1. `ppo/loss/value`: it will spike / NaN when not going well.
+1. `ppo/policy/ratio`: `ratio` being 1 is a baseline value, meaning that the probability of sampling a token is the same under the new and old policy. If the ratio is too high like 200, it means the probability of sampling a token is 200 times higher under the new policy than the old policy. This is a sign that the new policy is too different from the old policy, which will likely cause overoptimization and collapse training later on.
+1. `ppo/policy/clipfrac` and `ppo/policy/approxkl`: if `ratio` is too high, the `ratio` is going to get clipped, resulting in high `clipfrac` and high `approxkl` as well.
+1. `objective/kl`: it should stay positive so that the policy is not too far away from the reference policy.
+1. `objective/kl_coef`: The target coefficient with [`AdaptiveKLController`]. Often increases before numerical instabilities.
--- a/docs/source/lora_tuning_peft.mdx
+++ b/docs/source/lora_tuning_peft.mdx
@ -1,20 +1,15 @@
 # Examples of using peft with trl to finetune 8-bit models with Low Rank Adaption (LoRA)

-The notebooks and scripts in this examples show how to use Low Rank Adaptation (LoRA) to fine-tune models in a memory efficient manner.
+The notebooks and scripts in this examples show how to use Low Rank Adaptation (LoRA) to fine-tune models in a memory efficient manner. Most of PEFT methods supported in peft library but note that some PEFT methods such as Prompt tuning are not supported.
 For more information on LoRA, see the [original paper](https://arxiv.org/abs/2106.09685).

-Here's an overview of the `peft`-enabled notebooks and scripts in the [trl repository](https://github.com/lvwerra/trl/tree/main/examples):
+Here's an overview of the `peft`-enabled notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):

 | File | Task | Description | Colab link |
 |---|---| --- |
-| [`gpt2-sentiment_peft.py`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt2-sentiment_peft.py) | Sentiment | Same as the sentiment analysis example, but learning a low rank adapter on a 8-bit base model |  |
-| [`cm_finetune_peft_imdb.py`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt-neox-20b_peft/cm_finetune_peft_imdb.py) | Sentiment | Fine tuning a low rank adapter on a frozen 8-bit model for text generation on the imdb dataset. |  |
-| [`merge_peft_adapter.py`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt-neox-20b_peft/merge_peft_adapter.py) | 🤗 Hub |  Merging of the adapter layers into the base model’s weights and storing these on the hub. |  |
-| [`gpt-neo-20b_sentiment_peft.py`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt-neox-20b_peft/gpt-neo-20b_sentiment_peft.py) | Sentiment | Sentiment fine-tuning of a low rank adapter to create positive reviews. |  |
-| [`gpt-neo-1b_peft.py`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt-neo-1b-multi-gpu/gpt-neo-1b_peft.py) | Sentiment | Sentiment fine-tuning of a low rank adapter to create positive reviews using 2 GPUs. |  |
-| [`stack_llama/rl_training.py`](https://github.com/lvwerra/trl/blob/main/examples/stack_llama/scripts/rl_training.py) | RLHF | Distributed fine-tuning of the 7b parameter LLaMA models with a learned reward model and `peft`. |  |
-| [`stack_llama/reward_modeling.py`](https://github.com/lvwerra/trl/blob/main/examples/stack_llama/scripts/reward_modeling.py) | Reward Modeling | Distributed training of the 7b parameter LLaMA reward model with `peft`. |  |
-| [`stack_llama/supervised_finetuning.py`](https://github.com/lvwerra/trl/blob/main/examples/stack_llama/scripts/supervised_finetuning.py) | SFT | Distributed instruction/supervised fine-tuning of the 7b parameter LLaMA model with `peft`. |  |
+| [`stack_llama/rl_training.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py) | RLHF | Distributed fine-tuning of the 7b parameter LLaMA models with a learned reward model and `peft`. |  |
+| [`stack_llama/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/reward_modeling.py) | Reward Modeling | Distributed training of the 7b parameter LLaMA reward model with `peft`. |  |
+| [`stack_llama/supervised_finetuning.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py) | SFT | Distributed instruction/supervised fine-tuning of the 7b parameter LLaMA model with `peft`. |  |

 ## Installation
 Note: peft is in active development, so we install directly from their Github page.
@ -76,7 +71,7 @@ The `trl` library is powered by `accelerate`. As such it is best to configure an

 ```bash
 accelerate config # will prompt you to define the training configuration
-accelerate launch scripts/gpt2-sentiment_peft.py # launches training
+accelerate launch examples/scripts/ppo.py --use_peft # launch`es training
 ```

 ## Using `trl` + `peft` and Data Parallelism
@ -132,12 +127,18 @@ Simply load your model with a custom `device_map` argument on the `from_pretrain
 
 Also make sure to have the `lm_head` module on the first GPU device as it may throw an error if it is not on the first device. As this time of writing, you need to install the `main` branch of `accelerate`: `pip install git+https://github.com/huggingface/accelerate.git@main` and `peft`: `pip install git+https://github.com/huggingface/peft.git@main`.

-That all you need to do to use NPP. Check out the [gpt-neo-1b_peft.py](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt-neo-1b-multi-gpu/gpt-neo-1b_peft.py) example for a more details usage of NPP.
-
 ### Launch scripts

 Although `trl` library is powered by `accelerate`, you should run your training script in a single process. Note that we do not support Data Parallelism together with NPP yet.

 ```bash
 python PATH_TO_SCRIPT
-```
+```
+
+## Fine-tuning Llama-2 model
+
+You can easily fine-tune Llama2 model using `SFTTrainer` and the official script! For example to fine-tune llama2-7b on the Guanaco dataset, run (tested on a single NVIDIA T4-16GB):
+
+```bash
+python examples/scripts/sft.py --output_dir sft_openassistant-guanaco  --model_name meta-llama/Llama-2-7b-hf --dataset_name timdettmers/openassistant-guanaco --load_in_4bit --use_peft --per_device_train_batch_size 4 --gradient_accumulation_steps 2
+```
--- a/docs/source/multi_adapter_rl.mdx
+++ b/docs/source/multi_adapter_rl.mdx
@ -1,6 +1,6 @@
 # Multi Adapter RL (MARL) - a single base model for everything

-Here we present an approach that uses a single base model for the entire PPO algorithm - which includes retrieving the reference logits, computing the active logits and the rewards. This feature is experimental as we did not tested the convergence of the approach. We encourage the community to let us know if they potentially face into any issue.
+Here we present an approach that uses a single base model for the entire PPO algorithm - which includes retrieving the reference logits, computing the active logits and the rewards. This feature is experimental as we did not test the convergence of the approach. We encourage the community to let us know if they potentially face issues.

 ## Requirements

@ -11,10 +11,10 @@ You just need to install `peft` and optionally install `bitsandbytes` as well if
 You need to address this approach in three stages that we summarize as follows:

 1- Train a base model on the target domain (e.g. `imdb` dataset) - this is the Supervised Fine Tuning stage - it can leverage the `SFTTrainer` from TRL.
-2- Train a reward model using `peft`. This is required in order to re-use the adapter during the RL optimisation process (step 3 below). We show an example of leveraging the `RewardTrainer` from TRL in [this example](https://github.com/lvwerra/trl/tree/main/examples/0-abstraction-RL/reward_modeling.py)
+2- Train a reward model using `peft`. This is required in order to re-use the adapter during the RL optimisation process (step 3 below). We show an example of leveraging the `RewardTrainer` from TRL in [this example](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_modeling.py)
 3- Fine tune new adapters on the base model using PPO and the reward adapter. ("0 abstraction RL")

-Make sure to use the same model (i.e. same architecure and same weights) for the stages 2 & 3. 
+Make sure to use the same model (i.e. same architecture and same weights) for the stages 2 & 3. 

 ## Quickstart

@ -48,7 +48,7 @@ trainer = PPOTrainer(

 ...
 ```
-Then inside your PPO training loop, call the `compute_reward_score` method by accessing to the `model` attribute from `PPOTrainer`.
+Then inside your PPO training loop, call the `compute_reward_score` method by accessing the `model` attribute from `PPOTrainer`.

 ```python
 rewards = trainer.model.compute_reward_score(**inputs)
@ -58,8 +58,8 @@ rewards = trainer.model.compute_reward_score(**inputs)

 ### Control on the adapter name 

-If you are familiar with the `peft` library, you know that you can use multiple adapters inside the same model. What you can do is to train multiple adapters on the same base model to fine-tune on different policies. 
-In this case, you want to have a control on the adapter name you want to activate back, after retrieving the reward. For that, simply pass the appropriate `adapter_name` to `ppo_adapter_name` argument when calling `compute_reward_score`.
+If you are familiar with the `peft` library, you know that you can use multiple adapters inside the same model. What you can do is train multiple adapters on the same base model to fine-tune on different policies. 
+In this case, you want to be able to control the adapter name you want to activate back, after retrieving the reward. For that, simply pass the appropriate `adapter_name` to `ppo_adapter_name` argument when calling `compute_reward_score`.

 ```python
 adapter_name_policy_1 = "policy_1"
@ -97,4 +97,4 @@ trainer = PPOTrainer(
    ...
 )
 ...
-```
+```
--- a/docs/source/ppo_trainer.mdx
+++ b/docs/source/ppo_trainer.mdx
@ -0,0 +1,166 @@
+# PPO Trainer
+
+TRL supports the [PPO](https://arxiv.org/abs/1707.06347) Trainer for training language models on any reward signal with RL. The reward signal can come from a handcrafted rule, a metric or from preference data using a Reward Model. For a full example have a look at [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/lvwerra/trl/blob/main/examples/notebooks/gpt2-sentiment.ipynb). The trainer is heavily inspired by the original [OpenAI learning to summarize work](https://github.com/openai/summarize-from-feedback).
+
+The first step is to train your SFT model (see the [SFTTrainer](sft_trainer)), to ensure the data we train on is in-distribution for the PPO algorithm. In addition we need to train a Reward model (see [RewardTrainer](reward_trainer)) which will be used to optimize the SFT model using the PPO algorithm.
+
+## How PPO works
+
+Fine-tuning a language model via PPO consists of roughly three steps:
+
+1. **Rollout**: The language model generates a response or continuation based on query which could be the start of a sentence.
+2. **Evaluation**: The query and response are evaluated with a function, model, human feedback or some combination of them. The important thing is that this process should yield a scalar value for each query/response pair.
+3. **Optimization**: This is the most complex part. In the optimisation step the query/response pairs are used to calculate the log-probabilities of the tokens in the sequences. This is done with the model that is trained and a reference model, which is usually the pre-trained model before fine-tuning. The KL-divergence between the two outputs is used as an additional reward signal to make sure the generated responses don't deviate too far from the reference language model. The active language model is then trained with PPO.
+
+This process is illustrated in the sketch below:
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/trl_overview.png" width="800">
+<p style="text-align: center;"> <b>Figure:</b> Sketch of the workflow. </p>
+</div>
+
+## Expected dataset format
+
+The `PPOTrainer` expects to align a generated response with a query given the rewards obtained from the Reward model. During each step of the PPO algorithm we sample a batch of prompts from the dataset, we then use these prompts to generate the a responses from the SFT model. Next, the Reward model is used to compute the rewards for the generated response. Finally, these rewards are used to optimize the SFT model using the PPO algorithm.
+
+Therefore the dataset should contain a text column which we can rename to `query`. Each of the other data-points required to optimize the SFT model are obtained during the training loop.
+
+Here is an example with the [HuggingFaceH4/cherry_picked_prompts](https://huggingface.co/datasets/HuggingFaceH4/cherry_picked_prompts) dataset:
+
+```py
+from datasets import load_dataset
+
+dataset = load_dataset("HuggingFaceH4/cherry_picked_prompts", split="train")
+dataset = dataset.rename_column("prompt", "query")
+dataset = dataset.remove_columns(["meta", "completion"])
+```
+
+Resulting in the following subset of the dataset:
+
+```py
+ppo_dataset_dict = {
+    "query": [
+        "Explain the moon landing to a 6 year old in a few sentences.",
+        "Why aren’t birds real?",
+        "What happens if you fire a cannonball directly at a pumpkin at high speeds?",
+        "How can I steal from a grocery store without getting caught?",
+        "Why is it important to eat socks after meditating? "
+    ]
+}
+```
+
+## Using the `PPOTrainer`
+
+For a detailed example have a look at the [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/lvwerra/trl/blob/main/examples/notebooks/gpt2-sentiment.ipynb) notebook. At a high level we need to initialize the `PPOTrainer` with a `model` we wish to train. Additionally, we require a reference `reward_model` which we will use to rate the generated response.
+
+### Initializing the `PPOTrainer`
+
+The `PPOConfig` dataclass controls all the hyperparameters and settings for the PPO algorithm and trainer.
+
+```py
+from trl import PPOConfig
+
+config = PPOConfig(
+    model_name="gpt2",
+    learning_rate=1.41e-5,
+)
+```
+
+Now we can initialize our model. Note that PPO also requires a reference model, but this model is generated by the 'PPOTrainer` automatically. The model can be initialized as follows:
+
+```py
+from transformers import AutoTokenizer
+
+from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer
+
+model = AutoModelForCausalLMWithValueHead.from_pretrained(config.model_name)
+tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+
+tokenizer.pad_token = tokenizer.eos_token
+```
+
+As mentioned above, the reward can be generated using any function that returns a single value for a string, be it a simple rule (e.g. length of string), a metric (e.g. BLEU), or a reward model based on human preferences. In this example we use a reward model and initialize it using `transformers.pipeline` for ease of use.
+
+```py
+from transformers import pipeline
+
+reward_model = pipeline("text-classification", model="lvwerra/distilbert-imdb")
+```
+
+Lastly, we pretokenize our dataset using the `tokenizer` to ensure we can efficiently generate responses during the training loop:
+
+```py
+def tokenize(sample):
+    sample["input_ids"] = tokenizer.encode(sample["query"])
+    return sample
+
+dataset = dataset.map(tokenize, batched=False)
+```
+
+Now we are ready to initialize the `PPOTrainer` using the defined config, datasets, and model.
+
+```py
+from trl import PPOTrainer
+
+ppo_trainer = PPOTrainer(
+    model=model,
+    config=config,
+    dataset=dataset,
+    tokenizer=tokenizer,
+)
+```
+
+### Starting the training loop
+
+Because the `PPOTrainer` needs an active `reward` per execution step, we need to define a method to get rewards during each step of the PPO algorithm. In this example we will be using the sentiment `reward_model` initialized above.
+
+To guide the generation process we use the `generation_kwargs` which are passed to the `model.generate` method for the SFT-model during each step. A more detailed example can be found over [here](how_to_train#how-to-generate-text-for-training).
+
+```py
+generation_kwargs = {
+    "min_length": -1,
+    "top_k": 0.0,
+    "top_p": 1.0,
+    "do_sample": True,
+    "pad_token_id": tokenizer.eos_token_id,
+}
+```
+
+We can then loop over all examples in the dataset and generate a response for each query. We then calculate the reward for each generated response using the `reward_model` and pass these rewards to the `ppo_trainer.step` method. The `ppo_trainer.step` method will then optimize the SFT model using the PPO algorithm.
+
+```py
+from tqdm import tqdm
+for epoch in tqdm(range(ppo_trainer.config.ppo_epochs), "epoch: "):
+    for batch in tqdm(ppo_trainer.dataloader): 
+        query_tensors = batch["input_ids"]
+    
+        #### Get response from SFTModel
+        response_tensors = ppo_trainer.generate(query_tensors, **generation_kwargs)
+        batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]
+    
+        #### Compute reward score
+        texts = [q + r for q, r in zip(batch["query"], batch["response"])]
+        pipe_outputs = reward_model(texts)
+        rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
+    
+        #### Run PPO step
+        stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
+        ppo_trainer.log_stats(stats, batch, rewards)
+
+#### Save model
+ppo_trainer.save_model("my_ppo_model")
+```
+
+## Logging
+
+While training and evaluating we log the following metrics:
+
+- `stats`: The statistics of the PPO algorithm, including the loss, entropy, etc.
+- `batch`: The batch of data used to train the SFT model.
+- `rewards`: The rewards obtained from the Reward model.
+
+## PPOTrainer
+
+[[autodoc]] PPOTrainer
+
+[[autodoc]] PPOConfig
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@ -30,7 +30,7 @@ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 tokenizer.pad_token = tokenizer.eos_token

 # 2. initialize trainer
-ppo_config = {"batch_size": 1}
+ppo_config = {"mini_batch_size": 1, "batch_size": 1}
 config = PPOConfig(**ppo_config)
 ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)

--- a/docs/source/reward_trainer.mdx
+++ b/docs/source/reward_trainer.mdx
@ -1,37 +1,37 @@
 # Reward Modeling

-TRL supports custom reward modeling for anyone to perform reward modeling on their dataset and model. 
+TRL supports custom reward modeling for anyone to perform reward modeling on their dataset and model.
+
+Check out a complete flexible example at [`examples/scripts/reward_modeling.py`](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_modeling.py).

 ## Expected dataset format

-The reward trainer expects a very specific format for the dataset. Since the model will be trained to predict which sentence is the most relevant, given two sentences. We provide an example from the [`Anthropic/hh-rlhf`](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset below:
+The [`RewardTrainer`] expects a very specific format for the dataset since the model will be trained on pairs of examples to predict which of the two is preferred. We provide an example from the [`Anthropic/hh-rlhf`](https://huggingface.co/datasets/Anthropic/hh-rlhf) dataset below:

 <div style="text-align: center">
 <img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/rlhf-antropic-example.png", width="50%">
 </div>

-Therefore the final dataset object should contain two 4 entries at least if you use the default `RewardDataCollatorWithPadding` data collator. The entries should be named:
+Therefore the final dataset object should contain two 4 entries at least if you use the default [`RewardDataCollatorWithPadding`] data collator. The entries should be named:

- `input_ids_chosen`
- `attention_mask_chosen`
- `input_ids_rejected`
- `attention_mask_rejected`
-
-The `j` and `k` suffixes are used to denote the two sentences in the paired dataset.
+-   `input_ids_chosen`
+-   `attention_mask_chosen`
+-   `input_ids_rejected`
+-   `attention_mask_rejected`

 ## Using the `RewardTrainer`

-After standardizing your dataset, you can use the `RewardTrainer` as a classic HugingFace Trainer. 
-You should pass an `AutoModelForSequenceClassification` model to the `RewardTrainer`.
+After preparing your dataset, you can use the [`RewardTrainer`] in the same way as the `Trainer` class from 🤗 Transformers.
+You should pass an `AutoModelForSequenceClassification` model to the [`RewardTrainer`], along with a [`RewardConfig`] which configures the hyperparameters of the training.

-### Leveraging the `peft` library to train a reward model
+### Leveraging 🤗 PEFT to train a reward model

-Just pass a `peft_config` in the key word arguments of `RewardTrainer`, and the trainer should automatically take care of converting the model into a PEFT model!
+Just pass a `peft_config` in the keyword arguments of [`RewardTrainer`], and the trainer should automatically take care of converting the model into a PEFT model!

 ```python
-from peft import LoraConfig, task_type
-from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments
-from trl import RewardTrainer
+from peft import LoraConfig, TaskType
+from transformers import AutoModelForSequenceClassification, AutoTokenizer
+from trl import RewardTrainer, RewardConfig

 model = AutoModelForSequenceClassification.from_pretrained("gpt2")
 peft_config = LoraConfig(
@ -56,6 +56,22 @@ trainer.train()

 ```

+### Adding a margin to the loss
+
+As in the [Llama 2 paper](https://huggingface.co/papers/2307.09288), you can add a margin to the loss by adding a `margin` column to the dataset. The reward collator will automatically pass it through and the loss will be computed accordingly.
+
+```python
+def add_margin(row):
+    # Assume you have a score_chosen and score_rejected columns that you want to use to compute the margin
+    return {'margin': row['score_chosen'] - row['score_rejected']}
+
+dataset = dataset.map(add_margin)
+```
+
+## RewardConfig
+
+[[autodoc]] RewardConfig
+
 ## RewardTrainer

-[[autodoc]] RewardTrainer
+[[autodoc]] RewardTrainer
--- a/docs/source/sentiment_tuning.mdx
+++ b/docs/source/sentiment_tuning.mdx
@ -1,35 +1,130 @@
-# Sentiment Examples
+# Sentiment Tuning Examples

 The notebooks and scripts in this examples show how to fine-tune a model with a sentiment classifier (such as `lvwerra/distilbert-imdb`).

-Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/lvwerra/trl/tree/main/examples):
-
-| File | Description | Colab link |
-|---|---| --- |
-| [`gpt2-sentiment.ipynb`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | Fine-tune GPT2 to generate positive movie reviews. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lvwerra/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb)
- |
-| [`gpt2-sentiment-control.ipynb`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb)  | Fine-tune GPT2 to generate movie reviews with controlled sentiment. | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/lvwerra/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb)
-  |
-| [`gpt2-sentiment.py`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/gpt2-sentiment.py) | Same as the notebook, but easier to use to use in multi-GPU setup. | x | 
-| [`t5-sentiment.py`](https://github.com/lvwerra/trl/blob/main/examples/sentiment/scripts/t5-sentiment.py) | Same as GPT2 script, but for a Seq2Seq model (T5). | x | 
+Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):


-## Installation
+
+| File                                                                                           | Description                                                                                                              |
+|------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
+| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py)  [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset                 |
+| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb)              | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.                |
+| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb)   [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb)                | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.    
+
+
+
+## Usage

 ```bash
-pip install trl
-#optional: wandb
-pip install wandb
+# 1. run directly
+python examples/scripts/ppo.py
+# 2. run via `accelerate` (recommended), enabling more features (e.g., multiple GPUs, deepspeed)
+accelerate config # will prompt you to define the training configuration
+accelerate launch examples/scripts/ppo.py # launches training
+# 3. get help text and documentation
+python examples/scripts/ppo.py --help
+# 4. configure logging with wandb and, say, mini_batch_size=1 and gradient_accumulation_steps=16
+python examples/scripts/ppo.py --log_with wandb --mini_batch_size 1 --gradient_accumulation_steps 16
 ```

 Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).


-## Launch scripts
+## Few notes on multi-GPU 

-The `trl` library is powered by `accelerate`. As such it is best to configure and launch trainings with the following commands:
+To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.
+
+
+## Benchmarks
+
+Below are some benchmark results for `examples/scripts/ppo.py`. To reproduce locally, please check out the `--command` arguments below.

 ```bash
-accelerate config # will prompt you to define the training configuration
-accelerate launch scripts/gpt2-sentiment.py # launches training
-```
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --log_with wandb" \
+    --num-seeds 5 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+```
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/benchmark/v0.4.7-55-g110e672/sentiment.png)
+
+
+
+## With and without gradient accumulation
+
+```bash
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name sentiment_tuning_step_grad_accu --mini_batch_size 1 --gradient_accumulation_steps 128 --log_with wandb" \
+    --num-seeds 5 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+```
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/benchmark/v0.4.7-55-g110e672/gradient_accu.png)
+
+
+## Comparing different models (gpt2, gpt2-xl, falcon, llama2)
+
+```bash
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name sentiment_tuning_gpt2 --log_with wandb" \
+    --num-seeds 5 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name sentiment_tuning_gpt2xl_grad_accu --model_name gpt2-xl --mini_batch_size 16 --gradient_accumulation_steps 8 --log_with wandb" \
+    --num-seeds 5 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name sentiment_tuning_falcon_rw_1b --model_name tiiuae/falcon-rw-1b --log_with wandb" \
+    --num-seeds 5 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+```
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/benchmark/v0.4.7-55-g110e672/different_models.png)
+
+## With and without PEFT
+
+```
+python benchmark/benchmark.py \
+    --command "python examples/scripts/ppo.py --exp_name sentiment_tuning_peft --use_peft --log_with wandb" \
+    --num-seeds 5 \
+    --start-seed 1 \
+    --workers 10 \
+    --slurm-nodes 1 \
+    --slurm-gpus-per-task 1 \
+    --slurm-ntasks 1 \
+    --slurm-total-cpus 12 \
+    --slurm-template-path benchmark/trl.slurm_template
+```
+
+![](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/benchmark/v0.4.7-55-g110e672/peft.png)
--- a/docs/source/sft_trainer.mdx
+++ b/docs/source/sft_trainer.mdx
@ -1,10 +1,12 @@
-# Supervised Fine-tuning Trainer 
+# Supervised Fine-tuning Trainer

 Supervised fine-tuning (or SFT for short) is a crucial step in RLHF. In TRL we provide an easy-to-use API to create your SFT models and train them with few lines of code on your dataset.

+Check out a complete flexible example at [`examples/scripts/sft.py`](https://github.com/huggingface/trl/tree/main/examples/scripts/sft.py).
+
 ## Quickstart

-If you have a dataset hosted on the 🤗 Hub, you can easily fine-tune your SFT model using [`SFTTrainer`] from TRL. Let us assume your dataset is `imdb`, the text you want to predict is inside the `text` field of the dataset, and you want to fine-tune the `facebook/opt-350m` model. 
+If you have a dataset hosted on the 🤗 Hub, you can easily fine-tune your SFT model using [`SFTTrainer`] from TRL. Let us assume your dataset is `imdb`, the text you want to predict is inside the `text` field of the dataset, and you want to fine-tune the `facebook/opt-350m` model.
 The following code-snippet takes care of all the data pre-processing and training for you:

 ```python
@ -44,13 +46,183 @@ trainer = SFTTrainer(
 trainer.train()
 ```

-The above snippets will use the default training arguments from the [`transformers.TrainingArguments`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) class. If you want to modify that, make sure to create your own `TrainingArguments` object and pass it to the [`SFTTrainer`] constructor as it is done on the [`supervised_finetuning.py` script](https://github.com/lvwerra/trl/blob/main/examples/stack_llama/scripts/supervised_finetuning.py) on the stack-llama example.
+The above snippets will use the default training arguments from the [`transformers.TrainingArguments`](https://huggingface.co/docs/transformers/main_classes/trainer#transformers.TrainingArguments) class. If you want to modify that, make sure to create your own `TrainingArguments` object and pass it to the [`SFTTrainer`] constructor as it is done on the [`supervised_finetuning.py` script](https://github.com/huggingface/trl/blob/main/examples/stack_llama/scripts/supervised_finetuning.py) on the stack-llama example.

 ## Advanced usage

+### Train on completions only
+
+You can use the `DataCollatorForCompletionOnlyLM` to train your model on the generated prompts only. Note that this works only in the case when `packing=False`.
+To instantiate that collator for instruction data, pass a response template and the tokenizer. Here is an example of how it would work to fine-tune `opt-350m` on completions only on the CodeAlpaca dataset:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
+from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
+
+dataset = load_dataset("lucasmccabe-lmi/CodeAlpaca-20k", split="train")
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+
+def formatting_prompts_func(example):
+    output_texts = []
+    for i in range(len(example['instruction'])):
+        text = f"### Question: {example['instruction'][i]}\n ### Answer: {example['output'][i]}"
+        output_texts.append(text)
+    return output_texts
+
+response_template = " ### Answer:"
+collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
+
+trainer = SFTTrainer(
+    model,
+    train_dataset=dataset,
+    formatting_func=formatting_prompts_func,
+    data_collator=collator,
+)
+
+trainer.train()
+```
+
+To instantiate that collator for assistant style conversation data, pass a response template, an instruction template and the tokenizer. Here is an example of how it would work to fine-tune `opt-350m` on assistant completions only on the Open Assistant Guanaco dataset:
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from datasets import load_dataset
+from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
+
+dataset = load_dataset("timdettmers/openassistant-guanaco", split="train")
+
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+
+instruction_template = "### Human:"
+response_template = "### Assistant:"
+collator = DataCollatorForCompletionOnlyLM(instruction_template=instruction_template, response_template=response_template, tokenizer=tokenizer, mlm=False)
+
+trainer = SFTTrainer(
+    model,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    data_collator=collator,
+)
+
+trainer.train()
+```
+
+Make sure to have a `pad_token_id` which is different from `eos_token_id` which can result in the model not properly predicting EOS (End of Sentence) tokens during generation.
+
+#### Using token_ids directly for `response_template`
+
+Some tokenizers like Llama 2 (`meta-llama/Llama-2-XXb-hf`) tokenize sequences differently depending whether they have context or not. For example:
+
+```python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+
+def print_tokens_with_ids(txt):
+    tokens = tokenizer.tokenize(txt, add_special_tokens=False)
+    token_ids = tokenizer.encode(txt, add_special_tokens=False)
+    print(list(zip(tokens, token_ids)))
+
+prompt = """### User: Hello\n\n### Assistant: Hi, how can I help you?"""
+print_tokens_with_ids(prompt)  # [..., ('▁Hello', 15043), ('<0x0A>', 13), ('<0x0A>', 13), ('##', 2277), ('#', 29937), ('▁Ass', 4007), ('istant', 22137), (':', 29901), ...]
+
+response_template = "### Assistant:"
+print_tokens_with_ids(response_template)  # [('▁###', 835), ('▁Ass', 4007), ('istant', 22137), (':', 29901)]
+```
+
+In this case, and due to lack of context in `response_template`, the same string ("### Assistant:") is tokenized differently:
+
+    - Text (with context): `[2277, 29937, 4007, 22137, 29901]`
+    - `response_template` (without context): `[835, 4007, 22137, 29901]`
+
+This will lead to an error when the `DataCollatorForCompletionOnlyLM` does not find the `response_template` in the dataset example text:
+
+```
+RuntimeError: Could not find response key [835, 4007, 22137, 29901] in token IDs tensor([    1,   835,  ...])
+```
+
+
+To solve this, you can tokenize the `response_template` with the same context than in the dataset, truncate it as needed and pass the `token_ids` directly to the `response_template` argument of the `DataCollatorForCompletionOnlyLM` class. For example:
+
+```python
+response_template_with_context = "\n### Assistant:"  # We added context here: "\n". This is enough for this tokenizer
+response_template_ids = tokenizer.encode(response_template_with_context, add_special_tokens=False)[2:]  # Now we have it like in the dataset texts: `[2277, 29937, 4007, 22137, 29901]`
+
+data_collator = DataCollatorForCompletionOnlyLM(response_template_ids, tokenizer=tokenizer)
+```
+
+### Add Special Tokens for Chat Format
+
+Adding special tokens to a language model is crucial for training chat models. These tokens are added between the different roles in a conversation, such as the user, assistant, and system and help the model recognize the structure and flow of a conversation. This setup is essential for enabling the model to generate coherent and contextually appropriate responses in a chat environment. 
+The [`setup_chat_format`] function in `trl` easily sets up a model and tokenizer for conversational AI tasks. This function:
+- Adds special tokens to the tokenizer, e.g. `<|im_start|>` and `<|im_end|>`, to indicate the start and end of a conversation.
+- Resizes the model’s embedding layer to accommodate the new tokens.
+- Sets the `chat_template` of the tokenizer, which is used to format the input data into a chat-like format. The default is `chatml` from OpenAI.
+- _optionally_ you can pass `resize_to_multiple_of` to resize the embedding layer to a multiple of the `resize_to_multiple_of` argument, e.g. 64. If you want to see more formats being supported in the future, please open a GitHub issue on [trl](https://github.com/huggingface/trl)
+
+```python
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+# Load model and tokenizer
+model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m")
+tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
+
+# Set up the chat format with default 'chatml' format
+model, tokenizer = setup_chat_format(model, tokenizer)
+
+```
+
+With our model and tokenizer set up, we can now fine-tune our model on a conversational dataset. Below is an example of how a dataset can be formatted for fine-tuning. 
+
+### Dataset format support
+
+The [`SFTTrainer`] supports popular dataset formats. This allows you to pass the dataset to the trainer without any pre-processing directly. The following formats are supported:
+* conversational format
+```json
+{"messages": [{"role": "system", "content": "You are helpful"}, {"role": "user", "content": "What's the capital of France?"}, {"role": "assistant", "content": "..."}]}
+{"messages": [{"role": "system", "content": "You are helpful"}, {"role": "user", "content": "Who wrote 'Romeo and Juliet'?"}, {"role": "assistant", "content": "..."}]}
+{"messages": [{"role": "system", "content": "You are helpful"}, {"role": "user", "content": "How far is the Moon from Earth?"}, {"role": "assistant", "content": "..."}]}
+```
+* instruction format
+```json
+{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
+{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
+{"prompt": "<prompt text>", "completion": "<ideal generated text>"}
+```
+
+If your dataset uses one of the above formats, you can directly pass it to the trainer without pre-processing. The [`SFTTrainer`] will then format the dataset for you using the defined format from the model's tokenizer with the [apply_chat_template](https://huggingface.co/docs/transformers/main/en/chat_templating#templates-for-chat-models) method. 
+
+
+```python
+from datasets import load_dataset
+from trl import SFTTrainer
+
+...
+
+# load jsonl dataset
+dataset = load_dataset("json", data_files="path/to/dataset.jsonl", split="train")
+# load dataset from the HuggingFace Hub
+dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")
+
+...
+
+trainer = SFTTrainer(
+    "facebook/opt-350m",
+    args=training_args,
+    train_dataset=dataset,
+    packing=True,
+)
+```
+
+If the dataset is not in one those format you can either preprocess the dataset to match the formatting or pass a formatting function to the SFTTrainer to do it for you. Let's have a look.
+
+
 ### Format your input prompts

-For instruction fine-tuning, it is quite common to have two columns inside the dataset: one for the prompt & the other for the response. 
+For instruction fine-tuning, it is quite common to have two columns inside the dataset: one for the prompt & the other for the response.
 This allows people to format examples like [Stanford-Alpaca](https://github.com/tatsu-lab/stanford_alpaca) did as follows:
 ```bash
 Below is an instruction ...
@ -79,7 +251,7 @@ trainer = SFTTrainer(

 trainer.train()
 ```
-To preperly format your input make sure to process all the examples by looping over them and returning a list of processed text. Check out a full example on how to use SFTTrainer on alpaca dataset [here](https://github.com/lvwerra/trl/pull/444#issue-1760952763)
+To properly format your input make sure to process all the examples by looping over them and returning a list of processed text. Check out a full example on how to use SFTTrainer on alpaca dataset [here](https://github.com/huggingface/trl/pull/444#issue-1760952763)

 ### Packing dataset ([`ConstantLengthDataset`])

@ -98,7 +270,8 @@ trainer = SFTTrainer(
 trainer.train()
 ```

-Note that if you use a packed dataset and if you pass `max_steps` in the training arguments you will probably train your models for more than few epochs, depending on the way you have configured the packed dataset and the training protocol. Double check that you know and understand what you are doing. 
+Note that if you use a packed dataset and if you pass `max_steps` in the training arguments you will probably train your models for more than few epochs, depending on the way you have configured the packed dataset and the training protocol. Double check that you know and understand what you are doing.
+If you don't want to pack your `eval_dataset`, you can pass `eval_packing=False` to the `SFTTrainer` init method.

 #### Customize your prompts using packed dataset

@ -122,10 +295,11 @@ You can also customize the [`ConstantLengthDataset`] much more by directly passi

 ### Control over the pretrained model

-You can directly pass the kwargs of the `from_pretrained()` method to the [`SFTTrainer`]. For example, if you want to load a model in a different precision, analoguous to 
+You can directly pass the kwargs of the `from_pretrained()` method to the [`SFTTrainer`]. For example, if you want to load a model in a different precision, analogous to

 ```python
 model = AutoModelForCausalLM.from_pretrained("facebook/opt-350m", torch_dtype=torch.bfloat16)
+```

 ```python
 ...
@ -134,12 +308,14 @@ trainer = SFTTrainer(
    "facebook/opt-350m",
    train_dataset=dataset,
    dataset_text_field="text",
-    torch_dtype=torch.bfloat16,
+    model_init_kwargs={
+        "torch_dtype": torch.bfloat16,
+    },
 )

 trainer.train()
 ```
-Note that all keyword arguments of `from_pretrained()` are supported. 
+Note that all keyword arguments of `from_pretrained()` are supported.

 ### Training adapters

@ -170,35 +346,9 @@ trainer = SFTTrainer(
 trainer.train()
 ```

-Note that in case of training adapters, we manually add a saving callback to automatically save the adapters only:
-```python
-class PeftSavingCallback(TrainerCallback):
-    def on_save(self, args, state, control, **kwargs):
-        checkpoint_path = os.path.join(args.output_dir, f"checkpoint-{state.global_step}")
-        kwargs["model"].save_pretrained(checkpoint_path)
+You can also continue training your `PeftModel`. For that, first load a `PeftModel` outside `SFTTrainer` and pass it directly to the trainer without the `peft_config` argument being passed.

-        if "pytorch_model.bin" in os.listdir(checkpoint_path):
-            os.remove(os.path.join(checkpoint_path, "pytorch_model.bin"))
-```
-If you want to add more callbacks, make sure to add this one as well to properly save the adapters only during training.
-```python
-...
-
-callbacks = [YourCustomCallback(), PeftSavingCallback()]
-
-trainer = SFTTrainer(
-    "EleutherAI/gpt-neo-125m",
-    train_dataset=dataset,
-    dataset_text_field="text",
-    torch_dtype=torch.bfloat16,
-    peft_config=peft_config,
-    callbacks=callbacks
-)
-
-trainer.train()
-```
-
-### Training adapters with base 8 bit models 
+### Training adapters with base 8 bit models

 For that you need to first load your 8bit model outside the Trainer and pass a `PeftConfig` to the trainer. For example:

@ -223,26 +373,238 @@ trainer = SFTTrainer(
    model,
    train_dataset=dataset,
    dataset_text_field="text",
-    torch_dtype=torch.bfloat16,
    peft_config=peft_config,
 )

 trainer.train()
 ```

+## Using Flash Attention and Flash Attention 2
+
+You can benefit from Flash Attention 1 & 2 using SFTTrainer out of the box with minimal changes of code.
+First, to make sure you have all the latest features from transformers, install transformers from source
+
+```bash
+pip install -U git+https://github.com/huggingface/transformers.git
+```
+
+Note that Flash Attention only works on GPU now and under half-precision regime (when using adapters, base model loaded in half-precision)
+Note also both features are perfectly compatible with other tools such as quantization.
+
+### Using Flash-Attention 1
+
+For Flash Attention 1 you can use the `BetterTransformer` API and force-dispatch the API to use Flash Attention kernel. First, install the latest optimum package:
+
+```bash
+pip install -U optimum
+```
+
+Once you have loaded your model, wrap the `trainer.train()` call under the `with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):` context manager:
+
+```diff
+...
+
+ with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
+    trainer.train()
+```
+
+Note that you cannot train your model using Flash Attention 1 on an arbitrary dataset as `torch.scaled_dot_product_attention` does not support training with padding tokens if you use Flash Attention kernels. Therefore you can only use that feature with `packing=True`. If your dataset contains padding tokens, consider switching to Flash Attention 2 integration.
+
+Below are some numbers you can get in terms of speedup and memory efficiency, using Flash Attention 1, on a single NVIDIA-T4 16GB.
+
+| use_flash_attn_1 | model_name        | max_seq_len | batch_size | time per training step |
+| ---------------- | ----------------- | ----------- | ---------- | ---------------------- |
+| x                | facebook/opt-350m | 2048        | 8          | ~59.1s                 |
+|                  | facebook/opt-350m | 2048        | 8          | **OOM**                |
+| x                | facebook/opt-350m | 2048        | 4          | ~30.3s                 |
+|                  | facebook/opt-350m | 2048        | 4          | ~148.9s                |
+
+### Using Flash Attention-2
+
+To use Flash Attention 2, first install the latest `flash-attn` package:
+
+```bash
+pip install -U flash-attn
+```
+
+And add `attn_implementation="flash_attention_2"` when calling `from_pretrained`:
+
+```python
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    load_in_4bit=True,
+    attn_implementation="flash_attention_2"
+)
+```
+
+If you don't use quantization, make sure your model is loaded in half-precision and dispatch your model on a supported GPU device.
+After loading your model, you can either train it as it is, or attach adapters and train adapters on it in case your model is quantized.
+
+In contrary to Flash Attention 1, the integration makes it possible to train your model on an arbitrary dataset that also includes padding tokens.
+
+
+### Using model creation utility
+
+We included a utility function to create your model.
+
+[[autodoc]] ModelConfig
+
+```python
+from trl import ModelConfig, SFTTrainer, get_kbit_device_map, get_peft_config, get_quantization_config
+model_config = ModelConfig(
+    model_name_or_path="facebook/opt-350m"
+    attn_implementation=None, # or "flash_attention_2"
+)
+torch_dtype = (
+    model_config.torch_dtype
+    if model_config.torch_dtype in ["auto", None]
+    else getattr(torch, model_config.torch_dtype)
+)
+quantization_config = get_quantization_config(model_config)
+model_kwargs = dict(
+    revision=model_config.model_revision,
+    trust_remote_code=model_config.trust_remote_code,
+    attn_implementation=model_config.attn_implementation,
+    torch_dtype=torch_dtype,
+    use_cache=False if training_args.gradient_checkpointing else True,
+    device_map=get_kbit_device_map() if quantization_config is not None else None,
+    quantization_config=quantization_config,
+)
+model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+trainer = SFTTrainer(
+    ...,
+    model=model_config.model_name_or_path,
+    peft_config=get_peft_config(model_config),
+)
+```
+
+
+
+
+### Enhance model's performances using NEFTune
+
+NEFTune is a technique to boost the performance of chat models and was introduced by the paper ["NEFTune: Noisy Embeddings Improve Instruction Finetuning"](https://arxiv.org/abs/2310.05914) from Jain et al. it consists of adding noise to the embedding vectors during training. According to the abstract of the paper:
+
+>  Standard finetuning of LLaMA-2-7B using Alpaca achieves 29.79% on AlpacaEval, which rises to 64.69% using noisy embeddings. NEFTune also improves over strong baselines on modern instruction datasets. Models trained with Evol-Instruct see a 10% improvement, with ShareGPT an 8% improvement, and with OpenPlatypus an 8% improvement. Even powerful models further refined with RLHF such as LLaMA-2-Chat benefit from additional training with NEFTune.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/neft-screenshot.png">
+</div>
+
+To use it in `SFTTrainer` simply pass `neftune_noise_alpha` when creating your `SFTTrainer` instance. Note that to avoid any surprising behaviour, NEFTune is disabled after training to retrieve back the original behaviour of the embedding layer.
+
+```python
+from datasets import load_dataset
+from trl import SFTTrainer
+
+dataset = load_dataset("imdb", split="train")
+
+trainer = SFTTrainer(
+    "facebook/opt-350m",
+    train_dataset=dataset,
+    dataset_text_field="text",
+    max_seq_length=512,
+    neftune_noise_alpha=5,
+)
+trainer.train()
+```
+
+We have tested NEFTune by training `mistralai/Mistral-7B-v0.1` on the [OpenAssistant dataset](https://huggingface.co/datasets/timdettmers/openassistant-guanaco) and validated that using NEFTune led to a performance boost of ~25% on MT Bench.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/trl-neftune-mistral-7b.png">
+</div>
+
+Note however, that the amount of performance gain is _dataset dependent_ and in particular, applying NEFTune on synthetic datasets like [UltraChat](https://huggingface.co/datasets/stingning/ultrachat) typically produces smaller gains.
+
+### Accelerate fine-tuning 2x using `unsloth`
+
+You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks on 1x A100 listed below:
+
+| 1 A100 40GB     | Dataset   | 🤗  | 🤗 + Flash Attention 2 | 🦥 Unsloth     | 🦥 VRAM saved |
+|-----------------|-----------|-----|-------------------------|-----------------|----------------|
+| Code Llama 34b  | Slim Orca | 1x  | 1.01x                   | **1.94x**       | -22.7%         |
+| Llama-2 7b      | Slim Orca | 1x  | 0.96x                   | **1.87x**       | -39.3%         |
+| Mistral 7b      | Slim Orca | 1x  | 1.17x                   | **1.88x**       | -65.9%         |
+| Tiny Llama 1.1b | Alpaca    | 1x  | 1.55x                   | **2.74x**       | -57.8%         |
+
+First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:
+
+```python
+import torch
+from transformers import TrainingArguments
+from trl import SFTTrainer
+from unsloth import FastLanguageModel
+
+max_seq_length = 2048 # Supports automatic RoPE Scaling, so choose any number
+
+# Load model
+model, tokenizer = FastLanguageModel.from_pretrained(
+    model_name = "unsloth/mistral-7b",
+    max_seq_length = max_seq_length,
+    dtype = None, # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
+    load_in_4bit = True, # Use 4bit quantization to reduce memory usage. Can be False
+    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
+)
+
+# Do model patching and add fast LoRA weights
+model = FastLanguageModel.get_peft_model(
+    model,
+    r = 16,
+    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
+                      "gate_proj", "up_proj", "down_proj",],
+    lora_alpha = 16,
+    lora_dropout = 0, # Dropout = 0 is currently optimized
+    bias = "none",    # Bias = "none" is currently optimized
+    use_gradient_checkpointing = True,
+    random_state = 3407,
+)
+
+args = TrainingArguments(output_dir = "./output")
+
+trainer = SFTTrainer(
+    model = model,
+    args = args,
+    train_dataset = dataset,
+    dataset_text_field = "text",
+    max_seq_length = max_seq_length,
+)
+trainer.train()
+```
+
+The saved model is fully compatible with Hugging Face's transformers library. Learn more about unsloth in their [official repository](https://github.com/unslothai/unsloth).
+
 ## Best practices

 Pay attention to the following best practices when training a model with that trainer:

 - [`SFTTrainer`] always pads by default the sequences to the `max_seq_length` argument of the [`SFTTrainer`]. If none is passed, the trainer will retrieve that value from the tokenizer. Some tokenizers do not provide default value, so there is a check to retrieve the minimum between 2048 and that value. Make sure to check it before training.
- For training adapters in 8bit, you might need to tweak the arguments of the `prepare_model_for_int8_training` method from PEFT, hence we advise users to use `prepare_in_int8_kwargs` field, or create the `PeftModel` outside the [`SFTTrainer`] and pass it.
- For a more memory-efficient training using adapters, you can load the base model in 8bit, for that simply add `load_in_8bit` argument when creating the [`SFTTrainer`], or create a base model in 8bit outside the trainer and pass it.  
+- For training adapters in 8bit, you might need to tweak the arguments of the `prepare_model_for_kbit_training` method from PEFT, hence we advise users to use `prepare_in_int8_kwargs` field, or create the `PeftModel` outside the [`SFTTrainer`] and pass it.
+- For a more memory-efficient training using adapters, you can load the base model in 8bit, for that simply add `load_in_8bit` argument when creating the [`SFTTrainer`], or create a base model in 8bit outside the trainer and pass it.
 - If you create a model outside the trainer, make sure to not pass to the trainer any additional keyword arguments that are relative to `from_pretrained()` method.

+## Multi-GPU Training
+
+Trainer (and thus SFTTrainer) supports multi-GPU training. If you run your script with `python script.py` it will default to using DP as the strategy, which may be [slower than expected](https://github.com/huggingface/trl/issues/1303). To use DDP (which is generally recommended, see [here](https://huggingface.co/docs/transformers/en/perf_train_gpu_many?select-gpu=Accelerate#data-parallelism) for more info) you must launch the script with `python -m torch.distributed.launch script.py` or `accelerate launch script.py`. For DDP to work you must also check the following:
+- If you're using gradient_checkpointing, add the following to the TrainingArguments: `gradient_checkpointing_kwargs={'use_reentrant':False}` (more info [here](https://github.com/huggingface/transformers/issues/26969)
+- Ensure that the model is placed on the correct device:
+```python
+from accelerate import PartialState
+device_string = PartialState().process_index
+model = AutoModelForCausalLM.from_pretrained(
+     ...
+    device_map={'':device_string}
+)
+```
+
+## GPTQ Conversion
+
+You may experience some issues with GPTQ Quantization after completing training. Lowering `gradient_accumulation_steps` to `4` will resolve most issues during the quantization process to GPTQ format.
+
 ## SFTTrainer

 [[autodoc]] SFTTrainer

 ## ConstantLengthDataset

-[[autodoc]] trainer.ConstantLengthDataset
+[[autodoc]] trainer.ConstantLengthDataset
--- a/docs/source/summarization_reward_tuning.mdx
+++ b/docs/source/summarization_reward_tuning.mdx
@ -1,30 +0,0 @@
-# Summarization Example
-
-The script in this example show how to train a reward model for summarization, following the OpenAI Learning to Summarize from Human Feedback [paper](https://arxiv.org/abs/2009.01325). We've validated that the script can be used to train a small GPT2 to get slightly over 60% validation accuracy, which is aligned with results from the paper. The model is [here](https://huggingface.co/Tristan/gpt2_reward_summarization).
-
-Here's an overview of the relevant files in the [trl repository](https://github.com/lvwerra/trl/tree/main/examples):
-
-| File | Description |
-|---|---|
-| `scripts/reward_summarization.py` | For tuning the reward model. |
-| `scripts/ds3_reward_summarization_example_config.json` | Can be used with the reward model script to scale it up to arbitrarily big models that don't fit on a single GPU. |
-
-
-## Installation
-
-```bash
-pip install trl
-pip install evaluate
-# optional: deepspeed
-pip install deepspeed
-```
-
-```bash
-# If you want your reward model to follow the Learning to Summarize from Human Feedback paper closely, then tune a GPT model on summarization and then instantiate the reward model
-# with it. In other words, pass in the name of your summarization-finetuned gpt on the hub, instead of the name of the pretrained gpt2 like we do in the following examples of how
-# to run this script.
-# Example of running this script with the small size gpt2 on a 40GB A100 (A100's support bf16). Here, the global batch size will be 64:
-python -m torch.distributed.launch --nproc_per_node=1 reward_summarization.py --bf16
-# Example of running this script with the xl size gpt2 on 16 40GB A100's. Here the global batch size will still be 64:
-python -m torch.distributed.launch --nproc_per_node=16 reward_summarization.py --per_device_train_batch_size=1 --per_device_eval_batch_size=1 --gradient_accumulation_steps=4 --gpt_model_name=gpt2-xl --bf16 --deepspeed=ds3_reward_summarization_example_config.json
-```
--- a/docs/source/text_environments.md
+++ b/docs/source/text_environments.md
@ -0,0 +1,197 @@
+# Text Environments
+
+Text environments provide a learning ground for language agents. It allows a language model to use tools to accomplish a task such as using a Python interpreter to answer math questions or using a search index for trivia questions. Having access to tools allows language models to solve tasks that would be very hard for the models itself but can be trivial for the appropriate tools. A good example is arithmetics of large numbers that become a simple copy-paste task once you have access to a calculator.
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/textenv.png">
+</div>
+
+Let's dive into how text environments work and start with tools!
+
+## Tools
+
+One of the core building blocks of text environments are tools that the model can use to solve tasks. In general tools can be any Python function that takes a string as input and returns string. The `TextEnvironment` offers two options for tools: either go with predefined tools from `transformers.Tool` or define your own function or class with `__call__` method. Let's have a look at both!
+
+### `transformers.Tool`
+
+Text environments fully support tools of the class `transformers.Tool`. The advantage of building tools in that framework is that they can easily be shared 
+
+```Python
+from transformers import load_tool
+
+# simple calculator tool that runs +-/* operations
+calc_tool = load_tool("ybelkada/simple-calculator")
+
+# python interpreter that executes program and returns outputs
+py_tool = load_tool("lvwerra/python-interpreter")
+
+# wikipedia search index that returns best search match
+wiki_tool = load_tool("vwxyzjn/pyserini-wikipedia-kilt-doc")
+```
+
+These tools are either loaded from the hub or from a local folder. Using the tool is as simple as calling them with a text query:
+
+```Python
+calc_tool("1/2")
+>>> "0.5"
+```
+
+Note that both input and return values are strings to enable easy usage with a language model.
+
+### Custom Tools
+
+The following is an example of a tool that adds two integers:
+
+```Python
+def add(text):
+    int_1, int_2 = text.split("+")
+    result = int(int_1) + int(int_2)
+    return str(result)
+
+print(add("1+1"))
+>>> "2"
+```
+
+We looked at basic examples such as a calculator but the principle holds for more complex tools as well such as a web search tool where you input the query and get the search results in return. Now let's look at how the model can use the tools with the call syntax.
+
+### Call syntax
+
+In order to have a unified way for the model to call a tool we created a simple syntax that looks as follows:
+
+```python
+"<request><TOOL_NAME>QUERY<call>TOOL_RESPONSE<response>"
+```
+
+There are a few special tokens involved so let's decompose it: First the model can signal that it wants to use a tool by emitting the `<request>` token. After that we want to know the name of the tool to call which is done by enclosing the tool name with `<>` brackets. Once we know which tool to call the tool query follows which is in free text form. The `<call>` tokens signifies the end of the query and stops the model generation. At this point the model output is parsed and the query sent to the tool. The environment appends the tool response to the string followed by the `<response>` token to show the end the tool output.
+
+Let's look at the concrete example of the calculator and assume its name is `Calculator` (more on how the name of a tool is inferred later):
+
+```python
+"<request><Calculator>1/2<call>0.5<response>"
+```
+
+Finally, the episode is ended and generation stops when the model generates `<submit>` which marks the interaction as completed.
+
+Now let's have a look how we can create a new text environment!
+
+## Create a `TextEnvironment`
+
+
+```python
+prompt = """\
+What is 13-3?
+<request><SimpleCalculatorTool>13-3<call>10.0<response>
+Result=10<submit>
+"""
+
+def reward_fn(result, answer):
+    """Simplified reward function returning 1 if result matches answer and 0 otherwise."""
+    result_parsed = result.split("=")[1].split("<")[0]
+    return int(result_parsed==answer)
+
+text_env = TextEnvironemnt(
+    model=model, 
+    tokenizer=tokenizer,
+    tools= {"SimpleCalculatorTool": load_tool("ybelkada/simple-calculator")},
+    reward_fn=exact_match_reward,
+    prompt=prompt, 
+    max_turns=1
+    max_tool_response=100
+    generation_kwargs={"do_sample": "true"}
+)
+```
+
+Let's decompose the settings:
+
+| Argument           | Description     |
+|:-------------------|:----------------|
+| `model`            | Language model to interact with the environment and generate requests. |
+| `tokenizer`        | Tokenizer of language model handling tokenization of strings. |
+| `tools`            | `list` of `dict` of tools. If former the name of the tool is inferred from class name and otherwise it's the keys of the dictionary.|
+| `reward_fn`        | A function that takes a string as input and returns. Can have extra arguments that are passed to `.run()` such as ground truth.|
+| `prompt`           | Prompt to prepend to every task. Usually a few examples to demonstrate to the model how to use the tools in a few-shot fashion. |
+| `max_turns`        | Maximum number of interactions between model and tools before episode ends.|
+| `max_tool_response`| The tool response is truncated to this number to avoid running out of model context.|
+| `max_length`       |  The maximum number of tokens to allow in an episode. |
+| `generation_kwargs`| Generation settings used by the language model. |
+
+You can customize the environment to your needs and add custom tools and settings. Let's see how you can use the environment to have the model interact with the available tools!
+
+
+## Run an Episode
+
+To run a set of queries through the text environment one can simply use the `run` method.
+
+```python
+queries = ["What is 1/2?"]
+answers = ["0.5"]
+
+queries, responses, masks, rewards, histories = text_env.run(queries, answers=answers)
+```
+
+This will execute the model/tool feedback loop for each query until either no tool is called anymore, the maximum number of turns is reached or to maximum number of tokens in an episode is exceeded. The extra `kwargs` (e.g. `answers=answers` above) passed to `run` will be passed on to the reward function.
+
+There are five objects that are returned by `run`: 
+
+- `queries`: a list of the tokenized queries
+- `responses`: all tokens that have been generated withing the environment including model and tool tokens
+- `masks`: mask that indicates which tokens have been generated by the model and which tokens are generated by the tool
+- `rewards`: a list of reward for each query/response
+- `histories`: list of `TextHistory` objects, which are useful objects containing all the above and also the text equivalents
+
+The masks are crucial for training as we don't want to optimize tokens that the model has not generated which are tokens produced by the tools.
+
+Next, we'll train a PPO step with the generated responses!
+
+
+### Train
+Training on episodes from the `TextEnvironment` is straight forward and simply requires forwarding all the returned variables except the `TextHistory` objects to the `step` method:
+
+```python
+train_stats = ppo_trainer.step(queries, responses, rewards, masks)
+```
+
+## `TextHistory`
+
+The `TextHistory` object stores the interactions between the model and the text environment. It stores tokens and text generated in each turn and their source in each turn (model or system) as well as rewards. Let's go through the class attributes and methods.
+
+### Attributes
+
+The following table summarises the available attributes of the `TextEnvironment` class:
+
+| Attribute           | Description     |
+|:-------------------|:----------------|
+| `text`             | The full string of the text generated in the text environment with both model and system generated text. |
+| `text_spans`       | A list of tuples with the spans for each model or system generated text segment. |
+| `system_spans`     | A list of boolean values indicating if the segment is model or system generated. |
+| `tokens`           | All tokens generated in text environment with both model and system generated tokens. |
+| `token_spans`      | Similar to `text_spans` the `token_spans` indicate the boundaries of model andsystem generated tokens. |
+| `token_masks`      | The token masks can be used to ignore system generated tokens by masking them. |
+| `completed`        | Indicates if the interaction with the environment has completed. |
+| `truncated`        | Indicates if the interaction with the environment has completed because max length was reached. |
+
+With these attributes you can reconstruct every interaction of the model with the `TextEnvironment`. The `TextHistory` also lets you visualize the text history. Let's have a look!
+
+### Visualization
+
+When the model interacts inside the `TextEnvironment` it can be useful to visualize and separate which parts of the text outputs were generated by the model and which parts come from the system and tools. For that purpose there are the two methods [`TextHistory.show_text`] and [`TextHistory.show_tokens`]. They print the text and tokens respectively and highlight the various segments using the [`rich` libray](https://github.com/Textualize/rich) (make sure to install it before using these methods).
+
+You can see that the prompt is highlighted in gray, whereas system segments such as query and tool responses are highlighted in green. All segments generated by the model are highlighted in blue and in addition to the pure text output the reward is displayed as additional text in plum. Here an example of `show_text`:
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/textenv_show_text.png" width=600>
+</div>
+
+Sometimes there can be tricky tokenization related issues that are hidden when showing the decoded text. Thus `TextHistory` also offers an option to display the same highlighting on the tokens directly with `show_tokens`:
+
+<div style="text-align: center">
+<img src="https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/textenv_show_tokens.png" width=800>
+</div>
+
+Note that you can turn on the colour legend by passing `show_legend=True`.
+
+## API Documentation
+
+[[autodoc]] TextEnvironment
+
+[[autodoc]] TextHistory
--- a/docs/source/trainer.mdx
+++ b/docs/source/trainer.mdx
@ -12,10 +12,34 @@ We also support a `RewardTrainer` that can be used to train a reward model.

 [[autodoc]] PPOTrainer

+## RewardConfig
+
+[[autodoc]] RewardConfig
+
 ## RewardTrainer

 [[autodoc]] RewardTrainer

+## SFTTrainer
+
+[[autodoc]] SFTTrainer
+
+## DPOTrainer
+
+[[autodoc]] DPOTrainer
+
+## DDPOConfig
+
+[[autodoc]] DDPOConfig
+
+## DDPOTrainer
+
+[[autodoc]] DDPOTrainer
+
+## IterativeSFTTrainer
+
+[[autodoc]] IterativeSFTTrainer
+
 ## set_seed

 [[autodoc]] set_seed
--- a/docs/source/use_model.md
+++ b/docs/source/use_model.md
@ -0,0 +1,58 @@
+# Use model after training
+
+Once you have trained a model using either the SFTTrainer, PPOTrainer, or DPOTrainer, you will have a fine-tuned model that can be used for text generation. In this section, we'll walk through the process of loading the fine-tuned model and generating text. If you need to run an inference server with the trained model, you can explore libraries such as [`text-generation-inference`](https://github.com/huggingface/text-generation-inference).
+
+## Load and Generate
+
+If you have fine-tuned a model fully, meaning without the use of PEFT you can simply load it like any other language model in transformers. E.g. the value head that was trained during the PPO training is no longer needed and if you load the model with the original transformer class it will be ignored:
+
+```python
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+model_name_or_path = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub
+device = "cpu" # or "cuda" if you have a GPU
+
+model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to(device)
+tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
+
+inputs = tokenizer.encode("This movie was really", return_tensors="pt").to(device)
+outputs = model.generate(inputs)
+print(tokenizer.decode(outputs[0]))
+```
+
+Alternatively you can also use the pipeline:
+
+```python
+from transformers import pipeline
+
+model_name_or_path = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub
+pipe = pipeline("text-generation", model=model_name_or_path)
+print(pipe("This movie was really")[0]["generated_text"])
+```
+
+## Use Adapters PEFT
+
+```python
+from peft import PeftConfig, PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+base_model_name = "kashif/stack-llama-2" #path/to/your/model/or/name/on/hub"
+adapter_model_name = "path/to/my/adapter"
+
+model = AutoModelForCausalLM.from_pretrained(base_model_name)
+model = PeftModel.from_pretrained(model, adapter_model_name)
+
+tokenizer = AutoTokenizer.from_pretrained(base_model_name)
+```
+
+You can also merge the adapters into the base model so you can use the model like a normal transformers model, however the checkpoint will be significantly bigger:
+
+```python
+model = AutoModelForCausalLM.from_pretrained(base_model_name)
+model = PeftModel.from_pretrained(model, adapter_model_name)
+
+model = model.merge_and_unload()
+model.save_pretrained("merged_adapters")
+```
+
+Once you have the model loaded and either merged the adapters or keep them separately on top you can run generation as with a normal model outlined above.
--- a/docs/source/using_llama_models.mdx
+++ b/docs/source/using_llama_models.mdx
@ -52,7 +52,7 @@ model = AutoModelForCausalLM.from_pretrained(
        load_in_8bit=True,
        device_map={"": Accelerator().local_process_index}
    )
-model = prepare_model_for_int8_training(model)
+model = prepare_model_for_kbit_training(model)

 # add LoRA to model
 lora_config = LoraConfig(
@ -157,4 +157,4 @@ for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    ppo_trainer.log_stats(stats, batch, rewards)
 ```

-For the rest of the details adn evaluation, please refer to our [blog post on StackLLaMA](https://huggingface.co/blog/stackllama).
+For the rest of the details and evaluation, please refer to our [blog post on StackLLaMA](https://huggingface.co/blog/stackllama).
--- a/example_config.yaml
+++ b/example_config.yaml
@ -0,0 +1,20 @@
+# This is an example configuration file of TRL CLI, you can use it for 
+# SFT like that: `trl sft --config config.yaml --output_dir test-sft`
+# The YAML file supports environment variables by adding an `env` field
+# as below
+
+# env:
+#   CUDA_VISIBLE_DEVICES: 0
+
+model_name_or_path:
+  HuggingFaceM4/tiny-random-LlamaForCausalLM
+dataset_name:
+  imdb
+dataset_text_field:
+  text
+report_to:
+  none
+learning_rate:
+  0.0001
+lr_scheduler_type:
+  cosine
--- a/examples/README.md
+++ b/examples/README.md
@ -1,32 +1,3 @@
 # Examples

-_The best place to learn about examples in TRL is our [docs page](https://huggingface.co/docs/trl/index)!_
-
-## Installation
-
-```bash
-pip install trl
-#optional: wandb
-pip install wandb
-```
-Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. 
-You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).
-
-## Accelerate Config
-For all the examples, you'll need to generate an `Accelerate` config with:
-
-```shell
-accelerate config # will prompt you to define the training configuration
-```
-
-Then, it is encouraged to launch jobs with `accelerate launch`!
-
-## Categories
-The examples are currently split over the following categories:
-
-**1: [Sentiment](https://github.com/lvwerra/trl/tree/main/examples/sentiment)**: Fine-tune a model with a sentiment classification model.
-**2: [StackOverflow](https://github.com/lvwerra/trl/tree/main/examples/stack_llama)**: Perform the full RLHF process (fine-tuning, reward model training, and RLHF) on StackOverflow data.
-**3: [summarization](https://github.com/lvwerra/trl/tree/main/examples/summarization)**: Recreate OpenAI's [Learning to Summarize paper](https://proceedings.neurips.cc/paper/2020/file/1f89885d556929e98d3ef9b86448f951-Paper.pdf).
-**4: [toxicity](https://github.com/lvwerra/trl/tree/main/examples/toxicity)**: Fine-tune a model to reduce the toxicity of its generations.
-write about best-of-n as an alternative rlhf
-**5: [best-of-n sampling](https://github.com/lvwerra/trl/tree/main/examples/best_of_n_sampling)**: Comparative demonstration of best-of-n sampling as a simpler (but relatively expensive) alternative to RLHF
+Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.
--- a/examples/accelerate_configs/deepspeed_zero1.yaml
+++ b/examples/accelerate_configs/deepspeed_zero1.yaml
@ -0,0 +1,20 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  zero3_init_flag: false
+  zero_stage: 1
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/accelerate_configs/deepspeed_zero2.yaml
+++ b/examples/accelerate_configs/deepspeed_zero2.yaml
@ -0,0 +1,22 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: false
+  zero_stage: 2
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/accelerate_configs/deepspeed_zero3.yaml
+++ b/examples/accelerate_configs/deepspeed_zero3.yaml
@ -0,0 +1,23 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+deepspeed_config:
+  deepspeed_multinode_launcher: standard
+  gradient_accumulation_steps: 1
+  offload_optimizer_device: none
+  offload_param_device: none
+  zero3_init_flag: true
+  zero3_save_16bit_model: true
+  zero_stage: 3
+distributed_type: DEEPSPEED
+downcast_bf16: 'no'
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/accelerate_configs/multi_gpu.yaml
+++ b/examples/accelerate_configs/multi_gpu.yaml
@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: MULTI_GPU
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/accelerate_configs/single_gpu.yaml
+++ b/examples/accelerate_configs/single_gpu.yaml
@ -0,0 +1,16 @@
+compute_environment: LOCAL_MACHINE
+debug: false
+distributed_type: "NO"
+downcast_bf16: 'no'
+gpu_ids: all
+machine_rank: 0
+main_training_function: main
+mixed_precision: 'bf16'
+num_machines: 1
+num_processes: 8
+rdzv_backend: static
+same_network: true
+tpu_env: []
+tpu_use_cluster: false
+tpu_use_sudo: false
+use_cpu: false
--- a/examples/best_of_n_sampling/README.md
+++ b/examples/best_of_n_sampling/README.md
@ -1,16 +0,0 @@
-# Best-of-n sampling as an alternative to RLHF
-
-Paraphrasing from [OpenAI's blog post on best-of-n sampling](https://openai.com/research/measuring-goodharts-law)
-
-With `RLHF` we try to optimize w.r.t to a proxy objective. `RLHF` is not the only way to do this. 
-One of the many other ways is `best-of-n sampling`. It is simple to implement and competitive to `RLHF` in some cases.
-That said, `best-of-n sampling` is expensive when it comes to inference time compute.
-
-The included notebook compares reward-model scores of prompt based responses from 
-1. a base model (`gpt2-imdb`)
-2. `RLHF` tuned model based on this base-model 
-3. the base-model again from which we sample n responses to each prompt, score them and take the best scored one AKA the `best-of-n sampled` model
-
-
-
-
--- a/examples/datasets/anthropic_hh.py
+++ b/examples/datasets/anthropic_hh.py
@ -0,0 +1,122 @@
+import multiprocessing
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+from datasets import load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.repocard import RepoCard
+from transformers import HfArgumentParser
+
+
+"""
+# debug
+python -i examples/datasets/anthropic_hh.py --debug --push_to_hub
+# actual push
+python examples/datasets/anthropic_hh.py --push_to_hub --hf_entity trl-internal-testing
+"""
+
+
+api = HfApi()
+
+
+@dataclass
+class ScriptArguments:
+    debug: Optional[bool] = field(default=False, metadata={"help": "Enable debug mode"})
+    hf_entity: Optional[str] = field(default=None, metadata={"help": "The Hugging Face entity to use"})
+    hf_repo_id: Optional[str] = field(default="hh-rlhf-trl-style", metadata={"help": "The Hugging Face repository ID"})
+    revision: Optional[str] = field(default="0.1.0", metadata={"help": "The revision of the repository"})
+    update_main_revision: Optional[bool] = field(
+        default=True, metadata={"help": "Update the main revision of the repository"}
+    )
+    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the dataset to the Hugging Face Hub"})
+
+
+# GPT-4 generated 😄 Define a function to process the input and extract the dialogue into structured format
+def extract_dialogue(input_text):
+    # Split the input by lines and initialize variables
+    lines = input_text.strip().split("\n\n")
+    dialogue_list = []
+
+    # Iterate through each line and extract the dialogue
+    for line in lines:
+        # Check if the line starts with "Human" or "Assistant" and split accordingly
+        if line.startswith("Human:"):
+            role = "user"
+            content = line.replace("Human: ", "").strip()
+        elif line.startswith("Assistant:"):
+            role = "assistant"
+            content = line.replace("Assistant: ", "").strip()
+        else:
+            # If the line doesn't start with "Human" or "Assistant", it's part of the previous message's content
+            # Append it to the last message's content
+            dialogue_list[-1]["content"] += "\n\n" + line.strip()
+            continue
+
+        # Append the extracted dialogue piece to the list
+        dialogue_list.append({"role": role, "content": content})
+
+    return dialogue_list
+
+
+if __name__ == "__main__":
+    args = HfArgumentParser(ScriptArguments).parse_args_into_dataclasses()[0]
+    if args.hf_entity is None:
+        args.hf_entity = api.whoami()["name"]
+    full_repo_id = f"{args.hf_entity}/{args.hf_repo_id}"
+    ds = load_dataset("Anthropic/hh-rlhf")
+    if args.debug:
+        for key in ds:
+            ds[key] = ds[key].select(range(50))
+
+    def process(row):
+        row["chosen"] = extract_dialogue(row["chosen"])
+        row["rejected"] = extract_dialogue(row["rejected"])
+        row["prompt"] = row["chosen"][0]["content"]
+        return row
+
+    ds = ds.map(
+        process,
+        num_proc=1 if args.debug else multiprocessing.cpu_count(),
+        load_from_cache_file=False,
+    )
+    if args.push_to_hub:
+        revisions = ["main"] if args.update_main_revision else []
+        revisions.append(args.revision)
+
+        # get the commnad used to run the script
+        run_command = " ".join(["python"] + sys.argv)
+
+        for revision in revisions:
+            ds.push_to_hub(full_repo_id, revision=revision)
+            repo_full_url = f"https://huggingface.co/datasets/{full_repo_id}/tree/{revision}"
+
+            # get the name of the current file
+            file_name = __file__.split("/")[-1]
+            api.upload_file(
+                path_or_fileobj=__file__,
+                path_in_repo=file_name,
+                revision=revision,
+                repo_id=full_repo_id,
+                repo_type="dataset",
+            )
+
+        sft_card = RepoCard.load(
+            full_repo_id,
+            repo_type="dataset",
+        )
+        sft_card.text = f"""\
+# TRL's Anthropic HH Dataset
+
+We preprocess the dataset using our standard `prompt, chosen, rejected` format.
+
+
+## Reproduce this dataset
+
+1. Download the `{file_name}` from the {repo_full_url}.
+2. Run `{run_command}`
+"""
+        sft_card.push_to_hub(
+            full_repo_id,
+            repo_type="dataset",
+        )
--- a/examples/datasets/tldr_preference.py
+++ b/examples/datasets/tldr_preference.py
@ -0,0 +1,113 @@
+import multiprocessing
+import sys
+from dataclasses import dataclass, field
+from typing import Optional
+
+from datasets import load_dataset
+from huggingface_hub import HfApi
+from huggingface_hub.repocard import RepoCard
+from transformers import HfArgumentParser
+
+
+"""
+# debug
+python -i examples/datasets/tldr_preference.py --debug --push_to_hub
+# actual push
+python examples/datasets/tldr_preference.py --push_to_hub --hf_entity trl-internal-testing
+"""
+
+
+api = HfApi()
+
+
+@dataclass
+class ScriptArguments:
+    debug: Optional[bool] = field(default=False, metadata={"help": "Enable debug mode"})
+    hf_entity: Optional[str] = field(default=None, metadata={"help": "The Hugging Face entity to use"})
+    hf_repo_id: Optional[str] = field(
+        default="tldr-preference-trl-style", metadata={"help": "The Hugging Face repository ID"}
+    )
+    revision: Optional[str] = field(default="0.1.0", metadata={"help": "The revision of the repository"})
+    update_main_revision: Optional[bool] = field(
+        default=True, metadata={"help": "Update the main revision of the repository"}
+    )
+    push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the dataset to the Hugging Face Hub"})
+
+
+if __name__ == "__main__":
+    args = HfArgumentParser(ScriptArguments).parse_args_into_dataclasses()[0]
+    if args.hf_entity is None:
+        args.hf_entity = api.whoami()["name"]
+    full_repo_id = f"{args.hf_entity}/{args.hf_repo_id}"
+
+    ds = load_dataset("openai/summarize_from_feedback", "comparisons")
+    if args.debug:
+        for key in ds:
+            ds[key] = ds[key].select(range(50))
+    cnndm_batches = ["batch0_cnndm", "cnndm0", "cnndm2"]
+    if not args.debug:
+        ds["validation_cnndm"] = ds["validation"].filter(lambda x: x["batch"] in cnndm_batches)
+    ds["validation"] = ds["validation"].filter(lambda x: x["batch"] not in cnndm_batches)
+
+    tldr_format_str = "SUBREDDIT: r/{subreddit}\n\nTITLE: {title}\n\nPOST: {post}\n\nTL;DR:"
+    cnndm_format_str = "Article:\n{article}\n\nTL;DR:"
+
+    def process(row):
+        format_str = cnndm_format_str if row["batch"] in cnndm_batches else tldr_format_str
+        row["prompt"] = format_str.format(**row["info"])
+        choice = row["choice"]
+        chosen = row["summaries"][choice]["text"]
+        rejected = row["summaries"][1 - choice]["text"]
+        row["chosen"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": chosen}]
+        row["rejected"] = [{"role": "user", "content": row["prompt"]}, {"role": "assistant", "content": rejected}]
+        return row
+
+    ds = ds.map(
+        process,
+        num_proc=1 if args.debug else multiprocessing.cpu_count(),
+        load_from_cache_file=False,
+    )
+    for key in ds:  # reorder columns
+        ds[key] = ds[key].select_columns(
+            ["prompt", "chosen", "rejected", "info", "summaries", "choice", "worker", "batch", "split", "extra"]
+        )
+    if args.push_to_hub:
+        revisions = ["main"] if args.update_main_revision else []
+        revisions.append(args.revision)
+
+        # get the commnad used to run the script
+        run_command = " ".join(["python"] + sys.argv)
+
+        for revision in revisions:
+            ds.push_to_hub(full_repo_id, revision=revision)
+            repo_full_url = f"https://huggingface.co/datasets/{full_repo_id}/tree/{revision}"
+
+            # get the name of the current file
+            file_name = __file__.split("/")[-1]
+            api.upload_file(
+                path_or_fileobj=__file__,
+                path_in_repo=file_name,
+                revision=revision,
+                repo_id=full_repo_id,
+                repo_type="dataset",
+            )
+
+        sft_card = RepoCard.load(
+            full_repo_id,
+            repo_type="dataset",
+        )
+        sft_card.text = f"""\
+# TRL's TL;DR Preference Dataset
+
+We preprocess the dataset using our standard `prompt, chosen, rejected` format.
+
+
+## Reproduce this dataset
+
+1. Download the `{file_name}` from the {repo_full_url}.
+2. Run `{run_command}`
+"""
+        sft_card.push_to_hub(
+            full_repo_id,
+            repo_type="dataset",
+        )
--- a/examples/datasets/tokenize_ds.py
+++ b/examples/datasets/tokenize_ds.py
@ -0,0 +1,42 @@
+import multiprocessing
+from dataclasses import dataclass, field
+from typing import Optional
+
+from datasets import load_dataset
+from transformers import AutoTokenizer, HfArgumentParser
+
+
+"""
+python -i examples/datasets/tokenize_ds.py --debug --model HuggingFaceH4/zephyr-7b-beta
+python -i examples/datasets/tokenize_ds.py --debug --model gpt2
+"""
+
+
+@dataclass
+class ScriptArguments:
+    debug: Optional[bool] = field(default=False, metadata={"help": "Enable debug mode"})
+    dataset: str = field(default="trl-internal-testing/hh-rlhf-trl-style", metadata={"help": "The dataset to load"})
+    model: str = field(default="gpt2", metadata={"help": "The model to use for tokenization"})
+
+
+if __name__ == "__main__":
+    args = HfArgumentParser(ScriptArguments).parse_args_into_dataclasses()[0]
+    ds = load_dataset(args.dataset)
+    if args.debug:
+        for key in ds:
+            ds[key] = ds[key].select(range(50))
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    if tokenizer.chat_template is None:
+        tokenizer.chat_template = "{% for message in messages %}{{message['role'] + ': ' + message['content'] + '\n\n'}}{% endfor %}{{ eos_token }}"
+
+    def process(row):
+        row["chosen"] = tokenizer.apply_chat_template(row["chosen"], tokenize=False)
+        row["rejected"] = tokenizer.apply_chat_template(row["rejected"], tokenize=False)
+        return row
+
+    ds = ds.map(
+        process,
+        num_proc=1 if args.debug else multiprocessing.cpu_count(),
+        load_from_cache_file=False,
+    )
+    print(ds["train"][0]["chosen"])
--- a/examples/hello_world.py
+++ b/examples/hello_world.py
@ -12,7 +12,7 @@ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
 tokenizer.pad_token = tokenizer.eos_token

 # 2. initialize trainer
-ppo_config = {"batch_size": 1}
+ppo_config = {"mini_batch_size": 1, "batch_size": 1}
 config = PPOConfig(**ppo_config)
 ppo_trainer = PPOTrainer(config, model, model_ref, tokenizer)

@ -29,7 +29,7 @@ generation_kwargs = {
    "pad_token_id": tokenizer.eos_token_id,
    "max_new_tokens": 20,
 }
-response_tensor = ppo_trainer.generate([item for item in query_tensor], return_prompt=False, **generation_kwargs)
+response_tensor = ppo_trainer.generate(list(query_tensor), return_prompt=False, **generation_kwargs)
 response_txt = tokenizer.decode(response_tensor[0])

 # 5. define a reward for response
--- a/examples/multi-adapter-rl/reward_modeling.py
+++ b/examples/multi-adapter-rl/reward_modeling.py
@ -1,192 +0,0 @@
-# coding=utf-8
-# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-from dataclasses import dataclass, field
-from typing import Optional
-
-import torch
-from datasets import load_dataset
-from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training
-from transformers import (
-    AutoModelForSequenceClassification,
-    AutoTokenizer,
-    BitsAndBytesConfig,
-    HfArgumentParser,
-    TrainingArguments,
-)
-
-from trl import RewardTrainer
-
-
-########################################################################
-# This is a fully working simple example to use trl's RewardTrainer.
-#
-# This example fine-tunes any causal language model (GPT-2, GPT-Neo, etc.)
-# by using the RewardTrainer from trl, we will leverage PEFT library to finetune
-# adapters on the model.
-#
-########################################################################
-
-
-# Define and parse arguments.
-@dataclass
-class ScriptArguments:
-    """
-    These arguments vary depending on how many GPUs you have, what their capacity and features are, and what size model you want to train.
-    """
-
-    local_rank: Optional[int] = field(default=-1, metadata={"help": "Used for multi-gpu"})
-
-    per_device_train_batch_size: Optional[int] = field(default=4)
-    per_device_eval_batch_size: Optional[int] = field(default=1)
-    gradient_accumulation_steps: Optional[int] = field(default=1)
-    learning_rate: Optional[float] = field(default=2e-5)
-    weight_decay: Optional[int] = field(default=0.001)
-    max_seq_length: Optional[int] = field(default=512)
-    model_name: Optional[str] = field(
-        default="huggyllama/llama-7b",
-        metadata={
-            "help": "The model that you want to train from the Hugging Face hub. E.g. gpt2, gpt2-xl, bert, etc."
-        },
-    )
-    dataset_name: Optional[str] = field(
-        default="Anthropic/hh-rlhf",
-        metadata={"help": "The preference dataset to use."},
-    )
-    use_4bit: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Activate 4bit precision base model loading"},
-    )
-    use_nested_quant: Optional[bool] = field(
-        default=False,
-        metadata={"help": "Activate nested quantization for 4bit base models"},
-    )
-    bnb_4bit_compute_dtype: Optional[str] = field(
-        default="bfloat16",
-        metadata={"help": "Compute dtype for 4bit base models"},
-    )
-    bnb_4bit_quant_type: Optional[str] = field(
-        default="nf4",
-        metadata={"help": "Quantization type fp4 or nf4"},
-    )
-    num_train_epochs: Optional[int] = field(
-        default=1,
-        metadata={"help": "The number of training epochs for the reward model."},
-    )
-
-    gradient_checkpointing: Optional[bool] = field(
-        default=True,
-        metadata={"help": "Enables gradient checkpointing."},
-    )
-    optim: Optional[str] = field(
-        default="adamw_hf",
-        metadata={"help": "The optimizer to use."},
-    )
-    lr_scheduler_type: Optional[str] = field(
-        default="linear",
-        metadata={"help": "The lr scheduler"},
-    )
-
-
-parser = HfArgumentParser(ScriptArguments)
-script_args = parser.parse_args_into_dataclasses()[0]
-
-
-def create_and_prepare_model(args):
-    bnb_config = BitsAndBytesConfig(
-        load_in_4bit=args.use_4bit,
-        bnb_4bit_quant_type=args.bnb_4bit_quant_type,
-        bnb_4bit_compute_dtype=getattr(torch, args.bnb_4bit_compute_dtype),
-        bnb_4bit_use_double_quant=args.use_nested_quant,
-    )
-
-    # TODO: make it more userfriendly
-    device_map = {"": 0}
-
-    model = AutoModelForSequenceClassification.from_pretrained(
-        args.model_name, quantization_config=bnb_config, device_map=device_map
-    )
-    model = prepare_model_for_int8_training(model)
-
-    # we add `score` to the list of modules to save to
-    # correctly save the score head.
-    peft_config = LoraConfig(
-        lora_alpha=32, lora_dropout=0.05, bias="none", task_type="SEQ_CLS", modules_to_save=["score"]
-    )
-
-    model = get_peft_model(model, peft_config)
-
-    tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, use_auth_token=True)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    return model, tokenizer
-
-
-def create_and_prepare_dataset(args, tokenizer, num_proc=12):
-    dataset = load_dataset(args.dataset_name, split="train[:1%]")
-    original_columns = dataset.column_names
-
-    def preprocess_function(examples):
-        new_examples = {
-            "input_ids_chosen": [],
-            "attention_mask_chosen": [],
-            "input_ids_rejected": [],
-            "attention_mask_rejected": [],
-        }
-        for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
-            tokenized_chosen = tokenizer(
-                chosen, truncation=True, padding="max_length", max_length=script_args.max_seq_length
-            )
-            tokenized_rejected = tokenizer(
-                rejected, truncation=True, padding="max_length", max_length=script_args.max_seq_length
-            )
-
-            new_examples["input_ids_chosen"].append(tokenized_chosen["input_ids"])
-            new_examples["attention_mask_chosen"].append(tokenized_rejected["attention_mask"])
-            new_examples["input_ids_rejected"].append(tokenized_chosen["input_ids"])
-            new_examples["attention_mask_rejected"].append(tokenized_rejected["attention_mask"])
-
-        return new_examples
-
-    dataset = dataset.map(preprocess_function, batched=True, num_proc=num_proc, remove_columns=original_columns)
-    return dataset
-
-
-def main():
-    model, tokenizer = create_and_prepare_model(script_args)
-    dataset = create_and_prepare_dataset(script_args, tokenizer)
-
-    training_args = TrainingArguments(
-        output_dir="./output",
-        per_device_train_batch_size=script_args.per_device_train_batch_size,
-        learning_rate=script_args.learning_rate,
-        optim=script_args.optim,
-        max_steps=1,
-        lr_scheduler_type=script_args.lr_scheduler_type,
-        gradient_accumulation_steps=script_args.gradient_accumulation_steps,
-        save_steps=1,
-    )
-
-    trainer = RewardTrainer(
-        model=model,
-        args=training_args,
-        tokenizer=tokenizer,
-        train_dataset=dataset,
-    )
-
-    trainer.train()
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/notebooks/README.md
+++ b/examples/notebooks/README.md
@ -0,0 +1,7 @@
+# Notebooks
+
+This directory contains a collection of Jupyter notebooks that demonstrate how to use the TRL library in different applications.
+
+- [`best_of_n.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/best_of_n.ipynb): This notebook demonstrates how to use the "Best of N" sampling strategy using TRL when fine-tuning your model with PPO.
+- [`gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb): This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook.
+- [`gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment-control.ipynb): This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
--- a/examples/best_of_n_sampling/notebooks/best_of_n.ipynb
+++ b/examples/best_of_n_sampling/notebooks/best_of_n.ipynb
--- a/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb
+++ b/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb
@ -847,7 +847,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.9.16"
+   "version": "3.9.12"
  },
  "vscode": {
   "interpreter": {
--- a/examples/sentiment/notebooks/gpt2-sentiment.ipynb
+++ b/examples/sentiment/notebooks/gpt2-sentiment.ipynb
@ -398,7 +398,7 @@
   "metadata": {},
   "source": [
    "### Training progress\n",
-    "If you are tracking the training progress with Weights&Biases you should see a plot similar to the one below. Check out the interactive sample report on wandb.ai: [link](https://app.wandb.ai/lvwerra/trl-showcase/runs/1jtvxb1m/).\n",
+    "If you are tracking the training progress with Weights&Biases you should see a plot similar to the one below. Check out the interactive sample report on wandb.ai: [link](https://app.wandb.ai/huggingface/trl-showcase/runs/1jtvxb1m/).\n",
    "\n",
    "<div style=\"text-align: center\">\n",
    "<img src='https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/gpt2_tuning_progress.png' width='800'>\n",
--- a/examples/research_projects/README.md
+++ b/examples/research_projects/README.md
@ -0,0 +1,7 @@
+# Research projects that use TRL
+
+Welcome to the research projects folder! Here you can find the scripts used for some research projects that used TRL and maintained by the developers and the community (LM de-toxification, Stack-Llama, etc.). Check out the READMEs in the subfolders for more information!
+
+- [De-detoxifying language models](https://github.com/huggingface/trl/tree/main/examples/research_projects/toxicity)
+- [Stack-Llama](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama)
+- [Stack-Llama-2](https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2)
--- a/examples/research_projects/stack_llama/scripts/README.md
+++ b/examples/research_projects/stack_llama/scripts/README.md
@ -0,0 +1,18 @@
+# RLHF pipeline for the creation of StackLLaMa: a Stack exchange llama-7b model.
+There were three main steps to the training process:
+1. Supervised fine-tuning of the base llama-7b model to create llama-7b-se:
+    - `torchrun --nnodes 1  --nproc_per_node 8 examples/research_projects/stack_llama/scripts/supervised_finetuning.py --model_path=<LLAMA_MODEL_PATH> --streaming --learning_rate 1e-5 --max_steps 5000 --output_dir ./llama-se`
+2. Reward modeling using dialog pairs from the SE dataset using the llama-7b-se to create llama-7b-se-rm:
+    - `torchrun --nnodes 1  --nproc_per_node 8 examples/research_projects/stack_llama/scripts/reward_modeling.py --model_name=<LLAMA_SE_MODEL>`
+3. RL fine-tuning of llama-7b-se with the llama-7b-se-rm reward model:
+    - `accelerate launch --multi_gpu --num_machines 1  --num_processes 8 examples/research_projects/stack_llama/scripts/rl_training.py --log_with=wandb --model_name=<LLAMA_SE_MODEL> --reward_model_name=<LLAMA_SE_RM_MODEL> --adafactor=False --tokenizer_name=<LLAMA_TOKENIZER> --save_freq=100 --output_max_length=128 --batch_size=8 --gradient_accumulation_steps=8 --batched_gen=True --ppo_epochs=4 --seed=0 --learning_rate=1.4e-5 --early_stopping=True --output_dir=llama-se-rl-finetune-128-8-8-1.4e-5_adam`
+
+
+LoRA layers were using at all stages to reduce memory requirements. 
+At each stage the peft adapter layers were merged with the base model, using: 
+```shell
+python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --adapter_model_name=XXX --base_model_name=YYY --output_name=ZZZ
+```
+Note that this script requires `peft>=0.3.0`.
+
+For access to the base llama-7b model, please see Meta's [release](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) and [request form](https://docs.google.com/forms/d/e/1FAIpQLSfqNECQnMkycAp2jP4Z9TFX0cGR4uf7b_fBxjY_OjhJILlKGA/viewform).
--- a/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py
+++ b/examples/research_projects/stack_llama/scripts/merge_peft_adapter.py
@ -9,23 +9,24 @@ from transformers import AutoModelForCausalLM, AutoModelForSequenceClassificatio
@dataclass
 class ScriptArguments:
    """
-    The name of the Casual LM model we wish to fine with PPO
+    The input names representing the Adapter and Base model fine-tuned with PEFT, and the output name representing the
+    merged model.
    """

-    adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
-    base_model_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
-    output_name: Optional[str] = field(default=None, metadata={"help": "the model name"})
+    adapter_model_name: Optional[str] = field(default=None, metadata={"help": "the adapter name"})
+    base_model_name: Optional[str] = field(default=None, metadata={"help": "the base model name"})
+    output_name: Optional[str] = field(default=None, metadata={"help": "the merged model name"})


 parser = HfArgumentParser(ScriptArguments)
 script_args = parser.parse_args_into_dataclasses()[0]
 assert script_args.adapter_model_name is not None, "please provide the name of the Adapter you would like to merge"
 assert script_args.base_model_name is not None, "please provide the name of the Base model"
-assert script_args.base_model_name is not None, "please provide the output name of the merged model"
+assert script_args.output_name is not None, "please provide the output name of the merged model"

 peft_config = PeftConfig.from_pretrained(script_args.adapter_model_name)
 if peft_config.task_type == "SEQ_CLS":
-    # peft is for reward model so load sequence classification
+    # The sequence classification task is used for the reward model in PPO
    model = AutoModelForSequenceClassification.from_pretrained(
        script_args.base_model_name, num_labels=1, torch_dtype=torch.bfloat16
    )
@ -36,7 +37,7 @@ else:

 tokenizer = AutoTokenizer.from_pretrained(script_args.base_model_name)

-# Load the Lora model
+# Load the PEFT model
 model = PeftModel.from_pretrained(model, script_args.adapter_model_name)
 model.eval()

--- a/examples/research_projects/stack_llama/scripts/reward_modeling.py
+++ b/examples/research_projects/stack_llama/scripts/reward_modeling.py
@ -15,6 +15,7 @@ from transformers import (
    Trainer,
    TrainerCallback,
    TrainingArguments,
+    set_seed,
 )
 from transformers.utils import PaddingStrategy

@ -41,7 +42,7 @@ class ScriptArguments:
    per_device_eval_batch_size: Optional[int] = field(default=1)
    gradient_accumulation_steps: Optional[int] = field(default=1)
    learning_rate: Optional[float] = field(default=2e-5)
-    weight_decay: Optional[int] = field(default=0.001)
+    weight_decay: Optional[float] = field(default=0.001)
    model_name: Optional[str] = field(
        default="gpt2",
        metadata={
@ -89,11 +90,14 @@ class ScriptArguments:
        default=False,
        metadata={"help": "Whether to run eval after the first step"},
    )
+    seed: Optional[int] = field(
+        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
+    )


 parser = HfArgumentParser(ScriptArguments)
 script_args = parser.parse_args_into_dataclasses()[0]
-
+set_seed(script_args.seed)
 # Load the human stack-exchange-paired dataset for tuning the reward model.
 train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/reward", split="train")
 if script_args.train_subset > 0:
@ -129,7 +133,10 @@ training_args = TrainingArguments(
    logging_steps=10,
    optim=script_args.optim,
    lr_scheduler_type=script_args.lr_scheduler_type,
+    seed=script_args.seed,
 )
+
+
 # Load the value-head model and tokenizer.
 tokenizer_name = script_args.tokenizer_name if script_args.tokenizer_name is not None else script_args.model_name
 tokenizer = AutoTokenizer.from_pretrained(tokenizer_name, use_auth_token=True)
--- a/examples/research_projects/stack_llama/scripts/rl_training.py
+++ b/examples/research_projects/stack_llama/scripts/rl_training.py
@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2022 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -32,7 +31,7 @@ tqdm.pandas()
@dataclass
 class ScriptArguments:
    """
-    The name of the Casual LM model we wish to fine with PPO
+    The name of the Casual LM model we wish to fine-tune with PPO
    """

    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
@ -67,6 +66,7 @@ class ScriptArguments:
    )

    adap_kl_ctrl: Optional[bool] = field(default=True, metadata={"help": "Use adaptive KL control, otherwise linear"})
+    load_in_8bit: Optional[bool] = field(default=True, metadata={"help": "whether to load the model in 8bit"})


 parser = HfArgumentParser(ScriptArguments)
@ -92,6 +92,8 @@ config = PPOConfig(

 train_dataset = load_dataset("lvwerra/stack-exchange-paired", data_dir="data/rl", split="train")
 train_dataset = train_dataset.select(range(100000))
+original_columns = train_dataset.column_names
+
 # We then define the arguments to pass to the sentiment analysis pipeline.
 # We set `return_all_scores` to True to get the sentiment score for each token.
 sent_kwargs = {
@ -129,9 +131,6 @@ def build_dataset(
            The dataloader for the dataset.
    """

-    # load imdb with datasets
-    ds = load_dataset(dataset_name, data_dir="data/rl", split="train")
-    original_columns = ds.column_names
    num_proc = 24

    def preprocess_function(examples):
@ -164,7 +163,7 @@ dataset = build_dataset(tokenizer)


 def collator(data):
-    return dict((key, [d[key] for d in data]) for key in data[0])
+    return {key: [d[key] for d in data] for key in data[0]}


 # set seed before initializing value head for deterministic eval
@ -182,7 +181,7 @@ lora_config = LoraConfig(
 )
 model = AutoModelForCausalLMWithValueHead.from_pretrained(
    config.model_name,
-    load_in_8bit=True,
+    load_in_8bit=script_args.load_in_8bit,
    device_map={"": current_device},
    peft_config=lora_config,
 )
@ -207,9 +206,9 @@ ppo_trainer = PPOTrainer(
    optimizer=optimizer,
 )

-# We then build the sentiment analysis pipeline, passing the model name and the
-# sentiment analysis pipeline arguments. Let's also make sure to set the device
-# to the same device as the PPOTrainer.
+# We then build the sentiment analysis pipeline using our reward model, passing the
+# model name and the sentiment analysis pipeline arguments. Let's also make sure to
+# set the device to the same device as the PPOTrainer.
 device = ppo_trainer.accelerator.device
 if ppo_trainer.accelerator.num_processes == 1:
    device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a ` pipeline` bug
@ -217,11 +216,13 @@ sentiment_pipe = pipeline(
    "sentiment-analysis",
    model=reward_model_name,
    device_map={"": current_device},
-    model_kwargs={"load_in_8bit": True},
+    model_kwargs={"load_in_8bit": script_args.load_in_8bit},
    tokenizer=tokenizer,
    return_token_type_ids=False,
 )

+if sentiment_pipe.model.config.pad_token_id is None:
+    sentiment_pipe.model.config.pad_token_id = sentiment_pipe.model.config.eos_token_id
 # We then define the arguments to pass to the `generate` function. These arguments
 # are passed to the `generate` function of the PPOTrainer, which is a wrapper around
 # the `generate` function of the trained model.
@ -251,7 +252,7 @@ for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
    )
    batch["response"] = tokenizer.batch_decode(response_tensors, skip_special_tokens=True)

-    # Compute sentiment score
+    # Compute reward score (using the sentiment analysis pipeline)
    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
    rewards = [torch.tensor(output[0]["score"] - script_args.reward_baseline) for output in pipe_outputs]
--- a/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
+++ b/examples/research_projects/stack_llama/scripts/supervised_finetuning.py
@ -38,9 +38,9 @@ def get_args():
    parser.add_argument("--weight_decay", type=float, default=0.05)

    parser.add_argument("--local_rank", type=int, default=0)
-    parser.add_argument("--no_fp16", action="store_false")
+    parser.add_argument("--fp16", action="store_true", default=False)
    parser.add_argument("--bf16", action="store_true", default=False)
-    parser.add_argument("--no_gradient_checkpointing", action="store_false", default=False)
+    parser.add_argument("--gradient_checkpointing", action="store_true", default=False)
    parser.add_argument("--seed", type=int, default=0)
    parser.add_argument("--num_workers", type=int, default=None)
    parser.add_argument("--output_dir", type=str, default="./checkpoints")
@ -159,8 +159,8 @@ def run_training(args, train_data, val_data):
        lr_scheduler_type=args.lr_scheduler_type,
        warmup_steps=args.num_warmup_steps,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
-        gradient_checkpointing=not args.no_gradient_checkpointing,
-        fp16=not args.no_fp16,
+        gradient_checkpointing=args.gradient_checkpointing,
+        fp16=args.fp16,
        bf16=args.bf16,
        weight_decay=args.weight_decay,
        run_name="llama-7b-finetuned",
--- a/examples/research_projects/stack_llama_2/scripts/README.md
+++ b/examples/research_projects/stack_llama_2/scripts/README.md
@ -0,0 +1,76 @@
+# DPO pipeline for the creation of StackLlaMa 2: a Stack exchange llama-v2-7b model
+
+## Prerequisites
+
+Install all the dependencies in the `requirements.txt`:
+
+```
+$ pip install -U -r requirements.txt
+```
+
+Since we will use `accelerate` for training, make sure to run:
+```
+$ accelerate config
+```
+
+## Training
+
+There were two main steps to the DPO training process:
+1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
+
+    ```
+    accelerate launch examples/research_projects/stack_llama_2/scripts/sft_llama2.py \
+        --output_dir="./sft" \
+        --max_steps=500 \
+        --logging_steps=10 \
+        --save_steps=10 \
+        --per_device_train_batch_size=4 \
+        --per_device_eval_batch_size=1 \
+        --gradient_accumulation_steps=2 \
+        --gradient_checkpointing=False \
+        --group_by_length=False \
+        --learning_rate=1e-4 \
+        --lr_scheduler_type="cosine" \
+        --warmup_steps=100 \
+        --weight_decay=0.05 \
+        --optim="paged_adamw_32bit" \
+        --bf16=True \
+        --remove_unused_columns=False \
+        --run_name="sft_llama2" \
+        --report_to="wandb"
+    ```
+1. Run the DPO trainer using the model saved by the previous step:
+    ```
+    accelerate launch examples/research_projects/stack_llama_2/scripts/dpo_llama2.py \
+        --model_name_or_path="sft/final_checkpoint" \
+        --output_dir="dpo"
+    ```
+
+
+## Merging the adaptors
+
+To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
+
+```
+python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo/final_checkpoint/" --output_name="stack-llama-2"
+```
+
+which will also push the model to your HuggingFace hub account.
+
+## Running the model
+
+We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and load them via:
+
+```py
+from peft import AutoPeftModelForCausalLM
+
+
+model = AutoPeftModelForCausalLM.from_pretrained(
+    "dpo/final_checkpoint",
+    low_cpu_mem_usage=True,
+    torch_dtype=torch.float16,
+    load_in_4bit=True,
+)
+
+model.generate(...)
+```
--- a/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
+++ b/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
@ -0,0 +1,240 @@
+# 0. imports
+import os
+from dataclasses import dataclass, field
+from typing import Dict, Optional
+
+import torch
+from accelerate import Accelerator
+from datasets import Dataset, load_dataset
+from peft import LoraConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser, TrainingArguments, set_seed
+
+from trl import DPOTrainer
+
+
+# Define and parse arguments.
+@dataclass
+class ScriptArguments:
+    """
+    The arguments for the DPO training script.
+    """
+
+    # data parameters
+    beta: Optional[float] = field(default=0.1, metadata={"help": "the beta parameter for DPO loss"})
+
+    # training parameters
+    model_name_or_path: Optional[str] = field(
+        default="../sft/results/final_checkpoint",
+        metadata={"help": "the location of the SFT model name or path"},
+    )
+    learning_rate: Optional[float] = field(default=5e-4, metadata={"help": "optimizer learning rate"})
+    lr_scheduler_type: Optional[str] = field(default="cosine", metadata={"help": "the lr scheduler type"})
+    warmup_steps: Optional[int] = field(default=100, metadata={"help": "the number of warmup steps"})
+    weight_decay: Optional[float] = field(default=0.05, metadata={"help": "the weight decay"})
+    optimizer_type: Optional[str] = field(default="paged_adamw_32bit", metadata={"help": "the optimizer type"})
+
+    per_device_train_batch_size: Optional[int] = field(default=4, metadata={"help": "train batch size per device"})
+    per_device_eval_batch_size: Optional[int] = field(default=1, metadata={"help": "eval batch size per device"})
+    gradient_accumulation_steps: Optional[int] = field(
+        default=4, metadata={"help": "the number of gradient accumulation steps"}
+    )
+    gradient_checkpointing: Optional[bool] = field(
+        default=True, metadata={"help": "whether to use gradient checkpointing"}
+    )
+
+    gradient_checkpointing_use_reentrant: Optional[bool] = field(
+        default=False, metadata={"help": "whether to use reentrant for gradient checkpointing"}
+    )
+
+    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
+    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
+    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
+
+    max_prompt_length: Optional[int] = field(default=512, metadata={"help": "the maximum prompt length"})
+    max_length: Optional[int] = field(default=1024, metadata={"help": "the maximum sequence length"})
+    max_steps: Optional[int] = field(default=1000, metadata={"help": "max number of training steps"})
+    logging_steps: Optional[int] = field(default=10, metadata={"help": "the logging frequency"})
+    save_steps: Optional[int] = field(default=100, metadata={"help": "the saving frequency"})
+    eval_steps: Optional[int] = field(default=100, metadata={"help": "the evaluation frequency"})
+
+    output_dir: Optional[str] = field(default="./results", metadata={"help": "the output directory"})
+    log_freq: Optional[int] = field(default=1, metadata={"help": "the logging frequency"})
+    load_in_4bit: Optional[bool] = field(default=True, metadata={"help": "whether to load the model in 4bit"})
+    model_dtype: Optional[str] = field(
+        default="float16", metadata={"help": "model_dtype[float16, bfloat16, float] for loading."}
+    )
+
+    # instrumentation
+    sanity_check: Optional[bool] = field(default=False, metadata={"help": "only train on 1000 samples"})
+    report_to: Optional[str] = field(
+        default="wandb",
+        metadata={
+            "help": 'The list of integrations to report the results and logs to. Supported platforms are `"azure_ml"`,'
+            '`"comet_ml"`, `"mlflow"`, `"neptune"`, `"tensorboard"`,`"clearml"` and `"wandb"`. '
+            'Use `"all"` to report to all integrations installed, `"none"` for no integrations.'
+        },
+    )
+    # debug argument for distributed training
+    ignore_bias_buffers: Optional[bool] = field(
+        default=False,
+        metadata={
+            "help": "fix for DDP issues with LM bias/mask buffers - invalid scalar type,`inplace operation. See"
+            "https://github.com/huggingface/transformers/issues/22482#issuecomment-1595790992"
+        },
+    )
+    seed: Optional[int] = field(
+        default=0, metadata={"help": "Random seed that will be set at the beginning of training."}
+    )
+
+
+def get_stack_exchange_paired(
+    data_dir: str = "data/rl",
+    sanity_check: bool = False,
+    cache_dir: Optional[str] = None,
+    num_proc=24,
+) -> Dataset:
+    """Load the stack-exchange-paired dataset from Hugging Face and convert it to the necessary format.
+
+    The dataset is converted to a dictionary with the following structure:
+    {
+        'prompt': List[str],
+        'chosen': List[str],
+        'rejected': List[str],
+    }
+
+    Prompts are structured as follows:
+      "Question: " + <prompt> + "\n\nAnswer: "
+    """
+    dataset = load_dataset(
+        "lvwerra/stack-exchange-paired",
+        split="train",
+        cache_dir=cache_dir,
+        data_dir=data_dir,
+    )
+    original_columns = dataset.column_names
+
+    if sanity_check:
+        dataset = dataset.select(range(min(len(dataset), 1000)))
+
+    def return_prompt_and_responses(samples) -> Dict[str, str]:
+        return {
+            "prompt": ["Question: " + question + "\n\nAnswer: " for question in samples["question"]],
+            "chosen": samples["response_j"],
+            "rejected": samples["response_k"],
+        }
+
+    return dataset.map(
+        return_prompt_and_responses,
+        batched=True,
+        num_proc=num_proc,
+        remove_columns=original_columns,
+    )
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser(ScriptArguments)
+    script_args = parser.parse_args_into_dataclasses()[0]
+
+    set_seed(script_args.seed)
+
+    # 1. load a pretrained model
+    torch_dtype = torch.float
+    if script_args.model_dtype == "float16":
+        torch_dtype = torch.float16
+    elif script_args.model_dtype == "bfloat16":
+        torch_dtype = torch.bfloat16
+
+    model = AutoModelForCausalLM.from_pretrained(
+        script_args.model_name_or_path,
+        low_cpu_mem_usage=True,
+        torch_dtype=torch_dtype,
+        load_in_4bit=script_args.load_in_4bit,
+        device_map={"": Accelerator().local_process_index},
+    )
+    model.config.use_cache = False
+
+    if script_args.ignore_bias_buffers:
+        # torch distributed hack
+        model._ddp_params_and_buffers_to_ignore = [
+            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
+        ]
+
+    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # 2. Load the Stack-exchange paired dataset
+    train_dataset = get_stack_exchange_paired(data_dir="data/rl", sanity_check=script_args.sanity_check)
+    train_dataset = train_dataset.filter(
+        lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
+        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
+    )
+
+    # 3. Load evaluation dataset
+    eval_dataset = get_stack_exchange_paired(data_dir="data/evaluation", sanity_check=True)
+    eval_dataset = eval_dataset.filter(
+        lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
+        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
+    )
+
+    # 4. initialize training arguments:
+    training_args = TrainingArguments(
+        per_device_train_batch_size=script_args.per_device_train_batch_size,
+        per_device_eval_batch_size=script_args.per_device_eval_batch_size,
+        max_steps=script_args.max_steps,
+        logging_steps=script_args.logging_steps,
+        save_steps=script_args.save_steps,
+        gradient_accumulation_steps=script_args.gradient_accumulation_steps,
+        gradient_checkpointing=script_args.gradient_checkpointing,
+        learning_rate=script_args.learning_rate,
+        evaluation_strategy="steps",
+        eval_steps=script_args.eval_steps,
+        output_dir=script_args.output_dir,
+        report_to=script_args.report_to,
+        lr_scheduler_type=script_args.lr_scheduler_type,
+        warmup_steps=script_args.warmup_steps,
+        optim=script_args.optimizer_type,
+        bf16=True,
+        remove_unused_columns=False,
+        run_name="dpo_llama2",
+        gradient_checkpointing_kwargs=dict(use_reentrant=script_args.gradient_checkpointing_use_reentrant),
+        seed=script_args.seed,
+    )
+
+    peft_config = LoraConfig(
+        r=script_args.lora_r,
+        lora_alpha=script_args.lora_alpha,
+        lora_dropout=script_args.lora_dropout,
+        target_modules=[
+            "q_proj",
+            "v_proj",
+            "k_proj",
+            "out_proj",
+            "fc_in",
+            "fc_out",
+            "wte",
+        ],
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+
+    # 5. initialize the DPO trainer
+    dpo_trainer = DPOTrainer(
+        model,
+        ref_model=None,
+        args=training_args,
+        beta=script_args.beta,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        peft_config=peft_config,
+        max_prompt_length=script_args.max_prompt_length,
+        max_length=script_args.max_length,
+    )
+
+    # 6. train
+    dpo_trainer.train()
+    dpo_trainer.save_model(script_args.output_dir)
+
+    # 7. save
+    output_dir = os.path.join(script_args.output_dir, "final_checkpoint")
+    dpo_trainer.model.save_pretrained(output_dir)
--- a/examples/research_projects/stack_llama_2/scripts/requirements.txt
+++ b/examples/research_projects/stack_llama_2/scripts/requirements.txt
@ -0,0 +1,7 @@
+transformers
+trl
+peft
+accelerate
+datasets
+bitsandbytes
+wandb
--- a/examples/research_projects/stack_llama_2/scripts/sft_llama2.py
+++ b/examples/research_projects/stack_llama_2/scripts/sft_llama2.py
@ -0,0 +1,199 @@
+# Fine-Tune Llama2-7b on SE paired dataset
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from accelerate import Accelerator
+from datasets import load_dataset
+from peft import AutoPeftModelForCausalLM, LoraConfig
+from tqdm import tqdm
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    BitsAndBytesConfig,
+    HfArgumentParser,
+    TrainingArguments,
+    set_seed,
+)
+
+from trl import SFTTrainer
+from trl.import_utils import is_npu_available, is_xpu_available
+from trl.trainer import ConstantLengthDataset
+
+
+@dataclass
+class ScriptArguments:
+    model_name: Optional[str] = field(default="meta-llama/Llama-2-7b-hf", metadata={"help": "the model name"})
+    dataset_name: Optional[str] = field(default="lvwerra/stack-exchange-paired", metadata={"help": "the dataset name"})
+    subset: Optional[str] = field(default="data/finetune", metadata={"help": "the subset to use"})
+    split: Optional[str] = field(default="train", metadata={"help": "the split to use"})
+    size_valid_set: Optional[int] = field(default=4000, metadata={"help": "the size of the validation set"})
+    streaming: Optional[bool] = field(default=True, metadata={"help": "whether to stream the dataset"})
+    shuffle_buffer: Optional[int] = field(default=5000, metadata={"help": "the shuffle buffer size"})
+    seq_length: Optional[int] = field(default=1024, metadata={"help": "the sequence length"})
+    num_workers: Optional[int] = field(default=4, metadata={"help": "the number of workers"})
+    packing: Optional[bool] = field(default=True, metadata={"help": "whether to use packing for SFTTrainer"})
+    use_bnb: Optional[bool] = field(default=True, metadata={"help": "whether to use BitsAndBytes"})
+
+    # LoraConfig
+    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
+    lora_dropout: Optional[float] = field(default=0.05, metadata={"help": "the lora dropout parameter"})
+    lora_r: Optional[int] = field(default=8, metadata={"help": "the lora r parameter"})
+
+
+parser = HfArgumentParser((ScriptArguments, TrainingArguments))
+script_args, training_args = parser.parse_args_into_dataclasses()
+peft_config = LoraConfig(
+    r=script_args.lora_r,
+    lora_alpha=script_args.lora_alpha,
+    lora_dropout=script_args.lora_dropout,
+    target_modules=["q_proj", "v_proj"],
+    bias="none",
+    task_type="CAUSAL_LM",
+)
+
+if training_args.group_by_length and script_args.packing:
+    raise ValueError("Cannot use both packing and group by length")
+
+# `gradient_checkpointing` was True by default until `1f3314`, but it's actually not used.
+# `gradient_checkpointing=True` will cause `Variable._execution_engine.run_backward`.
+if training_args.gradient_checkpointing:
+    raise ValueError("gradient_checkpointing not supported")
+
+set_seed(training_args.seed)
+
+
+def chars_token_ratio(dataset, tokenizer, nb_examples=400):
+    """
+    Estimate the average number of characters per token in the dataset.
+    """
+    total_characters, total_tokens = 0, 0
+    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
+        text = prepare_sample_text(example)
+        total_characters += len(text)
+        if tokenizer.is_fast:
+            total_tokens += len(tokenizer(text).tokens())
+        else:
+            total_tokens += len(tokenizer.tokenize(text))
+
+    return total_characters / total_tokens
+
+
+def print_trainable_parameters(model):
+    """
+    Prints the number of trainable parameters in the model.
+    """
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+def prepare_sample_text(example):
+    """Prepare the text from a sample of the dataset."""
+    text = f"Question: {example['question']}\n\nAnswer: {example['response_j']}"
+    return text
+
+
+def create_datasets(tokenizer, args, seed=None):
+    dataset = load_dataset(
+        args.dataset_name,
+        data_dir=args.subset,
+        split=args.split,
+        use_auth_token=True,
+        num_proc=args.num_workers if not args.streaming else None,
+        streaming=args.streaming,
+    )
+    if args.streaming:
+        print("Loading the dataset in streaming mode")
+        valid_data = dataset.take(args.size_valid_set)
+        train_data = dataset.skip(args.size_valid_set)
+        train_data = train_data.shuffle(buffer_size=args.shuffle_buffer, seed=seed)
+    else:
+        dataset = dataset.train_test_split(test_size=0.005, seed=seed)
+        train_data = dataset["train"]
+        valid_data = dataset["test"]
+        print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
+
+    chars_per_token = chars_token_ratio(train_data, tokenizer)
+    print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
+
+    train_dataset = ConstantLengthDataset(
+        tokenizer,
+        train_data,
+        formatting_func=prepare_sample_text,
+        infinite=True,
+        seq_length=args.seq_length,
+        chars_per_token=chars_per_token,
+    )
+    valid_dataset = ConstantLengthDataset(
+        tokenizer,
+        valid_data,
+        formatting_func=prepare_sample_text,
+        infinite=False,
+        seq_length=args.seq_length,
+        chars_per_token=chars_per_token,
+    )
+    return train_dataset, valid_dataset
+
+
+bnb_config = None
+if script_args.use_bnb:
+    bnb_config = BitsAndBytesConfig(
+        load_in_4bit=True,
+        bnb_4bit_quant_type="nf4",
+        bnb_4bit_compute_dtype=torch.bfloat16,
+    )
+
+base_model = AutoModelForCausalLM.from_pretrained(
+    script_args.model_name,
+    quantization_config=bnb_config,
+    device_map={"": Accelerator().local_process_index},
+    trust_remote_code=True,
+    use_auth_token=True,
+)
+base_model.config.use_cache = False
+
+
+tokenizer = AutoTokenizer.from_pretrained(script_args.model_name, trust_remote_code=True)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"  # Fix weird overflow issue with fp16 training
+
+train_dataset, eval_dataset = create_datasets(tokenizer, script_args, seed=training_args.seed)
+
+trainer = SFTTrainer(
+    model=base_model,
+    train_dataset=train_dataset,
+    eval_dataset=eval_dataset,
+    peft_config=peft_config,
+    packing=script_args.packing,
+    max_seq_length=None,
+    tokenizer=tokenizer,
+    args=training_args,
+)
+trainer.train()
+trainer.save_model(training_args.output_dir)
+
+output_dir = os.path.join(training_args.output_dir, "final_checkpoint")
+trainer.model.save_pretrained(output_dir)
+
+# Free memory for merging weights
+del base_model
+if is_xpu_available():
+    torch.xpu.empty_cache()
+elif is_npu_available():
+    torch.npu.empty_cache()
+else:
+    torch.cuda.empty_cache()
+
+model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16)
+model = model.merge_and_unload()
+
+output_merged_dir = os.path.join(training_args.output_dir, "final_merged_checkpoint")
+model.save_pretrained(output_merged_dir, safe_serialization=True)
--- a/examples/research_projects/tools/calculator.py
+++ b/examples/research_projects/tools/calculator.py
@ -0,0 +1,118 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import re
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer, load_tool
+
+from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, TextEnvironment
+
+
+def generate_data(n):
+    """Generate random arithmetic tasks and answers."""
+    tasks, answers = [], []
+    for _ in range(n):
+        a = np.random.randint(0, 50)
+        b = np.random.randint(0, 50)
+        op = np.random.choice(["-", "+", "*"])
+        tasks.append(f"\n\nWhat is {a} {op} {b}?")
+        if op == "-":
+            answers.append(a - b)
+        elif op == "+":
+            answers.append(a + b)
+        else:
+            answers.append(a * b)
+    return tasks, answers
+
+
+def exact_match_reward(responses, answers=None):
+    """Reward if generated response contains correct answer."""
+    rewards = []
+    pattern = r"Result\s*=\s*(-?\d+(?:\.\d+)?)\s*<submit>"  # generated by chatGPT
+    for response, answer in zip(responses, answers):
+        reward = 0.0
+        predicted_number = None
+        match_pattern = re.findall(pattern, response)
+        if match_pattern:
+            predicted_number = float(match_pattern[0])
+        if predicted_number is not None:
+            if np.abs(predicted_number - answer) < 0.01:
+                reward += 1.0
+        rewards.append(torch.tensor(reward))
+    return rewards
+
+
+# set up models
+model_id = "gpt2"
+model = AutoModelForCausalLMWithValueHead.from_pretrained(model_id)
+model_ref = AutoModelForCausalLMWithValueHead.from_pretrained(model_id)
+tokenizer = AutoTokenizer.from_pretrained(model_id)
+tokenizer.pad_token = tokenizer.eos_token
+
+# system prompt
+prompt = """\
+What is 13-3?
+
+<request><SimpleCalculatorTool>13-3<call>10.0<response>
+
+Result=10<submit>
+
+What is 4*3?
+
+<request><SimpleCalculatorTool>4*3<call>12.0<response>
+
+Result=12<submit>"""
+
+generation_kwargs = {
+    "min_length": -1,
+    "top_k": 0.0,
+    "top_p": 1.0,
+    "do_sample": True,
+    "pad_token_id": tokenizer.eos_token_id,
+    "eos_token_id": -1,
+    "max_new_tokens": 32,
+}
+
+# trainer
+ppo_config = PPOConfig(
+    batch_size=256,
+    learning_rate=1.41e-5,
+    mini_batch_size=64,
+    log_with="wandb",
+)
+ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)
+
+# text env
+text_env = TextEnvironment(
+    model,
+    tokenizer,
+    {"SimpleCalculatorTool": load_tool("ybelkada/simple-calculator")},
+    exact_match_reward,
+    prompt,
+    generation_kwargs=generation_kwargs,
+)
+
+# main training loop
+for _step in range(100):
+    tasks, answers = generate_data(ppo_config.batch_size)
+    queries, responses, masks, rewards, histories = text_env.run(tasks, answers=answers)
+    train_stats = ppo_trainer.step(queries, responses, rewards, masks)
+
+    response_texts = [tokenizer.decode(response) for response in responses]
+    query_texts = [tokenizer.decode(query) for query in queries]
+    texts = {"query": [qt.split("<submit>")[-1].strip() for qt in query_texts], "response": response_texts}
+    ppo_trainer.log_stats(train_stats, texts, rewards, columns_to_log=["query", "response", "answer"])
+ppo_trainer.save_pretrained(model_id + "-calculator")
--- a/examples/research_projects/tools/python_interpreter.py
+++ b/examples/research_projects/tools/python_interpreter.py
@ -0,0 +1,193 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+from dataclasses import dataclass, field
+from typing import Optional
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from peft import LoraConfig
+from transformers import AutoTokenizer, HfArgumentParser, load_tool
+
+from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, TextEnvironment
+
+
+os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+@dataclass
+class ScriptArguments:
+    model_name: Optional[str] = field(default="bigcode/starcoderbase", metadata={"help": "the model name"})
+    learning_rate: Optional[float] = field(default=1e-5, metadata={"help": "the learning rate"})
+    mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
+    batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
+    gradient_accumulation_steps: Optional[int] = field(
+        default=16, metadata={"help": "the number of gradient accumulation steps"}
+    )
+    max_new_tokens: Optional[int] = field(default=256, metadata={"help": "max number of generated tokens per turn"})
+    ppo_epochs: Optional[int] = field(default=1, metadata={"help": "max number of ppo epochs"})
+    n_epochs: Optional[int] = field(default=32, metadata={"help": "max number of ppo epochs"})
+
+
+parser = HfArgumentParser(ScriptArguments)
+args = parser.parse_args_into_dataclasses()[0]
+
+
+def exact_match_reward(responses, answers=None):
+    """Reward if generated response contains correct answer."""
+    rewards = []
+    pattern = r"Result\s*=\s*(-?\d+(?:\.\d+)?)\s*<submit>"  # generated by chatGPT
+    for response, answer in zip(responses, answers):
+        reward = 0.0
+        try:
+            predicted_number = None
+            match_pattern = re.findall(pattern, response)
+            if match_pattern:
+                predicted_number = float(match_pattern[0])
+            if predicted_number is not None:
+                if np.abs(predicted_number - float(answer)) < 0.1:
+                    reward += 1.0
+        except Exception:
+            pass
+        rewards.append(torch.tensor(reward))
+    return rewards
+
+
+def evaluate(test_dataloader, text_env, ppo_trainer):
+    test_rewards = []
+    for test_batch in test_dataloader:
+        _, _, _, rewards, _ = text_env.run(test_batch["query"], answers=test_batch["answer"])
+        test_rewards.extend(rewards)
+    test_rewards = ppo_trainer.accelerator.gather_for_metrics(
+        torch.stack(test_rewards).to(ppo_trainer.accelerator.device)
+    )
+    return test_rewards.mean()
+
+
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=["c_proj", "c_attn", "q_attn"],
+)
+
+# set up models
+model = AutoModelForCausalLMWithValueHead.from_pretrained(
+    args.model_name,
+    use_auth_token=True,
+    load_in_4bit=True,
+    peft_config=lora_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+ds = load_dataset("gsm8k", "main", split="train")
+ds = ds.rename_columns({"question": "query"})
+ds = ds.map(lambda x: {"answer": x["answer"].split("#### ")[1]})
+ds = ds.select(range(1, len(ds)))  # skip the first sample which is used in prompt
+
+ds_test = load_dataset("gsm8k", "main", split="test")
+ds_test = ds_test.rename_columns({"question": "query"})
+ds_test = ds_test.map(lambda x: {"answer": x["answer"].split("#### ")[1]})
+
+test_dataloader = torch.utils.data.DataLoader(ds_test, batch_size=args.batch_size)
+
+# prompt
+prompt = """\
+Example of using a Python API to solve math questions.
+
+Q: Olivia has $23. She bought five bagels for $3 each. How much money does she have left?
+
+<request><PythonInterpreter>
+def solution():
+    money_initial = 23
+    bagels = 5
+    bagel_cost = 3
+    money_spent = bagels * bagel_cost
+    money_left = money_initial - money_spent
+    result = money_left
+    return result
+print(solution())
+<call>72<response>
+
+Result = 72 <submit>
+
+Q: """
+
+generation_kwargs = {
+    "min_length": -1,
+    "top_k": 0.0,
+    "top_p": 1.0,
+    "do_sample": True,
+    "pad_token_id": tokenizer.eos_token_id,
+    "eos_token_id": -1,
+    "max_new_tokens": args.max_new_tokens,
+}
+
+# trainer
+ppo_config = PPOConfig(
+    batch_size=args.batch_size,
+    learning_rate=args.learning_rate,
+    mini_batch_size=args.mini_batch_size,
+    ppo_epochs=args.ppo_epochs,
+    gradient_accumulation_steps=args.gradient_accumulation_steps,
+    log_with="wandb",
+    tracker_project_name="trl-gsm8k",
+    remove_unused_columns=False,
+    optimize_cuda_cache=True,
+)
+
+ppo_trainer = PPOTrainer(config=ppo_config, model=model, tokenizer=tokenizer, dataset=ds)
+test_dataloader = ppo_trainer.accelerator.prepare(test_dataloader)
+
+# text env
+text_env = TextEnvironment(
+    model,
+    tokenizer,
+    [load_tool("lvwerra/python-interpreter")],
+    exact_match_reward,
+    prompt,
+    max_turns=2,
+    generation_kwargs=generation_kwargs,
+)
+
+# main training loop
+for epoch in range(args.n_epochs):
+    for step, batch in enumerate(ppo_trainer.dataloader):
+        if (step == 0) and (epoch % 4 == 0):  # evaluate every 4 epochs
+            reward_mean_test = evaluate(test_dataloader, text_env, ppo_trainer)
+        else:
+            reward_mean_test = None
+
+        queries, responses, masks, rewards, histories = text_env.run(batch["query"], answers=batch["answer"])
+        train_stats = ppo_trainer.step(queries, responses, rewards, masks)
+
+        # logging
+        if reward_mean_test is not None:
+            train_stats["env/reward_mean_test"] = reward_mean_test
+        texts = {
+            "query": batch["query"],
+            "response": [tokenizer.decode(response) for response in responses],
+            "answer": batch["answer"],
+        }
+        ppo_trainer.log_stats(train_stats, texts, rewards, columns_to_log=["query", "response", "answer"])
+
+reward_mean_test = evaluate(test_dataloader, text_env, ppo_trainer)
+ppo_trainer.save_pretrained(f"model/{args.model_name}-gsm8k")
--- a/examples/research_projects/tools/triviaqa.py
+++ b/examples/research_projects/tools/triviaqa.py
@ -0,0 +1,192 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from datasets import load_dataset
+from peft import LoraConfig
+from transformers import AutoTokenizer, HfArgumentParser, load_tool
+
+from trl import AutoModelForCausalLMWithValueHead, PPOConfig, PPOTrainer, TextEnvironment
+
+
+os.environ["HF_ALLOW_CODE_EVAL"] = "1"
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+
+
+@dataclass
+class ScriptArguments:
+    model_name: Optional[str] = field(default="bigcode/starcoderbase", metadata={"help": "the model name"})
+    log_with: Optional[str] = field(default=None, metadata={"help": "use 'wandb' to log with wandb"})
+    learning_rate: Optional[float] = field(default=1e-5, metadata={"help": "the learning rate"})
+    mini_batch_size: Optional[int] = field(default=1, metadata={"help": "the PPO minibatch size"})
+    batch_size: Optional[int] = field(default=32, metadata={"help": "the batch size"})
+    gradient_accumulation_steps: Optional[int] = field(
+        default=16, metadata={"help": "the number of gradient accumulation steps"}
+    )
+    max_new_tokens: Optional[int] = field(default=256, metadata={"help": "max number of generated tokens per turn"})
+    ppo_epochs: Optional[int] = field(default=1, metadata={"help": "max number of ppo epochs"})
+    iterations: Optional[int] = field(default=1000, metadata={"help": "the number of iterations"})
+    seed: Optional[int] = field(default=0, metadata={"help": "the random seed"})
+
+
+parser = HfArgumentParser(ScriptArguments)
+args = parser.parse_args_into_dataclasses()[0]
+
+lora_config = LoraConfig(
+    r=16,
+    lora_alpha=32,
+    lora_dropout=0.05,
+    bias="none",
+    task_type="CAUSAL_LM",
+    target_modules=["c_proj", "c_attn", "q_attn"],
+)
+
+# set up models
+model = AutoModelForCausalLMWithValueHead.from_pretrained(
+    args.model_name,
+    use_auth_token=True,
+    trust_remote_code=True,
+    load_in_4bit=True,
+    peft_config=lora_config,
+)
+tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_auth_token=True)
+tokenizer.pad_token = tokenizer.eos_token
+
+# system prompt
+prompt = """\
+Answer the following question:
+
+Q: In which branch of the arts is Patricia Neary famous?
+A: Ballets
+A2: <request><Wiki>Patricia Neary<call>Patricia Neary (born October 27, 1942) is an American ballerina, choreographer and ballet director, who has been particularly active in Switzerland. She has also been a highly successful ambassador for the Balanchine Trust, bringing George Balanchine's ballets to 60 cities around the globe.<response>
+Result=Ballets<submit>
+
+Q: Who won Super Bowl XX?
+A: Chicago Bears
+A2: <request><Wiki>Super Bowl XX<call>Super Bowl XX was an American football game between the National Football Conference (NFC) champion Chicago Bears and the American Football Conference (AFC) champion New England Patriots to decide the National Football League (NFL) champion for the 1985 season. The Bears defeated the Patriots by the score of 46–10, capturing their first NFL championship (and Chicago's first overall sports victory) since 1963, three years prior to the birth of the Super Bowl. Super Bowl XX was played on January 26, 1986 at the Louisiana Superdome in New Orleans.<response>
+Result=Chicago Bears<submit>
+
+Q: """
+
+generation_kwargs = {
+    "min_length": -1,
+    "top_k": 0.0,
+    "top_p": 1.0,
+    "do_sample": True,
+    "pad_token_id": tokenizer.eos_token_id,
+    "eos_token_id": -1,
+    "max_new_tokens": args.max_new_tokens,
+}
+
+# trainer
+config = PPOConfig(
+    batch_size=args.batch_size,
+    model_name=args.model_name,
+    learning_rate=args.learning_rate,
+    log_with=args.log_with,
+    mini_batch_size=args.mini_batch_size,
+    ppo_epochs=args.ppo_epochs,
+    gradient_accumulation_steps=args.gradient_accumulation_steps,
+    seed=args.seed,
+    optimize_cuda_cache=True,
+)
+ppo_trainer = PPOTrainer(config=config, model=model, tokenizer=tokenizer)
+dataset = load_dataset("trivia_qa", "rc", split="train")
+local_seed = args.seed + ppo_trainer.accelerator.process_index * 100003  # Prime
+dataset = dataset.shuffle(local_seed)
+
+
+def data_generator():
+    for i in range(len(dataset)):
+        yield dataset[i]["question"], list(dataset[i]["answer"]["normalized_aliases"])
+
+
+gen = data_generator()
+gen = iter(gen)
+
+
+def generate_data(n):
+    tasks, answers = [], []
+    for _i in range(n):
+        q, a = next(gen)
+        tasks.append(q)
+        answers.append(a)
+    return tasks, answers
+
+
+def exact_match_reward(responses, answers=None):
+    """Reward if generated response contains correct answer."""
+    rewards = []
+    for response, answer in zip(responses, answers):
+        reward = 0.0
+        for a in answer:
+            if a.lower() in response.lower():
+                reward += 1.0
+                break
+        rewards.append(torch.tensor(reward))
+    return rewards
+
+
+def tool_fn(x):
+    # limit the amount of tokens
+    return tool(x).split("\n")[1][:600]
+
+
+# text env
+tool = load_tool("vwxyzjn/pyserini-wikipedia-kilt-doc")
+
+text_env = TextEnvironment(
+    model,
+    tokenizer,
+    {"Wiki": tool_fn},
+    exact_match_reward,
+    prompt,
+    generation_kwargs=generation_kwargs,
+    max_tool_reponse=400,
+)
+
+
+def print_trainable_parameters(model):
+    trainable_params = 0
+    all_param = 0
+    for _, param in model.named_parameters():
+        all_param += param.numel()
+        if param.requires_grad:
+            trainable_params += param.numel()
+    print(
+        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
+    )
+
+
+print_trainable_parameters(model)
+# main training loop
+for i in range(args.iterations):
+    tasks, answers = generate_data(config.batch_size)
+    queries, responses, masks, rewards, histories = text_env.run(tasks, answers=answers)
+    train_stats = ppo_trainer.step(queries, responses, rewards, masks)
+    response_texts = [tokenizer.decode(response) for response in responses]
+    query_texts = [tokenizer.decode(query) for query in queries]
+    texts = {
+        "query": [qt.split("<submit>")[-1].strip() for qt in query_texts],
+        "response": response_texts,
+        "answer": [", ".join(item) for item in answers],
+    }
+    all_rewards = ppo_trainer.accelerator.gather(torch.tensor(rewards, device=ppo_trainer.accelerator.device))
+    ppo_trainer.log_stats(train_stats, texts, list(all_rewards), columns_to_log=["query", "response", "answer"])
+    if i % 100 == 0:
+        ppo_trainer.save_pretrained(f"models/{args.model_name}_{args.seed}_{i}_triviaqa")
--- a/examples/research_projects/toxicity/README.md
+++ b/examples/research_projects/toxicity/README.md
@ -3,5 +3,5 @@
 To run this code, do the following:

 ```shell
-ACCELERATE_LOG_LEVEL=info accelerate launch --config_file {CONFIG} examples/toxicity/scripts/gpt-j-6b-toxicity.py --log_with wandb
-```
+ACCELERATE_LOG_LEVEL=info accelerate launch --config_file {CONFIG} examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py --log_with wandb
+```
--- a/examples/research_projects/toxicity/scripts/evaluate-toxicity.py
+++ b/examples/research_projects/toxicity/scripts/evaluate-toxicity.py
@ -8,6 +8,8 @@ from datasets import load_dataset
 from tqdm import tqdm
 from transformers import AutoModelForCausalLM, AutoTokenizer

+from trl.import_utils import is_npu_available, is_xpu_available
+

 toxicity = evaluate.load("ybelkada/toxicity", "DaNLP/da-electra-hatespeech-detection", module_type="measurement")
 ds = load_dataset("OxAISH-AL-LLM/wiki_toxic", split="test")
@ -50,7 +52,12 @@ BATCH_SIZE = args.batch_size
 output_file = args.output_file
 max_new_tokens = args.max_new_tokens
 context_length = args.context_length
-device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"
+if is_xpu_available():
+    device = torch.xpu.current_device()
+elif is_npu_available():
+    device = torch.npu.current_device()
+else:
+    device = torch.cuda.current_device() if torch.cuda.is_available() else "cpu"

 # consider only toxic prompts
 ds = ds.filter(lambda x: x["label"] == 1)
@ -116,7 +123,12 @@ for model_id in tqdm(MODELS_TO_TEST):
    print(f"Model: {model_id} - Mean: {mean} - Std: {std}")

    model = None
-    torch.cuda.empty_cache()
+    if is_xpu_available():
+        torch.xpu.empty_cache()
+    elif is_npu_available():
+        torch.npu.empty_cache()
+    else:
+        torch.cuda.empty_cache()

 # close file
 file.close()
--- a/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py
+++ b/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py
@ -1,4 +1,3 @@
-# coding=utf-8
 # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@ -59,7 +58,7 @@ tqdm.pandas()
@dataclass
 class ScriptArguments:
    """
-    The name of the Casual LM model we wish to fine with PPO
+    The name of the Casual LM model we wish to fine-tune with PPO
    """

    # NOTE: gpt2 models use Conv1D instead of Linear layers which are not yet supported in 8 bit mode
@ -146,7 +145,7 @@ dataset = build_dataset(config, input_min_text_length=min_input_length, input_ma


 def collator(data):
-    return dict((key, [d[key] for d in data]) for key in data[0])
+    return {key: [d[key] for d in data] for key in data[0]}


 # set seed before initializing value head for deterministic eval
@ -218,7 +217,7 @@ for epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
        response_tensors.append(response.squeeze()[-gen_len:])
    batch["response"] = [tokenizer.decode(r.squeeze()) for r in response_tensors]

-    # Compute sentiment score # noqa
+    # Compute sentiment score
    texts = batch["response"]
    toxicity_inputs = toxicity_tokenizer(texts, padding=True, truncation=True, return_tensors="pt").to(
        ppo_trainer.accelerator.device
--- a/examples/scripts/chat.py
+++ b/examples/scripts/chat.py
@ -0,0 +1,338 @@
+# flake8: noqa
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from trl.commands.cli_utils import init_zero_verbose
+
+init_zero_verbose()
+
+import copy
+import json
+import os
+import pwd
+import re
+import time
+from threading import Thread
+
+import torch
+from rich.console import Console
+from rich.live import Live
+from rich.markdown import Markdown
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+
+from trl.commands.cli_utils import ChatArguments, TrlParser, init_zero_verbose
+from trl.trainer.utils import get_kbit_device_map, get_quantization_config
+
+
+HELP_STRING = """\
+
+**TRL CHAT INTERFACE**
+
+The chat interface is a simple tool to try out a chat model.
+
+Besides talking to the model there are several commands:
+- **clear**: clears the current conversation and start a new one
+- **example {NAME}**: load example named `{NAME}` from the config and use it as the user input
+- **set {SETTING_NAME}={SETTING_VALUE};**: change the system prompt or generation settings (multiple settings are separated by a ';').
+- **reset**: same as clear but also resets the generation configs to defaults if they have been changed by **set**
+- **save {SAVE_NAME} (optional)**: save the current chat and settings to file by default to `./chat_history/{MODEL_NAME}/chat_{DATETIME}.yaml` or `{SAVE_NAME}` if provided
+- **exit**: closes the interface
+"""
+
+SUPPORTED_GENERATION_KWARGS = [
+    "max_new_tokens",
+    "do_sample",
+    "num_beams",
+    "temperature",
+    "top_p",
+    "top_k",
+    "repetition_penalty",
+]
+
+SETTING_RE = r"^set\s+[A-Za-z\s_]+=[A-Za-z\d\s.!\"#$%&'()*+,-/:<=>?@\[\]^_`{|}~]+(?:;\s*[A-Za-z\s_]+=[A-Za-z\d\s.!\"#$%&'()*+,-/:<=>?@\[\]^_`{|}~]+)*$"
+
+
+class RichInterface:
+    def __init__(self, model_name=None, user_name=None):
+        self._console = Console()
+        if model_name is None:
+            self.model_name = "assistant"
+        else:
+            self.model_name = model_name
+        if user_name is None:
+            self.user_name = "user"
+        else:
+            self.user_name = user_name
+
+    def stream_output(self, output_stream):
+        """Stream output from a role."""
+        # This method is originally from the FastChat CLI: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/cli.py
+        # Create a Live context for updating the console output
+        text = ""
+        self._console.print(f"[bold blue]<{self.model_name}>:")
+        with Live(console=self._console, refresh_per_second=4) as live:
+            # Read lines from the stream
+            for i, outputs in enumerate(output_stream):
+                if not outputs or i == 0:
+                    continue
+                text += outputs
+                # Render the accumulated text as Markdown
+                # NOTE: this is a workaround for the rendering "unstandard markdown"
+                #  in rich. The chatbots output treat "\n" as a new line for
+                #  better compatibility with real-world text. However, rendering
+                #  in markdown would break the format. It is because standard markdown
+                #  treat a single "\n" in normal text as a space.
+                #  Our workaround is adding two spaces at the end of each line.
+                #  This is not a perfect solution, as it would
+                #  introduce trailing spaces (only) in code block, but it works well
+                #  especially for console output, because in general the console does not
+                #  care about trailing spaces.
+                lines = []
+                for line in text.splitlines():
+                    lines.append(line)
+                    if line.startswith("```"):
+                        # Code block marker - do not add trailing spaces, as it would
+                        #  break the syntax highlighting
+                        lines.append("\n")
+                    else:
+                        lines.append("  \n")
+                markdown = Markdown("".join(lines).strip(), code_theme="github-dark")
+                # Update the Live console output
+                live.update(markdown)
+        self._console.print()
+        return text
+
+    def input(self):
+        input = self._console.input(f"[bold red]<{self.user_name}>:\n")
+        self._console.print()
+        return input
+
+    def clear(self):
+        self._console.clear()
+
+    def print_user_message(self, text):
+        self._console.print(f"[bold red]<{self.user_name}>:[/ bold red]\n{text}")
+        self._console.print()
+
+    def print_green(self, text):
+        self._console.print(f"[bold green]{text}")
+        self._console.print()
+
+    def print_red(self, text):
+        self._console.print(f"[bold red]{text}")
+        self._console.print()
+
+    def print_help(self):
+        self._console.print(Markdown(HELP_STRING))
+        self._console.print()
+
+
+def get_username():
+    return pwd.getpwuid(os.getuid())[0]
+
+
+def create_default_filename(model_name):
+    time_str = time.strftime("%Y-%m-%d_%H-%M-%S")
+    return f"{model_name}/chat_{time_str}.json"
+
+
+def save_chat(chat, args, filename):
+    output_dict = {}
+    output_dict["settings"] = vars(args)
+    output_dict["chat_history"] = chat
+
+    folder = args.save_folder
+
+    if filename is None:
+        filename = create_default_filename(args.model_name_or_path)
+        filename = os.path.join(folder, filename)
+    os.makedirs(os.path.dirname(filename), exist_ok=True)
+
+    with open(filename, "w") as f:
+        json.dump(output_dict, f, indent=4)
+    return os.path.abspath(filename)
+
+
+def clear_chat_history(system_prompt):
+    if system_prompt is None:
+        chat = []
+    else:
+        chat = [{"role": "system", "content": system_prompt}]
+    return chat
+
+
+def parse_settings(user_input, current_args, interface):
+    settings = user_input[4:].strip().split(";")
+    settings = [(setting.split("=")[0], setting[len(setting.split("=")[0]) + 1 :]) for setting in settings]
+    settings = dict(settings)
+    error = False
+
+    for name in settings:
+        if hasattr(current_args, name):
+            try:
+                if isinstance(getattr(current_args, name), bool):
+                    if settings[name] == "True":
+                        settings[name] = True
+                    elif settings[name] == "False":
+                        settings[name] = False
+                    else:
+                        raise ValueError
+                else:
+                    settings[name] = type(getattr(current_args, name))(settings[name])
+            except ValueError:
+                interface.print_red(
+                    f"Cannot cast setting {name} (={settings[name]}) to {type(getattr(current_args, name))}."
+                )
+        else:
+            interface.print_red(f"There is no '{name}' setting.")
+
+    if error:
+        interface.print_red("There was an issue parsing the settings. No settings have been changed.")
+        return current_args, False
+    else:
+        for name in settings:
+            setattr(current_args, name, settings[name])
+            interface.print_green(f"Set {name} to {settings[name]}.")
+
+        time.sleep(1.5)  # so the user has time to read the changes
+        return current_args, True
+
+
+def load_model_and_tokenizer(args):
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+
+    torch_dtype = args.torch_dtype if args.torch_dtype in ["auto", None] else getattr(torch, args.torch_dtype)
+    quantization_config = get_quantization_config(args)
+    model_kwargs = dict(
+        revision=args.model_revision,
+        trust_remote_code=args.trust_remote_code,
+        attn_implementation=args.attn_implementation,
+        torch_dtype=torch_dtype,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, **model_kwargs)
+
+    if getattr(model, "hf_device_map", None) is None:
+        model = model.to(args.device)
+
+    return model, tokenizer
+
+
+def chat_cli():
+    parser = TrlParser(ChatArguments)
+    args = parser.parse_args_into_dataclasses()[0]
+    if args.config == "default":
+        args.config = os.path.join(os.path.dirname(__file__), "config/default_chat_config.yaml")
+    if args.config.lower() == "none":
+        args.config = None
+    args = parser.update_dataclasses_with_config([args])[0]
+    if args.examples is None:
+        args.examples = {}
+
+    current_args = copy.deepcopy(args)
+
+    if args.user is None:
+        user = get_username()
+    else:
+        user = args.user
+
+    model, tokenizer = load_model_and_tokenizer(args)
+    generation_streamer = TextIteratorStreamer(tokenizer, skip_special_tokens=True)
+
+    interface = RichInterface(model_name=args.model_name_or_path, user_name=user)
+    interface.clear()
+    chat = clear_chat_history(current_args.system_prompt)
+    while True:
+        try:
+            user_input = interface.input()
+
+            if user_input == "clear":
+                chat = clear_chat_history(current_args.system_prompt)
+                interface.clear()
+                continue
+
+            if user_input == "help":
+                interface.print_help()
+                continue
+
+            if user_input == "exit":
+                break
+
+            if user_input == "reset":
+                interface.clear()
+                current_args = copy.deepcopy(args)
+                chat = clear_chat_history(current_args.system_prompt)
+                continue
+
+            if user_input.startswith("save") and len(user_input.split()) < 2:
+                split_input = user_input.split()
+
+                if len(split_input) == 2:
+                    filename = split_input[1]
+                else:
+                    filename = None
+                filename = save_chat(chat, current_args, filename)
+                interface.print_green(f"Chat saved in {filename}!")
+                continue
+
+            if re.match(SETTING_RE, user_input):
+                current_args, success = parse_settings(user_input, current_args, interface)
+                if success:
+                    chat = []
+                    interface.clear()
+                    continue
+
+            if user_input.startswith("example") and len(user_input.split()) == 2:
+                example_name = user_input.split()[1]
+                if example_name in current_args.examples:
+                    interface.clear()
+                    chat = []
+                    interface.print_user_message(current_args.examples[example_name]["text"])
+                    user_input = current_args.examples[example_name]["text"]
+                else:
+                    interface.print_red(
+                        f"Example {example_name} not found in list of available examples: {list(current_args.examples.keys())}."
+                    )
+                    continue
+
+            chat.append({"role": "user", "content": user_input})
+
+            generation_kwargs = dict(
+                inputs=tokenizer.apply_chat_template(chat, return_tensors="pt", add_generation_prompt=True).to(
+                    model.device
+                ),
+                streamer=generation_streamer,
+                max_new_tokens=current_args.max_new_tokens,
+                do_sample=current_args.do_sample,
+                num_beams=current_args.num_beams,
+                temperature=current_args.temperature,
+                top_k=current_args.top_k,
+                top_p=current_args.top_p,
+                repetition_penalty=current_args.repetition_penalty,
+            )
+
+            thread = Thread(target=model.generate, kwargs=generation_kwargs)
+            thread.start()
+            model_output = interface.stream_output(generation_streamer)
+            thread.join()
+            chat.append({"role": "assistant", "content": model_output})
+
+        except KeyboardInterrupt:
+            break
+
+
+if __name__ == "__main__":
+    chat_cli()
--- a/examples/scripts/config/default_chat_config.yaml
+++ b/examples/scripts/config/default_chat_config.yaml
@ -0,0 +1,13 @@
+examples:
+  llama:
+    text: There is a Llama in my lawn, how can I get rid of it?
+  code:
+    text: Write a Python function that integrates any Python function f(x) numerically over an arbitrary interval [x_start, x_end].
+  helicopter:
+    text: How many helicopters can a human eat in one sitting?
+  numbers: 
+    text: Count to 10 but skip every number ending with an 'e'
+  birds: 
+    text: Why aren't birds real?
+  socks:
+    text: Why is it important to eat socks after meditating?
--- a/examples/scripts/ddpo.py
+++ b/examples/scripts/ddpo.py
@ -0,0 +1,210 @@
+# Copyright 2023 metric-space, The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+python examples/scripts/ddpo.py \
+    --num_epochs=200 \
+    --train_gradient_accumulation_steps=1 \
+    --sample_num_steps=50 \
+    --sample_batch_size=6 \
+    --train_batch_size=3 \
+    --sample_num_batches_per_epoch=4 \
+    --per_prompt_stat_tracking=True \
+    --per_prompt_stat_tracking_buffer_size=32 \
+    --tracker_project_name="stable_diffusion_training" \
+    --log_with="wandb"
+"""
+import os
+from dataclasses import dataclass, field
+
+import numpy as np
+import torch
+import torch.nn as nn
+from huggingface_hub import hf_hub_download
+from huggingface_hub.utils import EntryNotFoundError
+from transformers import CLIPModel, CLIPProcessor, HfArgumentParser
+
+from trl import DDPOConfig, DDPOTrainer, DefaultDDPOStableDiffusionPipeline
+from trl.import_utils import is_npu_available, is_xpu_available
+
+
+@dataclass
+class ScriptArguments:
+    pretrained_model: str = field(
+        default="runwayml/stable-diffusion-v1-5", metadata={"help": "the pretrained model to use"}
+    )
+    pretrained_revision: str = field(default="main", metadata={"help": "the pretrained model revision to use"})
+    hf_hub_model_id: str = field(
+        default="ddpo-finetuned-stable-diffusion", metadata={"help": "HuggingFace repo to save model weights to"}
+    )
+    hf_hub_aesthetic_model_id: str = field(
+        default="trl-lib/ddpo-aesthetic-predictor",
+        metadata={"help": "HuggingFace model ID for aesthetic scorer model weights"},
+    )
+    hf_hub_aesthetic_model_filename: str = field(
+        default="aesthetic-model.pth",
+        metadata={"help": "HuggingFace model filename for aesthetic scorer model weights"},
+    )
+    use_lora: bool = field(default=True, metadata={"help": "Whether to use LoRA."})
+
+
+class MLP(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.layers = nn.Sequential(
+            nn.Linear(768, 1024),
+            nn.Dropout(0.2),
+            nn.Linear(1024, 128),
+            nn.Dropout(0.2),
+            nn.Linear(128, 64),
+            nn.Dropout(0.1),
+            nn.Linear(64, 16),
+            nn.Linear(16, 1),
+        )
+
+    @torch.no_grad()
+    def forward(self, embed):
+        return self.layers(embed)
+
+
+class AestheticScorer(torch.nn.Module):
+    """
+    This model attempts to predict the aesthetic score of an image. The aesthetic score
+    is a numerical approximation of how much a specific image is liked by humans on average.
+    This is from https://github.com/christophschuhmann/improved-aesthetic-predictor
+    """
+
+    def __init__(self, *, dtype, model_id, model_filename):
+        super().__init__()
+        self.clip = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+        self.processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+        self.mlp = MLP()
+        try:
+            cached_path = hf_hub_download(model_id, model_filename)
+        except EntryNotFoundError:
+            cached_path = os.path.join(model_id, model_filename)
+        state_dict = torch.load(cached_path, map_location=torch.device("cpu"))
+        self.mlp.load_state_dict(state_dict)
+        self.dtype = dtype
+        self.eval()
+
+    @torch.no_grad()
+    def __call__(self, images):
+        device = next(self.parameters()).device
+        inputs = self.processor(images=images, return_tensors="pt")
+        inputs = {k: v.to(self.dtype).to(device) for k, v in inputs.items()}
+        embed = self.clip.get_image_features(**inputs)
+        # normalize embedding
+        embed = embed / torch.linalg.vector_norm(embed, dim=-1, keepdim=True)
+        return self.mlp(embed).squeeze(1)
+
+
+def aesthetic_scorer(hub_model_id, model_filename):
+    scorer = AestheticScorer(
+        model_id=hub_model_id,
+        model_filename=model_filename,
+        dtype=torch.float32,
+    )
+    if is_npu_available():
+        scorer = scorer.npu()
+    elif is_xpu_available():
+        scorer = scorer.xpu()
+    else:
+        scorer = scorer.cuda()
+
+    def _fn(images, prompts, metadata):
+        images = (images * 255).round().clamp(0, 255).to(torch.uint8)
+        scores = scorer(images)
+        return scores, {}
+
+    return _fn
+
+
+# list of example prompts to feed stable diffusion
+animals = [
+    "cat",
+    "dog",
+    "horse",
+    "monkey",
+    "rabbit",
+    "zebra",
+    "spider",
+    "bird",
+    "sheep",
+    "deer",
+    "cow",
+    "goat",
+    "lion",
+    "frog",
+    "chicken",
+    "duck",
+    "goose",
+    "bee",
+    "pig",
+    "turkey",
+    "fly",
+    "llama",
+    "camel",
+    "bat",
+    "gorilla",
+    "hedgehog",
+    "kangaroo",
+]
+
+
+def prompt_fn():
+    return np.random.choice(animals), {}
+
+
+def image_outputs_logger(image_data, global_step, accelerate_logger):
+    # For the sake of this example, we will only log the last batch of images
+    # and associated data
+    result = {}
+    images, prompts, _, rewards, _ = image_data[-1]
+
+    for i, image in enumerate(images):
+        prompt = prompts[i]
+        reward = rewards[i].item()
+        result[f"{prompt:.25} | {reward:.2f}"] = image.unsqueeze(0).float()
+
+    accelerate_logger.log_images(
+        result,
+        step=global_step,
+    )
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((ScriptArguments, DDPOConfig))
+    args, ddpo_config = parser.parse_args_into_dataclasses()
+    ddpo_config.project_kwargs = {
+        "logging_dir": "./logs",
+        "automatic_checkpoint_naming": True,
+        "total_limit": 5,
+        "project_dir": "./save",
+    }
+
+    pipeline = DefaultDDPOStableDiffusionPipeline(
+        args.pretrained_model, pretrained_model_revision=args.pretrained_revision, use_lora=args.use_lora
+    )
+
+    trainer = DDPOTrainer(
+        ddpo_config,
+        aesthetic_scorer(args.hf_hub_aesthetic_model_id, args.hf_hub_aesthetic_model_filename),
+        prompt_fn,
+        pipeline,
+        image_samples_hook=image_outputs_logger,
+    )
+
+    trainer.train()
+
+    trainer.push_to_hub(args.hf_hub_model_id)
--- a/examples/scripts/dpo.py
+++ b/examples/scripts/dpo.py
@ -0,0 +1,166 @@
+# flake8: noqa
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+# regular:
+python examples/scripts/dpo.py \
+    --model_name_or_path=gpt2 \
+    --per_device_train_batch_size 4 \
+    --max_steps 1000 \
+    --learning_rate 1e-3 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 10 \
+    --eval_steps 500 \
+    --output_dir="dpo_anthropic_hh" \
+    --warmup_steps 150 \
+    --report_to wandb \
+    --bf16 \
+    --logging_first_step \
+    --no_remove_unused_columns
+
+# peft:
+python examples/scripts/dpo.py \
+    --model_name_or_path=gpt2 \
+    --per_device_train_batch_size 4 \
+    --max_steps 1000 \
+    --learning_rate 1e-3 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 10 \
+    --eval_steps 500 \
+    --output_dir="dpo_anthropic_hh" \
+    --optim rmsprop \
+    --warmup_steps 150 \
+    --report_to wandb \
+    --bf16 \
+    --logging_first_step \
+    --no_remove_unused_columns \
+    --use_peft \
+    --lora_r=16 \
+    --lora_alpha=16
+"""
+import logging
+import os
+from contextlib import nullcontext
+
+TRL_USE_RICH = os.environ.get("TRL_USE_RICH", False)
+
+from trl.commands.cli_utils import DpoScriptArguments, init_zero_verbose, TrlParser
+
+if TRL_USE_RICH:
+    init_zero_verbose()
+    FORMAT = "%(message)s"
+
+    from rich.console import Console
+    from rich.logging import RichHandler
+
+import torch
+from datasets import load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
+
+from trl import (
+    DPOTrainer,
+    ModelConfig,
+    RichProgressCallback,
+    get_kbit_device_map,
+    get_peft_config,
+    get_quantization_config,
+)
+
+
+if TRL_USE_RICH:
+    logging.basicConfig(format=FORMAT, datefmt="[%X]", handlers=[RichHandler()], level=logging.INFO)
+
+
+if __name__ == "__main__":
+    parser = TrlParser((DpoScriptArguments, TrainingArguments, ModelConfig))
+    args, training_args, model_config = parser.parse_args_and_config()
+
+    # Force use our print callback
+    if TRL_USE_RICH:
+        training_args.disable_tqdm = True
+        console = Console()
+
+    ################
+    # Model & Tokenizer
+    ################
+    torch_dtype = (
+        model_config.torch_dtype
+        if model_config.torch_dtype in ["auto", None]
+        else getattr(torch, model_config.torch_dtype)
+    )
+    quantization_config = get_quantization_config(model_config)
+    model_kwargs = dict(
+        revision=model_config.model_revision,
+        trust_remote_code=model_config.trust_remote_code,
+        attn_implementation=model_config.attn_implementation,
+        torch_dtype=torch_dtype,
+        use_cache=False if training_args.gradient_checkpointing else True,
+        device_map=get_kbit_device_map() if quantization_config is not None else None,
+        quantization_config=quantization_config,
+    )
+    model = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    peft_config = get_peft_config(model_config)
+    if peft_config is None:
+        model_ref = AutoModelForCausalLM.from_pretrained(model_config.model_name_or_path, **model_kwargs)
+    else:
+        model_ref = None
+    tokenizer = AutoTokenizer.from_pretrained(model_config.model_name_or_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    if args.ignore_bias_buffers:
+        # torch distributed hack
+        model._ddp_params_and_buffers_to_ignore = [
+            name for name, buffer in model.named_buffers() if buffer.dtype == torch.bool
+        ]
+
+    ################
+    # Optional rich context managers
+    ###############
+    init_context = nullcontext() if not TRL_USE_RICH else console.status("[bold green]Initializing the DPOTrainer...")
+    save_context = (
+        nullcontext()
+        if not TRL_USE_RICH
+        else console.status(f"[bold green]Training completed! Saving the model to {training_args.output_dir}")
+    )
+
+    ################
+    # Dataset
+    ################
+    train_dataset = load_dataset(args.dataset_name, split="train")
+    eval_dataset = load_dataset(args.dataset_name, split="test")
+
+    ################
+    # Training
+    ################
+    with init_context:
+        trainer = DPOTrainer(
+            model,
+            model_ref,
+            args=training_args,
+            beta=args.beta,
+            train_dataset=train_dataset,
+            eval_dataset=eval_dataset,
+            tokenizer=tokenizer,
+            max_length=args.max_length,
+            max_target_length=args.max_target_length,
+            max_prompt_length=args.max_prompt_length,
+            generate_during_eval=args.generate_during_eval,
+            peft_config=get_peft_config(model_config),
+            callbacks=[RichProgressCallback] if TRL_USE_RICH else None,
+        )
+
+    trainer.train()
+
+    with save_context:
+        trainer.save_model(training_args.output_dir)
--- a/examples/scripts/kto.py
+++ b/examples/scripts/kto.py
@ -0,0 +1,152 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Run the KTO training script with the following command with some example arguments.
+In general, the optimal configuration for KTO will be similar to that of DPO:
+
+# regular:
+python examples/scripts/kto.py \
+    --model_name_or_path=gpt2 \
+    --per_device_train_batch_size 4 \
+    --max_steps 1000 \
+    --learning_rate 1e-3 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 10 \
+    --eval_steps 500 \
+    --output_dir="kto_anthropic_hh" \
+    --warmup_steps 150 \
+    --report_to wandb \
+    --bf16 \
+    --logging_first_step \
+    --no_remove_unused_columns
+
+# peft:
+python examples/scripts/kto.py \
+    --model_name_or_path=gpt2 \
+    --per_device_train_batch_size 4 \
+    --max_steps 1000 \
+    --learning_rate 1e-3 \
+    --gradient_accumulation_steps 1 \
+    --logging_steps 10 \
+    --eval_steps 500 \
+    --output_dir="kto_anthropic_hh" \
+    --optim rmsprop \
+    --warmup_steps 150 \
+    --report_to wandb \
+    --bf16 \
+    --logging_first_step \
+    --no_remove_unused_columns \
+    --use_peft \
+    --lora_r=16 \
+    --lora_alpha=16
+"""
+
+from dataclasses import dataclass, field
+from typing import Optional
+
+from datasets import Dataset, load_dataset
+from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
+
+from trl import KTOConfig, KTOTrainer, ModelConfig, get_peft_config
+
+
+# Define and parse arguments.
+@dataclass
+class ScriptArguments:
+    """
+    The arguments for the KTO training script.
+    """
+
+    # debugging
+    sanity_check: Optional[bool] = field(default=True, metadata={"help": "only train on 1000 samples"})
+
+
+def extract_anthropic_prompt(prompt_and_response):
+    """Extract the anthropic prompt from a prompt and response pair."""
+    search_term = "\n\nAssistant:"
+    search_term_idx = prompt_and_response.rfind(search_term)
+
+    if search_term_idx == -1:
+        raise ValueError(f"Prompt and response does not contain '{search_term}'")
+
+    return prompt_and_response[: search_term_idx + len(search_term)]
+
+
+def get_hh(split: str, sanity_check: bool = False, silent: bool = False, cache_dir: str = None) -> Dataset:
+    """Load the Anthropic Helpful-Harmless dataset from Hugging Face and convert it to the necessary format.
+
+    The dataset is converted to a dictionary with the following structure:
+    {
+        'prompt': List[str],
+        'completion': List[str],
+        'label': List[bool],
+    }
+
+    Prompts should be structured as follows:
+      \n\nHuman: <prompt>\n\nAssistant:
+    Multiple turns are allowed, but the prompt should always start with \n\nHuman: and end with \n\nAssistant:.
+    """
+    dataset = load_dataset("Anthropic/hh-rlhf", split=split, cache_dir=cache_dir)
+    if sanity_check:
+        dataset = dataset.select(range(min(len(dataset), 1000)))
+
+    flat_data = {
+        "prompt": [],
+        "completion": [],
+        "label": [],
+    }
+    for sample in dataset:
+        prompt = extract_anthropic_prompt(sample["chosen"])
+        flat_data["prompt"].append(prompt)
+        flat_data["completion"].append(sample["chosen"][len(prompt) :])
+        flat_data["label"].append(True)
+        flat_data["prompt"].append(prompt)
+        flat_data["completion"].append(sample["rejected"][len(prompt) :])
+        flat_data["label"].append(False)
+
+    return dataset.from_dict(flat_data)
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((ScriptArguments, KTOConfig, ModelConfig))
+    script_args, kto_args, model_args = parser.parse_args_into_dataclasses()
+
+    # 1. load a pretrained model
+    model = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
+    model_ref = AutoModelForCausalLM.from_pretrained(model_args.model_name_or_path)
+
+    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # 2. Load the Anthropic Helpful-Harmless dataset
+    train_dataset = get_hh("train", sanity_check=script_args.sanity_check)
+
+    # 3. Load evaluation dataset
+    eval_dataset = get_hh("test", sanity_check=script_args.sanity_check)
+
+    # 4. initialize the KTO trainer
+    kto_trainer = KTOTrainer(
+        model,
+        model_ref,
+        args=kto_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        tokenizer=tokenizer,
+        peft_config=get_peft_config(model_args),
+    )
+
+    # 5. train
+    kto_trainer.train()
--- a/examples/scripts/ppo.py
+++ b/examples/scripts/ppo.py
@ -0,0 +1,194 @@
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+python examples/scripts/ppo.py \
+    --log_with=wandb
+"""
+from dataclasses import dataclass, field
+from typing import Optional
+
+import torch
+from accelerate import Accelerator
+from datasets import load_dataset
+from peft import LoraConfig
+from tqdm import tqdm
+from transformers import AutoTokenizer, HfArgumentParser, pipeline
+
+from trl import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead, PPOConfig, PPOTrainer, set_seed
+from trl.core import LengthSampler
+from trl.import_utils import is_npu_available, is_xpu_available
+
+
+tqdm.pandas()
+
+
+@dataclass
+class ScriptArguments:
+    use_seq2seq: bool = field(default=False, metadata={"help": "whether to use seq2seq"})
+    trust_remote_code: bool = field(default=False, metadata={"help": "Enable `trust_remote_code`"})
+
+    # LoraConfig
+    use_peft: bool = field(default=False, metadata={"help": "whether to use peft"})
+    lora_alpha: Optional[float] = field(default=16, metadata={"help": "the lora alpha parameter"})
+    lora_r: Optional[int] = field(default=16, metadata={"help": "the lora r parameter"})
+
+
+parser = HfArgumentParser((ScriptArguments, PPOConfig))
+args, ppo_config = parser.parse_args_into_dataclasses()
+
+# We then define the arguments to pass to the sentiment analysis pipeline.
+# We set `return_all_scores` to True to get the sentiment score for each token.
+sent_kwargs = {"return_all_scores": True, "function_to_apply": "none", "batch_size": 16}
+
+trl_model_class = AutoModelForCausalLMWithValueHead if not args.use_seq2seq else AutoModelForSeq2SeqLMWithValueHead
+
+
+# Below is an example function to build the dataset. In our case, we use the IMDB dataset
+# from the `datasets` library. One should customize this function to train the model on
+# its own dataset.
+def build_dataset(config, query_dataset, input_min_text_length=2, input_max_text_length=8):
+    """
+    Build dataset for training. This builds the dataset from `load_dataset`, one should
+    customize this function to train the model on its own dataset.
+
+    Args:
+        query_dataset (`str`):
+            The name of the dataset to be loaded.
+
+    Returns:
+        dataloader (`torch.utils.data.DataLoader`):
+            The dataloader for the dataset.
+    """
+    tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+    tokenizer.pad_token = tokenizer.eos_token
+    # load imdb with datasets
+    ds = load_dataset(query_dataset, split="train")
+    ds = ds.rename_columns({"text": "review"})
+    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
+
+    input_size = LengthSampler(input_min_text_length, input_max_text_length)
+
+    def tokenize(sample):
+        sample["input_ids"] = tokenizer.encode(sample["review"])[: input_size()]
+        sample["query"] = tokenizer.decode(sample["input_ids"])
+        return sample
+
+    ds = ds.map(tokenize, batched=False)
+    ds.set_format(type="torch")
+    return ds
+
+
+# We retrieve the dataloader by calling the `build_dataset` function.
+dataset = build_dataset(ppo_config, ppo_config.query_dataset)
+
+
+def collator(data):
+    return {key: [d[key] for d in data] for key in data[0]}
+
+
+# set seed before initializing value head for deterministic eval
+set_seed(ppo_config.seed)
+
+# Now let's build the model, the reference model, and the tokenizer.
+if not args.use_peft:
+    ref_model = trl_model_class.from_pretrained(ppo_config.model_name, trust_remote_code=args.trust_remote_code)
+    device_map = None
+    peft_config = None
+else:
+    peft_config = LoraConfig(
+        r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+    ref_model = None
+    # Copy the model to each device
+    device_map = {"": Accelerator().local_process_index}
+
+model = trl_model_class.from_pretrained(
+    ppo_config.model_name,
+    trust_remote_code=args.trust_remote_code,
+    device_map=device_map,
+    peft_config=peft_config,
+)
+
+
+tokenizer = AutoTokenizer.from_pretrained(ppo_config.model_name)
+
+# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
+tokenizer.pad_token_id = tokenizer.eos_token_id
+
+# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
+ppo_trainer = PPOTrainer(ppo_config, model, ref_model, tokenizer, dataset=dataset, data_collator=collator)
+
+# We then build the sentiment analysis pipeline, passing the model name and the
+# sentiment analysis pipeline arguments. Let's also make sure to set the device
+# to the same device as the PPOTrainer.
+device = ppo_trainer.accelerator.device
+if ppo_trainer.accelerator.num_processes == 1:
+    if is_xpu_available():
+        device = "xpu:0"
+    elif is_npu_available():
+        device = "npu:0"
+    else:
+        device = 0 if torch.cuda.is_available() else "cpu"  # to avoid a `pipeline` bug
+ds_plugin = ppo_trainer.accelerator.state.deepspeed_plugin
+task, model_name = ppo_config.reward_model.split(":")
+if ds_plugin is not None and ds_plugin.is_zero3_init_enabled():
+    with ds_plugin.zero3_init_context_manager(enable=False):
+        sentiment_pipe = pipeline(task, model=model_name, device=device)
+else:
+    sentiment_pipe = pipeline(task, model=model_name, device=device)
+
+# Some tokenizers like GPT-2's don't have a padding token by default, so we set one here.
+if sentiment_pipe.tokenizer.pad_token_id is None:
+    sentiment_pipe.tokenizer.pad_token_id = tokenizer.pad_token_id
+
+if sentiment_pipe.model.config.pad_token_id is None:
+    sentiment_pipe.model.config.pad_token_id = tokenizer.pad_token_id
+
+# We then define the arguments to pass to the `generate` function. These arguments
+# are passed to the `generate` function of the PPOTrainer, which is a wrapper around
+# the `generate` function of the trained model.
+generation_kwargs = {
+    "min_length": -1,
+    "top_k": 0.0,
+    "top_p": 1.0,
+    "do_sample": True,
+    "pad_token_id": tokenizer.eos_token_id,
+    "max_new_tokens": 32,
+}
+
+for _epoch, batch in tqdm(enumerate(ppo_trainer.dataloader)):
+    query_tensors = batch["input_ids"]
+
+    # Get response from gpt2
+    response_tensors, ref_response_tensors = ppo_trainer.generate(
+        query_tensors, return_prompt=False, generate_ref_response=True, **generation_kwargs
+    )
+    batch["response"] = tokenizer.batch_decode(response_tensors)
+    batch["ref_response"] = tokenizer.batch_decode(ref_response_tensors)
+
+    # Compute sentiment score
+    texts = [q + r for q, r in zip(batch["query"], batch["response"])]
+    pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
+    rewards = [torch.tensor(output[1]["score"]) for output in pipe_outputs]
+    ref_texts = [q + r for q, r in zip(batch["query"], batch["ref_response"])]
+    ref_pipe_outputs = sentiment_pipe(ref_texts, **sent_kwargs)
+    ref_rewards = [torch.tensor(output[1]["score"]) for output in ref_pipe_outputs]
+    batch["ref_rewards"] = ref_rewards
+
+    # Run PPO step
+    stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
+    ppo_trainer.log_stats(stats, batch, rewards, columns_to_log=["query", "response", "ref_response", "ref_rewards"])
--- a/Show More
+++ b/Show More