Compare commits

...

33 Commits

Author SHA1 Message Date
7ea66d56eb with accelerate 2025-10-10 11:26:27 +00:00
3aa67591ee add personal script and command 2025-10-10 11:20:30 +00:00
66213f11ce Merge branch 'main' into kashif-patch-1 2025-10-10 12:21:14 +02:00
f853e091ea Fix CI CUDA out of memory errors by improving GPU memory management (#4238) 2025-10-10 09:49:45 +02:00
a2d07aa0ce Merge branch 'main' into kashif-patch-1 2025-10-10 09:33:06 +02:00
803ec0d856 Fix CI slow test ValueError: Backward pass should have cleared tracker of all tensors (#4236)
Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Kashif Rasul <kashif.rasul@gmail.com>
2025-10-10 09:28:34 +02:00
7a0a615d50 Warnings pointing to RFC (#4224) 2025-10-09 17:05:36 -06:00
5b7713f8aa fix for judges 2025-10-09 19:41:36 +00:00
c38cb69ec7 🧘 Enhance markdown style (#4235) 2025-10-09 13:49:44 -05:00
54c97ad0ca use same pattern as grpo_trainer 2025-10-09 16:40:13 +00:00
f1c7fea83b use warnings 2025-10-09 16:34:13 +00:00
9c2508e61e Update trl/trainer/online_dpo_config.py
Co-authored-by: Quentin Gallouédec <45557362+qgallouedec@users.noreply.github.com>
2025-10-09 18:26:14 +02:00
68ef15c686 Remove unused log_example_reports.py script (#4241)
Co-authored-by: behroozazarkhalili <ermiaazarkhalili>
2025-10-09 09:18:48 -07:00
7c8bec1114 Merge branch 'main' into kashif-patch-1 2025-10-09 16:20:02 +02:00
c3fd3be3d6 add warning 2025-10-09 14:19:33 +00:00
3dd7fc2850 Fix CI IndentationError for Python 3.13.8 (#4240) 2025-10-09 15:46:41 +02:00
379fc7830e Update model weights if needed 2025-10-09 14:45:41 +02:00
324c768c4b Merge branch 'main' into kashif-patch-1 2025-10-09 14:18:39 +02:00
51ced65153 Replace setup with pyproject in CI tests paths (#4230) 2025-10-09 09:35:08 +02:00
4bb883a6e6 Update CI Docker image to pytorch/pytorch:2.8.0 (#4232) 2025-10-09 08:09:15 +02:00
f7846321e7 Remove unused Path import in __init__.py (#4227) 2025-10-08 21:30:54 +02:00
a944890ff1 Fix callable annotations (#4216) 2025-10-08 21:21:21 +02:00
521db3520a Fix CI unittest asserts (#4234) 2025-10-08 21:18:41 +02:00
e2c97a805a Exclude vllm dependencies from dev extra (#4229) 2025-10-08 18:14:23 +02:00
d1d0407d3c 🏷️ Account for token_type_ids in DataCollatorForVisionLanguageModeling (#4190) 2025-10-08 09:34:48 -06:00
824ff8c73e Add Efficient Online Training with GRPO and vLLM in TRL to community tutorials (#4219) 2025-10-08 12:59:04 +02:00
f15399d3d3 Fix entropy and accuracy calculation for prompt_tuning techniques. (#4196) 2025-10-08 09:42:19 +01:00
26b967dd47 Update trl/trainer/online_dpo_trainer.py
Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
2025-10-07 15:56:31 +02:00
f883a91fc3 Update trl/trainer/online_dpo_trainer.py
Co-authored-by: Albert Villanova del Moral <8515462+albertvillanova@users.noreply.github.com>
2025-10-07 15:56:24 +02:00
4718e32146 Merge branch 'main' into kashif-patch-1 2025-10-07 11:01:01 +02:00
1de9563161 use prepare_fsdp 2025-10-02 15:45:31 +02:00
c466b47abe prepeare ref_model properly
Updated reference model preparation for FSDP and device placement.
2025-10-02 11:32:04 +02:00
7359b0f0ff [Online-DPO] fix the completion_len == max_new_tokens crash 2025-10-02 10:16:33 +02:00
68 changed files with 1418 additions and 733 deletions

View File

@ -68,7 +68,7 @@ jobs:
CUDA_VISIBLE_DEVICES: "0,1"
TEST_TYPE: "multi_gpu"
container:
image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
options: --gpus all --shm-size "16gb"
defaults:
run:
@ -115,6 +115,4 @@ jobs:
source .venv/bin/activate
uv pip install slack_sdk tabulate
python scripts/log_reports.py >> $GITHUB_STEP_SUMMARY
python scripts/log_example_reports.py --text_file_name temp_results_sft_tests.txt >> $GITHUB_STEP_SUMMARY
python scripts/log_example_reports.py --text_file_name temp_results_dpo_tests.txt >> $GITHUB_STEP_SUMMARY
rm *.txt

View File

@ -11,11 +11,12 @@ on:
- "scripts/**.py"
- "tests/**.py"
- "trl/**.py"
- "setup.py"
- "pyproject.toml"
env:
TQDM_DISABLE: 1
CI_SLACK_CHANNEL: ${{ secrets.CI_PUSH_MAIN_CHANNEL }}
PYTORCH_CUDA_ALLOC_CONF: "expandable_segments:True"
jobs:
check_code_quality:
@ -41,7 +42,7 @@ jobs:
runs-on:
group: aws-g4dn-2xlarge
container:
image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
options: --gpus all
defaults:
run:
@ -93,7 +94,7 @@ jobs:
runs-on:
group: aws-g4dn-2xlarge
container:
image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
options: --gpus all
defaults:
run:
@ -149,7 +150,7 @@ jobs:
runs-on:
group: aws-g4dn-2xlarge
container:
image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
options: --gpus all
defaults:
run:
@ -201,7 +202,7 @@ jobs:
runs-on:
group: aws-g4dn-2xlarge
container:
image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
options: --gpus all
defaults:
run:

View File

@ -16,7 +16,7 @@ jobs:
runs-on:
group: aws-g4dn-2xlarge
container:
image: pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
image: pytorch/pytorch:2.8.0-cuda12.8-cudnn9-devel
options: --gpus all
defaults:
run:

View File

@ -1,15 +1,10 @@
# How to contribute to TRL?
Everyone is welcome to contribute, and we value everybody's contribution. Code
contributions are not the only way to help the community. Answering questions, helping
others, and improving the documentation are also immensely valuable.
Everyone is welcome to contribute, and we value everybody's contribution. Code contributions are not the only way to help the community. Answering questions, helping others, and improving the documentation are also immensely valuable.
It also helps us if you spread the word! Reference the library in blog posts
about the awesome projects it made possible, shout out on Twitter every time it has
helped you, or simply ⭐️ the repository to say thank you.
It also helps us if you spread the word! Reference the library in blog posts about the awesome projects it made possible, shout out on Twitter every time it has helped you, or simply ⭐️ the repository to say thank you.
However you choose to contribute, please be mindful and respect our
[code of conduct](https://github.com/huggingface/trl/blob/main/CODE_OF_CONDUCT.md).
However you choose to contribute, please be mindful and respect our [code of conduct](https://github.com/huggingface/trl/blob/main/CODE_OF_CONDUCT.md).
**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
@ -22,9 +17,7 @@ There are several ways you can contribute to TRL:
* Implement trainers for new post-training algorithms.
* Contribute to the examples or the documentation.
If you don't know where to start, there is a special [Good First
Issue](https://github.com/huggingface/trl/labels/%F0%9F%91%B6%20good%20first%20issue) listing. It will give you a list of
open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.
If you don't know where to start, there is a special [Good First Issue](https://github.com/huggingface/trl/labels/%F0%9F%91%B6%20good%20first%20issue) listing. It will give you a list of open issues that are beginner-friendly and help you start contributing to open-source. The best way to do that is to open a Pull Request and link it to the issue that you'd like to work on. We try to give priority to opened PRs as we can easily track the progress of the fix, and if the contributor does not have time anymore, someone else can take the PR over.
For something slightly more challenging, you can also take a look at the [Good Second Issue](https://github.com/huggingface/trl/labels/Good%20Second%20Issue) list. In general though, if you feel like you know what you're doing, go for it and we'll help you get there! 🚀
@ -48,14 +41,12 @@ Do your best to follow these guidelines when submitting a bug-related issue or a
The TRL library is robust and reliable thanks to users who report the problems they encounter.
Before you report an issue, we would really appreciate it if you could **make sure the bug was not
already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code.
Before you report an issue, we would really appreciate it if you could **make sure the bug was not already reported** (use the search bar on GitHub under Issues). Your issue should also be related to bugs in the library itself, and not your code.
Once you've confirmed the bug hasn't already been reported, please include the following information in your issue so we can quickly resolve it:
* Your **OS type and version**, **Python**, **PyTorch**, **TRL** and **Transformers** versions.
* A short, self-contained, code snippet that allows us to reproduce the bug in
less than 30s.
* A short, self-contained, code snippet that allows us to reproduce the bug in less than 30s.
* The *full* traceback if an exception is raised.
* Attach any other additional information, like screenshots, you think may help.
@ -106,29 +97,20 @@ We're always looking for improvements to the documentation that make it more cle
## Submitting a pull request (PR)
Before writing code, we strongly advise you to search through the existing PRs or
issues to make sure that nobody is already working on the same thing. If you are
unsure, it is always a good idea to open an issue to get some feedback.
Before writing code, we strongly advise you to search through the existing PRs or issues to make sure that nobody is already working on the same thing. If you are unsure, it is always a good idea to open an issue to get some feedback.
You will need basic `git` proficiency to be able to contribute to
TRL. `git` is not the easiest tool to use but it has the greatest
manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
Git](https://git-scm.com/book/en/v2) is a very good reference.
You will need basic `git` proficiency to be able to contribute to TRL. `git` is not the easiest tool to use but it has the greatest manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro Git](https://git-scm.com/book/en/v2) is a very good reference.
Follow these steps to start contributing:
1. Fork the [repository](https://github.com/huggingface/trl) by
clicking on the 'Fork' button on the repository's page. This creates a copy of the code
under your GitHub user account.
1. Fork the [repository](https://github.com/huggingface/trl) by clicking on the 'Fork' button on the repository's page. This creates a copy of the code under your GitHub user account.
2. Clone your fork to your local disk, and add the base repository as a remote. The following command
assumes you have your public SSH key uploaded to GitHub. See the following guide for more
[information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
2. Clone your fork to your local disk, and add the base repository as a remote. The following command assumes you have your public SSH key uploaded to GitHub. See the following guide for more [information](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository).
```bash
$ git clone git@github.com:<your Github handle>/trl.git
$ cd trl
$ git remote add upstream https://github.com/huggingface/trl.git
git clone git@github.com:<your Github handle>/trl.git
cd trl
git remote add upstream https://github.com/huggingface/trl.git
```
3. Create a new branch to hold your development changes, and do this for every new PR you work on.
@ -136,15 +118,15 @@ Follow these steps to start contributing:
Start by synchronizing your `main` branch with the `upstream/main` branch (more details in the [GitHub Docs](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests/syncing-a-fork)):
```bash
$ git checkout main
$ git fetch upstream
$ git merge upstream/main
git checkout main
git fetch upstream
git merge upstream/main
```
Once your `main` branch is synchronized, create a new branch from it:
```bash
$ git checkout -b a-descriptive-name-for-my-changes
git checkout -b a-descriptive-name-for-my-changes
```
**Do not** work on the `main` branch.
@ -152,32 +134,28 @@ Follow these steps to start contributing:
4. Set up a development environment by running the following command in a conda or a virtual environment you've created for working on this library:
```bash
$ pip install -e .[dev]
pip install -e .[dev]
```
(If TRL was already installed in the virtual environment, remove
it with `pip uninstall trl` before reinstalling it.)
(If TRL was already installed in the virtual environment, remove it with `pip uninstall trl` before reinstalling it.)
Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using
the provided Dev Container. Documentation on how to get started with dev containers is available [here](https://code.visualstudio.com/docs/remote/containers).
Alternatively, if you are using [Visual Studio Code](https://code.visualstudio.com/Download), the fastest way to get set up is by using the provided Dev Container. Check [the documentation on how to get started with dev containers](https://code.visualstudio.com/docs/remote/containers).
5. Develop the features on your branch.
As you work on the features, you should make sure that the test suite
passes. You should run the tests impacted by your changes like this (see
below an explanation regarding the environment variable):
As you work on the features, you should make sure that the test suite passes. You should run the tests impacted by your changes like this (see below an explanation regarding the environment variable):
```bash
$ pytest tests/<TEST_TO_RUN>.py
```
> For the following commands leveraging the `make` utility.
```bash
pytest tests/<TEST_TO_RUN>.py
```
You can also run the full suite with the following command.
> For the following commands leveraging the `make` utility.
```bash
$ make test
```
You can also run the full suite with the following command.
```bash
make test
```
TRL relies on `ruff` for maintaining consistent code formatting across its source files. Before submitting any PR, you should apply automatic style corrections and run code verification checks.
@ -186,59 +164,51 @@ Follow these steps to start contributing:
To apply these checks and corrections in one step, use:
```bash
$ make precommit
make precommit
```
This command runs the following:
- Executes `pre-commit` hooks to automatically fix style issues with `ruff` and other tools.
- Runs additional scripts such as adding copyright information.
* Executes `pre-commit` hooks to automatically fix style issues with `ruff` and other tools.
* Runs additional scripts such as adding copyright information.
If you prefer to apply the style corrections separately or review them individually, the `pre-commit` hook will handle the formatting for the files in question.
Once you're happy with your changes, add changed files using `git add` and
make a commit with `git commit` to record your changes locally:
Once you're happy with your changes, add changed files using `git add` and make a commit with `git commit` to record your changes locally:
```bash
$ git add modified_file.py
$ git commit
```
```bash
git add modified_file.py
git commit
```
Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
Please write [good commit messages](https://chris.beams.io/posts/git-commit/).
It is a good idea to sync your copy of the code with the original
repository regularly. This way you can quickly account for changes:
It is a good idea to sync your copy of the code with the original
repository regularly. This way you can quickly account for changes:
```bash
$ git fetch upstream
$ git rebase upstream/main
```
```bash
git fetch upstream
git rebase upstream/main
```
Push the changes to your account using:
Push the changes to your account using:
```bash
$ git push -u origin a-descriptive-name-for-my-changes
```
```bash
git push -u origin a-descriptive-name-for-my-changes
```
6. Once you are satisfied (**and the checklist below is happy too**), go to the
webpage of your fork on GitHub. Click on 'Pull request' to send your changes
to the project maintainers for review.
6. Once you are satisfied (**and the checklist below is happy too**), go to the webpage of your fork on GitHub. Click on 'Pull request' to send your changes to the project maintainers for review.
7. It's ok if maintainers ask you for changes. It happens to core contributors too! To ensure everyone can review your changes in the pull request, work on your local branch and push the updates to your fork. They will automatically appear in the pull request.
### Checklist
1. The title of your pull request should be a summary of its contribution;
2. If your pull request addresses an issue, please mention the issue number in
the pull request description to make sure they are linked (and people
consulting the issue know you are working on it);
3. To indicate a work in progress please prefix the title with `[WIP]`, or mark
the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate
it from PRs ready to be merged;
2. If your pull request addresses an issue, please mention the issue number in the pull request description to make sure they are linked (and people consulting the issue know you are working on it);
3. To indicate a work in progress please prefix the title with `[WIP]`, or mark the PR as a draft PR. These are useful to avoid duplicated work, and to differentiate it from PRs ready to be merged;
4. Make sure existing tests pass;
5. Add high-coverage tests. No quality testing = no merge.
### Tests
An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
@ -248,7 +218,7 @@ We use `pytest` to run the tests. From the root of the
repository here's how to run tests with `pytest` for the library:
```bash
$ python -m pytest -sv ./tests
python -m pytest -sv ./tests
```
That's how `make test` is implemented (without the `pip install` line)!
@ -260,23 +230,23 @@ you're working on.
1. **Use defaults when appropriate**:
Provide default values unless the parameter's value varies significantly by use case. For example, datasets or models should not have defaults, but parameters like `learning_rate` should.
Provide default values unless the parameter's value varies significantly by use case. For example, datasets or models should not have defaults, but parameters like `learning_rate` should.
2. **Prioritize proven defaults**:
Default values should align with those recommended in the original paper or method. Alternatives require strong evidence of superior performance in most cases.
Default values should align with those recommended in the original paper or method. Alternatives require strong evidence of superior performance in most cases.
3. **Ensure safety and predictability**:
Defaults must be safe, expected and reliable. Avoid settings that could lead to surprising outcomes, such as excessive memory usage or poor performance in edge cases.
Defaults must be safe, expected and reliable. Avoid settings that could lead to surprising outcomes, such as excessive memory usage or poor performance in edge cases.
4. **Balance consistency and flexibility**:
Aim for consistent defaults across similar functions or methods. However, consistency should not be preferred to point 2 or 3.
Aim for consistent defaults across similar functions or methods. However, consistency should not be preferred to point 2 or 3.
5. **Opt-in for new features**:
Do not enable new features or improvements (e.g., novel loss functions) by default. Users should explicitly opt-in to use these.
Do not enable new features or improvements (e.g., novel loss functions) by default. Users should explicitly opt-in to use these.
### Writing documentation
@ -318,26 +288,26 @@ def replicate_str(string: str, n: int, sep: str = " ") -> str:
* Note that `Optional` means that the value can be `None`, and `*optional*` means that it is not required for the user to pass a value.
E.g., for arguments that can't be `None` and aren't required:
```python
```txt
foo (`int`, *optional*, defaults to `4`):
```
For arguments that can be `None` and are required:
```python
```txt
foo (`Optional[int]`):
```
for arguments that can be `None` and aren't required:
for arguments that can be `None` and aren't required (in this case, if the default value is `None`, you can omit it):
```python
```txt
foo (`Optional[int]`, *optional*):
```
* **String Defaults:**
* Ensured that default string values are wrapped in double quotes:
```python
```txt
defaults to `"foo"`
```
@ -346,7 +316,7 @@ def replicate_str(string: str, n: int, sep: str = " ") -> str:
* **Default Value Formatting:**
* Consistently surrounded default values with backticks for improved formatting:
```python
```txt
defaults to `4`
```
@ -383,8 +353,8 @@ Our approach to deprecation and backward compatibility is flexible and based on
When a feature or component is marked for deprecation, its use will emit a warning message. This warning will include:
- **Transition Guidance**: Instructions on how to migrate to the alternative solution or replacement.
- **Removal Version**: The target version when the feature will be removed, providing users with a clear timeframe to transition.
* **Transition Guidance**: Instructions on how to migrate to the alternative solution or replacement.
* **Removal Version**: The target version when the feature will be removed, providing users with a clear timeframe to transition.
Example:
@ -398,9 +368,9 @@ Example:
The deprecation and removal schedule is based on each feature's usage and impact, with examples at two extremes:
- **Experimental or Low-Use Features**: For a feature that is experimental or has limited usage, backward compatibility may not be maintained between releases. Users should therefore anticipate potential breaking changes from one version to the next.
* **Experimental or Low-Use Features**: For a feature that is experimental or has limited usage, backward compatibility may not be maintained between releases. Users should therefore anticipate potential breaking changes from one version to the next.
- **Widely-Used Components**: For a feature with high usage, we aim for a more gradual transition period of approximately **5 months**, generally scheduling deprecation around **5 minor releases** after the initial warning.
* **Widely-Used Components**: For a feature with high usage, we aim for a more gradual transition period of approximately **5 months**, generally scheduling deprecation around **5 minor releases** after the initial warning.
These examples represent the two ends of a continuum. The specific timeline for each feature will be determined individually, balancing innovation with user stability needs.
@ -410,22 +380,22 @@ Warnings play a critical role in guiding users toward resolving potential issues
#### Definitions
- **Correct**: An operation is correct if it is valid, follows the intended approach, and aligns with the current best practices or guidelines within the codebase. This is the recommended or intended way to perform the operation.
- **Supported**: An operation is supported if it is technically valid and works within the current codebase, but it may not be the most efficient, optimal, or recommended way to perform the task. This includes deprecated features or legacy approaches that still work but may be phased out in the future.
* **Correct**: An operation is correct if it is valid, follows the intended approach, and aligns with the current best practices or guidelines within the codebase. This is the recommended or intended way to perform the operation.
* **Supported**: An operation is supported if it is technically valid and works within the current codebase, but it may not be the most efficient, optimal, or recommended way to perform the task. This includes deprecated features or legacy approaches that still work but may be phased out in the future.
#### Choosing the right message
- **Correct → No warning**:
* **Correct → No warning**:
If the operation is fully valid and expected, no message should be issued. The system is working as intended, so no warning is necessary.
- **Correct but deserves attention → No warning, possibly a log message**:
* **Correct but deserves attention → No warning, possibly a log message**:
When an operation is correct but uncommon or requires special attention, providing an informational message can be helpful. This keeps users informed without implying any issue. If available, use the logger to output this message. Example:
```python
logger.info("This is an informational message about a rare but correct operation.")
```
- **Correct but very likely a mistake → Warning with option to disable**:
* **Correct but very likely a mistake → Warning with option to disable**:
In rare cases, you may want to issue a warning for a correct operation thats very likely a mistake. In such cases, you must provide an option to suppress the warning. This can be done with a flag in the function. Example:
```python
@ -436,7 +406,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
# Do something
```
- **Supported but not correct → Warning**:
* **Supported but not correct → Warning**:
If the operation is technically supported but is deprecated, suboptimal, or could cause future issues (e.g., conflicting arguments), a warning should be raised. This message should be actionable, meaning it must explain how to resolve the issue. Example:
```python
@ -446,7 +416,7 @@ Warnings play a critical role in guiding users toward resolving potential issues
# Do something
```
- **Not supported → Exception**:
* **Not supported → Exception**:
If the operation is invalid or unsupported, raise an exception. This indicates that the operation cannot be performed and requires immediate attention. Example:
```python

View File

@ -1,6 +1,6 @@
# BCO Trainer
[![](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco,trl)
[![model badge](https://img.shields.io/badge/All_models-BCO-blue)](https://huggingface.co/models?other=bco,trl)
TRL supports the Binary Classifier Optimization (BCO).
The [BCO](https://huggingface.co/papers/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0.
@ -12,17 +12,16 @@ The [`BCOTrainer`] requires an [unpaired preference dataset](dataset_formats#unp
The [`BCOTrainer`] supports both [conversational](dataset_formats#conversational) and [standard](dataset_formats#standard) dataset formats. When provided with a conversational dataset, the trainer will automatically apply the chat template to the dataset.
## Expected model format
The BCO trainer expects a model of `AutoModelForCausalLM`, compared to PPO that expects `AutoModelForCausalLMWithValueHead` for the value function.
## Using the `BCOTrainer`
For a detailed example have a look at the `examples/scripts/bco.py` script. At a high level we need to initialize the `BCOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response.
For a detailed example have a look at the `examples/scripts/bco.py` script. At a high level we need to initialize the `BCOTrainer` with a `model` we wish to train and a reference `ref_model` which we will use to calculate the implicit rewards of the preferred and rejected response.
The `beta` refers to the hyperparameter of the implicit reward, and the dataset contains the 3 entries listed above. Note that the `model` and `ref_model` need to have the same architecture (ie decoder only or encoder-decoder).
```py
```python
training_args = BCOConfig(
beta=0.1,
)
@ -35,9 +34,10 @@ bco_trainer = BCOTrainer(
processing_class=tokenizer,
)
```
After this one can then call:
```py
```python
bco_trainer.train()
```
@ -49,7 +49,7 @@ If the prompts in your desired and undesired datasets differ a lot, it is useful
Choose an embedding model and tokenizer:
```py
```python
embedding_model = AutoModel.from_pretrained(your_model_id)
embedding_tokenizer = AutoTokenizer.from_pretrained(your_model_id)
@ -64,7 +64,7 @@ embedding_func = partial(embed_prompt, model=embedding_model)
Set `prompt_sample_size` to define how many prompts are selected to train the UDM classifier and start the training with the provided embedding function:
```py
```python
training_args = BCOConfig(
beta=0.1,
prompt_sample_size=512,

View File

@ -1,4 +1,4 @@
# Best of N sampling: Alternative ways to get better model output without RL based fine-tuning
# Best of N sampling: Alternative ways to get better model output without RL based fine-tuning
Within the extras module is the `best-of-n` sampler class that serves as an alternative method of generating better model output.
As to how it fares against the RL based fine-tuning, please look in the `examples` directory for a comparison example
@ -8,7 +8,6 @@ As to how it fares against the RL based fine-tuning, please look in the `example
To get started quickly, instantiate an instance of the class with a model, a length sampler, a tokenizer and a callable that serves as a proxy reward pipeline that outputs reward scores for input queries
```python
from transformers import pipeline, AutoTokenizer
from trl import AutoModelForCausalLMWithValueHead
from trl.core import LengthSampler
@ -19,37 +18,29 @@ reward_pipe = pipeline("sentiment-analysis", model=reward_model, device=device)
tokenizer = AutoTokenizer.from_pretrained(ref_model_name)
tokenizer.pad_token = tokenizer.eos_token
# callable that takes a list of raw text and returns a list of corresponding reward scores
def queries_to_scores(list_of_strings):
return [output["score"] for output in reward_pipe(list_of_strings)]
best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler)
```
And assuming you have a list/tensor of tokenized queries, you can generate better output by calling the `generate` method
```python
best_of_n.generate(query_tensors, device=device, **gen_kwargs)
```
The default sample size is 4, but you can change it at the time of instance initialization like so
```python
best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, sample_size=8)
```
The default output is the result of taking the top scored output for each query, but you can change it to top 2 and so on by passing the `n_candidates` argument at the time of instance initialization
```python
best_of_n = BestOfNSampler(model, tokenizer, queries_to_scores, length_sampler=output_length_sampler, n_candidates=2)
```
There is the option of setting the generation settings (like `temperature`, `pad_token_id`) at the time of instance creation as opposed to when calling the `generate` method.

View File

@ -2,9 +2,11 @@
TRL provides a powerful command-line interface (CLI) to fine-tune large language models (LLMs) using methods like Supervised Fine-Tuning (SFT), Direct Preference Optimization (DPO), and more. The CLI abstracts away much of the boilerplate, letting you launch training jobs quickly and reproducibly.
## Commands
Currently supported commands are:
#### Training Commands
### Training Commands
- `trl dpo`: fine-tune a LLM with DPO
- `trl grpo`: fine-tune a LLM with GRPO
@ -13,7 +15,7 @@ Currently supported commands are:
- `trl rloo`: fine-tune a LLM with RLOO
- `trl sft`: fine-tune a LLM with SFT
#### Other Commands
### Other Commands
- `trl env`: get the system information
- `trl vllm-serve`: serve a model with vLLM
@ -197,22 +199,22 @@ trl reward --config reward_config.yaml
The `--accelerate_config` flag lets you easily configure distributed training with [🤗 Accelerate](https://github.com/huggingface/accelerate). This flag accepts either:
* the name of a predefined config profile (built into TRL), or
* a path to a custom Accelerate YAML config file.
- the name of a predefined config profile (built into TRL), or
- a path to a custom Accelerate YAML config file.
#### Predefined Config Profiles
TRL provides several ready-to-use Accelerate configs to simplify common training setups:
| Name | Description |
| ------------ | ----------------------------------- |
| `fsdp1` | Fully Sharded Data Parallel Stage 1 |
| `fsdp2` | Fully Sharded Data Parallel Stage 2 |
| `zero1` | DeepSpeed ZeRO Stage 1 |
| `zero2` | DeepSpeed ZeRO Stage 2 |
| `zero3` | DeepSpeed ZeRO Stage 3 |
| `multi_gpu` | Multi-GPU training |
| `single_gpu` | Single-GPU training |
| Name | Description |
| --- | --- |
| `fsdp1` | Fully Sharded Data Parallel Stage 1 |
| `fsdp2` | Fully Sharded Data Parallel Stage 2 |
| `zero1` | DeepSpeed ZeRO Stage 1 |
| `zero2` | DeepSpeed ZeRO Stage 2 |
| `zero3` | DeepSpeed ZeRO Stage 3 |
| `multi_gpu` | Multi-GPU training |
| `single_gpu` | Single-GPU training |
To use one of these, just pass the name to `--accelerate_config`. TRL will automatically load the corresponding config file from `trl/accelerate_config/`.

View File

@ -8,6 +8,7 @@ Community tutorials are made by active members of the Hugging Face community who
| Task | Class | Description | Author | Tutorial | Colab |
| --- | --- | --- | --- | --- | --- |
| Reinforcement Learning | [`GRPOTrainer`] | Efficient Online Training with GRPO and vLLM in TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/grpo_vllm_online_training) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/grpo_vllm_online_training.ipynb) |
| Reinforcement Learning | [`GRPOTrainer`] | Post training an LLM for reasoning with GRPO in TRL | [Sergio Paniego](https://huggingface.co/sergiopaniego) | [Link](https://huggingface.co/learn/cookbook/fine_tuning_llm_grpo_trl) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/cookbook/blob/main/notebooks/en/fine_tuning_llm_grpo_trl.ipynb) |
| Reinforcement Learning | [`GRPOTrainer`] | Mini-R1: Reproduce Deepseek R1 „aha moment“ a RL tutorial | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/mini-deepseek-r1) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/mini-deepseek-r1-aha-grpo.ipynb) |
| Reinforcement Learning | [`GRPOTrainer`] | RL on LLaMA 3.1-8B with GRPO and Unsloth optimizations | [Andrea Manzoni](https://huggingface.co/AManzoni) | [Link](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/amanzoni1/fine_tuning/blob/main/RL_LLama3_1_8B_GRPO.ipynb) |
@ -17,7 +18,6 @@ Community tutorials are made by active members of the Hugging Face community who
| Preference Optimization | [`ORPOTrainer`] | Fine-tuning Llama 3 with ORPO combining instruction tuning and preference alignment | [Maxime Labonne](https://huggingface.co/mlabonne) | [Link](https://mlabonne.github.io/blog/posts/2024-04-19_Fine_tune_Llama_3_with_ORPO.html) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eHNWg9gnaXErdAa8_mcvjMupbSS6rDvi) |
| Instruction tuning | [`SFTTrainer`] | How to fine-tune open LLMs in 2025 with Hugging Face | [Philipp Schmid](https://huggingface.co/philschmid) | [Link](https://www.philschmid.de/fine-tune-llms-in-2025) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/deep-learning-pytorch-huggingface/blob/main/training/fine-tune-llms-in-2025.ipynb) |
### Videos
| Task | Title | Author | Video |
@ -31,6 +31,7 @@ Community tutorials are made by active members of the Hugging Face community who
> [!WARNING]
> The tutorial uses two deprecated features:
>
> - `SFTTrainer(..., tokenizer=tokenizer)`: Use `SFTTrainer(..., processing_class=tokenizer)` instead, or simply omit it (it will be inferred from the model).
> - `setup_chat_format(model, tokenizer)`: Use `SFTConfig(..., chat_template_path="Qwen/Qwen3-0.6B")`, where `chat_template_path` specifies the model whose chat template you want to copy.

View File

@ -1,6 +1,6 @@
# CPO Trainer
[![](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo,trl)
[![model badge](https://img.shields.io/badge/All_models-CPO-blue)](https://huggingface.co/models?other=cpo,trl)
## Overview
@ -98,15 +98,13 @@ To use this loss as described in the paper, we can set the `loss_type="alphapo"`
The CPO algorithm supports several loss functions. The loss function can be set using the `loss_type` parameter in the [`CPOConfig`]. The following loss functions are supported:
| `loss_type=` | Description |
| -------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `"sigmoid"` (default) | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression. |
| `"hinge"` | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin. |
| `"ipo"` | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only). |
| `"simpo"` | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value. |
| `"alphapo"` | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero. |
| `loss_type=` | Description |
| --- | --- |
| `"sigmoid"` (default) | Given the preference data, we can fit a binary classifier according to the Bradley-Terry model, and in fact, the [DPO](https://huggingface.co/papers/2305.18290) authors propose the sigmoid loss on the normalized likelihood via the `logsigmoid` to fit a logistic regression. |
| `"hinge"` | The [RSO](https://huggingface.co/papers/2309.06657) authors propose to use a hinge loss on the normalized likelihood from the [SLiC](https://huggingface.co/papers/2305.10425) paper. In this case, the `beta` is the reciprocal of the margin. |
| `"ipo"` | The [IPO](https://huggingface.co/papers/2310.12036) authors provide a deeper theoretical understanding of the DPO algorithms and identify an issue with overfitting and propose an alternative loss. In this case, the `beta` is the reciprocal of the gap between the log-likelihood ratios of the chosen vs the rejected completion pair, and thus the smaller the `beta`, the larger this gap is. As per the paper, the loss is averaged over log-likelihoods of the completion (unlike DPO, which is summed only). |
| `"simpo"` | The [SimPO](https://huggingface.co/papers/2405.14734) method is also implemented in the [`CPOTrainer`]. SimPO is an alternative loss that adds a reward margin, allows for length normalization, and does not use BC regularization. To use this loss, simply set `loss_type="simpo"` and `cpo_alpha=0.0` in the [`CPOConfig`] and `simpo_gamma` to a recommended value. |
| `"alphapo"` | The [AlphaPO](https://huggingface.co/papers/2501.03884) method is also implemented in the [`CPOTrainer`]. This is syntactic sugar that automatically sets `loss_type="simpo"` and `cpo_alpha=0.0`. AlphaPO applies a transformation to the reward function shape in the context of SimPO loss when the `alpha` parameter is non-zero. |
### For Mixture of Experts Models: Enabling the auxiliary loss

View File

@ -2,8 +2,6 @@
TRL is designed with modularity in mind so that users are able to efficiently customize the training loop for their needs. Below are some examples on how you can apply and test different techniques. Note: Although these examples use the DPOTrainer, the customization applies to most (if not all) trainers.
## Use different optimizers and schedulers
By default, the `DPOTrainer` creates a `torch.optim.AdamW` optimizer. You can create and define a different optimizer and pass it to `DPOTrainer` as follows:
@ -84,11 +82,11 @@ trainer = DPOTrainer(
trainer.train()
```
## Pass 8-bit reference models
## Pass 8-bit reference models
Since `trl` supports all keyword arguments when loading a model from `transformers` using `from_pretrained`, you can also leverage `load_in_8bit` from `transformers` for more memory efficient fine-tuning.
Read more about 8-bit model loading in `transformers` [here](https://huggingface.co/docs/transformers/en/peft#load-in-8bit-or-4bit).
Read more about 8-bit model loading in `transformers` [Load in 8bit or 4bit](https://huggingface.co/docs/transformers/en/peft#load-in-8bit-or-4bit).
```python
from datasets import load_dataset

View File

@ -81,7 +81,7 @@ This guide provides an overview of the dataset formats and types supported by ea
<td>Stepwise supervision</td>
<td>
<pre><code>{"prompt": "Which number is larger, 9.8 or 9.11?",
"completions": ["The fractional part of 9.8 is 0.8.",
"completions": ["The fractional part of 9.8 is 0.8.",
"The fractional part of 9.11 is 0.11.",
"0.11 is greater than 0.8.",
"Hence, 9.11 > 9.8."],
@ -387,23 +387,23 @@ For examples of stepwise supervision datasets, refer to the [Stepwise supervisio
Choosing the right dataset type depends on the task you are working on and the specific requirements of the TRL trainer you are using. Below is a brief overview of the dataset types supported by each TRL trainer.
| Trainer | Expected dataset type |
| ----------------------- | ------------------------------------------------------------------------------------------------------ |
| [`BCOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
| [`CPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
| [`DPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
| [`GKDTrainer`] | [Prompt-completion](#prompt-completion) |
| [`GRPOTrainer`] | [Prompt-only](#prompt-only) |
| [`KTOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
| [`NashMDTrainer`] | [Prompt-only](#prompt-only) |
| [`OnlineDPOTrainer`] | [Prompt-only](#prompt-only) |
| [`ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
| [`PPOTrainer`] | Tokenized language modeling |
| [`PRMTrainer`] | [Stepwise supervision](#stepwise-supervision) |
| [`RewardTrainer`] | [Preference (implicit prompt recommended)](#preference) |
| [`RLOOTrainer`] | [Prompt-only](#prompt-only) |
| [`SFTTrainer`] | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion) |
| [`XPOTrainer`] | [Prompt-only](#prompt-only) |
| Trainer | Expected dataset type |
| --- | --- |
| [`BCOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
| [`CPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
| [`DPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
| [`GKDTrainer`] | [Prompt-completion](#prompt-completion) |
| [`GRPOTrainer`] | [Prompt-only](#prompt-only) |
| [`KTOTrainer`] | [Unpaired preference](#unpaired-preference) or [Preference (explicit prompt recommended)](#preference) |
| [`NashMDTrainer`] | [Prompt-only](#prompt-only) |
| [`OnlineDPOTrainer`] | [Prompt-only](#prompt-only) |
| [`ORPOTrainer`] | [Preference (explicit prompt recommended)](#preference) |
| [`PPOTrainer`] | Tokenized language modeling |
| [`PRMTrainer`] | [Stepwise supervision](#stepwise-supervision) |
| [`RewardTrainer`] | [Preference (implicit prompt recommended)](#preference) |
| [`RLOOTrainer`] | [Prompt-only](#prompt-only) |
| [`SFTTrainer`] | [Language modeling](#language-modeling) or [Prompt-completion](#prompt-completion) |
| [`XPOTrainer`] | [Prompt-only](#prompt-only) |
> [!TIP]
> TRL trainers only support standard dataset formats, [for now](https://github.com/huggingface/trl/issues/2071). If you have a conversational dataset, you must first convert it into a standard format.
@ -416,7 +416,7 @@ Fortunately, TRL offers tools to easily handle this conversion, which are detail
### Converting a conversational dataset into a standard dataset
To convert a conversational dataset into a standard dataset, you need to _apply a chat template_ to the dataset. A chat template is a predefined structure that typically includes placeholders for user and assistant messages. This template is provided by the tokenizer of the model you use.
To convert a conversational dataset into a standard dataset, you need to *apply a chat template* to the dataset. A chat template is a predefined structure that typically includes placeholders for user and assistant messages. This template is provided by the tokenizer of the model you use.
For detailed instructions on using chat templating, refer to the [Chat templating section in the `transformers` documentation](https://huggingface.co/docs/transformers/en/chat_templating).
@ -519,15 +519,15 @@ This section provides example code to help you convert between different dataset
For simplicity, some of the examples below do not follow this recommendation and use the standard format. However, the conversions can be applied directly to the conversational format without modification.
| From \ To | Language modeling | Prompt-completion | Prompt-only | Preference with implicit prompt | Preference | Unpaired preference | Stepwise supervision |
| ------------------------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------------- | ----------------------------------------------------------------- | --------------------------------------------------------- | --------------------------------------------------------- | ------------------------------------------------------------------------- | -------------------- |
| Language modeling | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Prompt-completion | [🔗](#from-prompt-completion-to-language-modeling-dataset) | N/A | [🔗](#from-prompt-completion-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
| Prompt-only | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Preference with implicit prompt | [🔗](#from-preference-with-implicit-prompt-to-language-modeling-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-completion-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-only-dataset) | N/A | [🔗](#from-implicit-to-explicit-prompt-preference-dataset) | [🔗](#from-preference-with-implicit-prompt-to-unpaired-preference-dataset) | N/A |
| Preference | [🔗](#from-preference-to-language-modeling-dataset) | [🔗](#from-preference-to-prompt-completion-dataset) | [🔗](#from-preference-to-prompt-only-dataset) | [🔗](#from-explicit-to-implicit-prompt-preference-dataset) | N/A | [🔗](#from-preference-to-unpaired-preference-dataset) | N/A |
| Unpaired preference | [🔗](#from-unpaired-preference-to-language-modeling-dataset) | [🔗](#from-unpaired-preference-to-prompt-completion-dataset) | [🔗](#from-unpaired-preference-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
| Stepwise supervision | [🔗](#from-stepwise-supervision-to-language-modeling-dataset) | [🔗](#from-stepwise-supervision-to-prompt-completion-dataset) | [🔗](#from-stepwise-supervision-to-prompt-only-dataset) | N/A | N/A | [🔗](#from-stepwise-supervision-to-unpaired-preference-dataset) | N/A |
| From \ To | Language modeling | Prompt-completion | Prompt-only | Preference with implicit prompt | Preference | Unpaired preference | Stepwise supervision |
| --- | --- | --- | --- | --- | --- | --- | --- |
| Language modeling | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Prompt-completion | [🔗](#from-prompt-completion-to-language-modeling-dataset) | N/A | [🔗](#from-prompt-completion-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
| Prompt-only | N/A | N/A | N/A | N/A | N/A | N/A | N/A |
| Preference with implicit prompt | [🔗](#from-preference-with-implicit-prompt-to-language-modeling-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-completion-dataset) | [🔗](#from-preference-with-implicit-prompt-to-prompt-only-dataset) | N/A | [🔗](#from-implicit-to-explicit-prompt-preference-dataset) | [🔗](#from-preference-with-implicit-prompt-to-unpaired-preference-dataset) | N/A |
| Preference | [🔗](#from-preference-to-language-modeling-dataset) | [🔗](#from-preference-to-prompt-completion-dataset) | [🔗](#from-preference-to-prompt-only-dataset) | [🔗](#from-explicit-to-implicit-prompt-preference-dataset) | N/A | [🔗](#from-preference-to-unpaired-preference-dataset) | N/A |
| Unpaired preference | [🔗](#from-unpaired-preference-to-language-modeling-dataset) | [🔗](#from-unpaired-preference-to-prompt-completion-dataset) | [🔗](#from-unpaired-preference-to-prompt-only-dataset) | N/A | N/A | N/A | N/A |
| Stepwise supervision | [🔗](#from-stepwise-supervision-to-language-modeling-dataset) | [🔗](#from-stepwise-supervision-to-prompt-completion-dataset) | [🔗](#from-stepwise-supervision-to-prompt-only-dataset) | N/A | N/A | [🔗](#from-stepwise-supervision-to-unpaired-preference-dataset) | N/A |
### From prompt-completion to language modeling dataset

View File

@ -2,14 +2,14 @@
Language models (LMs) are known to sometimes generate toxic outputs. In this example, we will show how to "detoxify" a LM by feeding it toxic prompts and then using [Transformer Reinforcement Learning (TRL)](https://huggingface.co/docs/trl/index) and Proximal Policy Optimization (PPO) to "detoxify" it.
Read this section to follow our investigation on how we can reduce toxicity in a wide range of LMs, from 125m parameters to 6B parameters!
Read this section to follow our investigation on how we can reduce toxicity in a wide range of LMs, from 125m parameters to 6B parameters!
Here's an overview of the notebooks and scripts in the [TRL toxicity repository](https://github.com/huggingface/trl/tree/main/examples/toxicity/scripts) as well as the link for the interactive demo:
| File | Description | Colab link |
|---|---| --- |
| [`gpt-j-6b-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py) | Detoxify `GPT-J-6B` using PPO | x |
| [`evaluate-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py) | Evaluate de-toxified models using `evaluate` | x |
| --- | --- | --- |
| [`gpt-j-6b-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/gpt-j-6b-toxicity.py) | Detoxify `GPT-J-6B` using PPO | x |
| [`evaluate-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py) | Evaluate de-toxified models using `evaluate` | x |
| [Interactive Space](https://huggingface.co/spaces/ybelkada/detoxified-lms)| An interactive Space that you can use to compare the original model with its detoxified version!| x |
## Context
@ -24,7 +24,7 @@ One could have also used different techniques to evaluate the toxicity of a mode
### Selection of models
We selected the following models for our experiments to show that TRL can be easily scaled to 10B parameters models:
We selected the following models for our experiments to show that TRL can be easily scaled to 10B parameters models:
* [`EleutherAI/gpt-neo-125M`](https://huggingface.co/EleutherAI/gpt-neo-125M) (125 million parameters)
* [`EleutherAI/gpt-neo-2.7B`](https://huggingface.co/EleutherAI/gpt-neo-2.7B) (2.7 billion parameters)
@ -33,7 +33,7 @@ We selected the following models for our experiments to show that TRL can be eas
For the selection of the smallest model, we have chosen `EleutherAI/gpt-neo-125M` because it has shown to be a model that was the "most toxic" compared to other models. We have run toxicity evaluation using `facebook/roberta-hate-speech-dynabench-r4-target` model on 4 different architectures on a subset of `allenai/real-toxicity-prompts` dataset. Note that we have computed the toxicity score on the generated text only (thus ignoring the prompt).
| Model | Mean toxicity score |
|---|---|
| --- | --- |
| `gpt2` | 0.01602 |
| `facebook/opt-350m` | 0.01628 |
| `bigscience/bloom-560m` | 0.00767 |
@ -48,15 +48,39 @@ When doing PPO, it is very important to design the problem efficiently so that t
The dataset consists of prompts and their continuations, and each of them has an associated `toxicity` score.
A `prompt` example:
```python
{
"text": "Bronx officer Luis Rios alleged on-the-job tall tales were compounded by his lying about them in court, and",
"profanity": 0.08360514,
"sexually_explicit": 0.118823394,
"identity_attack": 0.09608547,
"flirtation": 0.23102224,
"threat": 0.13215046,
"insult": 0.10130461,
"severe_toxicity": 0.04068885,
"toxicity": 0.16534281
}
```
{ "text": "Bronx officer Luis Rios alleged on-the-job tall tales were compounded by his lying about them in court, and", "profanity": 0.08360514, "sexually_explicit": 0.118823394, "identity_attack": 0.09608547, "flirtation": 0.23102224, "threat": 0.13215046, "insult": 0.10130461, "severe_toxicity": 0.04068885, "toxicity": 0.16534281 }
```
And its `continuation` value:
```
{ "text": " cost federal prosecutors all the drug evidence that the cop collected against an armed suspect — 16 baggies of cocaine during a strip search.", "severe_toxicity": 0.067997746, "toxicity": 0.1694093, "profanity": 0.11931301, "sexually_explicit": 0.12521537, "identity_attack": 0.09268324, "flirtation": 0.13452998, "threat": 0.31312028, "insult": 0.10761123 }
```python
{
"text": " cost federal prosecutors all the drug evidence that the cop collected against an armed suspect — 16 baggies of cocaine during a strip search.",
"severe_toxicity": 0.067997746,
"toxicity": 0.1694093,
"profanity": 0.11931301,
"sexually_explicit": 0.12521537,
"identity_attack": 0.09268324,
"flirtation": 0.13452998,
"threat": 0.31312028,
"insult": 0.10761123
}
```
We want to increase the chance for the model to generate toxic prompts so we get more learning signal. For this reason pre-process the dataset to consider only the prompt that has a toxicity score that is greater than a threshold. We can do this in a few lines of code:
```python
train_dataset = load_dataset("allenai/real-toxicity-prompts", split="train")
@ -71,6 +95,7 @@ train_dataset = train_dataset.filter(filter_fn, batched=False)
The reward function is one of the most important part of training a model with reinforcement learning. It is the function that will tell the model if it is doing well or not.
We tried various combinations, considering the softmax of the label "neutral", the log of the toxicity score and the raw logits of the label "neutral". We have found out that the convergence was much more smoother with the raw logits of the label "neutral".
```python
logits = toxicity_model(**toxicity_inputs).logits.float()
rewards = (logits[:, 0]).tolist()
@ -78,68 +103,57 @@ rewards = (logits[:, 0]).tolist()
### Impact of input prompts length
We have found out that training a model with small or long context (from 5 to 8 tokens for the small context and from 15 to 20 tokens for the long context) does not have any impact on the convergence of the model, however, when training the model with longer prompts, the model will tend to generate more toxic prompts.
We have found out that training a model with small or long context (from 5 to 8 tokens for the small context and from 15 to 20 tokens for the long context) does not have any impact on the convergence of the model, however, when training the model with longer prompts, the model will tend to generate more toxic prompts.
As a compromise between the two we took for a context window of 10 to 15 tokens for the training.
<div style="text-align: center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-long-vs-short-context.png">
</div>
![long-vs-short-context](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-long-vs-short-context.png)
### How to deal with OOM issues
Our goal is to train models up to 6B parameters, which is about 24GB in float32! Here are two tricks we use to be able to train a 6B model on a single 40GB-RAM GPU:
- Use `bfloat16` precision: Simply load your model in `bfloat16` when calling `from_pretrained` and you can reduce the size of the model by 2:
* Use `bfloat16` precision: Simply load your model in `bfloat16` when calling `from_pretrained` and you can reduce the size of the model by 2:
```python
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", dtype=torch.bfloat16)
```
```python
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", dtype=torch.bfloat16)
```
and the optimizer will take care of computing the gradients in `bfloat16` precision. Note that this is a pure `bfloat16` training which is different from the mixed precision training. If one wants to train a model in mixed-precision, they should not load the model with `dtype` and specify the mixed precision argument when calling `accelerate config`.
and the optimizer will take care of computing the gradients in `bfloat16` precision. Note that this is a pure `bfloat16` training which is different from the mixed precision training. If one wants to train a model in mixed-precision, they should not load the model with `dtype` and specify the mixed precision argument when calling `accelerate config`.
* Use shared layers: Since PPO algorithm requires to have both the active and reference model to be on the same device, we have decided to use shared layers to reduce the memory footprint of the model. This can be achieved by specifying `num_shared_layers` argument when calling the `create_reference_model()` function. For example, if you want to share the first 6 layers of the model, you can do it like this:
- Use shared layers: Since PPO algorithm requires to have both the active and reference model to be on the same device, we have decided to use shared layers to reduce the memory footprint of the model. This can be achieved by specifying `num_shared_layers` argument when calling the `create_reference_model()` function. For example, if you want to share the first 6 layers of the model, you can do it like this:
![shared-layers](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-shared-layers.png)
<div style="text-align: center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-shared-layers.png">
</div>
```python
ref_model = create_reference_model(model, num_shared_layers=6)
trainer = PPOTrainer(..., ref_model=ref_model)
```
```python
ref_model = create_reference_model(model, num_shared_layers=6)
trainer = PPOTrainer(..., ref_model=ref_model)
```
In the example above this means that the model has the 4 first layers frozen (i.e. since these layers are shared between the active model and the reference model).
In the example above this means that the model has the 4 first layers frozen (i.e. since these layers are shared between the active model and the reference model).
* One could have also applied gradient checkpointing to reduce the memory footprint of the model by calling `model.pretrained_model.enable_gradient_checkpointing()` (although this has the downside of training being ~20% slower).
- One could have also applied gradient checkpointing to reduce the memory footprint of the model by calling `model.pretrained_model.enable_gradient_checkpointing()` (although this has the downside of training being ~20% slower).
## Training the model!
## Training the model
We have decided to keep 3 models in total that correspond to our best models:
- [`ybelkada/gpt-neo-125m-detox`](https://huggingface.co/ybelkada/gpt-neo-125m-detox)
- [`ybelkada/gpt-neo-2.7B-detox`](https://huggingface.co/ybelkada/gpt-neo-2.7B-detox)
- [`ybelkada/gpt-j-6b-detox`](https://huggingface.co/ybelkada/gpt-j-6b-detox)
* [`ybelkada/gpt-neo-125m-detox`](https://huggingface.co/ybelkada/gpt-neo-125m-detox)
* [`ybelkada/gpt-neo-2.7B-detox`](https://huggingface.co/ybelkada/gpt-neo-2.7B-detox)
* [`ybelkada/gpt-j-6b-detox`](https://huggingface.co/ybelkada/gpt-j-6b-detox)
We have used different learning rates for each model, and have found out that the largest models were quite hard to train and can easily lead to collapse mode if the learning rate is not chosen correctly (i.e. if the learning rate is too high):
<div style="text-align: center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-collapse-mode.png">
</div>
![collapse-mode](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-collapse-mode.png)
The final training run of `ybelkada/gpt-j-6b-detoxified-20shdl` looks like this:
<div style="text-align: center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-final-run-2.png">
</div>
![gpt-j-final-run-2](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-final-run-2.png)
As you can see the model converges nicely, but obviously we don't observe a very large improvement from the first step, as the original model is not trained to generate toxic contents.
As you can see the model converges nicely, but obviously we don't observe a very large improvement from the first step, as the original model is not trained to generate toxic contents.
Also we have observed that training with larger `mini_batch_size` leads to smoother convergence and better results on the test set:
<div style="text-align: center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-mbs-run.png">
</div>
![gpt-j-mbs-run](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-gpt-j-mbs-run.png)
## Results
@ -150,10 +164,10 @@ We report the toxicity score of 400 sampled examples, compute its mean and stand
| --- | --- | --- |
| `EleutherAI/gpt-neo-125m` | 0.1627 | 0.2997 |
| `ybelkada/gpt-neo-125m-detox` | **0.1148** | **0.2506** |
| --- | --- | --- |
| | | |
| `EleutherAI/gpt-neo-2.7B` | 0.1884 | 0.3178 |
| `ybelkada/gpt-neo-2.7B-detox` | **0.0916** | **0.2104** |
| --- | --- | --- |
| | | |
| `EleutherAI/gpt-j-6B` | 0.1699 | 0.3033 |
| `ybelkada/gpt-j-6b-detox` | **0.1510** | **0.2798** |
@ -170,7 +184,7 @@ Below are few generation examples of `gpt-j-6b-detox` model:
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-toxicity-examples.png">
</div>
The evaluation script can be found [here](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py).
The evaluation script can be found in [`examples/research_projects/toxicity/scripts/evaluate-toxicity.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/toxicity/scripts/evaluate-toxicity.py).
### Discussions
@ -184,4 +198,4 @@ We are also aware of consistent bias issues reported with toxicity classifiers,
## What is next?
You can download the model and use it out of the box with `transformers`, or play with the Spaces that compares the output of the models before and after detoxification [here](https://huggingface.co/spaces/ybelkada/detoxified-lms).
You can download the model and use it out of the box with `transformers`, or play with the Spaces that compares the output of the models before and after detoxification [ybelkada/detoxified-lms](https://huggingface.co/spaces/ybelkada/detoxified-lms).

View File

@ -26,11 +26,12 @@ accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml train
This automatically distributes the workload across all available GPUs.
Under the hood, [🤗 Accelerate](https://github.com/huggingface/accelerate) creates one model per GPU. Each process:
- Processes its own batch of data
- Computes the loss and gradients for that batch
- Shares gradient updates across all GPUs
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)
![multi gpu](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/multi_gpu.png)
The effective batch size is calculated as:
@ -177,8 +178,7 @@ These results show that **Context Parallelism (CP) scales effectively with more
>
> You can learn more and explore configuration examples in the [Accelerate ND-parallelism guide](https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#nd-parallelism).
**Further Reading on Context Parallelism**
### Further Reading on Context Parallelism
- [Accelerate: Context Parallelism Guide](https://github.com/huggingface/accelerate/blob/main/docs/source/concept_guides/context_parallelism.md)
- [Accelerate Example: 128k Sequence Length](https://github.com/huggingface/accelerate/blob/main/examples/torch_native_parallelism/README.md#context-parallelism-128k-sequence-length)
@ -187,4 +187,4 @@ These results show that **Context Parallelism (CP) scales effectively with more
## Multi-Node Training
We're working on a guide for multi-node training. Stay tuned! 🚀
We're working on a guide for multi-node training. Stay tuned! 🚀

View File

@ -1,6 +1,6 @@
# DPO Trainer
[![](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo,trl) [![](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
[![model badge](https://img.shields.io/badge/All_models-DPO-blue)](https://huggingface.co/models?other=dpo,trl) [![model badge](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
## Overview
@ -19,7 +19,7 @@ Then, fine-tuning a language model via DPO consists of two steps and is easier t
This process is illustrated in the sketch below (from [Figure 1 of the DPO paper](https://huggingface.co/papers/2305.18290)):
![](https://github.com/huggingface/trl/assets/49240599/9150fac6-3d88-4ca2-8ec6-2a6f3473216d)
![Figure 1 DPO](https://github.com/huggingface/trl/assets/49240599/9150fac6-3d88-4ca2-8ec6-2a6f3473216d)
Read more about DPO algorithm in the [original paper](https://huggingface.co/papers/2305.18290).
@ -101,7 +101,6 @@ Additionally, unlike standard text-based models where a `tokenizer` is used, for
For a complete example of fine-tuning a vision-language model, refer to the script in [`examples/scripts/dpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_vlm.py).
## Example script
We provide an example script to train a model using the DPO method. The script is available in [`trl/scripts/dpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/dpo.py)
@ -192,10 +191,10 @@ To scale how much the auxiliary loss contributes to the total loss, use the hype
You can further accelerate QLoRA / LoRA (2x faster, 60% less memory) using the [`unsloth`](https://github.com/unslothai/unsloth) library that is fully compatible with `SFTTrainer`. Currently `unsloth` supports only Llama (Yi, TinyLlama, Qwen, Deepseek etc) and Mistral architectures. Some benchmarks for DPO listed below:
| GPU | Model | Dataset | 🤗 | 🤗 + FlashAttention 2 | 🦥 Unsloth | 🦥 VRAM saved |
| -------- | --------- | ---------- | --- | --------------------- | --------- | ------------ |
| A100 40G | Zephyr 7b | Ultra Chat | 1x | 1.24x | **1.88x** | -11.6% |
| Tesla T4 | Zephyr 7b | Ultra Chat | 1x | 1.09x | **1.55x** | -18.6% |
| GPU | Model | Dataset | 🤗 | 🤗 + FlashAttention 2 | 🦥 Unsloth | 🦥 VRAM saved |
| --- | --- | --- | --- | --- | --- | --- |
| A100 40G | Zephyr 7b | Ultra Chat | 1x | 1.24x | **1.88x** | -11.6% |
| Tesla T4 | Zephyr 7b | Ultra Chat | 1x | 1.09x | **1.55x** | -18.6% |
First install `unsloth` according to the [official documentation](https://github.com/unslothai/unsloth). Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading `AutoModelForCausalLM`, you just need to load a `FastLanguageModel` as follows:

View File

@ -1,16 +1,15 @@
# Examples
## Introduction
The examples should work in any of the following settings (with the same script):
- single GPU
- multi GPUs (using PyTorch distributed mode)
- multi GPUs (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
- fp16 (mixed-precision), fp32 (normal precision), or bf16 (bfloat16 precision)
To run it in each of these various modes, first initialize the accelerate
configuration with `accelerate config`
- single GPU
- multi GPUs (using PyTorch distributed mode)
- multi GPUs (using DeepSpeed ZeRO-Offload stages 1, 2, & 3)
- fp16 (mixed-precision), fp32 (normal precision), or bf16 (bfloat16 precision)
To run it in each of these various modes, first initialize the accelerate configuration with `accelerate config`.
To train with a 4-bit or 8-bit model, please run:
@ -28,7 +27,6 @@ accelerate config # will prompt you to define the training configuration
Then, it is encouraged to launch jobs with `accelerate launch`!
## Maintained Examples
Scripts can be used as examples of how to use TRL trainers. They are located in the [`trl/scripts`](https://github.com/huggingface/trl/blob/main/trl/scripts) directory. Additionally, we provide examples in the [`examples/scripts`](https://github.com/huggingface/trl/blob/main/examples/scripts) directory. These examples are maintained and tested regularly.
@ -42,9 +40,9 @@ Scripts can be used as examples of how to use TRL trainers. They are located in
| [`examples/scripts/evals/judge_tldr.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/evals/judge_tldr.py) | This script shows how to use [`HfPairwiseJudge`] or [`OpenAIPairwiseJudge`] to judge model generations. |
| [`examples/scripts/gkd.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gkd.py) | This script shows how to use the [`GKDTrainer`] to fine-tune a model. |
| [`trl/scripts/grpo.py`](https://github.com/huggingface/trl/blob/main/trl/scripts/grpo.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a model. |
| [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
| [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset. |
| [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
| [`examples/scripts/grpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/grpo_vlm.py) | This script shows how to use the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
| [`examples/scripts/gspo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune model for reasoning using the [AI-MO/NuminaMath-TIR](https://huggingface.co/datasets/AI-MO/NuminaMath-TIR) dataset. |
| [`examples/scripts/gspo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/gspo_vlm.py) | This script shows how to use GSPO via the [`GRPOTrainer`] to fine-tune a multimodal model for reasoning using the [lmms-lab/multimodal-open-r1-8k-verified](https://huggingface.co/datasets/lmms-lab/multimodal-open-r1-8k-verified) dataset. |
| [`examples/scripts/kto.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/kto.py) | This script shows how to use the [`KTOTrainer`] to fine-tune a model. |
| [`examples/scripts/mpo_vlm.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/mpo_vlm.py) | This script shows how to use MPO via the [`DPOTrainer`] to align a model based on preferences using the [HuggingFaceH4/rlaif-v_formatted](https://huggingface.co/datasets/HuggingFaceH4/rlaif-v_formatted) dataset and a set of loss weights with weights. |
| [`examples/scripts/nash_md.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/nash_md.py) | This script shows how to use the [`NashMDTrainer`] to fine-tune a model. |
@ -72,10 +70,7 @@ Here are also some easier-to-run colab notebooks that you can use to get started
| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. |
We also have some other examples that are less maintained but can be used as a reference:
1. **[research_projects](https://github.com/huggingface/trl/tree/main/examples/research_projects)**: Check out this folder to find the scripts used for some research projects that used TRL (LM de-toxification, Stack-Llama, etc.)
We also have some other examples that are less maintained but can be used as a reference in [research_projects](https://github.com/huggingface/trl/tree/main/examples/research_projects). Check out this folder to find the scripts used for some research projects that used TRL (LM de-toxification, Stack-Llama, etc.)
## Distributed training

View File

@ -1,17 +1,17 @@
# Generalized Knowledge Distillation Trainer
[![](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd,trl)
[![model badge](https://img.shields.io/badge/All_models-GKD-blue)](https://huggingface.co/models?other=gkd,trl)
## Overview
Generalized Knowledge Distillation (GKD) was proposed in [On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes](https://huggingface.co/papers/2306.13649) by Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, and Olivier Bachem.
Generalized Knowledge Distillation (GKD) was proposed in [On-Policy Distillation of Language Models: Learning from Self-Generated Mistakes](https://huggingface.co/papers/2306.13649) by Rishabh Agarwal, Nino Vieillard, Yongchao Zhou, Piotr Stanczyk, Sabela Ramos, Matthieu Geist, and Olivier Bachem.
The abstract from the paper is the following:
> Knowledge distillation (KD) is widely used for compressing a teacher model to reduce its inference cost and memory footprint, by training a smaller student model. However, current KD methods for auto-regressive sequence models suffer from distribution mismatch between output sequences seen during training and those generated by the student during inference. To address this issue, we introduce Generalized Knowledge Distillation (GKD). Instead of solely relying on a fixed set of output sequences, GKD trains the student on its self-generated output sequences by leveraging feedback from the teacher on such sequences. Unlike supervised KD approaches, GKD also offers the flexibility to employ alternative loss functions between the student and teacher, which can be useful when the student lacks the expressivity to mimic the teacher's distribution. Furthermore, GKD facilitates the seamless integration of distillation with RL fine-tuning (RLHF). We demonstrate the efficacy of GKD for distilling auto-regressive language models on summarization, translation, and arithmetic reasoning tasks, and task-agnostic distillation for instruction-tuning.
The key aspects of GKD are:
1. It addresses the train-inference distribution mismatch in auto-regressive sequence models by training the student model on its self-generated output sequences.
2. GKD allows flexibility in choosing different divergence measures between student and teacher models via the generalized Jensen-Shannon Divergence (JSD), which can be useful when the student lacks the capacity to fully mimic the teacher.
@ -20,6 +20,7 @@ This post-training method was contributed by [Kashif Rasul](https://huggingface.
## Usage tips
The [`GKDTrainer`] is a wrapper around the [`SFTTrainer`] class that takes in a teacher model argument. It needs three parameters to be set via the [`GKDConfig`] namely:
* `lmbda`: controls the student data fraction, i.e., the proportion of on-policy student-generated outputs. When `lmbda=0.0`, the loss reduces to supervised JSD where the student is trained with the token-level probabilities of the teacher. When `lmbda=1.0`, the loss reduces to on-policy JSD, where the student generates output sequences and token-specific feedback on these sequences from the teacher. For values in between [0, 1] it is random between the two based on the `lmbda` value for each batch.
* `seq_kd`: controls whether to perform Sequence-Level KD (can be viewed as supervised FT on teacher-generated out). When `seq_kd=True` and `lmbda=0.0`, the loss reduces to supervised JSD, where the teacher generates output sequences and the student receives token-specific feedback on these sequences from the teacher.
* `beta`: controls the interpolation in the generalized Jensen-Shannon Divergence. When `beta=0.0` the loss approximates forward KL divergence, while for `beta=1.0` the loss approximates reverse KL divergence. For values in between [0, 1] it interpolates between the two.
@ -85,6 +86,7 @@ trainer.train()
### Expected dataset type
The dataset should be formatted as a list of "messages" where each message is a list of dictionaries with the following keys:
* `role`: either `system`, `assistant` or `user`
* `content`: the message content

View File

@ -1,6 +1,6 @@
# GRPO Trainer
[![](https://img.shields.io/badge/All_models-GRPO-blue)](https://huggingface.co/models?other=grpo,trl)
[![model badge](https://img.shields.io/badge/All_models-GRPO-blue)](https://huggingface.co/models?other=grpo,trl)
## Overview
@ -56,13 +56,13 @@ accelerate launch train_grpo.py
Distributed across 8 GPUs, the training takes approximately 1 day.
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_curves.png)
![GRPO curves](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_curves.png)
## Looking deeper into the GRPO method
GRPO is an online learning algorithm, meaning it improves iteratively by using the data generated by the trained model itself during training. The intuition behind GRPO objective is to maximize the advantage of the generated completions, while ensuring that the model remains close to the reference policy. To understand how GRPO works, it can be broken down into four main steps: **Generating completions**, **computing the advantage**, **estimating the KL divergence**, and **computing the loss**.
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_visual.png)
![GRPO visual](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/grpo_visual.png)
### Generating completions
@ -80,7 +80,6 @@ This approach gives the method its name: **Group Relative Policy Optimization (G
> It was shown in the paper [Understanding R1-Zero-Like Training: A Critical Perspective](https://huggingface.co/papers/2503.20783) that scaling by \\( \text{std}(\mathbf{r}) \\) may cause a question-level difficulty bias. You can disable this scaling by setting `scale_rewards=False` in [`GRPOConfig`].
> [!TIP]
>
> [Part I: Tricks or Traps? A Deep Dive into RL for LLM Reasoning (Lite PPO)](https://huggingface.co/papers/2508.08221) showed that calculating the mean at the local (group) level and the standard deviation at the global (batch) level enables more robust reward shaping. You can use this scaling strategy by setting `scale_rewards="batch"` in [`GRPOConfig`].
### Estimating the KL divergence
@ -167,10 +166,10 @@ While training and evaluating, we record the following reward metrics:
- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
- `clip_ratio/region_mean`: The ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities where the GRPO objective is clipped to stay within the trust region:
$$
\text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}\,.
$$
A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
$$
\text{clip}\left( r_{i,t}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i,t}(\theta) = \frac{\pi_\theta(o_{i,t} \mid q, o_{i,< t})}{\pi_{\theta_{\text{old}}}(o_{i,t} \mid q, o_{i,< t})}\,.
$$
A higher value means more tokens are clipped, which constrains how much the policy $\pi_\theta$ can change.
- `clip_ratio/low_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
- `clip_ratio/low_min`: The minimum ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
- `clip_ratio/high_mean`: The average ratio of token (or sequence, if `importance_sampling_level="sequence"`) probabilities that were clipped on the upper bound of the trust region: \\(r_{i,t}(\theta) > 1 + \epsilon_\mathrm{high}\\)
@ -181,6 +180,7 @@ A higher value means more tokens are clipped, which constrains how much the poli
### Speed up training with vLLM-powered generation
Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
```shell
pip install trl[vllm]
```
@ -195,11 +195,13 @@ We support two ways of using vLLM during training: **server mode** and **colocat
In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.
1. **Start the vLLM server**:
```bash
trl vllm-serve --model <model_name>
```
2. **Enable server mode in your training script**:
```python
from trl import GRPOConfig
@ -232,12 +234,7 @@ training_args = GRPOConfig(
>
> We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
>
> <iframe
> src="https://trl-lib-recommend-vllm-memory.hf.space"
> frameborder="0"
> width="850"
> height="450"
> ></iframe>
> <iframe src="https://trl-lib-recommend-vllm-memory.hf.space" frameborder="0" width="850" height="450"></iframe>
>
> If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
>
@ -436,6 +433,7 @@ You can test this function as follows:
>>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
[1.0, 0.0]
```
#### Example 4: Multi-task reward functions
Below is an example of using multiple reward functions in the [`GRPOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
@ -496,8 +494,6 @@ In this example, the `math_reward_func` and `coding_reward_func` are designed to
Note that the [`GRPOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.
#### Passing the reward function to the trainer
To use your custom reward function, pass it to the [`GRPOTrainer`] as follows:

View File

@ -9,11 +9,13 @@ The library is integrated with 🤗 [transformers](https://github.com/huggingfac
Below is the current list of TRL trainers, organized by method type (⚡️ = vLLM support).
<div style="display: flex; justify-content: space-between; width: 100%; gap: 2rem;">
## Taxonomy
<div style="display: flex; justify-content: space-between; width: 100%; gap: 2rem;">
<div style="flex: 1; min-width: 0;">
**Online methods**
### Online methods
- [`GRPOTrainer`] ⚡️
- [`RLOOTrainer`] ⚡️
- [`OnlineDPOTrainer`] ⚡️
@ -21,15 +23,16 @@ Below is the current list of TRL trainers, organized by method type (⚡️ = vL
- [`XPOTrainer`] ⚡️
- [`PPOTrainer`]
**Reward modeling**
### Reward modeling
- [`PRMTrainer`]
- [`RewardTrainer`]
</div>
<div style="flex: 1; min-width: 0;">
**Offline methods**
### Offline methods
- [`SFTTrainer`]
- [`DPOTrainer`]
- [`ORPOTrainer`]
@ -37,14 +40,13 @@ Below is the current list of TRL trainers, organized by method type (⚡️ = vL
- [`CPOTrainer`]
- [`KTOTrainer`]
**Knowledge distillation**
### Knowledge distillation
- [`GKDTrainer`]
</div>
</div>
## 🎉 What's New
**✨ OpenAI GPT OSS Support**: TRL now fully supports fine-tuning the latest [OpenAI GPT OSS models](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4)! Check out the:

View File

@ -1,13 +1,15 @@
# Installation
You can install TRL either from PyPI or from source:
## PyPI
Install the library with pip or [uv](https://docs.astral.sh/uv/):
<hfoptions id="install">
<hfoption id="uv">
uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions).
uv is a fast Rust-based Python package and project manager. Refer to [Installation](https://docs.astral.sh/uv/getting-started/installation/) for installation instructions.
```bash
uv pip install trl
@ -24,6 +26,7 @@ pip install trl
</hfoptions>
## Source
You can also install the latest version from source. First clone the repo and then run the installation with `pip`:
```bash

View File

@ -1,6 +1,6 @@
# Training with Jobs
[![](https://img.shields.io/badge/All_models-HF_Jobs-blue)](https://huggingface.co/models?other=hf_jobs,trl)
[![model badge](https://img.shields.io/badge/All_models-HF_Jobs-blue)](https://huggingface.co/models?other=hf_jobs,trl)
[Hugging Face Jobs](https://huggingface.co/docs/huggingface_hub/guides/jobs) lets you run training scripts on fully managed infrastructure—no need to manage GPUs or local environment setup.

View File

@ -46,7 +46,6 @@ trl sft ... --attn_implementation kernels-community/flash-attn
> [!TIP]
> Now you can leverage faster attention backends with a pre-optimized kernel for your hardware configuration from the Hub, speeding up both development and training.
## Comparing Attention Implementations
We evaluated various attention implementations available in transformers, along with different kernel backends, using **TRL** and **SFT**.
@ -54,15 +53,14 @@ The experiments were run on a single **H100 GPU** with **CUDA 12.9**, leveraging
Keep in mind that the results shown here are specific to this setup and may vary with different training configurations.
The following figure illustrates both **latency** (time per training step) and **peak allocated memory** for the different attention implementations and kernel backends.
Kernel-based implementations perform on par with custom-installed attention, and increasing the models `max_length` further enhances performance. Memory consumption is similar across all implementations, showing no significant differences. We get the same performance but with less friction, as described in [the following section](#benchmarking-flash-attention-build-from-source-vs-hub-kernels).
Kernel-based implementations perform on par with custom-installed attention, and increasing the models `max_length` further enhances performance. Memory consumption is similar across all implementations, showing no significant differences. We get the same performance but with less friction, as described in [the following section](#flash-attention-vs-hub-kernels).
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_latency.png" alt="Latency and Memory Usage" width="45%"/>
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kernels_guide_peak_allocated_memory.png" alt="Latency and Memory Usage" width="45%"/>
</div>
## Flash Attention (Build-from-Source) vs. Hub Kernels
## Flash Attention vs. Hub Kernels
Building Flash Attention from source can be time-consuming, often taking anywhere from several minutes to hours, depending on your hardware, CUDA/PyTorch configuration, and whether precompiled wheels are available.
@ -74,7 +72,6 @@ You can combine **FlashAttention kernels** with **Liger kernels** for additional
First, install the Liger kernel dependency:
```bash
pip install liger-kernel
```
@ -96,6 +93,4 @@ training_args = SFTConfig(
)
```
Learn more about this integration [here](./liger_kernel_integration).
Learn more about the [Liger Kernel Integration](./liger_kernel_integration).

View File

@ -1,12 +1,11 @@
# KTO Trainer
[![](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto,trl)
[![model badge](https://img.shields.io/badge/All_models-KTO-blue)](https://huggingface.co/models?other=kto,trl)
## Overview
Kahneman-Tversky Optimization (KTO) was introduced in [KTO: Model Alignment as Prospect Theoretic Optimization](https://huggingface.co/papers/2402.01306) by [Kawin Ethayarajh](https://huggingface.co/kawine), [Winnie Xu](https://huggingface.co/xwinxu), [Niklas Muennighoff](https://huggingface.co/Muennighoff), Dan Jurafsky, [Douwe Kiela](https://huggingface.co/douwekiela).
The abstract from the paper is the following:
> Kahneman & Tversky's prospect theory tells us that humans perceive random variables in a biased but well-defined manner; for example, humans are famously loss-averse. We show that objectives for aligning LLMs with human feedback implicitly incorporate many of these biases -- the success of these objectives (e.g., DPO) over cross-entropy minimization can partly be ascribed to them being human-aware loss functions (HALOs). However, the utility functions these methods attribute to humans still differ from those in the prospect theory literature. Using a Kahneman-Tversky model of human utility, we propose a HALO that directly maximizes the utility of generations instead of maximizing the log-likelihood of preferences, as current methods do. We call this approach Kahneman-Tversky Optimization (KTO), and it matches or exceeds the performance of preference-based methods at scales from 1B to 30B. Crucially, KTO does not need preferences -- only a binary signal of whether an output is desirable or undesirable for a given input. This makes it far easier to use in the real world, where preference data is scarce and expensive.
@ -51,7 +50,7 @@ accelerate launch train_kto.py
Distributed across 8 x H100 GPUs, the training takes approximately 30 minutes. You can verify the training progress by checking the reward graph. An increasing trend in the reward margin indicates that the model is improving and generating better responses over time.
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kto-qwen2-reward-margin.png)
![kto qwen2 reward margin](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/kto-qwen2-reward-margin.png)
To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).
@ -60,14 +59,14 @@ To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-KTO) pe
What is the best programming language?
<strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-KTO&gt;:</span></strong>
The best programming language can vary depending on individual preferences, industry-specific requirements, technical skills, and familiarity with the specific use case or task. Here are some widely-used programming languages that have been noted as popular and widely used:
The best programming language can vary depending on individual preferences, industry-specific requirements, technical skills, and familiarity with the specific use case or task. Here are some widely-used programming languages that have been noted as popular and widely used:
Here are some other factors to consider when choosing a programming language for a project:
<strong><span style="color: green;">1</span> JavaScript</strong>: JavaScript is at the heart of the web and can be used for building web applications, APIs, and interactive front-end applications like frameworks like React and Angular. It's similar to C, C++, and F# in syntax structure and is accessible and easy to learn, making it a popular choice for beginners and professionals alike.
<strong><span style="color: green;">2</span> Java</strong>: Known for its object-oriented programming (OOP) and support for Java 8 and .NET, Java is used for developing enterprise-level software applications, high-performance games, as well as mobile apps, game development, and desktop applications.
<strong><span style="color: green;">3</span> C++</strong>: Known for its flexibility and scalability, C++ offers comprehensive object-oriented programming and is a popular choice for high-performance computing and other technical fields. It's a powerful platform for building real-world applications and games at scale.
<strong><span style="color: green;">4</span> Python</strong>: Developed by Guido van Rossum in 1991, Python is a high-level, interpreted, and dynamically typed language known for its simplicity, readability, and versatility.
<strong><span style="color: green;">1</span> JavaScript</strong>: JavaScript is at the heart of the web and can be used for building web applications, APIs, and interactive front-end applications like frameworks like React and Angular. It's similar to C, C++, and F# in syntax structure and is accessible and easy to learn, making it a popular choice for beginners and professionals alike.
<strong><span style="color: green;">2</span> Java</strong>: Known for its object-oriented programming (OOP) and support for Java 8 and .NET, Java is used for developing enterprise-level software applications, high-performance games, as well as mobile apps, game development, and desktop applications.
<strong><span style="color: green;">3</span> C++</strong>: Known for its flexibility and scalability, C++ offers comprehensive object-oriented programming and is a popular choice for high-performance computing and other technical fields. It's a powerful platform for building real-world applications and games at scale.
<strong><span style="color: green;">4</span> Python</strong>: Developed by Guido van Rossum in 1991, Python is a high-level, interpreted, and dynamically typed language known for its simplicity, readability, and versatility.
</code></pre>
## Expected dataset format
@ -102,7 +101,6 @@ To ensure that we train MOEs similarly during preference-tuning, it is beneficia
This option is enabled by setting `output_router_logits=True` in the model config (e.g. [`~transformers.MixtralConfig`]).
To scale how much the auxiliary loss contributes to the total loss, use the hyperparameter `router_aux_loss_coef=...` (default: `0.001`) in the model config.
### Batch size recommendations
Use a per-step batch size that is at least 4, and an effective batch size between 16 and 128. Even if your effective batch size is large, if your per-step batch size is poor, then the KL estimate in KTO will be poor.

View File

@ -7,15 +7,15 @@
With this memory reduction, you can potentially turn off `cpu_offloading` or gradient checkpointing to further boost the performance.
| Speed Up | Memory Reduction |
|--------------------------|-------------------------|
| Speed Up | Memory Reduction |
| --- | --- |
| ![Speed up](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-tps.png) | ![Memory](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/e2e-memory.png) |
1. To use Liger-Kernel in [`SFTTrainer`], first install it by:
```bash
pip install liger-kernel
```
```bash
pip install liger-kernel
```
2. Once installed, set `use_liger_kernel` in [`SFTConfig`]. No other changes are needed!

View File

@ -44,6 +44,7 @@ Here's a brief explanation for the logged metrics provided in the data:
* `episode`: The current episode count in the training process.
### Crucial values
During training, many values are logged, here are the most important ones:
1. `objective/scores`: The mean scores returned by the reward model / environment.
@ -63,7 +64,7 @@ Here's a brief explanation for the logged metrics provided in the data for the G
* `num_tokens`: Total number of input tokens processed during training so far.
#### Completions
### Completions
* `completions/mean_length`: Mean length of all generated completions (including those not ending with an EOS token).
* `completions/min_length`: Minimum length among all generated completions.
@ -73,34 +74,33 @@ Here's a brief explanation for the logged metrics provided in the data for the G
* `completions/min_terminated_length`: Minimum length among completions that ended with an EOS token.
* `completions/max_terminated_length`: Maximum length among completions that ended with an EOS token.
#### Rewards
### Rewards
* `rewards/{reward_func_name}/mean`: The mean reward obtained from a specific, named reward function (e.g., `rewards/my_custom_reward/mean`). This is logged for each reward function used.
* `rewards/{reward_func_name}/std`: The standard deviation of rewards from a specific, named reward function.
* `reward`: The overall mean of the (potentially weighted and, if `args.scale_rewards` is true, normalized) rewards, after group-wise normalization (advantages).
* `reward_std`: The standard deviation of the (potentially weighted) rewards *before* group-wise normalization for advantages.
#### Policy and Loss Metrics
### Policy and Loss Metrics
* `kl`: The mean Kullback-Leibler (KL) divergence between the current policy and the reference policy. This is logged only if `beta` (the KL coefficient in `GRPOConfig`) is non-zero.
* `entropy`: Average entropy of token predictions across generated completions.
* If Liger GRPOLoss is used (`use_liger_loss: True` in `GRPOConfig`):
* `clip_ratio`: The fraction of policy updates where the probability ratio was clipped according to the GRPO loss's epsilon bounds.
* `clip_ratio`: The fraction of policy updates where the probability ratio was clipped according to the GRPO loss's epsilon bounds.
* If standard GRPOLoss is used (`use_liger_loss: False`):
* `clip_ratio/low_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the lower bound `1 - epsilon_low` (occurs when advantage is negative and ratio is below the bound).
* `clip_ratio/low_min`: The minimum observed fraction for `clip_ratio/low_mean` across batches/processes.
* `clip_ratio/high_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the upper bound `1 + epsilon_high` (occurs when advantage is positive and ratio is above the bound).
* `clip_ratio/high_max`: The maximum observed fraction for `clip_ratio/high_mean` across batches/processes.
* `clip_ratio/region_mean`: The mean fraction of instances where the probability ratio was clipped at either the lower or upper bound.
* `clip_ratio/low_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the lower bound `1 - epsilon_low` (occurs when advantage is negative and ratio is below the bound).
* `clip_ratio/low_min`: The minimum observed fraction for `clip_ratio/low_mean` across batches/processes.
* `clip_ratio/high_mean`: The mean fraction of instances where the probability ratio `r_t(θ)` was clipped at the upper bound `1 + epsilon_high` (occurs when advantage is positive and ratio is above the bound).
* `clip_ratio/high_max`: The maximum observed fraction for `clip_ratio/high_mean` across batches/processes.
* `clip_ratio/region_mean`: The mean fraction of instances where the probability ratio was clipped at either the lower or upper bound.
### Crucial GRPO values
During GRPO training, monitor these values for insights into performance and stability:
1. `reward`: This is the primary objective. It reflects the (group-wise normalized) rewards the policy is achieving. It should generally increase during successful training.
1. `kl`: If `beta > 0`, this tracks the divergence from the reference model. Keep an eye on it to ensure the policy doesn't stray too far, which can lead to instability.
1. `clip_ratio/*` (either `clip_ratio` for Liger loss or the more detailed `clip_ratio/...` metrics for standard loss): These indicate how often the policy updates are being constrained by the GRPO clipping mechanism. Very high values might suggest that the policy is trying to change too drastically (potentially due to large advantages or a learning rate that's too high) or that the epsilon clipping range is too restrictive.
1. `completions/clipped_ratio`: A high ratio here indicates that the model is frequently generating completions that are cut off by `max_completion_length` rather than naturally ending with an EOS token. This might suggest issues with learning sequence termination or that `max_completion_length` is too short.
1. `rewards/{reward_func_name}/mean`: Monitoring the mean of individual reward functions can help diagnose which aspects of the desired behavior the model is learning or struggling with, especially when using multiple reward sources.
1. `entropy`: Measures how uncertain the policy is in its action choices, higher entropy suggests more exploration. A collapse in entropy means the policy is becoming overconfident and deterministic, often too early. This can stall learning by reducing exploration and making updates overly biased. Stable but non-zero entropy is usually a sign that the policy retains flexibility and continues to explore.
* `reward`: This is the primary objective. It reflects the (group-wise normalized) rewards the policy is achieving. It should generally increase during successful training.
* `kl`: If `beta > 0`, this tracks the divergence from the reference model. Keep an eye on it to ensure the policy doesn't stray too far, which can lead to instability.
* `clip_ratio/*` (either `clip_ratio` for Liger loss or the more detailed `clip_ratio/...` metrics for standard loss): These indicate how often the policy updates are being constrained by the GRPO clipping mechanism. Very high values might suggest that the policy is trying to change too drastically (potentially due to large advantages or a learning rate that's too high) or that the epsilon clipping range is too restrictive.
* `completions/clipped_ratio`: A high ratio here indicates that the model is frequently generating completions that are cut off by `max_completion_length` rather than naturally ending with an EOS token. This might suggest issues with learning sequence termination or that `max_completion_length` is too short.
* `rewards/{reward_func_name}/mean`: Monitoring the mean of individual reward functions can help diagnose which aspects of the desired behavior the model is learning or struggling with, especially when using multiple reward sources.
* `entropy`: Measures how uncertain the policy is in its action choices, higher entropy suggests more exploration. A collapse in entropy means the policy is becoming overconfident and deterministic, often too early. This can stall learning by reducing exploration and making updates overly biased. Stable but non-zero entropy is usually a sign that the policy retains flexibility and continues to explore.

View File

@ -22,14 +22,13 @@ Let's implement and train LoRA adapters in TRL scripts based on the core finding
The blog post performs SFT on a range of models and datasets from the Hub, which we can reproduce in TRL.
| Model | Dataset |
|-------|---------|
| --- | --- |
| [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B) | [allenai/tulu-3-sft-mixture](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) |
| [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B) | [open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) |
| [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B) | [allenai/tulu-3-sft-mixture](https://huggingface.co/datasets/allenai/tulu-3-sft-mixture) |
| [Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B) | [open-thoughts/OpenThoughts-114k](https://huggingface.co/datasets/open-thoughts/OpenThoughts-114k) |
<hfoptions id="sft">
<hfoption id="python">
We can integrate these findings with the TRL Python API like so:
@ -64,7 +63,6 @@ trainer.train()
```
</hfoption>
<hfoption id="jobs">
```bash
@ -127,10 +125,10 @@ Once training starts, you can monitor the progress in [Trackio](https://huggingf
### Reinforcement Learning (GRPO)
The blog post performs GRPO on a range of models and datasets from the Hub, and once again we can reproduce the results in TRL.
The blog post performs GRPO on a range of models and datasets from the Hub, and once again we can reproduce the results in TRL.
| Model | Dataset |
|-------|---------|
| --- | --- |
| [Llama-3.1-8B-Base](https://huggingface.co/meta-llama/Llama-3.2-1B) | [GSM8k](https://huggingface.co/datasets/openai/gsm8k) |
| [Llama-3.1-8B-Base](https://huggingface.co/meta-llama/Llama-3.2-1B) | [DeepMath-103K](https://huggingface.co/datasets/zwhe99/DeepMath-103K) |
| [Qwen3-8b-base](https://huggingface.co/Qwen/Qwen3-8b-base) | [DeepMath-103K](https://huggingface.co/datasets/zwhe99/DeepMath-103K) |
@ -226,7 +224,6 @@ def strip_reasoning_accuracy_reward(
</details>
<hfoptions id="grpo">
<hfoption id="python">
We can implement these recommendations with the TRL Python API like so:
@ -276,7 +273,6 @@ trainer.train()
> This snippet skips the reward function which is defined above to keep the example concise.
</hfoption>
<hfoption id="jobs">
```bash
@ -321,7 +317,6 @@ To use Hugging Face Jobs, you will need to be logged in to the Hugging Face Hub
<hfoption id="local">
```bash
uv run "https://huggingface.co/datasets/burtenshaw/lora-without-regrets/resolve/main/grpo.py" \
--model_name_or_path Qwen/Qwen3-0.6B \
--dataset_name HuggingFaceH4/OpenR1-Math-220k-default-verified \
@ -372,23 +367,23 @@ And most importantly, the LoRA model uses significantly less memory than the ful
Here are the parameters we used to train the above models
| Parameter | LoRA | Full FT |
|----------------------------------|----------------------------------------------------|-------------------------------|
| `--model_name_or_path` | HuggingFaceTB/SmolLM3-3B | HuggingFaceTB/SmolLM3-3B |
| `--dataset_name` | HuggingFaceH4/OpenR1-Math-220k-default-verified | HuggingFaceH4/OpenR1-Math-220k-default-verified |
| `--learning_rate` | 1.0e-5 | 1.0e-6 |
| `--max_prompt_length` | 1024 | 1024 |
| `--max_completion_length` | 4096 | 4096 |
| `--lora_r` | 1 | - |
| `--lora_alpha` | 32 | - |
| `--lora_dropout` | 0.0 | - |
| `--lora_target_modules` | all-linear | - |
| Parameter | LoRA | Full FT |
| --- | --- | --- |
| `--model_name_or_path` | HuggingFaceTB/SmolLM3-3B | HuggingFaceTB/SmolLM3-3B |
| `--dataset_name` | HuggingFaceH4/OpenR1-Math-220k-default-verified | HuggingFaceH4/OpenR1-Math-220k-default-verified |
| `--learning_rate` | 1.0e-5 | 1.0e-6 |
| `--max_prompt_length` | 1024 | 1024 |
| `--max_completion_length` | 4096 | 4096 |
| `--lora_r` | 1 | - |
| `--lora_alpha` | 32 | - |
| `--lora_dropout` | 0.0 | - |
| `--lora_target_modules` | all-linear | - |
Let's break down the key findings of the blog post and how we were able to reproduce them.
### 1. *LoRA performs better when applied to all weight matrices*
The authors recommend applying LoRA to all weight matrices rather than limiting it to attention layers, as increasing the rank does not compensate for this restriction.
The authors recommend applying LoRA to all weight matrices rather than limiting it to attention layers, as increasing the rank does not compensate for this restriction.
![all layers](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/1.png)
@ -402,7 +397,7 @@ peft_config = LoraConfig(target_modules="all-linear")
### 2. *The adapter needs sufficient capacity to learn from the dataset*
The blog post recommends using a sufficient LoRA rank to learn from the dataset. The rank determines the number of trainable parameters in the LoRA adapter. Therefore, "For datasets that exceed LoRA capacity, LoRA underperforms FullFT".
The blog post recommends using a sufficient LoRA rank to learn from the dataset. The rank determines the number of trainable parameters in the LoRA adapter. Therefore, "For datasets that exceed LoRA capacity, LoRA underperforms FullFT".
![learning rate](https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/lora_without_regret/3.png)
@ -413,7 +408,7 @@ Reinforcement learning tasks typically require lower capacity, so smaller LoRA r
The blog post defines the ideal dataset size for LoRA to match full fine-tuning as "Post-training scale". Which we can use to determine the recommended rank for SFT and RL LoRAs as:
| Task Type | Dataset Size | Recommended Rank |
|-----------|-------------|------------------|
| --- | --- | --- |
| **SFT** | Post-training scale | 256 |
| **RL** | Any size | 1-32 |

View File

@ -8,7 +8,6 @@ With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder mode
## AutoModelForCausalLMWithValueHead
[[autodoc]] AutoModelForCausalLMWithValueHead
- __init__
- forward
@ -25,4 +24,4 @@ With the `AutoModelForCausalLMWithValueHead` class TRL supports all decoder mode
## create_reference_model
[[autodoc]] create_reference_model
[[autodoc]] create_reference_model

View File

@ -14,11 +14,11 @@ You need to address this approach in three stages that we summarize as follows:
2- Train a reward model using `peft`. This is required in order to re-use the adapter during the RL optimisation process (step 3 below). We show an example of leveraging the `RewardTrainer` from TRL in [this example](https://github.com/huggingface/trl/tree/main/examples/scripts/reward_modeling.py)
3- Fine tune new adapters on the base model using PPO and the reward adapter. ("0 abstraction RL")
Make sure to use the same model (i.e. same architecture and same weights) for the stages 2 & 3.
Make sure to use the same model (i.e. same architecture and same weights) for the stages 2 & 3.
## Quickstart
Let us assume you have trained your reward adapter on `llama-7b` model using `RewardTrainer` and pushed the weights on the hub under `trl-lib/llama-7b-hh-rm-adapter`.
Let us assume you have trained your reward adapter on `llama-7b` model using `RewardTrainer` and pushed the weights on the hub under `trl-lib/llama-7b-hh-rm-adapter`.
When doing PPO, before passing the model to `PPOTrainer` create your model as follows:
```python
@ -48,6 +48,7 @@ trainer = PPOTrainer(
...
```
Then inside your PPO training loop, call the `compute_reward_score` method by accessing the `model` attribute from `PPOTrainer`.
```python
@ -56,9 +57,9 @@ rewards = trainer.model.compute_reward_score(**inputs)
## Advanced usage
### Control on the adapter name
### Control on the adapter name
If you are familiar with the `peft` library, you know that you can use multiple adapters inside the same model. What you can do is train multiple adapters on the same base model to fine-tune on different policies.
If you are familiar with the `peft` library, you know that you can use multiple adapters inside the same model. What you can do is train multiple adapters on the same base model to fine-tune on different policies.
In this case, you want to be able to control the adapter name you want to activate back, after retrieving the reward. For that, simply pass the appropriate `adapter_name` to `ppo_adapter_name` argument when calling `compute_reward_score`.
```python
@ -71,6 +72,7 @@ rewards = trainer.model.compute_reward_score(**inputs, ppo_adapter_name=adapter_
For more memory efficient fine-tuning, you can load your base model in 8-bit or 4-bit while keeping the adapters in the default precision (float32).
Just pass the appropriate arguments (i.e. `load_in_8bit=True` or `load_in_4bit=True`) to `AutoModelForCausalLMWithValueHead.from_pretrained` as follows (assuming you have installed `bitsandbytes`):
```python
model_name = "llama-7b"
rm_adapter_id = "trl-lib/llama-7b-hh-rm-adapter"

View File

@ -1,16 +1,16 @@
# Nash-MD Trainer
[![](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md,trl)
[![model badge](https://img.shields.io/badge/All_models-Nash--MD-blue)](https://huggingface.co/models?other=nash-md,trl)
## Overview
Nash-MD was proposed in the paper [Nash Learning from Human Feedback](https://huggingface.co/papers/2312.00886) by Rémi Munos, [Michal Valko](https://huggingface.co/misovalko), Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mésnard, and Andrea Michi.
Nash-MD was proposed in the paper [Nash Learning from Human Feedback](https://huggingface.co/papers/2312.00886) by Rémi Munos, [Michal Valko](https://huggingface.co/misovalko), Daniele Calandriello, Mohammad Gheshlaghi Azar, Mark Rowland, Daniel Guo, Yunhao Tang, Matthieu Geist, Thomas Mésnard, and Andrea Michi.
The abstract from the paper is the following:
> Reinforcement learning from human feedback (RLHF) has emerged as the main paradigm for aligning large language models (LLMs) with human preferences. Typically, RLHF involves the initial step of learning a reward model from human feedback, often expressed as preferences between pairs of text generations produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by optimizing it to maximize the reward model through a reinforcement learning algorithm. However, an inherent limitation of current reward models is their inability to fully represent the richness of human preferences and their dependency on the sampling distribution. In this study, we introduce an alternative pipeline for the fine-tuning of LLMs using pairwise human feedback. Our approach entails the initial learning of a preference model, which is conditioned on two inputs given a prompt, followed by the pursuit of a policy that consistently generates responses preferred over those generated by any competing policy, thus defining the Nash equilibrium of this preference model. We term this approach Nash learning from human feedback (NLHF). In the context of a tabular policy representation, we present a novel algorithmic solution, Nash-MD, founded on the principles of mirror descent. This algorithm produces a sequence of policies, with the last iteration converging to the regularized Nash equilibrium. Additionally, we explore parametric representations of policies and introduce gradient descent algorithms for deep-learning architectures. To demonstrate the effectiveness of our approach, we present experimental results involving the fine-tuning of a LLM for a text summarization task. We believe NLHF offers a compelling avenue for preference learning and policy optimization with the potential of advancing the field of aligning LLMs with human preferences.
This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif) and [Daniil Tiapkin](https://huggingface.co/dtiapkin), [Pierre Ménard](https://huggingface.co/menardprr), Daniele Calandriello and [Quentin Gallouédec](https://huggingface.co/qgallouedec).
This post-training method was contributed by [Kashif Rasul](https://huggingface.co/kashif) and [Daniil Tiapkin](https://huggingface.co/dtiapkin), [Pierre Ménard](https://huggingface.co/menardprr), Daniele Calandriello and [Quentin Gallouédec](https://huggingface.co/qgallouedec).
## Quick start

View File

@ -1,10 +1,10 @@
# Online DPO Trainer
[![](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo,trl)
[![model badge](https://img.shields.io/badge/All_models-Online_DPO-blue)](https://huggingface.co/models?other=online-dpo,trl)
## Overview
## Overview
Online DPO was proposed in [Direct Language Model Alignment from Online AI Feedback](https://huggingface.co/papers/2402.04792) by Shangmin Guo, Biao Zhang, Tianlin Liu, Tianqi Liu, Misha Khalman, Felipe Llinares, Alexandre Rame, Thomas Mesnard, Yao Zhao, Bilal Piot, Johan Ferret, and Mathieu Blondel.
Online DPO was proposed in [Direct Language Model Alignment from Online AI Feedback](https://huggingface.co/papers/2402.04792) by Shangmin Guo, Biao Zhang, Tianlin Liu, Tianqi Liu, Misha Khalman, Felipe Llinares, Alexandre Rame, Thomas Mesnard, Yao Zhao, Bilal Piot, Johan Ferret, and Mathieu Blondel.
The abstract from the paper is the following:
@ -112,7 +112,6 @@ This callback logs the model's generated completions directly to Weights & Biase
![Logged Completions](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/wandb_completions.png)
## Example script
We provide an example script to train a model using the online DPO method. The script is available in [`examples/scripts/dpo_online.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/dpo_online.py)
@ -153,8 +152,7 @@ While training and evaluating, we record the following reward metrics. Here is a
To validate the online DPO implementation works, we ran experiments with the Pythia 1B, 2.8B, and 6.9B models on a single node of 8 x H100s. Here are the commands we used to run the experiments. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).
```
```shell
# 1B Online DPO experiment
accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml \
examples/scripts/dpo_online.py \
@ -213,9 +211,8 @@ accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml
Checkpoints and experiment tracking are available at:
- [🤗 Model checkpoints](https://huggingface.co/collections/trl-lib/online-dpo-66acd3fa38a331a9cd457b07)
- [🐝 Tracked experiment](https://wandb.ai/huggingface/trl/reports/Online-DPO-experiments-for-TL-DR-summarisation--Vmlldzo5MTczMDU0)
* [🤗 Model checkpoints](https://huggingface.co/collections/trl-lib/online-dpo-66acd3fa38a331a9cd457b07)
* [🐝 Tracked experiment](https://wandb.ai/huggingface/trl/reports/Online-DPO-experiments-for-TL-DR-summarisation--Vmlldzo5MTczMDU0)
To evaluate, we use [vLLM](https://github.com/vllm-project/vllm) to load the checkpoints and GPT-4o mini as a judge model to evaluate the generated TL;DR against the reference TL;DR.
For more information on how to use judges, see [Judges](judges).

View File

@ -1,6 +1,6 @@
# ORPO Trainer
[![](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo,trl) [![](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
[![model badge](https://img.shields.io/badge/All_models-ORPO-blue)](https://huggingface.co/models?other=orpo,trl) [![model badge](https://img.shields.io/badge/smol_course-Chapter_2-yellow)](https://github.com/huggingface/smol-course/tree/main/2_preference_alignment)
## Overview
@ -54,7 +54,7 @@ accelerate launch train_orpo.py
Distributed across 8 GPUs, the training takes approximately 30 minutes. You can verify the training progress by checking the reward graph. An increasing trend in the reward margin indicates that the model is improving and generating better responses over time.
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/orpo-qwen2-reward-margin.png)
![orpo qwen2 reward margin](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/orpo-qwen2-reward-margin.png)
To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-ORPO) performs, you can use the [Transformers Chat CLI](https://huggingface.co/docs/transformers/quicktour#chat-with-text-generation-models).
@ -64,11 +64,11 @@ What is the best programming language?
<strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-ORPO&gt;:</span></strong>
It's challenging to determine the best programming language as no one language is perfect, as the complexity of a task and the type of project are significant factors. Some popular languages include Java, Python, JavaScript, and
C++. If you have specific needs or requirements for a specific project, it's important to choose the language that best suits those needs.
C++. If you have specific needs or requirements for a specific project, it's important to choose the language that best suits those needs.
Here are some other factors to consider when choosing a programming language for a project:
<strong><span style="color: green;">• Language proficiency:</span></strong> A good programming language is more likely to be easy to understand and use, and will allow developers to collaborate on projects more efficiently.
<strong><span style="color: green;">• Language proficiency:</span></strong> A good programming language is more likely to be easy to understand and use, and will allow developers to collaborate on projects more efficiently.
<strong><span style="color: green;">• Ease of use:</span></strong> There are tools and libraries available to make programming more accessible, so developers should choose a language that can help them get started easier.
<strong><span style="color: green;">• Code readability:</span></strong> A clear and concise codebase should be easy to read and understand, especially when working with large projects.
<strong><span style="color: green;">• Tool and framework support:</span></strong> There are numerous libraries available for Python, Java, and JavaScript, along with tools like IDEs and static code analysis tools.
@ -118,7 +118,7 @@ While training and evaluating, we record the following reward metrics:
- `log_odds_chosen`: the mean log odds ratio of the chosen responses over the rejected responses
- `log_odds_ratio`: the mean of the `log(sigmoid(log_odds_chosen))`
- `nll_loss`: the mean negative log likelihood loss from the SFT part of the loss over chosen responses
## ORPOTrainer
[[autodoc]] ORPOTrainer

View File

@ -170,7 +170,7 @@ $$
}
$$
Despite \\( \textcolor{red}{\pi_{\text{inference}}} \\) and \\( \textcolor{blue}{\pi_{\text{training}}} \\) sharing the same model parameters \\( \theta \\), they can produce significantly different token probabilities. This unexpected behavior implicitly breaks the on-policy assumption, and silently turns training off-policy.
Despite \\( \textcolor{red}{\pi_{\text{inference}}} \\) and \\( \textcolor{blue}{\pi_{\text{training}}} \\) sharing the same model parameters \\( \theta \\), they can produce significantly different token probabilities. This unexpected behavior implicitly breaks the on-policy assumption, and silently turns training off-policy.
Truncated Importance Sampling (TIS) addresses this issue by adapting the model update via importance-sampling correction. The gradient computation of the aforementioned PPO objective becomes
@ -458,10 +458,7 @@ trainer = SFTTrainer(
Dynamic Fine-Tuning (DFT) improves the generalization of Large Language Models (LLMs) by dynamically rescaling gradients, outperforming standard Supervised Fine-Tuning (SFT) and showing competitive results in offline reinforcement learning.
$$
\mathcal{L}_{\text{DFT}}(\theta)
= \mathbb{E}_{(x,y) \sim \mathcal{D}} \left[ - \sum_{t=1}^{|y|}
\textcolor{red}{\text{sg}\big(\pi_\theta(y_t \mid y_{<t}, x)\big)}
\; \log \pi_\theta(y_t \mid y_{<t}, x) \right]
\mathcal{L}_{\text{DFT}}(\theta) = \mathbb{E}_{(x,y) \sim \mathcal{D}} \left[ - \sum_{t=1}^{|y|} \textcolor{red}{\text{sg}\big(\pi_\theta(y_t \mid y_{<t}, x)\big)} \; \log \pi_\theta(y_t \mid y_{<t}, x) \right]
$$
where \\( \text{sg}(\cdot) \\) is the stop-gradient operator. To use DFT with SFT as described in the paper, you can use the `loss_type="dft"` argument:

View File

@ -6,14 +6,15 @@ For more information on LoRA, see the [original paper](https://huggingface.co/pa
Here's an overview of the `peft`-enabled notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):
| File | Task | Description | Colab link |
|---|---| --- |
| [`stack_llama/rl_training.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py) | RLHF | Distributed fine-tuning of the 7b parameter LLaMA models with a learned reward model and `peft`. | |
| [`stack_llama/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/reward_modeling.py) | Reward Modeling | Distributed training of the 7b parameter LLaMA reward model with `peft`. | |
| [`stack_llama/supervised_finetuning.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py) | SFT | Distributed instruction/supervised fine-tuning of the 7b parameter LLaMA model with `peft`. | |
| ---| ---| --- |
| [`stack_llama/rl_training.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/rl_training.py) | RLHF | Distributed fine-tuning of the 7b parameter LLaMA models with a learned reward model and `peft`. | |
| [`stack_llama/reward_modeling.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/reward_modeling.py) | Reward Modeling | Distributed training of the 7b parameter LLaMA reward model with `peft`. | |
| [`stack_llama/supervised_finetuning.py`](https://github.com/huggingface/trl/blob/main/examples/research_projects/stack_llama/scripts/supervised_finetuning.py) | SFT | Distributed instruction/supervised fine-tuning of the 7b parameter LLaMA model with `peft`. | |
## Installation
Note: peft is in active development, so we install directly from their Github page.
Peft also relies on the latest version of transformers.
Peft also relies on the latest version of transformers.
```bash
pip install trl[peft]
@ -27,7 +28,7 @@ Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scr
## How to use it?
Simply declare a `PeftConfig` object in your script and pass it through `.from_pretrained` to load the TRL+PEFT model.
Simply declare a `PeftConfig` object in your script and pass it through `.from_pretrained` to load the TRL+PEFT model.
```python
from peft import LoraConfig
@ -47,7 +48,9 @@ model = AutoModelForCausalLMWithValueHead.from_pretrained(
peft_config=lora_config,
)
```
And if you want to load your model in 8bit precision:
```python
pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
config.model_name,
@ -55,7 +58,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
peft_config=lora_config,
)
```
... or in 4bit precision:
```python
pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
config.model_name,
@ -64,7 +69,6 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
)
```
## Launch scripts
The `trl` library is powered by `accelerate`. As such it is best to configure and launch trainings with the following commands:
@ -77,6 +81,7 @@ accelerate launch examples/scripts/ppo.py --use_peft # launch`es training
## Using `trl` + `peft` and Data Parallelism
You can scale up to as many GPUs as you want, as long as you are able to fit the training process in a single device. The only tweak you need to apply is to load the model as follows:
```python
from peft import LoraConfig
...
@ -94,7 +99,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
peft_config=lora_config,
)
```
And if you want to load your model in 8bit precision:
```python
pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
config.model_name,
@ -102,7 +109,9 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
load_in_8bit=True,
)
```
... or in 4bit precision:
```python
pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
config.model_name,
@ -110,21 +119,20 @@ pretrained_model = AutoModelForCausalLMWithValueHead.from_pretrained(
load_in_4bit=True,
)
```
Finally, make sure that the rewards are computed on correct device as well, for that you can use `ppo_trainer.model.current_device`.
## Naive pipeline parallelism (NPP) for large models (>60B models)
The `trl` library also supports naive pipeline parallelism (NPP) for large models (>60B models). This is a simple way to parallelize the model across multiple GPUs.
The `trl` library also supports naive pipeline parallelism (NPP) for large models (>60B models). This is a simple way to parallelize the model across multiple GPUs.
This paradigm, termed as "Naive Pipeline Parallelism" (NPP) is a simple way to parallelize the model across multiple GPUs. We load the model and the adapters across multiple GPUs and the activations and gradients will be naively communicated across the GPUs. This supports `int8` models as well as other `dtype` models.
<div style="text-align: center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-npp.png">
</div>
![NPP](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/trl-npp.png)
### How to use NPP?
Simply load your model with a custom `device_map` argument on the `from_pretrained` to split your model across multiple devices. Check out this [nice tutorial](https://github.com/huggingface/blog/blob/main/accelerate-large-models.md) on how to properly create a `device_map` for your model.
Simply load your model with a custom `device_map` argument on the `from_pretrained` to split your model across multiple devices. Check out this [nice tutorial](https://github.com/huggingface/blog/blob/main/accelerate-large-models.md) on how to properly create a `device_map` for your model.
Also make sure to have the `lm_head` module on the first GPU device as it may throw an error if it is not on the first device. As this time of writing, you need to install the `main` branch of `accelerate`: `pip install git+https://github.com/huggingface/accelerate.git@main` and `peft`: `pip install git+https://github.com/huggingface/peft.git@main`.
### Launch scripts

View File

@ -1,10 +1,11 @@
# PPO Trainer
[![](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)
[![model badge](https://img.shields.io/badge/All_models-PPO-blue)](https://huggingface.co/models?other=ppo,trl)
TRL supports training LLMs with [Proximal Policy Optimization (PPO)](https://huggingface.co/papers/1707.06347).
References:
- [Fine-Tuning Language Models from Human Preferences](https://github.com/openai/lm-human-preferences)
- [Learning to Summarize from Human Feedback](https://github.com/openai/summarize-from-feedback)
- [The N Implementation Details of RLHF with PPO](https://huggingface.co/blog/the_n_implementation_details_of_rlhf_with_ppo)
@ -31,49 +32,45 @@ python examples/scripts/ppo/ppo.py \
--missing_eos_penalty 1.0
```
## Explanation of the logged metrics
The logged metrics are as follows. Here is an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35)
* `eps`: Tracks the number of episodes per second.
* `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
* `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
* `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
* `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
* `objective/scores`: The mean scores returned by the reward model / environment.
* `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
* `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
* `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
* `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
* `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to policy/clipfrac_avg but for the value function.
* `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
* `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
* `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
* `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
* `lr`: lr: The current learning rate used by the optimizer.
* `episode`: episode: The current episode count in the training process.
- `eps`: Tracks the number of episodes per second.
- `objective/kl`: The mean Kullback-Leibler (KL) divergence between the current policy and reference policy.
- `objective/entropy`: The mean entropy of the policy, indicating the randomness of the actions chosen by the policy.
- `objective/non_score_reward`: The mean reward from non-score-related sources, basically `beta * kl.sum(1)`, where `beta` is the KL penalty coefficient and `kl` is the per-token KL divergence.
- `objective/rlhf_reward`: The mean RLHF reward, which is `score - non_score_reward`.
- `objective/scores`: The mean scores returned by the reward model / environment.
- `policy/approxkl_avg`: The average approximate KL divergence between consecutive PPO policies. Note that this is not the same as `objective/kl`.
- `policy/clipfrac_avg`: The average fraction of policy updates that are clipped, indicating how often the policy updates are constrained to prevent large changes.
- `loss/policy_avg`: The average policy loss, indicating how well the policy is performing.
- `loss/value_avg`: The average value loss, indicating the difference between the predicted value and the actual reward.
- `val/clipfrac_avg`: The average fraction of value function updates that are clipped, similar to policy/clipfrac_avg but for the value function.
- `policy/entropy_avg`: The average entropy of the policy during training, indicating how diverse the policy's actions are.
- `val/ratio`: The mean ratio of the current policy probability to the old policy probability, providing a measure of how much the policy has changed.
- `val/ratio_var`: The variance of the `val/ratio`, indicating the variability in policy changes.
- `val/num_eos_tokens`: The number of end-of-sequence (EOS) tokens generated, which can indicate the number of complete responses.
- `lr`: lr: The current learning rate used by the optimizer.
- `episode`: episode: The current episode count in the training process.
## Cookbook
* Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
* Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try understand why this is happening and try to fix it.
* Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
* Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
* Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
- Debugging TIP: `objective/rlhf_reward`: this is the ultimate objective of the RLHF training. If training works as intended, this metric should keep going up.
- Debugging TIP: `val/ratio`: this number should float around 1.0, and it gets clipped by `--cliprange 0.2` with PPO's surrogate loss. So if this `ratio` is too high like 2.0 or 1000.0 or too small like 0.1, it means the updates between consecutive policies are too drastic. You should try understand why this is happening and try to fix it.
- Memory TIP: If you are running out of memory, you can try to reduce the `--per_device_train_batch_size` or increase the `--gradient_accumulation_steps` to reduce the memory footprint.
- Memory TIP: If you have multiple GPUs, you can also run training with DeepSpeed stage 3 to reduce the memory footprint `accelerate launch --config_file examples/accelerate_configs/deepspeed_zero3.yaml`.
- Usage TIP: We recommend to use the "EOS trick" via `--missing_eos_penalty`, which subtracts a static scalar penalty from the score of completions that do not end with an EOS token. This can help the model learn to generate more coherent completions.
## What is my model doing exactly?
To help you understand what your model is doing, we periodically log some sample completions from the model. Here is an example of a completion. In an example [tracked run at Weights and Biases](https://wandb.ai/huggingface/trl/runs/dd2o3g35), it looks like the following, allowing you to see the model's response at different stages of training. By default we generate `--num_sample_generations 10` during training, but you can customize the number of generations.
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2_completions.gif)
![ppov2_completions](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2_completions.gif)
In the logs the sampled generations look like
In the logs the sampled generations look like
```
```txt
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━┓
┃ query ┃ model response ┃ score ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━┩
@ -177,7 +174,7 @@ This PPO implementation is based on the [The N+ Implementation Details of RLHF w
To validate the PPO implementation works, we ran experiment on the 1B model. Here are the command we used to run the experiment. We take the SFT / RM models directly from [The N+ Implementation Details of RLHF with PPO: A Case Study on TL;DR Summarization](https://huggingface.co/papers/2403.17031).
```
```shell
accelerate launch --config_file examples/accelerate_configs/deepspeed_zero2.yaml \
examples/scripts/ppo/ppo_tldr.py \
--output_dir models/minimal/ppo_tldr \
@ -212,8 +209,7 @@ The PPO checkpoint gets a 64.7% preferred rate vs the 33.0% preference rate of t
Metrics:
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2.png)
![PPO v2](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/ppov2.png)
```bash
# pip install openrlbenchmark==0.2.1a5

View File

@ -1,6 +1,6 @@
# PRM Trainer
[![](https://img.shields.io/badge/All_models-PRM-blue)](https://huggingface.co/models?other=prm,trl)
[![model badge](https://img.shields.io/badge/All_models-PRM-blue)](https://huggingface.co/models?other=prm,trl)
> [!WARNING]
> PRM Trainer is an experimental API which is subject to change at any time.
@ -15,7 +15,6 @@ The abstract from the paper is the following:
This post-training method was contributed by [Gaetan Lopez](https://github.com/gaetanlop), [Lewis Tunstall](https://huggingface.co/lewtun), [Quentin Gallouédec](https://huggingface.co/qgallouedec) and [Agustín Piqueres](https://huggingface.co/plaguss).
## Quick start
This example demonstrates how to train a model using the PRM method. We use the [Qwen 0.5B model](https://huggingface.co/Qwen/Qwen2-0.5B) as the base model. We use the stepwise supervision data from the [Math Shepherd dataset](https://huggingface.co/datasets/trl-lib/math_shepherd). You can view the data in the dataset here:
@ -54,7 +53,6 @@ Distributed across 8 GPUs, the training takes approximately 1 hour.
To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-Reward-Math-Sheperd) performs, you can use the following script.
```python
from datasets import load_dataset
from transformers import pipeline

View File

@ -7,9 +7,7 @@
Sequence lengths in the dataset can vary widely. When data is batched, sequences are padded to match the longest one in the batch, which can cause high memory usage, even if most sequences are relatively short.
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png" alt="Truncation prompt-completion" width="600"/>
</div>
![Truncation prompt-completion](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/why_you_should_truncate.png)
To reduce memory usage, it's important to truncate sequences to a reasonable length. While TRL trainers truncate sequences by default, you may want to adjust the default truncation length to better align with your specific use case.
@ -18,9 +16,7 @@ To reduce memory usage, it's important to truncate sequences to a reasonable len
DPO truncation is applied first to the prompt and to the completion via the `max_prompt_length` and `max_completion_length` parameters. The `max_length` parameter is then used to truncate the resulting sequence.
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png" alt="DPO truncation" width="600"/>
</div>
![DPO truncation](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_prompt_completion.png)
To set the truncation parameters, use the following code snippet:
@ -43,9 +39,7 @@ training_args = DPOConfig(..., max_completion_length=...)
SFT truncation is applied to the input sequence via the `max_length` parameter.
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png" alt="Truncation input ids" width="600"/>
</div>
![Truncation input ids](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/truncation_input_ids.png)
To set the truncation parameter, use the following code snippet:
@ -71,16 +65,14 @@ To help you choose an appropriate value, we provide a utility to visualize the s
> [!TIP]
> This technique applies only to SFT.
[Truncation](#truncation) has several drawbacks:
1. **Loss of information**: Key data at the end of a sequence may be discarded.
2. **Choosing truncation length**: Too short loses data; too long undermines efficiency.
Packing, introduced in [Raffel et al., 2020](https://huggingface.co/papers/1910.10683), addresses these issues by grouping sequences instead of truncating. It concatenates and splits dataset sequences into the desired lengths.
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_2.png" alt="Packing" width="600"/>
</div>
![Packing](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/packing_2.png)
Packing reduces padding by merging several sequences in one row when possible. We use an advanced method to be near-optimal in the way we pack the dataset. To enable packing, use `packing=True` in the [`SFTConfig`].
@ -142,9 +134,7 @@ training_args = KTOConfig(..., use_liger_loss=True)
Padding-free batching is an alternative approach for reducing memory usage. In this method, a batch is first sampled and then flattened into a single sequence, avoiding padding. Unlike packing, which can result in incomplete sequences by combining parts of different samples, padding-free batching ensures that all sequences remain complete and intact.
<div class="flex justify-center">
<img src="https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png" alt="Padding-free batching" width="600"/>
</div>
![Padding-free](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/padding-free.png)
> [!WARNING]
> It's highly recommended to use padding-free batching with **FlashAttention 2** or **FlashAttention 3**. Otherwise, you may encounter batch contamination issues.

View File

@ -1,6 +1,6 @@
# Reward Modeling
[![](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer,trl)
[![model badge](https://img.shields.io/badge/All_models-Reward_Trainer-blue)](https://huggingface.co/models?other=reward-trainer,trl)
## Overview

View File

@ -1,6 +1,6 @@
# RLOO Trainer
[![](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)
[![model badge](https://img.shields.io/badge/All_models-RLOO-blue)](https://huggingface.co/models?other=rloo,trl)
## Overview
@ -101,14 +101,13 @@ where \\( \beta > 0 \\) controls the strength of the KL penalty.
### Computing the advantage
Once the rewards for each completion have been computed, we calculate a baseline as the average reward of all other samples in the same batch, excluding the current sample. This baseline is used to reduce the variance of the policy gradient estimate. The advantage for each completion is then obtained as the difference between its own reward and this leave-one-out baseline.
Once the rewards for each completion have been computed, we calculate a baseline as the average reward of all other samples in the same batch, excluding the current sample. This baseline is used to reduce the variance of the policy gradient estimate. The advantage for each completion is then obtained as the difference between its own reward and this leave-one-out baseline.
Formally, for a batch of G completions, the baseline for completion is:
$$
b_i = \frac{1}{G-1} \sum_{j \neq i} r_j
$$
and then the advantage for each completion is computed as the difference between its reward and the baseline:
$$
@ -151,9 +150,9 @@ While training and evaluating, we record the following reward metrics:
- `entropy`: Average entropy of token predictions across generated completions. (If `mask_truncated_completions=True`, masked sequences tokens are excluded.)
- `kl`: The average KL divergence between the model and the reference model, calculated over generated completions. Logged only if `beta` is nonzero.
- `clip_ratio/region_mean`: The ratio of sequence probabilities where the RLOO objective is clipped to stay within the trust region:
$$
\text{clip}\left( r_{i}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i}(\theta) = \frac{\pi_\theta(o_{i} \mid q)}{\pi_{\theta_{\text{old}}}(o_{i} \mid q)}\,.
$$
$$
\text{clip}\left( r_{i}(\theta), 1 - \epsilon_\mathrm{low}, 1 + \epsilon_\mathrm{high} \right)\,, \qquad r_{i}(\theta) = \frac{\pi_\theta(o_{i} \mid q)}{\pi_{\theta_{\text{old}}}(o_{i} \mid q)}\,.
$$
A higher value means more samples are clipped, which constrains how much the policy $\pi_\theta$ can change.
- `clip_ratio/low_mean`: The average ratio of sequence probabilities that were clipped on the lower bound of the trust region: \\(r_{i,t}(\theta) < 1 - \epsilon_\mathrm{low}\\)
@ -166,6 +165,7 @@ $$
### Speed up training with vLLM-powered generation
Generation is often the main bottleneck when training with online methods. To accelerate generation, you can use [vLLM](https://github.com/vllm-project/vllm), a high-throughput, low-latency inference engine for LLMs. To enable it, first install the package with
```shell
pip install trl[vllm]
```
@ -177,11 +177,13 @@ We support two ways of using vLLM during training: **server mode** and **colocat
In this mode, vLLM runs in a separate process (and using separate GPUs) and communicates with the trainer via HTTP. This is ideal if you have dedicated GPUs for inference.
1. **Start the vLLM server**:
```bash
trl vllm-serve --model <model_name>
```
2. **Enable server mode in your training script**:
```python
from trl import RLOOConfig
@ -214,12 +216,7 @@ training_args = RLOOConfig(
>
> We provide a [HF Space](https://huggingface.co/spaces/trl-lib/recommend-vllm-memory) to help estimate the recommended GPU memory utilization based on your model configuration and experiment settings. Simply use it as follows to get `vllm_gpu_memory_utilization` recommendation:
>
> <iframe
> src="https://trl-lib-recommend-vllm-memory.hf.space"
> frameborder="0"
> width="850"
> height="450"
> ></iframe>
> <iframe src="https://trl-lib-recommend-vllm-memory.hf.space" frameborder="0" width="850" height="450"></iframe>
>
> If the recommended value does not work in your environment, we suggest adding a small buffer (e.g., +0.05 or +0.1) to the recommended value to ensure stability.
>
@ -418,6 +415,7 @@ You can test this function as follows:
>>> reward_func(prompts=prompts, completions=completions, ground_truth=ground_truth)
[1.0, 0.0]
```
#### Example 4: Multi-task reward functions
Below is an example of using multiple reward functions in the [`RLOOTrainer`]. In this example, we define two task-specific reward functions: `math_reward_func` and `coding_reward_func`. The `math_reward_func` rewards math problems based on their correctness, while the `coding_reward_func` rewards coding problems based on whether the solution works.
@ -478,8 +476,6 @@ In this example, the `math_reward_func` and `coding_reward_func` are designed to
Note that the [`RLOOTrainer`] will ignore the `None` rewards returned by the reward functions and only consider the rewards returned by the relevant functions. This ensures that the model is trained on the relevant tasks and ignores the tasks for which there is no relevant reward function.
#### Passing the reward function to the trainer
To use your custom reward function, pass it to the [`RLOOTrainer`] as follows:

View File

@ -4,15 +4,11 @@ The notebooks and scripts in these examples show how to fine-tune a model with a
Here's an overview of the notebooks and scripts in the [trl repository](https://github.com/huggingface/trl/tree/main/examples):
| File | Description |
|------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------|
| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset |
| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook.
| File | Description |
| --- |--- |
| [`examples/scripts/ppo.py`](https://github.com/huggingface/trl/blob/main/examples/scripts/ppo.py) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment.ipynb) | This script shows how to use the `PPOTrainer` to fine-tune a sentiment analysis model using IMDB dataset |
| [`examples/notebooks/gpt2-sentiment.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-sentiment.ipynb) | This notebook demonstrates how to reproduce the GPT2 imdb sentiment tuning example on a jupyter notebook. |
| [`examples/notebooks/gpt2-control.ipynb`](https://github.com/huggingface/trl/tree/main/examples/notebooks/gpt2-control.ipynb) [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/trl/blob/main/examples/sentiment/notebooks/gpt2-sentiment-control.ipynb) | This notebook demonstrates how to reproduce the GPT2 sentiment control example on a jupyter notebook. |
## Usage
@ -30,7 +26,6 @@ python examples/scripts/ppo.py --log_with wandb --mini_batch_size 1 --gradient_a
Note: if you don't want to log with `wandb` remove `log_with="wandb"` in the scripts/notebooks. You can also replace it with your favourite experiment tracker that's [supported by `accelerate`](https://huggingface.co/docs/accelerate/usage_guides/tracking).
## Few notes on multi-GPU
## Few notes on multi-GPU
To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.
To run in multi-GPU setup with DDP (distributed Data Parallel) change the `device_map` value to `device_map={"": Accelerator().process_index}` and make sure to run your script with `accelerate launch yourscript.py`. If you want to apply naive pipeline parallelism you can use `device_map="auto"`.

View File

@ -106,7 +106,6 @@ $$
where \\( y_t \\) is the target token at timestep \\( t \\), and the model is trained to predict the next token given the previous ones. In practice, padding tokens are masked out during loss computation.
> [!TIP]
>
> [On the Generalization of SFT: A Reinforcement Learning Perspective with Reward Rectification](https://huggingface.co/papers/2508.05629) proposes an alternative loss function, called **Dynamic Fine-Tuning (DFT)**, which aims to improve generalization by rectifying the reward signal. This method can be enabled by setting `loss_type="dft"` in the [`SFTConfig`]. For more details, see [Paper Index - Dynamic Fine-Tuning](paper_index#on-the-generalization-of-sft-a-reinforcement-learning-perspective-with-reward-rectification).
### Label shifting and masking

View File

@ -48,11 +48,13 @@ You can customize the server configuration by passing additional arguments. For
> When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.
>
> Set GPUs **0-3** for vLLM generation:
>
> ```sh
> CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
> ```
>
> And GPUs **4-7** for training:
> And GPUs **4-7** for training:
>
> ```sh
> CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
> ```
@ -79,12 +81,14 @@ You can customize the server configuration by passing additional arguments. For
> [!WARNING]
> When using vLLM, ensure that the GPUs assigned for training and generation are separate to avoid resource conflicts. For instance, if you plan to use 4 GPUs for training and another 4 for vLLM generation, you can specify GPU allocation using `CUDA_VISIBLE_DEVICES`.
>
> Set GPUs **0-3** for vLLM generation:
> Set GPUs **0-3** for vLLM generation:
>
> ```sh
> CUDA_VISIBLE_DEVICES=0,1,2,3 trl vllm-serve --model <model_name>
> ```
>
> And GPUs **4-7** for training:
> And GPUs **4-7** for training:
>
> ```sh
> CUDA_VISIBLE_DEVICES=4,5,6,7 accelerate launch train.py
> ```

View File

@ -43,7 +43,6 @@ To use the data efficiently, we use a technique called packing: instead of havin
With this approach the training is much more efficient as each token that is passed through the model is also trained in contrast to padding tokens which are usually masked from the loss.
If you don't have much data and are more concerned about occasionally cutting off some tokens that are overflowing the context you can also use a classical data loader.
```python
# load model in 8bit
model = AutoModelForCausalLM.from_pretrained(
@ -109,6 +108,7 @@ peft_config = LoraConfig(
lora_dropout=0.1,
)
```
As detailed in the next section, the resulting adapter can be merged into the frozen model and saved for further downstream use.
## Reinforcement Learning from Human Feedback

View File

@ -7,14 +7,13 @@ This document will guide you through the process of using vLLM with TRL for fast
> [!TIP]
> The following trainers currently support generation with vLLM:
>
>
> - [`GRPOTrainer`]
> - [`OnlineDPOTrainer`]
> - [`NashMDTrainer`]
> - [`XPOTrainer`]
> - [`RLOOTrainer`]
## 🚀 How can I use vLLM with TRL to speed up training?
💡 **Note**: Resources required for this specific example: a single node with 8 GPUs.
@ -235,16 +234,16 @@ Separately, the number of completions to generate per prompt is controlled by th
### 🥸 More detail on what happens under the hood when running the server
* The vLLM server starts by running the command: `trl vllm-serve --model Qwen/Qwen2.5-7B`.
* Once the server is running, it generates completions based on requests from the client (trainer) using `vllm_client.generate` [here](https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035).
* The client (trainer) then requests these completions from the server.
* These completions are used to compute the reward signal.
* Based on the reward signal and the models output, the loss is computed, and the backward pass is performed to update the models weights.
* **Note**: The server only handles completion generation — it doesnt train the model. Therefore, the models weights arent updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using `vllm_client.update_named_param(name, param.data)`.
- The vLLM server starts by running the command: `trl vllm-serve --model Qwen/Qwen2.5-7B`.
- Once the server is running, it generates completions based on requests from the client (trainer) using `vllm_client.generate` [these lines](https://github.com/huggingface/trl/blob/cc044e35b285be7dc062764b3364e1e684db4c7c/trl/trainer/grpo_trainer.py#L1025-L1035).
- The client (trainer) then requests these completions from the server.
- These completions are used to compute the reward signal.
- Based on the reward signal and the models output, the loss is computed, and the backward pass is performed to update the models weights.
- **Note**: The server only handles completion generation — it doesnt train the model. Therefore, the models weights arent updated on the server. Once the backward pass is complete, the client sends the updated weights to the server using `vllm_client.update_named_param(name, param.data)`.
When using vLLM, ensure the GPUs assigned for training and generation are separate to avoid NCCL communication conflicts. If you do not set the `CUDA_VISIBLE_DEVICES` environment variable, the training script will use all available GPUs by default, which may lead to device conflicts. Starting from TRL next release after v0.19.1, the code automatically detects and prevents same-device usage, raising a error at the vllm server process:
```
```log
RuntimeError: Attempting to use the same CUDA device for multiple distinct roles/ranks within the same communicator.
Ensure that trainer is using different devices than vLLM server.
```
@ -307,23 +306,23 @@ options:
### 💆🏻‍♀️ What's the best distributed setup?
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png)
![](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png)
![tp dp throughput 8 gpus](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_8_gpus.png)
![tp dp throughput 4 gpus](https://huggingface.co/datasets/trl-lib/documentation-images/resolve/main/tp_dp_throughput_4_gpus.png)
First and foremost, always remember that the optimal setup depends on:
* The model size
* The number of GPUs you have
* The GPU memory size
* The batch size you are using
* The number of requests you are sending to the server (prompts)
* The `max_model_len` you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)
* The number of completions you are generating for each request (`num_generations`)
- The model size
- The number of GPUs you have
- The GPU memory size
- The batch size you are using
- The number of requests you are sending to the server (prompts)
- The `max_model_len` you are using (this is the max length of the input sequence that the model can process, a.k.a. the context window size)
- The number of completions you are generating for each request (`num_generations`)
Given these factors, our experiments on the Qwen model family (3B, 7B, 14B, 32B) using 8 H100 GPUs show that:
* For reasonable-sized models (3B14B) and a moderate context window (`max_len < 8k`), using full capacity for data parallelism gives better throughput. The setup `(tp=1, dp=8)` yields the best results.
* For larger models (32B) and longer context windows (`max_len > 8k`), a smaller DP size combined with some model-side parallelism performs better. For example, `(tp=2, dp=4)` is a good setup for 32B models with a larger context window.
- For reasonable-sized models (3B14B) and a moderate context window (`max_len < 8k`), using full capacity for data parallelism gives better throughput. The setup `(tp=1, dp=8)` yields the best results.
- For larger models (32B) and longer context windows (`max_len > 8k`), a smaller DP size combined with some model-side parallelism performs better. For example, `(tp=2, dp=4)` is a good setup for 32B models with a larger context window.
### vLLM with Transformers Backend
@ -334,7 +333,7 @@ For more details, check out [vLLM Transformers Backend](https://blog.vllm.ai/202
Example:
```
```sh
CUDA_DEVICE_ORDER=PCI_BUS_ID CUDA_VISIBLE_DEVICES=0 trl vllm-serve --model Qwen/Qwen
2.5-VL-3B-Instruct --tensor-parallel-size 1 --port 8000 --enforce_eager --vllm_model_impl transformers
```
@ -496,7 +495,5 @@ training_args = RLOOConfig(
> [!WARNING]
> Check the documentation of the trainer you are using for specific details on vLLM usage and parameters.
> [!WARNING]
> To reduce GPU memory usage when running vLLM, consider [enabling vLLM sleep mode](reducing_memory_usage#vllm-sleep-mode).

View File

@ -1,6 +1,6 @@
# XPO Trainer
[![](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo,trl)
[![model badge](https://img.shields.io/badge/All_models-XPO-blue)](https://huggingface.co/models?other=xpo,trl)
## Overview
@ -57,7 +57,7 @@ To see how the [trained model](https://huggingface.co/trl-lib/Qwen2-0.5B-XPO) pe
What is the best programming language?
<strong><span style="color: blue;">&lt;trl-lib/Qwen2-0.5B-XPO&gt;:</span></strong>
The best programming language depends on individual preferences and familiarity with coding concepts. Some popular languages include Python, Java, C++, and JavaScript.
The best programming language depends on individual preferences and familiarity with coding concepts. Some popular languages include Python, Java, C++, and JavaScript.
</code></pre>
## Expected dataset type
@ -148,7 +148,6 @@ While training and evaluating we record the following reward metrics:
* `alpha`: The weight of the XPO loss term. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
* `beta`: The parameter that controls the weight of the loss term representing the deviation from the reference model. Typically fixed, but can be made dynamic by passing a list to [`XPOConfig`].
## XPOTrainer
[[autodoc]] XPOTrainer

View File

@ -1,3 +1,3 @@
# Examples
Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.
Please check out https://huggingface.co/docs/trl/example_overview for documentation on our examples.

View File

@ -4,21 +4,23 @@
Install all the dependencies in the `requirements.txt`:
```
$ pip install -U -r requirements.txt
```shell
pip install -U -r requirements.txt
```
Since we will use `accelerate` for training, make sure to run:
```
$ accelerate config
```shell
accelerate config
```
## Training
There were two main steps to the DPO training process:
1. Supervised fine-tuning of the base llama-v2-7b model to create llama-v2-7b-se:
```
```shell
accelerate launch examples/research_projects/stack_llama_2/scripts/sft_llama2.py \
--output_dir="./sft" \
--max_steps=500 \
@ -38,19 +40,20 @@ There were two main steps to the DPO training process:
--run_name="sft_llama2" \
--report_to="wandb"
```
1. Run the DPO trainer using the model saved by the previous step:
```
accelerate launch examples/research_projects/stack_llama_2/scripts/dpo_llama2.py \
--model_name_or_path="sft/final_checkpoint" \
--output_dir="dpo"
```
2. Run the DPO trainer using the model saved by the previous step:
```shell
accelerate launch examples/research_projects/stack_llama_2/scripts/dpo_llama2.py \
--model_name_or_path="sft/final_checkpoint" \
--output_dir="dpo"
```
## Merging the adaptors
To merge the adaptors into the base model we can use the `merge_peft_adapter.py` helper script that comes with TRL:
```
```shell
python examples/research_projects/stack_llama/scripts/merge_peft_adapter.py --base_model_name="meta-llama/Llama-2-7b-hf" --adapter_model_name="dpo/final_checkpoint/" --output_name="stack-llama-2"
```
@ -60,7 +63,7 @@ which will also push the model to your HuggingFace hub account.
We can load the DPO-trained LoRA adaptors which were saved by the DPO training step and load them via:
```py
```python
from peft import AutoPeftModelForCausalLM

624
online_dpo.py Normal file
View File

@ -0,0 +1,624 @@
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# /// script
# dependencies = [
# "trl",
# "peft",
# "math-verify",
# "latex2sympy2_extended",
# "trackio",
# "kernels",
# ]
# ///
"""
LoRA Without Regret - Online DPO Training Script
Based on: https://thinkingmachines.ai/blog/lora/
Trains a language model using Online DPO with LoRA adapters for parameter-efficient
fine-tuning on mathematical reasoning tasks.
Features:
- Online DPO training with custom reward functions
- LoRA for parameter-efficient fine-tuning
- BEMA (Bias-Corrected Exponential Moving Average) for dynamic reference model updating
- **PEFT/LoRA Compatible**: Updates base model with smoothed merged weights
- Creates a dynamic reference that follows policy in a smoothed, lagged manner
- For LoRA: Policy = BEMA(base) + LoRA adapters, Reference = BEMA(base)
- Improves training stability compared to fixed reference model
- The final saved BEMA model often generalizes better than the last checkpoint
- Default BEMA settings (can be modified in the script):
* update_freq=50: Update BEMA weights every 50 steps
* ema_power=0.5: EMA decay factor power (κ in BEMA paper)
* bias_power=0.2: BEMA scaling factor power (η in BEMA paper)
* lag=10: Initial offset for smoothness (ρ in BEMA paper)
* update_after=50: Burn-in period before BEMA starts (τ in BEMA paper)
* device="cpu": Store BEMA buffers on CPU to save GPU memory
Usage:
hf jobs uv run \\
--flavor a100-large \\
--timeout 4h \\
--secrets HF_TOKEN \\
--env PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \\
online_dpo.py \
--model_name_or_path Qwen/Qwen3-0.6B \
--dataset_name HuggingFaceH4/OpenR1-Math-220k-default-verified \
--output_dir online-dpo-lora-qwen3-0.6b \
--learning_rate 1.0e-6 \
--lr_scheduler_type cosine \
--warmup_ratio 0.0 \
--max_grad_norm 1.0 \
--beta 0.1 \
--max_length 2048 \
--max_new_tokens 2048 \
--gradient_accumulation_steps 8 \
--per_device_train_batch_size 1 \
--num_train_epochs 1 \
--use_peft \
--lora_r 1 \
--lora_alpha 32 \
--lora_dropout 0.0 \
--lora_target_modules all-linear \
--use_vllm \
--vllm_mode colocate \
--vllm_gpu_memory_utilization 0.4 \
--save_strategy steps \
--save_steps 50 \
--save_total_limit 1 \
--logging_steps 1 \
--max_steps 200 \
--report_to trackio
Note: BEMA (Bias-Corrected Exponential Moving Average) is automatically enabled for
dynamic reference model updating, with full support for LoRA/PEFT training.
How BEMA works with LoRA:
1. During training, LoRA adapters are trained on top of the base model
2. Every 50 steps (after step 50), BEMA:
a. Computes merged weights: base_weights + LoRA_deltas
b. Applies BEMA smoothing to the merged weights
c. Updates the shared base model with smoothed BEMA weights
3. Both policy and reference now use the BEMA-smoothed base:
- Policy forward pass: BEMA(base) + enabled LoRA adapters
- Reference forward pass: BEMA(base) + disabled LoRA adapters (= just BEMA(base))
This creates a dynamic reference model that follows the policy but in a lagged, smoothed
manner, which improves training stability compared to a completely fixed reference.
The LoRA adapters continue training on top of the evolving BEMA base, and the final
BEMA model often generalizes better than the last checkpoint because it represents
a smoothed trajectory through weight space.
BEMA weights are also saved to bema_state_dict.pt at checkpoints for later use.
To customize BEMA parameters, modify the PEFTBEMACallback initialization in the script
(lines 519-529). To disable BEMA, comment out or remove the callback section.
For more details on BEMA, see: https://huggingface.co/papers/2508.00180
"""
import os
from typing import Optional
from datetime import datetime
import torch
from datasets import load_dataset
from latex2sympy2_extended import NormalizationConfig
from math_verify import LatexExtractionConfig, parse, verify
from transformers import AutoTokenizer
from trl import (
OnlineDPOConfig,
OnlineDPOTrainer,
ModelConfig,
ScriptArguments,
TrlParser,
get_kbit_device_map,
get_peft_config,
get_quantization_config,
)
from trl.trainer.utils import SIMPLE_CHAT_TEMPLATE
# Import BEMA dependencies
try:
from transformers import PreTrainedModel, TrainerCallback, TrainerControl, TrainerState, TrainingArguments
from accelerate.utils import is_peft_model
BEMA_AVAILABLE = True
except ImportError:
BEMA_AVAILABLE = False
print("Warning: BEMA dependencies not available.")
# Disable GPU memory logging to avoid NCCL OOM errors during distributed logging
os.environ.setdefault("TRANSFORMERS_DISABLE_GPU_MEMORY_LOGGING", "1")
################
# PEFT-Compatible BEMA Callback
################
class PEFTBEMACallback(TrainerCallback):
"""
BEMA callback that works with PEFT/LoRA models for dynamic reference model updating.
This callback:
- Merges LoRA adapters with base model to compute full effective weights
- Applies BEMA (Bias-Corrected Exponential Moving Average) to the merged weights
- Updates the shared base model with smoothed BEMA weights
How it works with OnlineDPO + LoRA:
- Policy model = BEMA(base) + enabled LoRA adapters
- Reference model = BEMA(base) + disabled LoRA adapters
This creates a dynamic reference that follows the policy in a smoothed, lagged manner,
which can improve training stability compared to a fixed reference model.
The BEMA weights represent a smoothed trajectory through weight space and often
generalize better than the final checkpoint.
"""
def __init__(
self,
update_freq: int = 50,
ema_power: float = 0.5,
bias_power: float = 0.2,
lag: int = 10,
update_after: int = 0,
multiplier: float = 1.0,
min_ema_multiplier: float = 0.0,
device: str = "cpu",
):
self.update_freq = update_freq
self.ema_power = ema_power
self.bias_power = bias_power
self.lag = lag
self.update_after = update_after
self.multiplier = multiplier
self.min_ema_multiplier = min_ema_multiplier
self.device = device
# Storage for BEMA state
self.theta0_params = {} # θ₀ - initial merged parameters
self.ema_params = {} # EMA of merged parameters
self.param_names = [] # Names of base model parameters
self.bema_state_dict = {} # BEMA weights
def _unwrap_model(self, model):
"""Unwrap model from DDP/FSDP/DeepSpeed wrappers."""
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
if isinstance(model, (DDP, FSDP)):
return model.module
if hasattr(model, "module"):
return model.module
return model
def _get_merged_state_dict(self, model):
"""
Get merged state dict (base model + LoRA adapters) WITHOUT modifying the base model.
This computes the effective weights by manually adding LoRA deltas to base weights.
All computations are done on self.device (CPU by default) to avoid GPU OOM.
"""
model = self._unwrap_model(model)
if is_peft_model(model):
# For PEFT models, we manually compute merged weights without touching the base model
try:
from peft.tuners.lora import LoraLayer
except ImportError:
# Fallback for older peft versions
from peft.tuners.lora.layer import LoraLayer
merged_state_dict = {}
base_model = model.get_base_model()
# Get base model parameters first and move to self.device (CPU by default)
base_state_dict = {name: param.detach().clone().to(self.device) for name, param in base_model.named_parameters()}
# Now add LoRA deltas manually without modifying the base model
# All computations done on self.device (CPU by default) to save GPU memory
for name, module in model.named_modules():
if isinstance(module, LoraLayer):
# This is a LoRA layer, compute the delta
# LoRA delta = (lora_B @ lora_A) * scaling
if hasattr(module, 'lora_A') and hasattr(module, 'lora_B'):
for adapter_name in module.lora_A.keys():
if module.active_adapter == adapter_name:
# Move LoRA weights to self.device for computation
lora_A = module.lora_A[adapter_name].weight.detach().to(self.device)
lora_B = module.lora_B[adapter_name].weight.detach().to(self.device)
scaling = module.scaling[adapter_name]
# Compute LoRA delta on self.device: BA * scaling
lora_delta = (lora_B @ lora_A) * scaling
# Find the corresponding base weight name
# The module name in PEFT includes 'base_model.model.'
parent_name = name.replace('base_model.model.', '')
# Add delta to base weight (both on CPU)
weight_name = f"{parent_name}.weight"
if weight_name in base_state_dict:
merged_state_dict[weight_name] = base_state_dict[weight_name] + lora_delta
# Fill in any parameters that weren't modified by LoRA
for name, param in base_state_dict.items():
if name not in merged_state_dict:
merged_state_dict[name] = param
return merged_state_dict
else:
# For non-PEFT models, just return the state dict on self.device
return {name: param.detach().clone().to(self.device) for name, param in model.named_parameters()}
@torch.no_grad()
def on_train_begin(
self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs
):
# Get initial merged weights
merged_state = self._get_merged_state_dict(model)
# Initialize θ₀ and EMA with merged weights
total_params = 0
for name, param in merged_state.items():
self.param_names.append(name)
theta0 = param.to(self.device)
self.theta0_params[name] = theta0
self.ema_params[name] = theta0.clone()
total_params += param.numel()
print(f"PEFT-BEMA: Initialized BEMA callback")
print(f" - Tracking {len(self.param_names)} parameters ({total_params:,} total elements)")
print(f" - Storage device: {self.device}")
print(f" - Update frequency: every {self.update_freq} steps after step {self.update_after}")
def _ema_beta(self, step: int) -> float:
"""Compute the EMA decay factor βₜ = (ρ + γ·t)⁻ᵏᵃᵖᵖᵃ."""
beta = (self.lag + self.multiplier * step) ** (-self.ema_power)
return max(beta, self.min_ema_multiplier)
def _bema_alpha(self, step: int) -> float:
"""Compute the BEMA scaling factor αₜ = (ρ + γ·t)⁻ᵉᵗᵃ."""
return (self.lag + self.multiplier * step) ** (-self.bias_power)
def _update_bema_weights(self, model, step: int):
"""Update BEMA weights using merged LoRA + base weights. All on self.device (CPU by default)."""
beta = self._ema_beta(step)
alpha = self._bema_alpha(step)
# Get current merged weights (already on self.device from _get_merged_state_dict)
merged_state = self._get_merged_state_dict(model)
# Track magnitude of updates for verification
total_change = 0.0
num_params = 0
# Update EMA and compute BEMA for each parameter (all on self.device)
for name in self.param_names:
if name not in merged_state:
continue
thetat = merged_state[name].to(self.device) # Should already be on self.device (CPU)
theta0 = self.theta0_params[name]
ema = self.ema_params[name]
# Store old BEMA for comparison
old_bema = self.bema_state_dict.get(name, ema).clone() if name in self.bema_state_dict else ema.clone()
# EMA update: ema = (1 - beta) * ema + beta * θₜ
ema.mul_(1 - beta).add_(thetat, alpha=beta)
# BEMA update: bema = ema + alpha * (θₜ - θ₀)
bema_weight = ema + alpha * (thetat - theta0)
# Track change in BEMA weights
change = (bema_weight - old_bema).abs().mean().item()
total_change += change
num_params += 1
# Keep on self.device (CPU by default)
self.bema_state_dict[name] = bema_weight
avg_bema_change = total_change / num_params if num_params > 0 else 0.0
print(f" - Average BEMA weight change: {avg_bema_change:.6e}")
def _update_ref_model(self, model):
"""
Update the base model with BEMA weights.
For PEFT/LoRA with OnlineDPO:
- Policy model = base_model (with BEMA weights) + enabled LoRA adapters
- Reference model = base_model (with BEMA weights) + disabled LoRA adapters
This creates a dynamic reference model that follows the policy in a smoothed manner,
which can improve training stability compared to a fixed reference model.
BEMA weights are stored on self.device (CPU by default) and moved to GPU only during loading.
"""
model = self._unwrap_model(model)
if is_peft_model(model):
base_model = model.get_base_model()
# Get the device of the base model
model_device = next(base_model.parameters()).device
# Compute change magnitude before update (for verification)
old_weight = next(base_model.parameters()).detach().clone()
# Move BEMA weights to model device and load
bema_state_dict_gpu = {k: v.to(model_device) for k, v in self.bema_state_dict.items()}
base_model.load_state_dict(bema_state_dict_gpu, strict=False)
# Compute change magnitude after update
new_weight = next(base_model.parameters()).detach()
weight_change = (new_weight - old_weight).abs().mean().item()
print(f"PEFT-BEMA: Updated shared base model with BEMA weights")
print(f" Policy = BEMA_base + LoRA, Reference = BEMA_base")
print(f" Average weight change: {weight_change:.6e}")
else:
# For full fine-tuning, update the model directly
model_device = next(model.parameters()).device
old_weight = next(model.parameters()).detach().clone()
bema_state_dict_gpu = {k: v.to(model_device) for k, v in self.bema_state_dict.items()}
model.load_state_dict(bema_state_dict_gpu, strict=False)
new_weight = next(model.parameters()).detach()
weight_change = (new_weight - old_weight).abs().mean().item()
print(f"BEMA: Updated model with BEMA weights (change: {weight_change:.6e})")
@torch.no_grad()
def on_step_end(
self, args: TrainingArguments, state: TrainerState, control: TrainerControl, model: PreTrainedModel, **kwargs
):
step = state.global_step
# Only update after burn-in period and at specified frequency
if step < self.update_after:
return
if (step - self.update_after) % self.update_freq == 0:
print(f"\n{'='*60}")
print(f"PEFT-BEMA: Step {step} - Computing BEMA update")
print(f" - EMA beta: {self._ema_beta(step):.6f}")
print(f" - BEMA alpha: {self._bema_alpha(step):.6f}")
self._update_bema_weights(model, step)
self._update_ref_model(model)
print(f"{'='*60}\n")
@torch.no_grad()
def on_save(
self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs
):
"""Save BEMA state dict."""
if self.bema_state_dict:
import os
bema_path = os.path.join(args.output_dir, "bema_state_dict.pt")
torch.save(self.bema_state_dict, bema_path)
# Compute some statistics for verification
total_params = sum(v.numel() for v in self.bema_state_dict.values())
avg_magnitude = torch.mean(torch.stack([v.abs().mean() for v in self.bema_state_dict.values()])).item()
print(f"PEFT-BEMA: Saved BEMA state dict to {bema_path}")
print(f" {len(self.bema_state_dict)} tensors, {total_params:,} total parameters")
print(f" Average magnitude: {avg_magnitude:.6e}")
################
# Reward Function for Training
################
def strip_reasoning_accuracy_reward(
completions: list[list[dict[str, str]]], solution: list[str], **kwargs
) -> list[Optional[float]]:
"""Reward function that strips reasoning tags and checks mathematical accuracy.
This function:
1. Extracts the content from completions
2. Removes <think></think> tags (for reasoning that shouldn't be evaluated)
3. Parses both the gold solution and the predicted answer
4. Uses math_verify to check if they are mathematically equivalent
Args:
completions: List of model completions, each containing a list of messages
solution: List of ground truth solutions
**kwargs: Additional arguments (ignored but required for trainer compatibility)
Returns:
List of rewards where:
- 1.0 if the answer is correct
- 0.0 if the answer is incorrect
- None if the solution is not parseable (skips this example)
"""
contents = [completion[0]["content"] for completion in completions]
rewards = []
for content, sol in zip(contents, solution):
# Strip reasoning tags from completion
while "<think>" in content and "</think>" in content:
start = content.find("<think>")
end = content.find("</think>", start)
if start != -1 and end != -1:
content = content[:start] + content[end + len("</think>") :]
else:
break
# Parse gold solution
gold_parsed = parse(
f"${sol}$",
extraction_config=[
LatexExtractionConfig(
boxed_match_priority=0, try_extract_without_anchor=True
)
],
)
if len(gold_parsed) != 0:
# We require the answer to be provided in correct latex (no malformed operators)
answer_parsed = parse(
content,
extraction_config=[
LatexExtractionConfig(
boxed_match_priority=0,
normalization_config=NormalizationConfig(
basic_latex=True,
units=True,
malformed_operators=False,
nits=False,
boxed=True,
),
try_extract_without_anchor=False,
)
],
extraction_mode="first_match",
)
# Compute binary rewards if verifiable, `None` otherwise to skip this example
try:
reward = float(verify(gold_parsed, answer_parsed))
except Exception as e:
print(
f"verify failed: {e}, answer: {answer_parsed}, gold: {gold_parsed}"
)
reward = None
else:
# If the gold solution is not parseable, we assign `None` to skip this example
reward = None
rewards.append(reward)
return rewards
if __name__ == "__main__":
parser = TrlParser((ScriptArguments, OnlineDPOConfig, ModelConfig))
script_args, training_args, model_args = parser.parse_args_and_config()
# Enable logging in a Hugging Face Space
os.environ.setdefault("TRACKIO_SPACE_ID", "burtenshaw/trl-online-dpo")
os.environ.setdefault("TRACKIO_PROJECT", model_args.model_name_or_path.split('/')[-1] + script_args.dataset_name.replace('/', '-'))
################
# Model & Processor
################
dtype = (
model_args.dtype
if model_args.dtype in ["auto", None]
else getattr(torch, model_args.dtype)
)
training_args.model_init_kwargs = dict(
revision=model_args.model_revision,
attn_implementation=model_args.attn_implementation,
dtype=dtype,
)
quantization_config = get_quantization_config(model_args)
if quantization_config is not None:
# Passing None would not be treated the same as omitting the argument, so we include it only when valid.
training_args.model_init_kwargs["device_map"] = get_kbit_device_map()
training_args.model_init_kwargs["quantization_config"] = quantization_config
################
# Dataset
################
dataset = load_dataset(
script_args.dataset_name, split=script_args.dataset_train_split
)
# Limit to 5k samples for faster training
if len(dataset) > 5000:
dataset = dataset.select(range(5000))
def make_conversation(example):
prompt = [{"role": "user", "content": example["problem"]}]
return {"prompt": prompt}
dataset = dataset.map(make_conversation)
# Debug: Print first example to verify prompt structure
print("First dataset example:")
print(dataset[0])
# Remove unnecessary columns
columns_to_remove = [
col for col in dataset.column_names if col not in ["prompt", "solution"]
]
if columns_to_remove:
dataset = dataset.remove_columns(columns_to_remove)
################
# Tokenizer
################
tokenizer = AutoTokenizer.from_pretrained(
model_args.model_name_or_path,
padding_side="left",
trust_remote_code=model_args.trust_remote_code,
)
if tokenizer.chat_template is None:
tokenizer.chat_template = SIMPLE_CHAT_TEMPLATE
if tokenizer.pad_token_id is None:
tokenizer.pad_token = tokenizer.eos_token
################
# Training
################
if training_args.run_name is None:
now = datetime.now().strftime("%Y%m%d_%H%M%S")
training_args.run_name = (
f"online-dpo-lora-{model_args.model_name_or_path.split('/')[-1]}-{now}"
)
training_args.report_to = ["trackio"]
trainer = OnlineDPOTrainer(
model=model_args.model_name_or_path,
args=training_args,
reward_funcs=[strip_reasoning_accuracy_reward],
train_dataset=dataset,
eval_dataset=None,
processing_class=tokenizer,
peft_config=get_peft_config(model_args),
)
# Add BEMA callback for reference model updating if available
if BEMA_AVAILABLE:
print("Adding PEFT-compatible BEMA callback for dynamic reference model updating...")
bema_callback = PEFTBEMACallback(
update_freq=10, # Update BEMA weights every 10 steps
ema_power=0.5, # Power for EMA decay factor (κ in paper)
bias_power=0.2, # Power for BEMA scaling factor (η in paper)
lag=5, # Initial offset for smoothness (ρ in paper)
update_after=20, # Start BEMA after 20 steps burn-in (τ in paper)
multiplier=1.0, # Initial EMA decay factor (γ in paper)
min_ema_multiplier=0.0, # Minimum EMA multiplier
device="cpu", # Use CPU for BEMA buffers to avoid OOM
)
trainer.add_callback(bema_callback)
print("BEMA callback added for dynamic reference model updating.")
print("With LoRA: Policy = BEMA(base) + LoRA, Reference = BEMA(base)")
print("The base model will be updated with smoothed weights every 50 steps after step 50.")
else:
print("BEMA callback not available. Training with fixed reference model.")
trainer.train()
# Save and push to hub
trainer.save_model(training_args.output_dir)
if training_args.push_to_hub:
trainer.push_to_hub(dataset_name=script_args.dataset_name)

View File

@ -90,26 +90,32 @@ vlm = [
"num2words==0.5.14"
]
dev = [
# bco
"scikit-learn",
"joblib",
# deepspeed
"deepspeed>=0.14.4",
# judges
"openai>=1.23.2",
"llm-blender>=0.0.2",
# liger
"liger-kernel>=0.6.2",
# peft
"peft>=0.8.0",
# quality
"pre-commit",
"hf-doc-builder",
# quantization
"bitsandbytes",
# scikit: included in bco
# test
"parameterized",
"pytest-cov",
"pytest-rerunfailures==15.1",
"pytest-xdist",
"pytest",
"vllm==0.10.2",
"fastapi",
"pydantic",
"requests",
"uvicorn",
# vllm: not included in dev by default due to CUDA error; see GH-4228
# vlm
"Pillow",
"torchvision",
"num2words==0.5.14"

28
run.sh Normal file
View File

@ -0,0 +1,28 @@
accelerate launch --config_file examples/accelerate_configs/multi_gpu.yaml online_dpo.py \
--model_name_or_path Qwen/Qwen3-0.6B \
--dataset_name HuggingFaceH4/OpenR1-Math-220k-default-verified \
--output_dir online-dpo-lora-qwen3-0.6b \
--learning_rate 1.0e-6 \
--lr_scheduler_type cosine \
--warmup_ratio 0.0 \
--max_grad_norm 1.0 \
--beta 0.1 \
--max_length 2048 \
--max_new_tokens 2048 \
--gradient_accumulation_steps 8 \
--per_device_train_batch_size 1 \
--num_train_epochs 1 \
--use_peft \
--lora_r 1 \
--lora_alpha 32 \
--lora_dropout 0.0 \
--lora_target_modules all-linear \
--use_vllm \
--vllm_mode colocate \
--vllm_gpu_memory_utilization 0.4 \
--save_strategy steps \
--save_steps 50 \
--save_total_limit 1 \
--logging_steps 1 \
--max_steps 200 \
--report_to trackio

View File

@ -1,158 +0,0 @@
# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import logging
import os
from datetime import date
from tabulate import tabulate
MAX_LEN_MESSAGE = 2900 # slack endpoint has a limit of 3001 characters
parser = argparse.ArgumentParser()
parser.add_argument("--slack_channel_name", default="trl-push-examples-ci")
parser.add_argument("--text_file_name", required=True)
def main(text_file_name, slack_channel_name=None):
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
message = ""
if os.path.isfile(text_file_name):
final_results = {}
try:
with open(text_file_name) as file:
for line in file:
result, config_name = line.strip().split(",")
config_name = config_name.split("/")[-1].split(".yaml")[0]
final_results[config_name] = int(result)
except Exception as e:
logger.error(f"Error reading file {text_file_name}: {str(e)}")
final_results = {}
no_error_payload = {
"type": "section",
"text": {
"type": "plain_text",
"text": "🌞 There were no failures on the example tests!"
if not len(final_results) == 0
else "Something went wrong there is at least one empty file - please check GH action results.",
"emoji": True,
},
}
total_num_failed = sum(final_results.values())
else:
no_error_payload = {
"type": "section",
"text": {
"type": "plain_text",
"text": "❌ Something is wrong with the workflow please check ASAP!"
"Something went wrong there is no text file being produced. Please check ASAP.",
"emoji": True,
},
}
total_num_failed = 0
test_type_name = text_file_name.replace(".txt", "").replace("temp_results_", "").replace("_", " ").title()
payload = [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "🤗 Results of the {} TRL {} example tests.".format(
os.environ.get("TEST_TYPE", ""), test_type_name
),
},
},
]
if total_num_failed > 0:
message += f"{total_num_failed} failed tests for example tests!"
for test_name, failed in final_results.items():
failed_table = tabulate(
[[test_name, "" if not failed else ""]],
headers=["Test Name", "Status"],
showindex="always",
tablefmt="grid",
maxcolwidths=[12],
)
message += "\n```\n" + failed_table + "\n```"
print(f"### {message}")
else:
payload.append(no_error_payload)
if os.environ.get("TEST_TYPE", "") != "":
try:
from slack_sdk import WebClient
except ImportError:
logger.error("slack_sdk is not installed. Please install it to use Slack integration.")
return
if len(message) > MAX_LEN_MESSAGE:
print(f"Truncating long message from {len(message)} to {MAX_LEN_MESSAGE}")
message = message[:MAX_LEN_MESSAGE] + "..."
if len(message) != 0:
md_report = {
"type": "section",
"text": {"type": "mrkdwn", "text": message},
}
payload.append(md_report)
action_button = {
"type": "section",
"text": {"type": "mrkdwn", "text": "*For more details:*"},
"accessory": {
"type": "button",
"text": {"type": "plain_text", "text": "Check Action results", "emoji": True},
"url": f"https://github.com/huggingface/trl/actions/runs/{os.environ['GITHUB_RUN_ID']}",
},
}
payload.append(action_button)
date_report = {
"type": "context",
"elements": [
{
"type": "plain_text",
"text": f"On Push - main {os.environ.get('TEST_TYPE')} test results for {date.today()}",
},
],
}
payload.append(date_report)
print(payload)
try:
client = WebClient(token=os.environ.get("SLACK_API_TOKEN"))
response = client.chat_postMessage(channel=f"#{slack_channel_name}", text=message, blocks=payload)
if response["ok"]:
logger.info("Message sent successfully to Slack.")
else:
logger.error(f"Failed to send message to Slack: {response['error']}")
except Exception as e:
logger.error(f"Error sending message to Slack: {str(e)}")
if __name__ == "__main__":
args = parser.parse_args()
main(args.text_file_name, args.slack_channel_name)

35
tests/conftest.py Normal file
View File

@ -0,0 +1,35 @@
# Copyright 2020-2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gc
import pytest
import torch
@pytest.fixture(autouse=True)
def cleanup_gpu():
"""
Automatically cleanup GPU memory after each test.
This fixture helps prevent CUDA out of memory errors when running tests in parallel
with pytest-xdist by ensuring models and tensors are properly garbage collected
and GPU memory caches are cleared between tests.
"""
yield
# Cleanup after test
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.synchronize()

View File

@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import time
import pytest
@ -59,6 +60,9 @@ class TestJudges(TrlTestCase):
raise ValueError("Failed to load PairRMJudge")
@require_llm_blender
@pytest.mark.skipif(
sys.version_info == (3, 13, 8), reason="Python 3.13.8 has a bug in inspect.BlockFinder (cpython GH-139783)"
)
def test_pair_rm_judge(self):
judge = self.load_pair_rm_judge()
prompts, completions = self._get_prompts_and_pairwise_completions()
@ -68,6 +72,9 @@ class TestJudges(TrlTestCase):
assert ranks == [0, 1]
@require_llm_blender
@pytest.mark.skipif(
sys.version_info == (3, 13, 8), reason="Python 3.13.8 has a bug in inspect.BlockFinder (cpython GH-139783)"
)
def test_pair_rm_judge_return_scores(self):
judge = self.load_pair_rm_judge()
prompts, completions = self._get_prompts_and_pairwise_completions()

View File

@ -32,7 +32,15 @@ from .testing_utils import TrlTestCase, ignore_warnings, require_bitsandbytes, r
if is_peft_available():
from peft import LoraConfig, PeftModel, PromptEncoderConfig, TaskType, get_peft_model
from peft import (
LoraConfig,
PeftModel,
PrefixTuningConfig,
PromptEncoderConfig,
PromptTuningConfig,
TaskType,
get_peft_model,
)
class TestDFTLoss(TrlTestCase):
@ -453,7 +461,7 @@ class TestSFTTrainer(TrlTestCase):
assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
@require_peft
def test_train_dense_with_peft_config(self):
def test_train_dense_with_peft_config_lora(self):
# Get the base model parameter names
model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
model = AutoModelForCausalLM.from_pretrained(model_id)
@ -489,6 +497,66 @@ class TestSFTTrainer(TrlTestCase):
elif "base_layer" not in n: # We expect the peft parameters to be different (except for the base layer)
assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
@parameterized.expand(
[
("prompt_tuning",),
("prefix_tuning",),
("prompt_encoder",),
]
)
@require_peft
def test_train_with_peft_config_prompt_tuning(self, peft_type):
# Get the base model parameter names
model_id = "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5"
model = AutoModelForCausalLM.from_pretrained(model_id)
base_param_names = [f"base_model.{n}" for n, _ in model.named_parameters()]
# Get the dataset
dataset = load_dataset("trl-internal-testing/zen", "standard_language_modeling", split="train")
# Initialize the trainer, p-tuning doesn't support gradient checkpointing
training_args = SFTConfig(bf16=False, output_dir=self.tmp_dir, report_to="none", gradient_checkpointing=False)
if peft_type == "prompt_tuning":
peft_config = PromptTuningConfig(
task_type=TaskType.CAUSAL_LM,
num_virtual_tokens=4,
tokenizer_name_or_path="trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
)
elif peft_type == "prefix_tuning":
peft_config = PrefixTuningConfig(
task_type=TaskType.CAUSAL_LM,
num_virtual_tokens=4,
)
elif peft_type == "prompt_encoder":
peft_config = PromptEncoderConfig(
task_type=TaskType.CAUSAL_LM,
num_virtual_tokens=4,
encoder_hidden_size=model.config.hidden_size, # This will be overwritten below
)
trainer = SFTTrainer(
model=model_id,
args=training_args,
train_dataset=dataset,
peft_config=peft_config,
)
# Save the initial parameters to compare them later
previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
# Train the model
trainer.train()
# Check that the training loss is not None
assert trainer.state.log_history[-1]["train_loss"] is not None
# Check the peft params have changed and the base model params have not changed
for n, param in previous_trainable_params.items():
new_param = trainer.model.get_parameter(n)
if n in base_param_names: # We expect the base model parameters to be the same
assert torch.allclose(param, new_param), f"Parameter {n} has changed"
else: # We expect the peft parameters to be different
assert not torch.allclose(param, new_param), f"Parameter {n} has not changed"
@require_peft
def test_train_moe_with_peft_config(self):
# Get the base model parameter names
@ -1373,6 +1441,38 @@ class TestSFTTrainer(TrlTestCase):
new_param = trainer.model.get_parameter(n)
assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12), f"Param {n} is not updated"
# Special case for Gemma, as it uses token_type_ids, and we need to ensure they are properly in the collator.
@require_vision
def test_train_vlm_prompt_completion_gemma(self):
# Get the dataset
dataset = load_dataset("trl-internal-testing/zen-image", "conversational_prompt_completion", split="train")
# Initialize the trainer
training_args = SFTConfig(
output_dir=self.tmp_dir,
max_length=None, # For VLMs, truncating can remove image tokens, leading to errors
report_to="none",
)
trainer = SFTTrainer(
model="trl-internal-testing/tiny-Gemma3ForConditionalGeneration",
args=training_args,
train_dataset=dataset,
)
# Save the initial parameters to compare them later
previous_trainable_params = {n: param.clone() for n, param in trainer.model.named_parameters()}
# Train the model
trainer.train()
# Check that the training loss is not None
assert trainer.state.log_history[-1]["train_loss"] is not None
# Check the params have changed
for n, param in previous_trainable_params.items():
new_param = trainer.model.get_parameter(n)
assert not torch.allclose(param, new_param, rtol=1e-12, atol=1e-12), f"Param {n} is not updated"
# Gemma 3n uses a timm encoder, making it difficult to create a smaller variant for testing.
# To ensure coverage, we run tests on the full model but mark them as slow to exclude from default runs.
@pytest.mark.slow

View File

@ -16,6 +16,7 @@ import functools
import random
import signal
import warnings
from collections.abc import Callable
import psutil
import pytest
@ -73,7 +74,7 @@ class TrlTestCase:
self.tmp_dir = str(tmp_path)
def ignore_warnings(message: str = None, category: type[Warning] = Warning) -> callable:
def ignore_warnings(message: str = None, category: type[Warning] = Warning) -> Callable:
"""
Decorator to ignore warnings with a specific message and/or category.

View File

@ -13,7 +13,6 @@
# limitations under the License.
from importlib.metadata import PackageNotFoundError, version
from pathlib import Path
from typing import TYPE_CHECKING
from .import_utils import _LazyModule

View File

@ -15,7 +15,7 @@
import contextlib
import functools
import time
from collections.abc import Generator
from collections.abc import Callable, Generator
from transformers import Trainer
from transformers.integrations import is_mlflow_available, is_wandb_available
@ -68,12 +68,12 @@ def profiling_context(trainer: Trainer, name: str) -> Generator[None, None, None
mlflow.log_metrics(profiling_metrics, step=trainer.state.global_step)
def profiling_decorator(func: callable) -> callable:
def profiling_decorator(func: Callable) -> Callable:
"""
Decorator to profile a function and log execution time using [`extras.profiling.profiling_context`].
Args:
func (`callable`):
func (`Callable`):
Function to be profiled.
Example:

View File

@ -219,7 +219,6 @@ class OffloadActivations(saved_tensors_hooks):
verify_sufficient_virtual_memory()
self.is_first_backward_call = False
self.is_first_forward_call = True
if unpack_tensor_id not in self.tracker:
raise ValueError(f"Untracked tensor with id {unpack_tensor_id}")
@ -231,6 +230,9 @@ class OffloadActivations(saved_tensors_hooks):
# clear tensor from tracking
del self.tracker[unpack_tensor_id]
# Only set is_first_forward_call to True when all tensors have been unpacked
if len(self.tracker) == 0:
self.is_first_forward_call = True
return maybe_accelerator_tensor
def unpack_tensor_with_streams(unpack_tensor_id: int) -> torch.Tensor:
@ -254,7 +256,6 @@ class OffloadActivations(saved_tensors_hooks):
verify_sufficient_virtual_memory()
self.is_first_backward_call = False
self.is_first_forward_call = True
if unpack_tensor_id not in self.tracker:
raise ValueError(f"untracked tensor with id {unpack_tensor_id}")
@ -359,6 +360,9 @@ class OffloadActivations(saved_tensors_hooks):
# clear tensor from tracking
del self.tracker[unpack_tensor_id]
# Only set is_first_forward_call to True when all tensors have been unpacked
if len(self.tracker) == 0:
self.is_first_forward_call = True
return maybe_accelerator_tensor
unpack_tensor = unpack_tensor_with_streams if self.use_streams else unpack_tensor_single_stream

View File

@ -16,6 +16,7 @@ import inspect
import os
import random
import textwrap
import warnings
from collections import defaultdict
from contextlib import contextmanager, nullcontext
from operator import itemgetter
@ -360,6 +361,13 @@ class BCOTrainer(BaseTrainer):
embedding_func: Optional[Callable] = None,
embedding_tokenizer: Optional[PreTrainedTokenizerBase] = None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
if embedding_func is not None and not (is_sklearn_available() and is_joblib_available()):
raise ImportError(
"BCOTrainer with UDM requires the scikit-learn and joblib libraries. Please install it with `pip install scikit-learn joblib`."

View File

@ -14,6 +14,7 @@
import logging
import os
from collections.abc import Callable
from typing import Optional, Union
import pandas as pd
@ -583,7 +584,7 @@ class WeaveCallback(TrainerCallback):
self,
trainer: Trainer,
project_name: Optional[str] = None,
scorers: Optional[dict[str, callable]] = None,
scorers: Optional[dict[str, Callable]] = None,
generation_config: Optional[GenerationConfig] = None,
num_prompts: Optional[int] = None,
dataset_name: str = "eval_dataset",

View File

@ -13,8 +13,10 @@
# limitations under the License.
import inspect
import os
import random
import textwrap
import warnings
from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path
@ -142,6 +144,13 @@ class CPOTrainer(BaseTrainer):
peft_config: Optional[dict] = None,
compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
if args.model_init_kwargs is None:
model_init_kwargs = {}
elif not isinstance(model, str):

View File

@ -12,8 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import random
import textwrap
import warnings
from typing import Any, Callable, Optional, Union
import torch
@ -38,11 +40,7 @@ from ..models import prepare_deepspeed
from ..models.utils import unwrap_model_for_generation
from .gkd_config import GKDConfig
from .sft_trainer import SFTTrainer
from .utils import (
DataCollatorForChatML,
disable_dropout_in_model,
empty_cache,
)
from .utils import DataCollatorForChatML, disable_dropout_in_model, empty_cache
if is_peft_available():
@ -127,6 +125,13 @@ class GKDTrainer(SFTTrainer):
peft_config: Optional["PeftConfig"] = None,
formatting_func: Optional[Callable] = None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
# Ensure Trainer does not drop non-signature columns used by the collator (e.g., "prompts")
args.remove_unused_columns = False
# Respect a user-provided data_collator; otherwise, provide a ChatML collator that

View File

@ -13,8 +13,10 @@
# limitations under the License.
import inspect
import os
import random
import textwrap
import warnings
from collections import defaultdict
from contextlib import contextmanager, nullcontext
from operator import itemgetter
@ -353,6 +355,13 @@ class KTOTrainer(BaseTrainer):
model_adapter_name: Optional[str] = None,
ref_adapter_name: Optional[str] = None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
if type(args) is TrainingArguments:
raise ValueError("Please use `KTOConfig` instead TrainingArguments.")

View File

@ -412,3 +412,10 @@ class OnlineDPOConfig(TrainingArguments):
if hasattr(self.beta, "__len__") and len(self.beta) == 1:
self.beta = self.beta[0]
if self.max_new_tokens >= self.max_length:
warnings.warn(
f"The configuration has `max_new_tokens` ({self.max_new_tokens}) >= `max_length` ({self.max_length}). "
"This will cause prompts to be truncated or completely removed in the forward pass. "
"To preserve prompts, ensure e.g. `max_length > max_new_tokens + 512`. ",
)

View File

@ -57,8 +57,13 @@ from ..data_utils import apply_chat_template, is_conversational, maybe_apply_cha
from ..extras.profiling import profiling_context
from ..extras.vllm_client import VLLMClient
from ..import_utils import is_vllm_available
from ..models import create_reference_model, prepare_peft_model
from ..models.utils import unwrap_model_for_generation
from ..models import (
create_reference_model,
prepare_deepspeed,
prepare_fsdp,
prepare_peft_model,
unwrap_model_for_generation,
)
from .base_trainer import BaseTrainer
from .judges import BasePairwiseJudge
from .online_dpo_config import OnlineDPOConfig
@ -69,7 +74,6 @@ from .utils import (
empty_cache,
ensure_master_addr_port,
pad,
prepare_deepspeed,
truncate_right,
)
@ -206,6 +210,13 @@ class OnlineDPOTrainer(BaseTrainer):
reward_model: Optional[Union[PreTrainedModel, nn.Module]] = None,
reward_processing_class: Optional[PreTrainedTokenizerBase] = None,
) -> None:
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
if ref_model is model:
raise ValueError(
"`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "
@ -581,24 +592,20 @@ class OnlineDPOTrainer(BaseTrainer):
generation_kwargs = {k: v for k, v in generation_kwargs.items() if v is not None}
self.generation_config = GenerationConfig(**generation_kwargs)
if self.is_deepspeed_enabled:
if self.ref_model is not None:
self.ref_model = prepare_deepspeed(
self.ref_model, args.per_device_train_batch_size, args.fp16, args.bf16
)
# Prepare reward function models for DeepSpeed
if self.reward_funcs is not None:
for i, reward_func in enumerate(self.reward_funcs):
if isinstance(reward_func, PreTrainedModel):
if self.ref_model is not None:
if self.is_deepspeed_enabled:
self.ref_model = prepare_deepspeed(self.ref_model, self.accelerator)
elif self.is_fsdp_enabled:
self.ref_model = prepare_fsdp(self.ref_model, self.accelerator)
else:
self.ref_model = self.accelerator.prepare_model(self.ref_model, evaluation_mode=True)
if self.reward_funcs is not None:
for i, reward_func in enumerate(self.reward_funcs):
if isinstance(reward_func, PreTrainedModel):
if self.is_deepspeed_enabled:
self.reward_funcs[i] = prepare_deepspeed(reward_func, self.accelerator)
else:
if self.ref_model is not None:
self.ref_model = self.ref_model.to(self.accelerator.device)
# Prepare reward function models for FSDP/regular training
if self.reward_funcs is not None:
for i, reward_func in enumerate(self.reward_funcs):
if isinstance(reward_func, PreTrainedModel):
# Set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
else:
# set device placement to True to make `prepare_model` move `reward_func` to device when using fsdp
self.reward_funcs[i] = self.accelerator.prepare_model(
reward_func, evaluation_mode=True, device_placement=True
)
@ -826,8 +833,10 @@ class OnlineDPOTrainer(BaseTrainer):
def _generate_vllm_colocate(self, prompts, images=None):
"""Generate completions using vLLM colocate mode"""
# Update model weights if needed
self._move_model_to_vllm()
# Update model weights if needed - only after gradient accumulation completes
if self.state.global_step != self._last_loaded_step:
self._move_model_to_vllm()
self._last_loaded_step = self.state.global_step
# Apply chat template if conversational
if is_conversational({"prompt": prompts[0]}):
@ -1227,10 +1236,12 @@ class OnlineDPOTrainer(BaseTrainer):
# Get the logprobs of the completions from the model
output = model(prompt_completion_ids, **model_kwargs)
# There is 1 offset, because the model predict the next token
# There is 1 offset, because the model predicts the next token
prompt_len = prompt_ids.size(1)
start_idx = prompt_len - 1 if prompt_len > 0 else 0
logits = output.logits[:, start_idx:-1]
# Only slice off the last logit when we have a prompt, otherwise we need all logits
end_idx = -1 if prompt_len > 0 else None
logits = output.logits[:, start_idx:end_idx]
# Take the completion tokens logprob
logprobs = torch.take_along_dim(logits.log_softmax(dim=-1), completion_ids.unsqueeze(-1), dim=2).squeeze(-1)

View File

@ -13,8 +13,10 @@
# limitations under the License.
import inspect
import os
import random
import textwrap
import warnings
from collections import defaultdict
from contextlib import nullcontext
from pathlib import Path
@ -144,6 +146,13 @@ class ORPOTrainer(BaseTrainer):
peft_config: Optional[dict] = None,
compute_metrics: Optional[Callable[[EvalLoopOutput], dict]] = None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
if args.model_init_kwargs is None:
model_init_kwargs = {}
elif not isinstance(model, str):

View File

@ -17,6 +17,7 @@ import math
import os
import textwrap
import time
import warnings
from collections import defaultdict
from contextlib import contextmanager, nullcontext
from pathlib import Path
@ -159,6 +160,13 @@ class PPOTrainer(BaseTrainer):
callbacks: Optional[list[TrainerCallback]] = None,
peft_config: Optional["PeftConfig"] = None,
) -> None:
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
if ref_model is model:
raise ValueError(
"`model` and `ref_model` cannot be the same object. If you want `ref_model` to be the "

View File

@ -12,7 +12,9 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import textwrap
import warnings
from itertools import chain
from pathlib import Path
from typing import Callable, Optional, Union
@ -117,6 +119,13 @@ class PRMTrainer(BaseTrainer):
preprocess_logits_for_metrics: Optional[Callable[[torch.Tensor, torch.Tensor], torch.Tensor]] = None,
peft_config: Optional[dict] = None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
if peft_config is not None or (is_peft_available() and isinstance(model, PeftModel)):
model = prepare_peft_model(model, peft_config, args)

View File

@ -275,6 +275,13 @@ class RLOOTrainer(BaseTrainer):
ref_policy=None,
data_collator=None,
):
if not os.environ.get("TRL_EXPERIMENTAL_SILENCE"):
warnings.warn(
"This trainer will soon be moved to trl.experimental and is a candidate for removal. If you rely on "
"it and want it to remain, please share your comments here: "
"https://github.com/huggingface/trl/issues/4223. Silence this warning by setting environment variable "
"TRL_EXPERIMENTAL_SILENCE=1."
)
# Handle deprecated parameters
if config is not None:
warnings.warn(

View File

@ -61,7 +61,7 @@ from .utils import (
if is_peft_available():
from peft import PeftConfig, PeftModel
from peft import PeftConfig, PeftModel, PeftType
logger = logging.get_logger(__name__)
@ -424,15 +424,26 @@ class DataCollatorForVisionLanguageModeling(DataCollatorMixin):
input_ids = torch.cat((prompt_ids, completion_ids), dim=1)
attention_mask = torch.cat((prompt_mask, completion_mask), dim=1)
completion_mask = torch.cat((torch.zeros_like(prompt_mask), completion_mask), dim=1)
if "token_type_ids" in processed_prompts: # special case for Gemma
prompt_token_type_ids = processed_prompts["token_type_ids"]
completion_token_type_ids = processed_completions["token_type_ids"]
token_type_ids = torch.cat((prompt_token_type_ids, completion_token_type_ids), dim=1)
# Flush left to reduce padding
attention_mask, input_ids, completion_mask = flush_left(attention_mask, input_ids, completion_mask)
if "token_type_ids" in processed_prompts:
attention_mask, input_ids, completion_mask, token_type_ids = flush_left(
attention_mask, input_ids, completion_mask, token_type_ids
)
else:
attention_mask, input_ids, completion_mask = flush_left(attention_mask, input_ids, completion_mask)
# Truncate if necessary
if self.max_length is not None:
input_ids = input_ids[:, : self.max_length]
attention_mask = attention_mask[:, : self.max_length]
completion_mask = completion_mask[:, : self.max_length]
if "token_type_ids" in processed_prompts:
token_type_ids = token_type_ids[:, : self.max_length]
# Create labels and mask padding tokens
labels = input_ids.clone()
@ -445,6 +456,8 @@ class DataCollatorForVisionLanguageModeling(DataCollatorMixin):
output["input_ids"] = input_ids
output["attention_mask"] = attention_mask
output["labels"] = labels
if "token_type_ids" in processed_prompts:
output["token_type_ids"] = token_type_ids
return output
@ -1090,13 +1103,15 @@ class SFTTrainer(BaseTrainer):
if not self.args.use_liger_kernel: # liger doesn't return logits
with torch.no_grad():
per_token_entropy = entropy_from_logits(outputs.logits)
# When using Prompt Tuning, skip the virtual tokens in logits before entropy computation, since they
# do not correspond to actual input tokens.
if (
self.num_virtual_tokens > 0
and model.peft_config[model.active_adapter].peft_type != PeftType.PREFIX_TUNING
):
per_token_entropy = per_token_entropy[:, self.num_virtual_tokens :]
if "attention_mask" in inputs:
attention_mask = inputs["attention_mask"]
# When using Prompt Tuning, we need to add attention for the virtual tokens (all set to 1).
virtual_attention_mask = torch.ones(
attention_mask.size(0), self.num_virtual_tokens, device=attention_mask.device
)
attention_mask = torch.cat((virtual_attention_mask, attention_mask), dim=1)
entropy = torch.sum(per_token_entropy * attention_mask) / attention_mask.sum()
elif "position_ids" in inputs:
entropy = torch.mean(per_token_entropy)
@ -1131,9 +1146,12 @@ class SFTTrainer(BaseTrainer):
shift_logits = outputs.logits[..., :-1, :].contiguous()
shift_labels = labels[..., 1:].contiguous()
# When using Prompt Tuning, skip the virtual tokens in logits before accuracy computation, since they do
# not correspond to actual input labels.
shift_logits = shift_logits[:, self.num_virtual_tokens :, :]
# Prompt Tuning and P-Tuning output logits for virtual tokens but Prefix-Tuning does not.
if (
self.num_virtual_tokens > 0
and model.peft_config[model.active_adapter].peft_type != PeftType.PREFIX_TUNING
):
shift_logits = shift_logits[:, self.num_virtual_tokens :, :]
# Get predictions
predictions = shift_logits.argmax(dim=-1)