mirror of
https://github.com/huggingface/trl.git
synced 2025-11-12 01:04:41 +08:00
corrects loss function for Self-play Preference Optimization hard label version (#1615)
* corrects sppo hard lable version * formatting * formatting
This commit is contained in:
committed by
GitHub
parent
7075cec94d
commit
75de236c09
@ -111,7 +111,7 @@ The [KTO](https://arxiv.org/abs/2402.01306) authors directly maximize the utilit
|
||||
|
||||
The [BCO](https://arxiv.org/abs/2404.04656) authors train a binary classifier whose logit serves as a reward so that the classifier maps {prompt, chosen completion} pairs to 1 and {prompt, rejected completion} pairs to 0. The `DPOTrainer` can be switched to this loss via the `loss_type="bco_pair"` argument.
|
||||
|
||||
The [SPPO](https://arxiv.org/abs/2405.00675) authors claim that SPPO is capable of solving the Nash equilibrium iteratively by pushing the chosen rewards to be as large as 1/2 and the rejected rewards to be as small as -1/2 and can alleviate data sparsity issues.
|
||||
The [SPPO](https://arxiv.org/abs/2405.00675) authors claim that SPPO is capable of solving the Nash equilibrium iteratively by pushing the chosen rewards to be as large as 1/2 and the rejected rewards to be as small as -1/2 and can alleviate data sparsity issues. The implementation using loss_type="sppo_hard" approximates this algorithm by employing hard label probabilities, assigning 1 to the winner and 0 to the loser.
|
||||
|
||||
## Logging
|
||||
|
||||
|
||||
@ -90,8 +90,8 @@ class DPOTrainerTester(unittest.TestCase):
|
||||
["t5", "kto_pair", False],
|
||||
["gpt2", "bco_pair", False],
|
||||
["t5", "bco_pair", True],
|
||||
["gpt2", "sppo", False],
|
||||
["t5", "sppo", True],
|
||||
["gpt2", "sppo_hard", False],
|
||||
["t5", "sppo_hard", True],
|
||||
]
|
||||
)
|
||||
def test_dpo_trainer(self, name, loss_type, pre_compute):
|
||||
|
||||
@ -69,7 +69,7 @@ class DPOConfig(TrainingArguments):
|
||||
|
||||
beta: float = 0.1
|
||||
label_smoothing: float = 0
|
||||
loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "sppo"] = "sigmoid"
|
||||
loss_type: Literal["sigmoid", "hinge", "ipo", "kto_pair", "bco_pair", "sppo_hard"] = "sigmoid"
|
||||
label_pad_token_id: int = -100
|
||||
padding_value: int = 0
|
||||
truncation_mode: str = "keep_end"
|
||||
|
||||
@ -1028,14 +1028,15 @@ class DPOTrainer(Trainer):
|
||||
losses = -F.logsigmoid((self.beta * chosen_logratios) - delta) - F.logsigmoid(
|
||||
-(self.beta * rejected_logratios - delta)
|
||||
)
|
||||
elif self.loss_type == "sppo":
|
||||
a = self.beta * (policy_chosen_logps - reference_chosen_logps)
|
||||
b = self.beta * (policy_rejected_logps - reference_rejected_logps)
|
||||
elif self.loss_type == "sppo_hard":
|
||||
# In the paper (https://arxiv.org/pdf/2405.00675), SPPO employs a soft probability approach, estimated using the PairRM score. The probability calculation is conducted outside of the trainer class. The version described here is the hard probability version, where P in Equation (4.7) of Algorithm 1 is set to 1 for the winner and 0 for the loser.
|
||||
a = policy_chosen_logps - reference_chosen_logps
|
||||
b = policy_rejected_logps - reference_rejected_logps
|
||||
|
||||
losses = (a - 0.5) ** 2 + (b + 0.5) ** 2
|
||||
losses = (a - 0.5 / self.beta) ** 2 + (b + 0.5 / self.beta) ** 2
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'bco_pair', 'sppo']"
|
||||
f"Unknown loss type: {self.loss_type}. Should be one of ['sigmoid', 'hinge', 'ipo', 'kto_pair', 'bco_pair', 'sppo_hard']"
|
||||
)
|
||||
|
||||
chosen_rewards = (
|
||||
|
||||
Reference in New Issue
Block a user