mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[install] fix: revert pyproj.toml and fix tensordict req (#59)
* [megatron] chore: remove unused code * fix lint * ci install to user * test * fix requirements.txt * bump tensordict to 0.5
This commit is contained in:
2
.github/workflows/gpu_test.yml
vendored
2
.github/workflows/gpu_test.yml
vendored
@ -25,7 +25,7 @@ jobs:
|
||||
fetch-depth: 0
|
||||
- name: Install the current repository
|
||||
run: |
|
||||
pip install -e .[test]
|
||||
pip install -e .[test] --user
|
||||
- name: Running dataset tests
|
||||
run: |
|
||||
[ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data
|
||||
|
@ -1,3 +0,0 @@
|
||||
[build-system]
|
||||
requires = ["setuptools"]
|
||||
build-backend = "setuptools.build_meta"
|
@ -8,5 +8,5 @@ hydra-core
|
||||
numpy
|
||||
pybind11
|
||||
ray==2.10
|
||||
tensordict
|
||||
transformers
|
||||
tensordict < 0.6
|
||||
transformers
|
||||
|
@ -103,74 +103,6 @@ class MegatronPPOCritic(BasePPOCritic):
|
||||
|
||||
return values
|
||||
|
||||
def compute_advantages(self, data: DataProto) -> DataProto:
|
||||
# data.batch = data.batch.to(self.critic_module.device)
|
||||
# TODO: in general, we should compute reward of ref_log_prob here
|
||||
responses = data.batch['responses']
|
||||
response_length = responses.size(1)
|
||||
token_level_rewards = data.batch['token_level_rewards']
|
||||
batch_size = data.batch.batch_size[0]
|
||||
dp_size = mpu.get_data_parallel_world_size()
|
||||
attention_mask = data.batch['attention_mask']
|
||||
eos_mask = attention_mask[:, -response_length:]
|
||||
|
||||
# compute kl between ref_policy and current policy
|
||||
if 'ref_log_prob' in data.batch.keys():
|
||||
kld = core_algos.kl_penalty(data.batch['old_log_probs'],
|
||||
data.batch['ref_log_prob'],
|
||||
kl_penalty=self.config.kl_ctrl.kl_penalty_type) # (batch_size, response_length)
|
||||
kld = kld * eos_mask
|
||||
beta = self.kl_ctrl.value
|
||||
rewards = token_level_rewards - beta * kld
|
||||
|
||||
# per token kld
|
||||
current_kl = masked_mean(kld, mask=eos_mask).item()
|
||||
# according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
|
||||
self.kl_ctrl.update(current_kl=current_kl, n_steps=batch_size * dp_size)
|
||||
else:
|
||||
beta = 0
|
||||
current_kl = 0
|
||||
rewards = token_level_rewards
|
||||
|
||||
values = data.batch['values']
|
||||
|
||||
gamma = self.config.gamma
|
||||
lam = self.config.lam
|
||||
|
||||
advantages, returns = core_algos.compute_gae_advantage_return(token_level_rewards=rewards,
|
||||
values=values,
|
||||
eos_mask=eos_mask,
|
||||
gamma=gamma,
|
||||
lam=lam)
|
||||
data.batch['advantages'] = advantages
|
||||
data.batch['returns'] = returns
|
||||
|
||||
sequence_reward = torch.sum(token_level_rewards, dim=-1)
|
||||
|
||||
metrics = {
|
||||
'critic/rewards/mean': torch.mean(sequence_reward).detach().item(),
|
||||
'critic/rewards/max': torch.max(sequence_reward[eos_mask]).detach().item(),
|
||||
'critic/rewards/min': torch.min(sequence_reward[eos_mask]).detach().item(),
|
||||
'critic/advantages/mean': masked_mean(advantages, eos_mask).detach().item(),
|
||||
'critic/advantages/max': torch.max(advantages[eos_mask]).detach().item(),
|
||||
'critic/advantages/min': torch.min(advantages[eos_mask]).detach().item(),
|
||||
'critic/returns/mean': masked_mean(returns, eos_mask).detach().item(),
|
||||
'critic/returns/max': torch.max(returns[eos_mask]).detach().item(),
|
||||
'critic/returns/min': torch.min(returns[eos_mask]).detach().item(),
|
||||
'critic/values/mean': masked_mean(values, eos_mask).detach().item(),
|
||||
'critic/values/max': torch.max(values[eos_mask]).detach().item(),
|
||||
'critic/values/min': torch.min(values[eos_mask]).detach().item(),
|
||||
'critic/kld': current_kl,
|
||||
'critic/kl_coef': beta,
|
||||
}
|
||||
|
||||
data.meta_info['metrics'] = metrics
|
||||
|
||||
# add empty cache after each compute
|
||||
torch.cuda.empty_cache()
|
||||
|
||||
return data
|
||||
|
||||
def make_minibatch_iterator(self, data: DataProto) -> Iterable[DataProto]:
|
||||
select_keys = ['input_ids', 'responses', 'attention_mask', 'position_ids', 'values', 'returns']
|
||||
data = data.select(batch_keys=select_keys)
|
||||
|
Reference in New Issue
Block a user