[install] fix: revert pyproj.toml and fix tensordict req (#59)

* [megatron] chore: remove unused code

* fix lint

* ci install to user

* test

* fix requirements.txt

* bump tensordict to 0.5
This commit is contained in:
HL
2024-12-21 10:50:03 -07:00
committed by GitHub
parent f12179e900
commit 09568e60ea
4 changed files with 3 additions and 74 deletions

View File

@ -25,7 +25,7 @@ jobs:
fetch-depth: 0
- name: Install the current repository
run: |
pip install -e .[test]
pip install -e .[test] --user
- name: Running dataset tests
run: |
[ ! -d "$HOME/verl-data" ] && git clone --depth 1 https://github.com/eric-haibin-lin/verl-data ~/verl-data

View File

@ -1,3 +0,0 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"

View File

@ -8,5 +8,5 @@ hydra-core
numpy
pybind11
ray==2.10
tensordict
transformers
tensordict < 0.6
transformers

View File

@ -103,74 +103,6 @@ class MegatronPPOCritic(BasePPOCritic):
return values
def compute_advantages(self, data: DataProto) -> DataProto:
# data.batch = data.batch.to(self.critic_module.device)
# TODO: in general, we should compute reward of ref_log_prob here
responses = data.batch['responses']
response_length = responses.size(1)
token_level_rewards = data.batch['token_level_rewards']
batch_size = data.batch.batch_size[0]
dp_size = mpu.get_data_parallel_world_size()
attention_mask = data.batch['attention_mask']
eos_mask = attention_mask[:, -response_length:]
# compute kl between ref_policy and current policy
if 'ref_log_prob' in data.batch.keys():
kld = core_algos.kl_penalty(data.batch['old_log_probs'],
data.batch['ref_log_prob'],
kl_penalty=self.config.kl_ctrl.kl_penalty_type) # (batch_size, response_length)
kld = kld * eos_mask
beta = self.kl_ctrl.value
rewards = token_level_rewards - beta * kld
# per token kld
current_kl = masked_mean(kld, mask=eos_mask).item()
# according to https://github.com/huggingface/trl/blob/951ca1841f29114b969b57b26c7d3e80a39f75a0/trl/trainer/ppo_trainer.py#L837
self.kl_ctrl.update(current_kl=current_kl, n_steps=batch_size * dp_size)
else:
beta = 0
current_kl = 0
rewards = token_level_rewards
values = data.batch['values']
gamma = self.config.gamma
lam = self.config.lam
advantages, returns = core_algos.compute_gae_advantage_return(token_level_rewards=rewards,
values=values,
eos_mask=eos_mask,
gamma=gamma,
lam=lam)
data.batch['advantages'] = advantages
data.batch['returns'] = returns
sequence_reward = torch.sum(token_level_rewards, dim=-1)
metrics = {
'critic/rewards/mean': torch.mean(sequence_reward).detach().item(),
'critic/rewards/max': torch.max(sequence_reward[eos_mask]).detach().item(),
'critic/rewards/min': torch.min(sequence_reward[eos_mask]).detach().item(),
'critic/advantages/mean': masked_mean(advantages, eos_mask).detach().item(),
'critic/advantages/max': torch.max(advantages[eos_mask]).detach().item(),
'critic/advantages/min': torch.min(advantages[eos_mask]).detach().item(),
'critic/returns/mean': masked_mean(returns, eos_mask).detach().item(),
'critic/returns/max': torch.max(returns[eos_mask]).detach().item(),
'critic/returns/min': torch.min(returns[eos_mask]).detach().item(),
'critic/values/mean': masked_mean(values, eos_mask).detach().item(),
'critic/values/max': torch.max(values[eos_mask]).detach().item(),
'critic/values/min': torch.min(values[eos_mask]).detach().item(),
'critic/kld': current_kl,
'critic/kl_coef': beta,
}
data.meta_info['metrics'] = metrics
# add empty cache after each compute
torch.cuda.empty_cache()
return data
def make_minibatch_iterator(self, data: DataProto) -> Iterable[DataProto]:
select_keys = ['input_ids', 'responses', 'attention_mask', 'position_ids', 'values', 'returns']
data = data.select(batch_keys=select_keys)