mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
set use_kl_in_reward=True in reinforce_plus_plus (#2580)
set use_kl_in_reward=True in reinforce_plus_plus
This commit is contained in:
@ -23,7 +23,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.model.use_remove_padding=True \
|
||||
actor_rollout_ref.actor.ppo_mini_batch_size=1024 \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
|
||||
actor_rollout_ref.actor.use_kl_loss=True \
|
||||
actor_rollout_ref.actor.use_kl_loss=False \
|
||||
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
||||
actor_rollout_ref.actor.kl_loss_type=mse \
|
||||
actor_rollout_ref.actor.entropy_coeff=0 \
|
||||
@ -37,7 +37,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.rollout.n=8 \
|
||||
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
|
||||
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
||||
algorithm.use_kl_in_reward=False \
|
||||
algorithm.use_kl_in_reward=True \
|
||||
trainer.critic_warmup=0 \
|
||||
trainer.logger='["console","wandb"]' \
|
||||
trainer.project_name='verl_grpo_example_gsm8k' \
|
||||
|
@ -23,7 +23,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.model.use_remove_padding=True \
|
||||
actor_rollout_ref.actor.ppo_mini_batch_size=1024 \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
|
||||
actor_rollout_ref.actor.use_kl_loss=True \
|
||||
actor_rollout_ref.actor.use_kl_loss=False \
|
||||
actor_rollout_ref.actor.kl_loss_coef=0.001 \
|
||||
actor_rollout_ref.actor.kl_loss_type=mse \
|
||||
actor_rollout_ref.actor.entropy_coeff=0 \
|
||||
@ -37,7 +37,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.rollout.n=8 \
|
||||
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
|
||||
actor_rollout_ref.ref.fsdp_config.param_offload=True \
|
||||
algorithm.use_kl_in_reward=False \
|
||||
algorithm.use_kl_in_reward=True \
|
||||
trainer.critic_warmup=0 \
|
||||
trainer.logger='["console","wandb"]' \
|
||||
trainer.project_name='verl_grpo_example_gsm8k' \
|
||||
|
Reference in New Issue
Block a user