set use_kl_in_reward=True in reinforce_plus_plus (#2580)

set use_kl_in_reward=True in reinforce_plus_plus
2025-10-20 13:43:50 +08:00 · 2025-07-17 12:10:54 +08:00
parent 503ea75f53
commit 64601e418c
2 changed files with 4 additions and 4 deletions
--- a/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh
+++ b/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf.sh
@ -23,7 +23,7 @@ python3 -m verl.trainer.main_ppo \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=1024 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
-    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.use_kl_loss=False \
    actor_rollout_ref.actor.kl_loss_coef=0.001 \
    actor_rollout_ref.actor.kl_loss_type=mse \
    actor_rollout_ref.actor.entropy_coeff=0 \
@ -37,7 +37,7 @@ python3 -m verl.trainer.main_ppo \
    actor_rollout_ref.rollout.n=8 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
+    algorithm.use_kl_in_reward=True \
    trainer.critic_warmup=0 \
    trainer.logger='["console","wandb"]' \
    trainer.project_name='verl_grpo_example_gsm8k' \
--- a/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh
+++ b/examples/reinforce_plus_plus_trainer/run_qwen2-7b_math_rf_baseline.sh
@ -23,7 +23,7 @@ python3 -m verl.trainer.main_ppo \
    actor_rollout_ref.model.use_remove_padding=True \
    actor_rollout_ref.actor.ppo_mini_batch_size=1024 \
    actor_rollout_ref.actor.ppo_micro_batch_size_per_gpu=16 \
-    actor_rollout_ref.actor.use_kl_loss=True \
+    actor_rollout_ref.actor.use_kl_loss=False \
    actor_rollout_ref.actor.kl_loss_coef=0.001 \
    actor_rollout_ref.actor.kl_loss_type=mse \
    actor_rollout_ref.actor.entropy_coeff=0 \
@ -37,7 +37,7 @@ python3 -m verl.trainer.main_ppo \
    actor_rollout_ref.rollout.n=8 \
    actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=16 \
    actor_rollout_ref.ref.fsdp_config.param_offload=True \
-    algorithm.use_kl_in_reward=False \
+    algorithm.use_kl_in_reward=True \
    trainer.critic_warmup=0 \
    trainer.logger='["console","wandb"]' \
    trainer.project_name='verl_grpo_example_gsm8k' \