verl/verl/trainer/config/critic/critic.yaml

# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
_target_: verl.workers.config.CriticConfig

# Number of rollouts per update (mirrors actor rollout_n)
rollout_n: ${oc.select:actor_rollout_ref.rollout.n,1}

# fsdp or fsdp2 strategy used for critic model training
strategy: ???

# whether to enable the critic worker.
# by default it is only enabled if advantage estimator is gae
# set it to True manually if you always want to enable critic worker
enable: null

# optimizer configs
optim:

  # Learning rate
  lr: 1e-5

  # Warmup steps ratio; total steps will be injected at runtime
  lr_warmup_steps_ratio: 0.0

  # Total training steps (must be overridden at runtime)
  total_training_steps: -1

  # Weight decay
  weight_decay: 0.01

  # Prioritized. None, 0 or Negative values mean delegating to lr_warmup_steps_ratio.
  lr_warmup_steps: -1


# model config for the critic
model:

  # Path to pretrained model weights
  path: ~/models/deepseek-llm-7b-chat

  # Tokenizer path (defaults to actor's model path)
  tokenizer_path: ${oc.select:actor_rollout_ref.model.path,"~/models/deepseek-llm-7b-chat"}

  # Hugging Face config override
  override_config: {}

  # External model implementation (optional)
  external_lib: ${oc.select:actor_rollout_ref.model.external_lib,null}

  # Whether to trust remote code from Hugging Face models
  trust_remote_code: ${oc.select:actor_rollout_ref.model.trust_remote_code,false}

# PPO mini-batch size per update
ppo_mini_batch_size: ${oc.select:actor_rollout_ref.actor.ppo_mini_batch_size,256}

# [Deprecated] Global micro batch size
ppo_micro_batch_size: null

# Local per-GPU micro batch size
ppo_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size,null}

# Whether to automatically adjust batch size at runtime
use_dynamic_bsz: ${oc.select:actor_rollout_ref.actor.use_dynamic_bsz,false}

# Max tokens per GPU in one PPO batch (doubled for critic)
ppo_max_token_len_per_gpu: 32768

# Max token length per GPU in forward pass
forward_max_token_len_per_gpu: ${.ppo_max_token_len_per_gpu}

# Number of PPO epochs per batch
ppo_epochs: ${oc.select:actor_rollout_ref.actor.ppo_epochs,1}

# Shuffle training data across PPO epochs
shuffle: ${oc.select:actor_rollout_ref.actor.shuffle,false}

# PPO value function clipping range
cliprange_value: 0.5

# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
loss_agg_mode: ${oc.select:actor_rollout_ref.actor.loss_agg_mode,token-mean}

# checkpoint configs
checkpoint:

  # Target dataclass for this configuration
  _target_: verl.trainer.config.CheckpointConfig

  # What to include in saved checkpoints
  # with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
  save_contents: ['model', 'optimizer', 'extra']

  # What to include when loading checkpoints
  load_contents: ${.save_contents}

  # Whether to save checkpoints asynchronously. Only effective for Megatron as of now.
  async_save: False

# profile the critic model in `update_policy`
profiler:

  # Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
  _target_: verl.utils.profiler.ProfilerConfig

  # profiler tool, default same as profiler.tool in global config
  # choices: nsys, npu, torch
  tool: ${oc.select:global_profiler.tool,null}

  # whether enable profile on critic
  enable: False

  # Whether to profile all ranks.
  all_ranks: False

  # The ranks that will be profiled. [] or [0,1,...]
  ranks: []

  # profile results saving path
  save_path: ${oc.select:global_profiler.save_path,null}

  # specific tool config
  tool_config: ${oc.select:actor_rollout_ref.actor.tool_config,null}