mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
### What does this PR do? > Rename `warmup_style` in FSDPOptimizerConfig to `lr_scheduler_type` to align with Hugging Face Trainer API。 The following pull request is for refactoring the optimizer, however, the naming issue persists. https://github.com/volcengine/verl/pull/3656 ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [x] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [x] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) --------- Co-authored-by: weiqi.li <weiqi.li@bytedance.com>
1109 lines
35 KiB
YAML
1109 lines
35 KiB
YAML
# Format checks enforced on CI:
|
||
# 1. Comments must appear above each field.
|
||
# 2. There must be a blank line between each field.
|
||
# 3. Inline comments (after a field on the same line) are not allowed.
|
||
# 4. Indentation level is respected for nested fields.
|
||
|
||
# dataset config
|
||
data:
|
||
|
||
# Tokenizer class or path. If null, it will be inferred from the model.
|
||
tokenizer: null
|
||
|
||
# Whether to use shared memory for data loading.
|
||
use_shm: False
|
||
|
||
# Training set parquet. Can be a list or a single file.
|
||
# The program will read all files into memory, so it can't be too large (< 100GB).
|
||
# The path can be either a local path or an HDFS path.
|
||
# For HDFS path, we provide utils to download it to DRAM and convert it to a local path.
|
||
train_files: ~/data/rlhf/gsm8k/train.parquet
|
||
|
||
# Validation parquet. Can be a list or a single file.
|
||
val_files: ~/data/rlhf/gsm8k/test.parquet
|
||
|
||
# The field in the dataset where the prompt is located. Default is 'prompt'.
|
||
prompt_key: prompt
|
||
|
||
# The field used to select the reward function (if using different ones per example).
|
||
reward_fn_key: data_source
|
||
|
||
# Maximum prompt length. All prompts will be left-padded to this length.
|
||
# An error will be reported if the length is too long.
|
||
max_prompt_length: 512
|
||
|
||
# Maximum response length. Rollout in RL algorithms (e.g. PPO) generates up to this length.
|
||
max_response_length: 512
|
||
|
||
# Batch size sampled for one training iteration of different RL algorithms.
|
||
train_batch_size: 1024
|
||
|
||
# Batch size used during validation. Can be null.
|
||
val_batch_size: null
|
||
|
||
# Whether to return the original input_ids without adding chat template.
|
||
# This is used when the reward model's chat template differs from the policy.
|
||
# If using a model-based RM with different templates, this should be True.
|
||
return_raw_input_ids: False
|
||
|
||
# Whether to return the original chat (prompt) without applying chat template.
|
||
return_raw_chat: False
|
||
|
||
# Whether to return the full prompt with chat template.
|
||
return_full_prompt: False
|
||
|
||
# Whether to shuffle the data in the dataloader.
|
||
shuffle: True
|
||
|
||
# num dataloader workers
|
||
dataloader_num_workers: 8
|
||
|
||
# Whether to shuffle the validation set.
|
||
validation_shuffle: False
|
||
|
||
# Whether to filter overlong prompts.
|
||
filter_overlong_prompts: False
|
||
|
||
# Number of workers for filtering overlong prompts.
|
||
# For large-scale datasets, filtering can be time-consuming.
|
||
# Use multiprocessing to speed up. Default is 1.
|
||
filter_overlong_prompts_workers: 1
|
||
|
||
# Truncate the input_ids or prompt if they exceed max_prompt_length.
|
||
# Options: 'error', 'left', or 'right'. Default is 'error'.
|
||
truncation: error
|
||
|
||
# The field in the multi-modal dataset where the image is located. Default is 'images'.
|
||
image_key: images
|
||
|
||
# The field in the multi-modal dataset where the video is located.
|
||
video_key: videos
|
||
|
||
# If the remote tokenizer has a Python file, this flag determines whether to allow using it.
|
||
trust_remote_code: False
|
||
|
||
# Optional: specify a custom dataset class path and name if overriding default loading behavior.
|
||
custom_cls:
|
||
|
||
# The path to the file containing your customized dataset class. If not specified, pre-implemented dataset will be used.
|
||
path: null
|
||
|
||
# The name of the dataset class within the specified file.
|
||
name: null
|
||
|
||
# Whether to return multi-modal inputs in the dataset. Set to False if rollout generates new multi-modal inputs.
|
||
return_multi_modal_inputs: True
|
||
|
||
# Data generation configuration for augmenting the dataset.
|
||
datagen:
|
||
|
||
# The path to the file containing your customized data generation class.
|
||
# E.g. 'pkg://verl.experimental.dynamic_dataset.dynamicgen_dataset'
|
||
path: null
|
||
|
||
# The class name of the data generation class within the specified file.
|
||
# E.g. 'MockDataGenerator'
|
||
name: null
|
||
|
||
# settings related to data sampler
|
||
sampler:
|
||
|
||
# the path to the module containing a curriculum class which implements the
|
||
# AbstractSampler interface
|
||
class_path: null
|
||
|
||
# the name of the curriculum class like `MySampler`
|
||
class_name: null
|
||
|
||
# Additional kwargs when calling tokenizer.apply_chat_template
|
||
apply_chat_template_kwargs: {}
|
||
|
||
# config for actor, rollout and reference model
|
||
actor_rollout_ref:
|
||
|
||
# Whether it's a hybrid engine, currently only supports hybrid engine
|
||
hybrid_engine: true
|
||
|
||
# common configs for the model
|
||
model:
|
||
|
||
_target_: verl.workers.config.HFModelConfig
|
||
|
||
# Huggingface model path. This can be either local path or HDFS path.
|
||
path: ~/models/deepseek-llm-7b-chat
|
||
|
||
# Custom chat template for the model.
|
||
custom_chat_template: null
|
||
|
||
# Whether to use shared memory (SHM) for accelerating the loading of model weights
|
||
use_shm: false
|
||
|
||
# Additional Python packages to register huggingface models/tokenizers.
|
||
external_lib: null
|
||
|
||
# Used to override model's original configurations, mainly dropout
|
||
override_config: {}
|
||
|
||
# Enable gradient checkpointing for actor
|
||
enable_gradient_checkpointing: true
|
||
|
||
# Enable activation offloading for actor
|
||
enable_activation_offload: false
|
||
|
||
# Whether to remove padding tokens in inputs during training
|
||
use_remove_padding: false
|
||
|
||
# Set to positive value to enable LoRA (e.g., 32)
|
||
lora_rank: 0
|
||
|
||
# LoRA scaling factor
|
||
lora_alpha: 16
|
||
|
||
# Target modules to apply LoRA. Options: "all-linear" (not recommended for VLMs) or
|
||
# [q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj]
|
||
target_modules: all-linear
|
||
|
||
# Exclude modules from applying Lora. Similar usage to target_modules and Peft.
|
||
# Example: '.*visual.*' for excluding the ViT in Qwen2.5-VL, as currently vllm does not support ViT Lora.
|
||
exclude_modules: null
|
||
|
||
# Whether to use Liger for linear layer fusion
|
||
use_liger: false
|
||
|
||
# Whether to use custom fused kernels (e.g., FlashAttention, fused MLP)
|
||
use_fused_kernels: false
|
||
|
||
# Options for fused kernels. If use_fused_kernels is true, this will be used.
|
||
fused_kernel_options:
|
||
|
||
# Implementation backend for fused kernels. Options: "triton" or "torch".
|
||
impl_backend: torch
|
||
|
||
# Whether to enable loading a remote code model
|
||
trust_remote_code: false
|
||
|
||
# actor configs
|
||
actor:
|
||
|
||
# fsdp, fsdp2 or megatron. fsdp backend used here.
|
||
strategy: fsdp
|
||
|
||
# Split each sample into sub-batches of this size for PPO
|
||
ppo_mini_batch_size: 256
|
||
|
||
# [Deprecated] Global micro batch size
|
||
ppo_micro_batch_size: null
|
||
|
||
# Local per-GPU micro batch size
|
||
ppo_micro_batch_size_per_gpu: null
|
||
|
||
# Whether to automatically adjust batch size at runtime
|
||
use_dynamic_bsz: false
|
||
|
||
# Max tokens per GPU in one PPO batch; affects gradient accumulation
|
||
# Typically it should be: n * ${data.max_prompt_length} + ${data.max_response_length}
|
||
ppo_max_token_len_per_gpu: 16384
|
||
|
||
# Gradient clipping for actor updates
|
||
grad_clip: 1.0
|
||
|
||
# PPO clip ratio
|
||
clip_ratio: 0.2
|
||
|
||
# Lower bound for asymmetric clipping (used in dual-clip PPO)
|
||
clip_ratio_low: 0.2
|
||
|
||
# Upper bound for asymmetric clipping (used in dual-clip PPO)
|
||
clip_ratio_high: 0.2
|
||
|
||
# policy loss config
|
||
policy_loss:
|
||
|
||
# Loss function mode: vanilla / clip-cov / kl-cov /gpg from https://arxiv.org/abs/2505.22617
|
||
loss_mode: "vanilla"
|
||
|
||
# Ratio of tokens to be clipped for clip-cov loss
|
||
clip_cov_ratio: 0.0002
|
||
|
||
# Lower bound for clip-cov loss
|
||
clip_cov_lb: 1.0
|
||
|
||
# Upper bound for clip-cov loss
|
||
clip_cov_ub: 5.0
|
||
|
||
# Ratio of tokens to be applied kl penalty for kl-cov loss
|
||
kl_cov_ratio: 0.0002
|
||
|
||
# KL divergence penalty coefficient
|
||
ppo_kl_coef: 0.1
|
||
|
||
# Constant C in Dual-clip PPO; clips when advantage < 0 and ratio > C
|
||
clip_ratio_c: 3.0
|
||
|
||
# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
|
||
loss_agg_mode: token-mean
|
||
|
||
# Entropy regularization coefficient in PPO loss
|
||
entropy_coeff: 0
|
||
|
||
# Whether to use KL loss instead of KL reward penalty. True for GRPO
|
||
use_kl_loss: false
|
||
|
||
# Whether to use torch.compile()
|
||
use_torch_compile: true
|
||
|
||
# KL loss coefficient when use_kl_loss is enabled. For GRPO
|
||
kl_loss_coef: 0.001
|
||
|
||
# Type of KL divergence loss. Options: "kl"(k1), "abs", "mse"(k2), "low_var_kl"(k3), "full"
|
||
kl_loss_type: low_var_kl
|
||
|
||
# Number of PPO epochs per batch
|
||
ppo_epochs: 1
|
||
|
||
# Shuffle training data across PPO epochs
|
||
shuffle: false
|
||
|
||
# Sequence parallelism size for Ulysses-style model parallelism
|
||
ulysses_sequence_parallel_size: 1
|
||
|
||
# calculate entropy with chunking to reduce memory peak
|
||
entropy_from_logits_with_chunking: False
|
||
|
||
# recompute entropy
|
||
entropy_checkpointing: False
|
||
|
||
# checkpoint configs
|
||
checkpoint:
|
||
|
||
# What to include in saved checkpoints
|
||
# with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
|
||
save_contents: ['model', 'optimizer', 'extra']
|
||
|
||
# For more flexibility, you can specify the contents to load from the checkpoint.
|
||
load_contents: ${actor_rollout_ref.actor.checkpoint.save_contents}
|
||
|
||
# optimizer configs
|
||
optim:
|
||
|
||
# Learning rate
|
||
lr: 1e-6
|
||
|
||
# Warmup steps; negative value delegates to lr_warmup_steps_ratio
|
||
lr_warmup_steps: -1
|
||
|
||
# Warmup steps ratio (used if lr_warmup_steps is negative)
|
||
lr_warmup_steps_ratio: 0.0
|
||
|
||
# Minimum LR ratio for cosine schedule
|
||
min_lr_ratio: 0.0
|
||
|
||
# Number of cosine cycles in LR schedule
|
||
num_cycles: 0.5
|
||
|
||
# LR scheduler type: "constant" or "cosine"
|
||
lr_scheduler_type: constant
|
||
|
||
# Total training steps (must be overridden at runtime)
|
||
total_training_steps: -1
|
||
|
||
# Weight decay
|
||
weight_decay: 0.01
|
||
|
||
# configs for FSDP
|
||
fsdp_config:
|
||
|
||
# policy for wrapping the model
|
||
wrap_policy:
|
||
|
||
# Minimum number of parameters to trigger wrapping a layer with FSDP
|
||
min_num_params: 0
|
||
|
||
# Whether to offload model parameters to CPU (trades speed for memory)
|
||
param_offload: false
|
||
|
||
# Whether to offload optimizer state to CPU
|
||
optimizer_offload: false
|
||
|
||
# Only for FSDP2: offload param/grad/optimizer during train
|
||
offload_policy: false
|
||
|
||
# Only for FSDP2: Reshard after forward pass to reduce memory footprint
|
||
reshard_after_forward: true
|
||
|
||
# Number of GPUs in each FSDP shard group; -1 means auto
|
||
fsdp_size: -1
|
||
|
||
# Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
|
||
# before the current forward computation.
|
||
forward_prefetch: False
|
||
|
||
# Reference model config.
|
||
# Reference model will be enabled when actor.use_kl_loss or/and algorithm.use_kl_in_reward is/are True.
|
||
ref:
|
||
|
||
# actor_rollout_ref.ref: FSDP config same as actor. For models larger than 7B, it’s recommended to turn on offload for ref by default
|
||
strategy: ${actor_rollout_ref.actor.strategy}
|
||
|
||
# config for FSDP strategy
|
||
fsdp_config:
|
||
|
||
# whether to offload parameters in FSDP
|
||
param_offload: False
|
||
|
||
# whether to perform reshard after model forward to save memory.
|
||
# only for fsdp2, [True, False, int between 1 and fsdp_size]
|
||
reshard_after_forward: True
|
||
|
||
# Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
|
||
# before the current forward computation.
|
||
forward_prefetch: False
|
||
|
||
# the wrap policy for FSDP model
|
||
wrap_policy:
|
||
|
||
# minimum number of params in a wrapped module
|
||
min_num_params: 0
|
||
|
||
# whether to enable torch.compile
|
||
use_torch_compile: ${actor_rollout_ref.actor.use_torch_compile}
|
||
|
||
# [Will be deprecated, use log_prob_micro_batch_size_per_gpu]
|
||
# The batch size for one forward pass in the computation of log_prob. Global batch size.
|
||
log_prob_micro_batch_size: null
|
||
|
||
# The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
|
||
log_prob_micro_batch_size_per_gpu: null
|
||
|
||
# enable dynamic batch size (sequence packing) for log_prob computation
|
||
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||
|
||
# the max token length per GPU
|
||
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||
|
||
# sequence parallel size
|
||
ulysses_sequence_parallel_size: ${actor_rollout_ref.actor.ulysses_sequence_parallel_size}
|
||
|
||
# calculate entropy with chunking to reduce memory peak
|
||
entropy_from_logits_with_chunking: False
|
||
|
||
# recompute entropy
|
||
entropy_checkpointing: False
|
||
|
||
# Rollout model config.
|
||
rollout:
|
||
|
||
# actor_rollout_ref.rollout.name: hf/vllm/sglang.
|
||
name: vllm
|
||
|
||
# sync: LLM, async: AsyncLLM
|
||
mode: sync
|
||
|
||
# Sampling temperature for rollout.
|
||
temperature: 1.0
|
||
|
||
# Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
|
||
top_k: -1
|
||
|
||
# Top-p sampling parameter. Default 1.0.
|
||
top_p: 1
|
||
|
||
|
||
# typically the same as data max prompt length
|
||
prompt_length: ${data.max_prompt_length}
|
||
|
||
# typically the same as data max response length
|
||
response_length: ${data.max_response_length}
|
||
|
||
# for vllm rollout
|
||
# Rollout model parameters type. Align with actor model's FSDP/Megatron type.
|
||
dtype: bfloat16
|
||
|
||
# Fraction of GPU memory used by vLLM/SGLang for KV cache.
|
||
gpu_memory_utilization: 0.5
|
||
|
||
# Whether to ignore EOS and continue generating after EOS is hit.
|
||
ignore_eos: False
|
||
|
||
# Whether to disable CUDA graph. Default True to allow cache freeing.
|
||
enforce_eager: False
|
||
|
||
# Whether to free engine KVCache after generation. Set enforce_eager=True when enabled.
|
||
free_cache_engine: True
|
||
|
||
# Which loader to use for rollout model weights: dummy_dtensor, hf, megatron, etc.
|
||
# safetensors (for huge model, and set use_shm=True); dummy_dtensor: randomly init model weight
|
||
load_format: dummy
|
||
|
||
# for huge model, layered summon can save memory (prevent OOM) but make it slower
|
||
layered_summon: False
|
||
|
||
# TP size for rollout. Only effective for vLLM.
|
||
tensor_model_parallel_size: 2
|
||
|
||
# max number of tokens in a batch
|
||
max_num_batched_tokens: 8192
|
||
|
||
# max length for rollout
|
||
max_model_len: null
|
||
|
||
# max length of sequences
|
||
max_num_seqs: 1024
|
||
|
||
# [Will be deprecated, use log_prob_micro_batch_size_per_gpu] The batch size for one forward pass in the computation of log_prob. Global batch size.
|
||
log_prob_micro_batch_size: null
|
||
|
||
# The batch size for one forward pass in the computation of log_prob. Local batch size per GPU.
|
||
log_prob_micro_batch_size_per_gpu: null
|
||
|
||
# enable dynamic batch size (sequence packing) for log_prob computation
|
||
log_prob_use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||
|
||
# max token length for log_prob computation
|
||
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||
|
||
# disable logging statistics
|
||
disable_log_stats: True
|
||
|
||
# may get higher throughput when set to True. When activated, Please increase max_num_batched_tokens or decrease max_model_len.
|
||
enable_chunked_prefill: True
|
||
|
||
# for hf rollout
|
||
# Whether to sample during training rollout. False uses greedy sampling.
|
||
do_sample: True
|
||
|
||
# number of responses (i.e. num sample times). > 1 for grpo
|
||
n: 1
|
||
|
||
# Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition.
|
||
multi_stage_wake_up: false
|
||
|
||
# Extra inference engine arguments, please refer vllm/sglang official doc for detail
|
||
engine_kwargs:
|
||
|
||
# vllm engine config
|
||
vllm: {}
|
||
|
||
# sglang engine config
|
||
sglang: {}
|
||
|
||
# Sampling parameters used during validation.
|
||
val_kwargs:
|
||
|
||
# sampling parameters for validation
|
||
# Top-k sampling parameter. -1 for vLLM rollout, 0 for HF rollout.
|
||
top_k: -1
|
||
|
||
# Top-p sampling parameter. Default 1.0.
|
||
top_p: 1.0
|
||
|
||
# Sampling temperature for rollout.
|
||
temperature: 0
|
||
|
||
# whether to repeat n times for validation
|
||
n: 1
|
||
|
||
# Whether to sample during training rollout. False uses greedy sampling.
|
||
do_sample: False
|
||
|
||
# Multi-turn interaction config for tools or chat.
|
||
multi_turn:
|
||
|
||
# set to True for multi-turn tool interaction tasks; should set rollout.name to sglang as well
|
||
enable: False
|
||
|
||
# null for no limit (default max_length // 3)
|
||
max_assistant_turns: null
|
||
|
||
# null for no tool
|
||
tool_config_path: null
|
||
|
||
# null for no limit (default max_length // 3)
|
||
max_user_turns: null
|
||
|
||
# max parallel call for tools in single turn
|
||
max_parallel_calls: 1
|
||
|
||
# max length of tool response
|
||
max_tool_response_length: 256
|
||
|
||
# truncate side of tool response: left, middle, right
|
||
tool_response_truncate_side: middle
|
||
|
||
# null for no interaction
|
||
interaction_config_path: null
|
||
|
||
# - When set to True, the model's default chat template is used for multi-turn rollout, which typically matches production behavior.
|
||
# - When set to False, the token ids recorded for training are used instead; unlike the default chat template, these always include the model's full output,
|
||
# which may contain additional content such as reasoning content. This maintains the consistency between training and rollout, but it will lead to longer prompts.
|
||
use_inference_chat_template: False
|
||
|
||
# Tokenization is performed turn by turn and the resulting token ids are concatenated to form the full conversation.
|
||
# To ensure this matches the result of tokenizing the entire conversation at once, a sanity check is run at the end of each multi-turn rollout to compare the two sets of token ids.
|
||
# Some models are known to produce different tokenization results when tokenizing turn by turn vs. all at once. aThis behavior has already been validated for them.
|
||
# To reduce excessive warnings, you can turn off the sanity check for these models if you are using their default chat template:
|
||
# Qwen/QwQ-32B, Qwen/Qwen3-xxB
|
||
# - disable: disable tokenization sanity check
|
||
# - strict: enable strict tokenization sanity check (default)
|
||
# - ignore_strippable: ignore strippable tokens when checking tokenization sanity
|
||
tokenization_sanity_check_mode: strict
|
||
|
||
# Format of the multi-turn interaction. Options: hermes, llama3_json, ...
|
||
format: hermes
|
||
|
||
# support logging rollout prob for debugging purpose
|
||
calculate_log_probs: False
|
||
|
||
# [Experimental] agent loop based rollout configs
|
||
agent:
|
||
|
||
# Number of agent loop workers
|
||
num_workers: 8
|
||
|
||
# custom async server configs
|
||
custom_async_server:
|
||
|
||
# Path to the custom async server implementation
|
||
path: null
|
||
|
||
# Class name of the custom async server class (e.g. AsyncvLLMServer)
|
||
name: null
|
||
|
||
# profiler configs
|
||
profiler:
|
||
|
||
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
|
||
_target_: verl.utils.profiler.ProfilerConfig
|
||
|
||
# True for each task has its own database, False for all tasks in one training step share one database.
|
||
discrete: False
|
||
|
||
# Whether to profile all ranks.
|
||
all_ranks: False
|
||
|
||
# The ranks that will be profiled. [] or [0,1,...]
|
||
ranks: []
|
||
|
||
# configs for the critic
|
||
critic:
|
||
|
||
# Number of rollouts per update (mirrors actor rollout_n)
|
||
rollout_n: ${actor_rollout_ref.rollout.n}
|
||
|
||
# fsdp or fsdp2 strategy used for critic model training
|
||
strategy: ${actor_rollout_ref.actor.strategy}
|
||
|
||
# optimizer configs
|
||
optim:
|
||
|
||
# Learning rate
|
||
lr: 1e-5
|
||
|
||
# Warmup steps ratio; total steps will be injected at runtime
|
||
lr_warmup_steps_ratio: 0.
|
||
|
||
# Minimum LR ratio for cosine schedule
|
||
min_lr_ratio: 0.0
|
||
|
||
# LR scheduler type: "constant" or "cosine"
|
||
lr_scheduler_type: constant
|
||
|
||
# Total training steps (must be overridden at runtime)
|
||
total_training_steps: -1
|
||
|
||
# Weight decay
|
||
weight_decay: 0.01
|
||
|
||
# model config for the critic
|
||
model:
|
||
|
||
# Path to pretrained model weights
|
||
path: ~/models/deepseek-llm-7b-chat
|
||
|
||
# Whether to use shared memory for loading the model
|
||
use_shm: False
|
||
|
||
# Tokenizer path (defaults to actor's model path)
|
||
tokenizer_path: ${actor_rollout_ref.model.path}
|
||
|
||
# Hugging Face config override
|
||
override_config: { }
|
||
|
||
# External model implementation (optional)
|
||
external_lib: ${actor_rollout_ref.model.external_lib}
|
||
|
||
# Enable gradient checkpointing to save memory
|
||
enable_gradient_checkpointing: True
|
||
|
||
# Offload activations to CPU to reduce GPU memory usage
|
||
enable_activation_offload: False
|
||
|
||
# Use remove padding optimization (saves compute)
|
||
use_remove_padding: False
|
||
|
||
# Whether to trust remote code from Hugging Face models
|
||
trust_remote_code: ${actor_rollout_ref.model.trust_remote_code}
|
||
|
||
# FSDP-specific config
|
||
fsdp_config:
|
||
|
||
# Whether to offload model parameters to CPU
|
||
param_offload: False
|
||
|
||
# Whether to offload optimizer state to CPU
|
||
optimizer_offload: False
|
||
|
||
# Only for FSDP2: offload param/grad/optimizer during train
|
||
offload_policy: False
|
||
|
||
# Only for FSDP2: Reshard after forward pass to reduce memory footprint
|
||
reshard_after_forward: True
|
||
|
||
# Policy for wrapping layers with FSDP
|
||
wrap_policy:
|
||
|
||
# Minimum number of parameters to trigger wrapping
|
||
min_num_params: 0
|
||
|
||
# Number of GPUs in each FSDP shard group; -1 means auto
|
||
fsdp_size: -1
|
||
|
||
# Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
|
||
# before the current forward computation.
|
||
forward_prefetch: False
|
||
|
||
# Set to positive value to enable LoRA (e.g., 32)
|
||
lora_rank: 0
|
||
|
||
# LoRA scaling factor
|
||
lora_alpha: 16
|
||
|
||
# LoRA target modules: "all-linear" or list of linear projection layers
|
||
target_modules: all-linear
|
||
|
||
# PPO mini-batch size per update
|
||
ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size}
|
||
|
||
# [Deprecated] Global micro batch size
|
||
ppo_micro_batch_size: null
|
||
|
||
# Local per-GPU micro batch size
|
||
ppo_micro_batch_size_per_gpu: null
|
||
|
||
# Forward-only batch size (global)
|
||
forward_micro_batch_size: ${critic.ppo_micro_batch_size}
|
||
|
||
# Forward-only batch size (per GPU)
|
||
forward_micro_batch_size_per_gpu: ${critic.ppo_micro_batch_size_per_gpu}
|
||
|
||
# Whether to automatically adjust batch size at runtime
|
||
use_dynamic_bsz: ${actor_rollout_ref.actor.use_dynamic_bsz}
|
||
|
||
# Max tokens per GPU in one PPO batch (doubled for critic)
|
||
ppo_max_token_len_per_gpu: 32768
|
||
|
||
# Max token length per GPU in forward pass
|
||
forward_max_token_len_per_gpu: ${critic.ppo_max_token_len_per_gpu}
|
||
|
||
# Sequence parallelism size for Ulysses-style model parallelism
|
||
ulysses_sequence_parallel_size: 1
|
||
|
||
# Number of PPO epochs per batch
|
||
ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs}
|
||
|
||
# Shuffle training data across PPO epochs
|
||
shuffle: ${actor_rollout_ref.actor.shuffle}
|
||
|
||
# Gradient clipping for critic updates
|
||
grad_clip: 1.0
|
||
|
||
# PPO value function clipping range
|
||
cliprange_value: 0.5
|
||
|
||
# Loss aggregation mode: "token-mean", "seq-mean-token-sum", or "seq-mean-token-mean"
|
||
loss_agg_mode: ${actor_rollout_ref.actor.loss_agg_mode}
|
||
|
||
# checkpoint configs
|
||
checkpoint:
|
||
|
||
# What to include in saved checkpoints
|
||
# with 'hf_model' you can save whole model as hf format, now only use sharded model checkpoint to save space
|
||
save_contents: ['model', 'optimizer', 'extra']
|
||
|
||
# What to include when loading checkpoints
|
||
load_contents: ${critic.checkpoint.save_contents}
|
||
|
||
# profiler configs
|
||
# the corresponding dataclass is verl.utils.profiler.ProfilerConfig.
|
||
profiler:
|
||
|
||
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
|
||
_target_: verl.utils.profiler.ProfilerConfig
|
||
|
||
# True for each task has its own database, False for all tasks in one training step share one database.
|
||
discrete: False
|
||
|
||
# Whether to profile all ranks.
|
||
all_ranks: False
|
||
|
||
# The ranks that will be profiled. [] or [0,1,...]
|
||
ranks: []
|
||
|
||
# configs for the reward model
|
||
reward_model:
|
||
|
||
# Whether to enable reward model. If False, we compute the reward only with the user-defined reward functions.
|
||
# In GSM8K and Math examples, we disable reward model.
|
||
# For RLHF alignment example using full_hh_rlhf, we utilize reward model to assess the responses.
|
||
# If False, the following parameters are not effective
|
||
enable: False
|
||
|
||
# FSDP strategy: "fsdp" or "fsdp2"
|
||
strategy: ${actor_rollout_ref.actor.strategy}
|
||
|
||
# model config for reward scoring
|
||
model:
|
||
|
||
# Input tokenizer. If the reward model’s chat template is inconsistent with the policy,
|
||
# we need to first decode to plaintext, then apply the rm’s chat_template.
|
||
# Then score with RM. If chat_templates are consistent, it can be set to null.
|
||
input_tokenizer: ${actor_rollout_ref.model.path}
|
||
|
||
# RM’s HDFS path or local path. Note that RM only supports AutoModelForSequenceClassification.
|
||
# Other model types need to define their own RewardModelWorker and pass it from the code.
|
||
path: ~/models/FsfairX-LLaMA3-RM-v0.1
|
||
|
||
# Whether to use shared memory for loading the model
|
||
use_shm: False
|
||
|
||
# External model implementation (optional)
|
||
external_lib: ${actor_rollout_ref.model.external_lib}
|
||
|
||
# Use remove padding optimization (saves compute)
|
||
use_remove_padding: False
|
||
|
||
# Whether to use fused reward kernels for speedup
|
||
use_fused_kernels: ${actor_rollout_ref.model.use_fused_kernels}
|
||
|
||
# Whether to enable loading a remote code model, default to False
|
||
trust_remote_code: False
|
||
|
||
# FSDP-specific config
|
||
fsdp_config:
|
||
|
||
# Policy for wrapping layers with FSDP
|
||
wrap_policy:
|
||
|
||
# Minimum number of parameters to trigger wrapping
|
||
min_num_params: 0
|
||
|
||
# Whether to offload model parameters to CPU
|
||
param_offload: False
|
||
|
||
# Only for FSDP2: Reshard after forward pass to reduce memory footprint
|
||
reshard_after_forward: True
|
||
|
||
# Number of GPUs in each FSDP shard group; -1 means auto
|
||
fsdp_size: -1
|
||
|
||
# Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
|
||
# before the current forward computation.
|
||
forward_prefetch: False
|
||
|
||
# [Deprecated] Global micro batch size
|
||
micro_batch_size: null
|
||
|
||
# Local per-GPU micro batch size
|
||
micro_batch_size_per_gpu: null
|
||
|
||
# Maximum sequence length to process for scoring
|
||
max_length: null
|
||
|
||
# Sequence parallelism size for Ulysses-style model parallelism
|
||
ulysses_sequence_parallel_size: 1
|
||
|
||
# Whether to dynamically adjust batch size at runtime
|
||
use_dynamic_bsz: ${critic.use_dynamic_bsz}
|
||
|
||
# Maximum number of tokens per GPU in one forward pass
|
||
forward_max_token_len_per_gpu: ${critic.forward_max_token_len_per_gpu}
|
||
|
||
# Reward Manager. This defines the mechanism of computing rule-based reward and handling different reward sources.
|
||
# Default is naive. If all verification functions are multiprocessing-safe,
|
||
# the reward manager can be set to prime for parallel verification.
|
||
reward_manager: naive
|
||
|
||
# Whether to launch custom reward function asynchronously during log_prob
|
||
launch_reward_fn_async: False
|
||
|
||
# Cloud/local sandbox fusion configuration for custom reward logic
|
||
sandbox_fusion:
|
||
|
||
# Cloud/local function URL for sandbox execution
|
||
url: null
|
||
|
||
# Max concurrent requests allowed to sandbox
|
||
max_concurrent: 64
|
||
|
||
# Max memory limit for each sandbox process in MB
|
||
memory_limit_mb: 1024
|
||
|
||
# profiler configs
|
||
profiler:
|
||
|
||
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
|
||
_target_: verl.utils.profiler.ProfilerConfig
|
||
|
||
# True for each task has its own database, False for all tasks in one training step share one database.
|
||
discrete: False
|
||
|
||
# Whether to profile all ranks.
|
||
all_ranks: False
|
||
|
||
# The ranks that will be profiled. [] or [0,1,...]
|
||
ranks: []
|
||
|
||
# custom reward function definition
|
||
custom_reward_function:
|
||
|
||
# The path to the file containing your customized reward function.
|
||
# If not specified, pre-implemented reward functions will be used.
|
||
path: null
|
||
|
||
# The name of the reward function within the specified file. Default is 'compute_score'.
|
||
name: compute_score
|
||
|
||
# config for the algorithm
|
||
algorithm:
|
||
|
||
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
|
||
_target_: verl.trainer.config.AlgoConfig
|
||
|
||
# Discount factor for future rewards
|
||
gamma: 1.0
|
||
|
||
# Trade-off between bias and variance in the GAE estimator
|
||
lam: 1.0
|
||
|
||
# Advantage estimator type: "gae", "grpo", "reinforce_plus_plus", etc.
|
||
adv_estimator: gae
|
||
|
||
# Whether to normalize advantages by std (specific to GRPO)
|
||
norm_adv_by_std_in_grpo: True
|
||
|
||
# Whether to enable in-reward KL penalty
|
||
use_kl_in_reward: False
|
||
|
||
# How to estimate KL divergence: "kl", "abs", "mse", "low_var_kl", or "full"
|
||
kl_penalty: kl
|
||
|
||
# KL control configuration
|
||
kl_ctrl:
|
||
|
||
# Required when using verl.utils.omega_conf_to_dataclass to instantiate dataclass configs
|
||
_target_: verl.trainer.config.KLControlConfig
|
||
|
||
# KL control type: "fixed" or "adaptive"
|
||
type: fixed
|
||
|
||
# Initial coefficient for KL penalty
|
||
kl_coef: 0.001
|
||
|
||
# Horizon value for adaptive controller (if enabled)
|
||
horizon: 10000
|
||
|
||
# Target KL divergence (used for adaptive controller)
|
||
target_kl: 0.1
|
||
|
||
# Whether to enable preference feedback PPO
|
||
use_pf_ppo: False
|
||
|
||
# Preference feedback PPO settings
|
||
pf_ppo:
|
||
|
||
# Method for reweighting samples: "pow", "max_min", or "max_random"
|
||
reweight_method: pow
|
||
|
||
# Power used for weight scaling in "pow" method
|
||
weight_pow: 2.0
|
||
|
||
# config for the trainer
|
||
trainer:
|
||
|
||
# Whether to balance batch sizes across distributed workers
|
||
balance_batch: True
|
||
|
||
# Number of epochs in training
|
||
total_epochs: 30
|
||
|
||
# Total training steps (can be set explicitly or derived from epochs)
|
||
total_training_steps: null
|
||
|
||
# The steps that will be profiled. null means no profiling. null or [1,2,5,...]
|
||
profile_steps: null
|
||
|
||
# controller Nvidia Nsight Systems Options. Must set when profile_steps is not None.
|
||
## reference https://docs.nvidia.com/nsight-systems/UserGuide/index.html
|
||
## reference https://docs.ray.io/en/latest/ray-observability/user-guides/profiling.html
|
||
controller_nsight_options:
|
||
|
||
# Select the API(s) to be traced.
|
||
trace: "cuda,nvtx,cublas,ucx"
|
||
|
||
# Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
|
||
cuda-memory-usage: "true"
|
||
|
||
# CUDA graphs will be traced as a whole
|
||
cuda-graph-trace: "graph"
|
||
|
||
# worker Nvidia Nsight Systems Options. Must set when profile_steps is not None.
|
||
worker_nsight_options:
|
||
|
||
# Select the API(s) to be traced.
|
||
trace: "cuda,nvtx,cublas,ucx"
|
||
|
||
# Track the GPU memory usage by CUDA kernels. Must be string type "true" or "false".
|
||
cuda-memory-usage: "true"
|
||
|
||
# CUDA graphs will be traced as a whole
|
||
cuda-graph-trace: "graph"
|
||
|
||
# Profiling only in a range of torch.cuda.profiler.start and stop. Do not change this config.
|
||
capture-range: "cudaProfilerApi"
|
||
|
||
# Specify the desired behavior when a capture range ends.
|
||
# In verl we need the orch.cuda.profiler.start/stop pair to repeats n times.
|
||
# valid values are "repeat-shutdown:n" or null.
|
||
# For normal whole step profiling, n = len(profile_steps);
|
||
# but for discrete profiling, n = len(profile_steps) * Number(subtasks).
|
||
# Or you can just leave it null and the program will use n = len(profile_steps) * 6;
|
||
capture-range-end: null
|
||
|
||
# Send signal to the target application's process group. We let the program to exit by itself.
|
||
kill: none
|
||
|
||
# Config for npu profiler. Must set when profile_steps is not None and torch_npu is available.
|
||
npu_profile:
|
||
|
||
# Options for the npu profiler
|
||
options:
|
||
|
||
# Storage path of collected data.
|
||
save_path: ./profiler_data
|
||
|
||
# The roles that will be profiled. Only takes effect in discrete mode.
|
||
# optional values: all, rollout_generate, actor_compute_log_prob, actor_update and ref_compute_log_prob.
|
||
# "all" means all roles will be profiled.
|
||
roles: ["all"]
|
||
|
||
# Collection level, optional values: level_none, level0, level1, level2.
|
||
level: level1
|
||
|
||
# Whether to enable memory analysis.
|
||
with_memory: False
|
||
|
||
# Whether to record tensor shape.
|
||
record_shapes: False
|
||
|
||
# Whether to record Device-side performance data.
|
||
with_npu: True
|
||
|
||
# Whether to record Host-side performance data.
|
||
with_cpu: True
|
||
|
||
# Whether to record Python call stack information.
|
||
with_module: False
|
||
|
||
# Whether to record operator call stack information.
|
||
with_stack: False
|
||
|
||
# Whether to automatically parse the data.
|
||
analysis: True
|
||
|
||
# Project name for experiment tracking (e.g., wandb)
|
||
project_name: verl_examples
|
||
|
||
# Experiment name for run identification in tracking tools
|
||
experiment_name: gsm8k
|
||
|
||
# Logging backends to use: "console", "wandb", etc.
|
||
logger: [ 'console', 'wandb' ]
|
||
|
||
# Number of generations to log during validation
|
||
log_val_generations: 0
|
||
|
||
# Directory for logging rollout data; no dump if null
|
||
rollout_data_dir: null
|
||
|
||
# Directory for logging validation data; no dump if null
|
||
validation_data_dir: null
|
||
|
||
# Number of nodes used in the training
|
||
nnodes: 1
|
||
|
||
# Number of GPUs per node
|
||
n_gpus_per_node: 8
|
||
|
||
# Save frequency (by iteration) for model checkpoints
|
||
save_freq: -1
|
||
|
||
# ESI refers to the elastic server instance used during training, similar to the training plan. For example,
|
||
# if you purchase 10 hours of computing power, the ESI will automatically shut down after 10 hours of training.
|
||
# To ensure a checkpoint is saved before ESI shuts down, the system will start saving a checkpoint in advance.
|
||
# The advance time is calculated as: Advance Time = Longest historical step duration + Checkpoint save duration + esi_redundant_time.
|
||
# Here, esi_redundant_time is a user-defined value that further extends the advance time for added safety.
|
||
esi_redundant_time: 0
|
||
|
||
# Resume mode: "auto", "disable", or "resume_path"
|
||
# "auto": resume from last checkpoint if available
|
||
# "disable": start from scratch
|
||
# "resume_path": resume from a user-defined path
|
||
resume_mode: auto
|
||
|
||
# Path to resume training from (only used when resume_mode is "resume_path")
|
||
resume_from_path: null
|
||
|
||
# Whether to run validation before training begins
|
||
val_before_train: True
|
||
|
||
# Whether to run validation only
|
||
val_only: False
|
||
|
||
# Validation frequency (in training iterations)
|
||
test_freq: -1
|
||
|
||
# Number of iterations to warm up the critic before updating policy
|
||
critic_warmup: 0
|
||
|
||
# Default path to distributed filesystem for saving checkpoints
|
||
default_hdfs_dir: null
|
||
|
||
# Whether to delete local checkpoints after loading
|
||
del_local_ckpt_after_load: False
|
||
|
||
# Default local directory for saving checkpoints
|
||
default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}
|
||
|
||
# Maximum number of actor checkpoints to keep
|
||
max_actor_ckpt_to_keep: null
|
||
|
||
# Maximum number of critic checkpoints to keep
|
||
max_critic_ckpt_to_keep: null
|
||
|
||
# Timeout (in seconds) for Ray worker to wait for registration
|
||
ray_wait_register_center_timeout: 300
|
||
|
||
# Device to run training on (e.g., "cuda", "cpu")
|
||
device: cuda
|
||
|
||
# configs related to ray
|
||
ray_kwargs:
|
||
# configs related to ray initialization
|
||
ray_init:
|
||
|
||
# Number of CPUs for Ray. Use a fixed number instead of null when using SLURM.
|
||
num_cpus: null
|
||
|
||
# Path to save Ray timeline JSON for performance profiling
|
||
timeline_json_file: null
|