mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 21:53:50 +08:00
[fsdp,doc] refactor: rename warmup_style@FSDPOptimizerConfig -> lr_scheduler_type (#3739)
### What does this PR do? > Rename `warmup_style` in FSDPOptimizerConfig to `lr_scheduler_type` to align with Hugging Face Trainer API。 The following pull request is for refactoring the optimizer, however, the naming issue persists. https://github.com/volcengine/verl/pull/3656 ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [x] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [x] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) --------- Co-authored-by: weiqi.li <weiqi.li@bytedance.com>
This commit is contained in:
@ -132,7 +132,7 @@ Actor/Rollout/Reference Policy
|
||||
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||
min_lr_ratio: 0.0 # only used with cosine lr scheduler, default to 0.0
|
||||
num_cycles: 0.5 # only used with cosine lr scheduler, default to 0.5
|
||||
warmup_style: constant # select from constant/cosine
|
||||
lr_scheduler_type: constant # select from constant/cosine
|
||||
total_training_steps: -1 # must be override by program
|
||||
fsdp_config:
|
||||
wrap_policy:
|
||||
@ -415,7 +415,7 @@ ____________________________________________________
|
||||
|
||||
Notice that there are some differences in APIs between Megatron optimizer and FSDP optimizer.
|
||||
|
||||
- Megatron optimizer scheduler names the period after lr_warmup as lr_decay_steps, so the ``warmup_style`` actually means the style of lr decay after warmup.
|
||||
- Megatron optimizer scheduler names the period after lr_warmup as lr_decay_steps, so the ``lr_scheduler_type`` actually means the style of lr decay after warmup.
|
||||
- Megatron optimizer also support weight decay decay mechanism
|
||||
- ``use_checkpoint_opt_param_scheduler`` determines whether to use the checkpoint optimizer parameter scheduler. If set to True, the optimizer parameter scheduler will be saved in the checkpoint and loaded from the checkpoint during resuming training.
|
||||
|
||||
|
@ -51,7 +51,7 @@ actor_rollout_ref:
|
||||
lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
|
||||
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||
min_lr_ratio: null # only useful for warmup with cosine
|
||||
warmup_style: constant # select from constant/cosine
|
||||
lr_scheduler_type: constant # select from constant/cosine
|
||||
total_training_steps: -1 # must be override by program
|
||||
fsdp_config:
|
||||
wrap_policy:
|
||||
@ -105,7 +105,7 @@ critic:
|
||||
lr: 1e-5
|
||||
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||
min_lr_ratio: null # only useful for warmup with cosine
|
||||
warmup_style: constant # select from constant/cosine
|
||||
lr_scheduler_type: constant # select from constant/cosine
|
||||
total_training_steps: -1 # must be override by program
|
||||
model:
|
||||
path: ~/models/deepseek-llm-7b-chat
|
||||
|
@ -103,7 +103,7 @@ HYDRA_FULL_ERROR=1 python -m recipe.entropy.main_entropy \
|
||||
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
||||
actor_rollout_ref.actor.optim.lr=1e-6 \
|
||||
actor_rollout_ref.actor.optim.weight_decay=0 \
|
||||
actor_rollout_ref.actor.optim.warmup_style=constant \
|
||||
actor_rollout_ref.actor.optim.lr_scheduler_type=constant \
|
||||
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size=${train_micro_batch_size} \
|
||||
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
|
||||
|
@ -100,7 +100,7 @@ HYDRA_FULL_ERROR=1 python -m recipe.entropy.main_entropy \
|
||||
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
||||
actor_rollout_ref.actor.optim.lr=1e-6 \
|
||||
actor_rollout_ref.actor.optim.weight_decay=0 \
|
||||
actor_rollout_ref.actor.optim.warmup_style=constant \
|
||||
actor_rollout_ref.actor.optim.lr_scheduler_type=constant \
|
||||
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size=${train_micro_batch_size} \
|
||||
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
|
||||
|
@ -99,7 +99,7 @@ HYDRA_FULL_ERROR=1 python -m recipe.entropy.main_entropy \
|
||||
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
||||
actor_rollout_ref.actor.optim.lr=1e-6 \
|
||||
actor_rollout_ref.actor.optim.weight_decay=0 \
|
||||
actor_rollout_ref.actor.optim.warmup_style=constant \
|
||||
actor_rollout_ref.actor.optim.lr_scheduler_type=constant \
|
||||
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size=${train_micro_batch_size} \
|
||||
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
|
||||
|
@ -103,7 +103,7 @@ HYDRA_FULL_ERROR=1 python -m recipe.entropy.main_entropy \
|
||||
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
||||
actor_rollout_ref.actor.optim.lr=1e-6 \
|
||||
actor_rollout_ref.actor.optim.weight_decay=0 \
|
||||
actor_rollout_ref.actor.optim.warmup_style=constant \
|
||||
actor_rollout_ref.actor.optim.lr_scheduler_type=constant \
|
||||
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size=${train_micro_batch_size} \
|
||||
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
|
||||
|
@ -99,7 +99,7 @@ HYDRA_FULL_ERROR=1 python -m recipe.entropy.main_entropy \
|
||||
actor_rollout_ref.model.enable_gradient_checkpointing=True \
|
||||
actor_rollout_ref.actor.optim.lr=1e-6 \
|
||||
actor_rollout_ref.actor.optim.weight_decay=0 \
|
||||
actor_rollout_ref.actor.optim.warmup_style=constant \
|
||||
actor_rollout_ref.actor.optim.lr_scheduler_type=constant \
|
||||
actor_rollout_ref.actor.ppo_mini_batch_size=${train_prompt_mini_bsz} \
|
||||
actor_rollout_ref.actor.ppo_micro_batch_size=${train_micro_batch_size} \
|
||||
actor_rollout_ref.actor.fsdp_config.param_offload=${offload} \
|
||||
|
@ -48,7 +48,8 @@ reward_model:
|
||||
lr_warmup_steps: -1 # Prioritized. Negative values mean delegating to lr_warmup_steps_ratio.
|
||||
lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime
|
||||
min_lr_ratio: null
|
||||
warmup_style: constant
|
||||
warmup_style: null # deprecated
|
||||
lr_scheduler_type: constant
|
||||
total_training_steps: -1 # must be overridden by program
|
||||
weight_decay: 0.
|
||||
grad_clip: 10.0
|
||||
|
@ -42,7 +42,7 @@ FSDP_ENGINE_CONFIG="\
|
||||
optim.betas="[0.9,0.95]" \
|
||||
optim.clip_grad=1.0 \
|
||||
optim.min_lr_ratio=0.1 \
|
||||
optim.warmup_style=cosine \
|
||||
optim.lr_scheduler_type=cosine \
|
||||
engine.ulysses_sequence_parallel_size=${SP_SIZE} \
|
||||
engine.strategy=${FSDP_STRATEGY} \
|
||||
engine.fsdp_size=${FSDP_SIZE}"
|
||||
|
@ -301,8 +301,8 @@ actor_rollout_ref:
|
||||
# Number of cosine cycles in LR schedule
|
||||
num_cycles: 0.5
|
||||
|
||||
# LR warmup style: "constant" or "cosine"
|
||||
warmup_style: constant
|
||||
# LR scheduler type: "constant" or "cosine"
|
||||
lr_scheduler_type: constant
|
||||
|
||||
# Total training steps (must be overridden at runtime)
|
||||
total_training_steps: -1
|
||||
@ -605,8 +605,8 @@ critic:
|
||||
# Minimum LR ratio for cosine schedule
|
||||
min_lr_ratio: 0.0
|
||||
|
||||
# LR warmup style: "constant" or "cosine"
|
||||
warmup_style: constant
|
||||
# LR scheduler type: "constant" or "cosine"
|
||||
lr_scheduler_type: constant
|
||||
|
||||
# Total training steps (must be overridden at runtime)
|
||||
total_training_steps: -1
|
||||
|
@ -21,15 +21,24 @@ class TestFSDPOptimizerConfigCPU:
|
||||
def test_default_configuration(self):
|
||||
config = FSDPOptimizerConfig(lr=0.1)
|
||||
assert config.min_lr_ratio is None
|
||||
assert config.warmup_style == "constant"
|
||||
assert config.lr_scheduler_type == "constant"
|
||||
assert config.num_cycles == 0.5
|
||||
|
||||
@pytest.mark.parametrize("warmup_style", ["constant", "cosine"])
|
||||
def test_valid_warmup_styles(self, warmup_style):
|
||||
config = FSDPOptimizerConfig(warmup_style=warmup_style, lr=0.1)
|
||||
assert config.warmup_style == warmup_style
|
||||
@pytest.mark.parametrize("lr_scheduler_type", ["constant", "cosine"])
|
||||
def test_valid_lr_scheduler_types(self, lr_scheduler_type):
|
||||
config = FSDPOptimizerConfig(lr_scheduler_type=lr_scheduler_type, lr=0.1)
|
||||
assert config.lr_scheduler_type == lr_scheduler_type
|
||||
|
||||
def test_invalid_warmup_style(self):
|
||||
@pytest.mark.parametrize("warmup_style", ["constant", "cosine"])
|
||||
def test_valid_warmup_style_types(self, warmup_style):
|
||||
config = FSDPOptimizerConfig(warmup_style=warmup_style, lr=0.1)
|
||||
assert config.lr_scheduler_type == warmup_style
|
||||
|
||||
def test_invalid_lr_scheduler_type(self):
|
||||
with pytest.raises((ValueError, AssertionError)):
|
||||
FSDPOptimizerConfig(lr_scheduler_type="invalid_style", lr=0.1)
|
||||
|
||||
def test_invalid_warmup_style_type(self):
|
||||
with pytest.raises((ValueError, AssertionError)):
|
||||
FSDPOptimizerConfig(warmup_style="invalid_style", lr=0.1)
|
||||
|
||||
|
@ -18,7 +18,8 @@ actor_rollout_ref:
|
||||
clip_grad: 1.0
|
||||
min_lr_ratio: 0.0
|
||||
num_cycles: 0.5
|
||||
warmup_style: constant
|
||||
lr_scheduler_type: constant
|
||||
warmup_style: null
|
||||
fsdp_config:
|
||||
_target_: verl.workers.config.FSDPEngineConfig
|
||||
wrap_policy:
|
||||
@ -315,7 +316,8 @@ critic:
|
||||
clip_grad: 1.0
|
||||
min_lr_ratio: 0.0
|
||||
num_cycles: 0.5
|
||||
warmup_style: constant
|
||||
lr_scheduler_type: constant
|
||||
warmup_style: null
|
||||
model:
|
||||
fsdp_config:
|
||||
_target_: verl.workers.config.FSDPEngineConfig
|
||||
|
@ -28,6 +28,8 @@ min_lr_ratio: 0.0
|
||||
# Number of cosine cycles in LR schedule
|
||||
num_cycles: 0.5
|
||||
|
||||
# LR warmup style: "constant" or "cosine"
|
||||
warmup_style: constant
|
||||
# LR scheduler type: "constant" or "cosine"
|
||||
lr_scheduler_type: constant
|
||||
|
||||
# deprecated
|
||||
warmup_style: null
|
||||
|
@ -60,16 +60,27 @@ class FSDPOptimizerConfig(OptimizerConfig):
|
||||
Args:
|
||||
lr (float): Learning rate.
|
||||
min_lr_ratio (Optional[float]): Minimum LR ratio for cosine schedule.
|
||||
warmup_style (str): LR warmup style: "constant" or "cosine".
|
||||
lr_scheduler_type (str): LR scheduler type: "constant" or "cosine".
|
||||
num_cycles (float): Number of cosine cycles in LR schedule.
|
||||
"""
|
||||
|
||||
_mutable_fields = OptimizerConfig._mutable_fields.copy()
|
||||
_mutable_fields.add("lr_scheduler_type")
|
||||
|
||||
min_lr_ratio: Optional[float] = None
|
||||
warmup_style: str = "constant"
|
||||
# deprecate warmup_style
|
||||
warmup_style: Optional[str] = None
|
||||
lr_scheduler_type: str = "constant"
|
||||
num_cycles: float = 0.5
|
||||
|
||||
def __post_init__(self):
|
||||
assert self.warmup_style in ["constant", "cosine"]
|
||||
if self.warmup_style is not None:
|
||||
assert self.warmup_style in ["constant", "cosine"]
|
||||
warnings.warn(
|
||||
"`warmup_style` is deprecated, use `lr_scheduler_type` instead.", DeprecationWarning, stacklevel=2
|
||||
)
|
||||
self.lr_scheduler_type = self.warmup_style
|
||||
assert self.lr_scheduler_type in ["constant", "cosine"]
|
||||
return super().__post_init__()
|
||||
|
||||
|
||||
|
@ -370,7 +370,7 @@ class FSDPEngine(BaseEngine):
|
||||
|
||||
total_steps = optim_config.total_training_steps
|
||||
num_warmup_steps = optim_config.lr_warmup_steps
|
||||
warmup_style = optim_config.warmup_style
|
||||
lr_scheduler_type = optim_config.lr_scheduler_type
|
||||
min_lr_ratio = optim_config.min_lr_ratio
|
||||
num_cycles = optim_config.num_cycles
|
||||
if num_warmup_steps <= 0:
|
||||
@ -380,9 +380,9 @@ class FSDPEngine(BaseEngine):
|
||||
if self.rank == 0:
|
||||
print(f"Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}")
|
||||
|
||||
if warmup_style == "constant":
|
||||
if lr_scheduler_type == "constant":
|
||||
lr_scheduler = get_constant_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=num_warmup_steps)
|
||||
elif warmup_style == "cosine":
|
||||
elif lr_scheduler_type == "cosine":
|
||||
lr_scheduler = get_cosine_schedule_with_warmup(
|
||||
optimizer=optimizer,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
@ -391,7 +391,7 @@ class FSDPEngine(BaseEngine):
|
||||
num_cycles=num_cycles,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Warmup style {warmup_style} is not supported")
|
||||
raise NotImplementedError(f"LR scheduler type {lr_scheduler_type} is not supported")
|
||||
return lr_scheduler
|
||||
|
||||
def _build_model_optimizer(self):
|
||||
|
@ -529,7 +529,7 @@ class ActorRolloutRefWorker(Worker, DistProfilerExtension):
|
||||
|
||||
total_steps = optim_config.get("total_training_steps", 0)
|
||||
num_warmup_steps = int(optim_config.get("lr_warmup_steps", -1))
|
||||
warmup_style = optim_config.get("warmup_style", "constant")
|
||||
lr_scheduler_type = optim_config.get("lr_scheduler_type", "constant")
|
||||
min_lr_ratio = optim_config.get("min_lr_ratio", 0.0)
|
||||
num_cycles = optim_config.get("num_cycles", 0.5)
|
||||
if num_warmup_steps < 0:
|
||||
@ -539,11 +539,11 @@ class ActorRolloutRefWorker(Worker, DistProfilerExtension):
|
||||
if self.rank == 0:
|
||||
print(f"Total steps: {total_steps}, num_warmup_steps: {num_warmup_steps}")
|
||||
|
||||
if warmup_style == "constant":
|
||||
if lr_scheduler_type == "constant":
|
||||
actor_lr_scheduler = get_constant_schedule_with_warmup(
|
||||
optimizer=actor_optimizer, num_warmup_steps=num_warmup_steps
|
||||
)
|
||||
elif warmup_style == "cosine":
|
||||
elif lr_scheduler_type == "cosine":
|
||||
actor_lr_scheduler = get_cosine_schedule_with_warmup(
|
||||
optimizer=actor_optimizer,
|
||||
num_warmup_steps=num_warmup_steps,
|
||||
@ -552,7 +552,7 @@ class ActorRolloutRefWorker(Worker, DistProfilerExtension):
|
||||
num_cycles=num_cycles,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Warmup style {warmup_style} is not supported")
|
||||
raise NotImplementedError(f"LR scheduler type {lr_scheduler_type} is not supported")
|
||||
|
||||
log_gpu_memory_usage(f"After {role} optimizer init", logger=logger)
|
||||
else:
|
||||
@ -1386,7 +1386,8 @@ class CriticWorker(Worker, DistProfilerExtension):
|
||||
|
||||
total_steps = config.optim.get("total_training_steps", 0)
|
||||
num_warmup_steps = int(config.optim.get("lr_warmup_steps", -1))
|
||||
warmup_style = config.optim.get("warmup_style", "constant")
|
||||
|
||||
lr_scheduler_type = config.optim.get("lr_scheduler_type", "constant")
|
||||
if num_warmup_steps < 0:
|
||||
num_warmup_steps_ratio = config.optim.get("lr_warmup_steps_ratio", 0.0)
|
||||
num_warmup_steps = int(num_warmup_steps_ratio * total_steps)
|
||||
@ -1396,11 +1397,11 @@ class CriticWorker(Worker, DistProfilerExtension):
|
||||
|
||||
from verl.utils.torch_functional import get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup
|
||||
|
||||
if warmup_style == "constant":
|
||||
if lr_scheduler_type == "constant":
|
||||
critic_lr_scheduler = get_constant_schedule_with_warmup(
|
||||
optimizer=critic_optimizer, num_warmup_steps=num_warmup_steps
|
||||
)
|
||||
elif warmup_style == "cosine":
|
||||
elif lr_scheduler_type == "cosine":
|
||||
min_lr_ratio = config.optim.get("min_lr_ratio", 0.0)
|
||||
num_cycles = config.optim.get("num_cycles", 0.5)
|
||||
critic_lr_scheduler = get_cosine_schedule_with_warmup(
|
||||
@ -1411,7 +1412,7 @@ class CriticWorker(Worker, DistProfilerExtension):
|
||||
num_cycles=num_cycles,
|
||||
)
|
||||
else:
|
||||
raise NotImplementedError(f"Warmup style {warmup_style} is not supported")
|
||||
raise NotImplementedError(f"LR scheduler type {lr_scheduler_type} is not supported")
|
||||
|
||||
return critic_module, critic_optimizer, critic_lr_scheduler
|
||||
|
||||
|
Reference in New Issue
Block a user