diff --git a/docs/examples/config.rst b/docs/examples/config.rst index a59da7e5f..3f3a9b917 100644 --- a/docs/examples/config.rst +++ b/docs/examples/config.rst @@ -180,12 +180,9 @@ Actor/Rollout/Reference Policy log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu} # for hf rollout do_sample: True - engine_kwargs: # inference engine parameters - vllm: - swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB - disable_mm_preprocessor_cache: False # disable preprocessor cache for multimodel models - sglang: - attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla + engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail + vllm: {} + sglang: {} n: 1 # for each prompt, sample n responses (i.e. num sample times). set it to values > 1 for grpo, rloo val_kwargs: @@ -353,19 +350,9 @@ Reference model will be enabled when ``actor.use_kl_loss`` or/and ``algorithm.us deterministic outputs. When set to True, the rollout will use the ``actor_rollout_ref.rollout.val_kwargs`` parameters (top_k, top_p, temperature) to control the sampling behavior. -- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args +- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args, please refer vllm official doc for detail - - ``swap_space``: swap space in GB used by the inference engine. Positive integer, e.g., ``32`` means 32 GB. ``null``: means not setting and using the engine default value (usually, e.g., 4 GB for vLLM) - - ``disable_mm_preprocessor_cache``: Whether to disable preprocessor cache for multimodel models. - -- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args - - - ``attention_backend``: The attention backend to use for the inference engine. - - - ``null``: means not setting and using the engine default value (usually, e.g., ``fa3`` for SGLang) - - ``flashinfer``: Use flashinfer attention backend. - - ``triton``: Use triton attention backend. - - ``flashmla``: Use flashmla attention backend. +- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args, please refer sglang official doc for detail - ``actor_rollout_ref.rollout.ignore_eos``: Whether to ignore the EOS token and continue generating tokens after the EOS token is generated. diff --git a/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh b/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh index 985624e57..ede8eeda7 100644 --- a/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh +++ b/examples/grpo_trainer/run_deepseek671b_math_megatron_96gb.sh @@ -68,7 +68,7 @@ EP=${EP:-8} ETP=1 CP=1 optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.} -last_layer=${LAST_LAYER:-6} +LAST_LAYER=${LAST_LAYER:-6} project_name='verl-deepseek-v3' diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh index d0de1aac5..632bdc8fa 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl-7b-megatron.sh @@ -62,7 +62,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \ actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \ actor_rollout_ref.rollout.n=5 \ actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \ diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b.sh index 450390e25..371ed228f 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl-7b.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl-7b.sh @@ -26,7 +26,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.enforce_eager=False \ diff --git a/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh b/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh index b00ad8087..cb1af5b08 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl-7b_lora.sh @@ -32,7 +32,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.enforce_eager=False \ diff --git a/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh index ef1301126..c29838a33 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl_32b_npu.sh @@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \ actor_rollout_ref.rollout.tensor_model_parallel_size=8 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.enforce_eager=True \ diff --git a/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh index b319dee99..07ab65ee2 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl_3b_npu.sh @@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=2 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.enforce_eager=True \ diff --git a/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh b/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh index 913da5424..6d8f95981 100644 --- a/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh +++ b/examples/grpo_trainer/run_qwen2_5_vl_7b_npu.sh @@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \ actor_rollout_ref.rollout.tensor_model_parallel_size=4 \ actor_rollout_ref.rollout.name=$ENGINE \ - actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \ actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \ actor_rollout_ref.rollout.enable_chunked_prefill=False \ actor_rollout_ref.rollout.enforce_eager=True \ diff --git a/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh b/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh index 7b3b49915..854beae46 100644 --- a/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh +++ b/examples/grpo_trainer/run_qwen3-235b_megatron_96gb.sh @@ -57,7 +57,7 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10)) infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1)) offload=True OPTIM_OFFLOAD=${OPTIM_OFFLOAD:-True} -gen_tp=16 +gen_tp=8 train_tp=${TP:-4} train_pp=${PP:-8} @@ -128,6 +128,7 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.actor.optim.clip_grad=1.0 \ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \ + +actor_rollout_ref.rollout.engine_kwargs.vllm.enable_expert_parallel=True \ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ actor_rollout_ref.rollout.enable_chunked_prefill=True \ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ diff --git a/tests/models/test_transformers_ulysses.py b/tests/models/test_transformers_ulysses.py index 1f900dfe9..15a2f1cbe 100644 --- a/tests/models/test_transformers_ulysses.py +++ b/tests/models/test_transformers_ulysses.py @@ -255,7 +255,8 @@ def _hf_casual_fwd_bwd(config, sp_size, dp_size): grad = model.model.layers[0].self_attn.q_proj.weight.grad grad_full = model_no_sp.model.layers[0].self_attn.q_proj.weight.grad torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=3e-5) - torch.testing.assert_close(grad, grad_full, atol=1e-2, rtol=3e-5) + # The check should be less strict because the gradient is not an averaged value. + torch.testing.assert_close(grad, grad_full, rtol=1e-2, atol=1e-3) if __name__ == "__main__": diff --git a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml index 66c82b89c..14070305f 100644 --- a/tests/trainer/config/legacy_ppo_megatron_trainer.yaml +++ b/tests/trainer/config/legacy_ppo_megatron_trainer.yaml @@ -183,12 +183,9 @@ actor_rollout_ref: gate_proj_layer_name: gate_up # number of responses (i.e. num sample times) n: 1 - engine_kwargs: # inference engine parameters - vllm: - swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB - disable_mm_preprocessor_cache: False # whether to disable the preprocessor cache for multimodel models. - sglang: - attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla + engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail + vllm: {} + sglang: {} val_kwargs: # sampling parameters for validation top_k: -1 # 0 for hf rollout, -1 for vllm rollout diff --git a/tests/trainer/config/legacy_ppo_trainer.yaml b/tests/trainer/config/legacy_ppo_trainer.yaml index bfd74bca4..454efe932 100644 --- a/tests/trainer/config/legacy_ppo_trainer.yaml +++ b/tests/trainer/config/legacy_ppo_trainer.yaml @@ -476,23 +476,14 @@ actor_rollout_ref: # Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition. multi_stage_wake_up: false - # Extra inference engine arguments (vllm, sglang). + # Extra inference engine arguments, please refer vllm/sglang official doc for detail engine_kwargs: - # for vllm - vllm: + # vllm engine config + vllm: {} - # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB). - swap_space: null - - # Whether to disable the preprocessor cache for multimodel models. - disable_mm_preprocessor_cache: False - - # for sglang - sglang: - - # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default. - attention_backend: null + # sglang engine config + sglang: {} # Sampling parameters used during validation. val_kwargs: diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml index b030fddac..13aee363b 100644 --- a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml @@ -166,11 +166,8 @@ actor_rollout_ref: over_sample_rate: 0 multi_stage_wake_up: false engine_kwargs: - vllm: - swap_space: null - disable_mm_preprocessor_cache: false - sglang: - attention_backend: null + vllm: {} + sglang: {} val_kwargs: _target_: verl.workers.config.SamplingConfig top_k: -1 diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml index f90422bb5..de3442413 100644 --- a/verl/trainer/config/_generated_ppo_trainer.yaml +++ b/verl/trainer/config/_generated_ppo_trainer.yaml @@ -141,11 +141,8 @@ actor_rollout_ref: over_sample_rate: 0 multi_stage_wake_up: false engine_kwargs: - vllm: - swap_space: null - disable_mm_preprocessor_cache: false - sglang: - attention_backend: null + vllm: {} + sglang: {} val_kwargs: _target_: verl.workers.config.SamplingConfig top_k: -1 diff --git a/verl/trainer/config/rollout/rollout.yaml b/verl/trainer/config/rollout/rollout.yaml index acb88728b..ba6261ed6 100644 --- a/verl/trainer/config/rollout/rollout.yaml +++ b/verl/trainer/config/rollout/rollout.yaml @@ -91,23 +91,14 @@ over_sample_rate: 0 # This is only effective for SGLang rollout. multi_stage_wake_up: false -# Extra inference engine arguments (vllm, sglang). +# Extra inference engine arguments (vllm, sglang), please refer vllm/sglang official doc for detail engine_kwargs: - # for vllm - vllm: + # vllm engine config + vllm: {} - # Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB). - swap_space: null - - # Whether to disable the preprocessor cache for multimodel models. - disable_mm_preprocessor_cache: False - - # for sglang - sglang: - - # The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default. - attention_backend: null + # sglang engine config + sglang: {} # Sampling parameters used during validation. val_kwargs: diff --git a/verl/workers/rollout/sglang_rollout/sglang_rollout.py b/verl/workers/rollout/sglang_rollout/sglang_rollout.py index 4bb9e4674..188ceb3cf 100644 --- a/verl/workers/rollout/sglang_rollout/sglang_rollout.py +++ b/verl/workers/rollout/sglang_rollout/sglang_rollout.py @@ -437,8 +437,11 @@ class SGLangRollout(BaseRollout): tp_size_per_node = self._tp_size // nnodes node_rank = self._tp_rank // tp_size_per_node first_rank_in_node = self._tp_rank % tp_size_per_node == 0 - engine_kwargs = self.config.get("engine_kwargs", {}).get("sglang", {}) - attention_backend = engine_kwargs.get("attention_backend", None) + engine_kwargs = self.config.get("engine_kwargs", {}).get("sglang", {}) or {} + engine_kwargs = {key: val for key, val in engine_kwargs.items() if val is not None} + + # attention backend will be changed to fa3 if not specified + attention_backend = engine_kwargs.pop("attention_backend", None) if first_rank_in_node: rank = dist.get_rank() @@ -471,6 +474,7 @@ class SGLangRollout(BaseRollout): # In async mode for AgentLoop, SGLang support token in token out to avoid the tokenizer # inconsistency issue. skip_tokenizer_init=self.config.mode == "async", + **engine_kwargs, ) else: self._engine = None diff --git a/verl/workers/rollout/vllm_rollout/vllm_async_server.py b/verl/workers/rollout/vllm_rollout/vllm_async_server.py index b12670b5d..7b3aa4312 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_async_server.py +++ b/verl/workers/rollout/vllm_rollout/vllm_async_server.py @@ -256,6 +256,12 @@ class AsyncvLLMServer(AsyncServerBase): else: logger.warning(f"cudagraph_capture_sizes must be a list, but got {cudagraph_capture_sizes}") + engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) or {} + + engine_kwargs = {key: val for key, val in engine_kwargs.items() if val is not None} + if config.get("limit_images", None): # support for multi-image data + engine_kwargs["limit_mm_per_prompt"] = {"image": config.get("limit_images")} + engine_args = AsyncEngineArgs( model=local_path, enable_sleep_mode=config.free_cache_engine, @@ -277,6 +283,7 @@ class AsyncvLLMServer(AsyncServerBase): trust_remote_code=trust_remote_code, seed=config.get("seed", 0), **compilation_config, + **engine_kwargs, ) # init async llm engine diff --git a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py index 2de8024a3..7e85b4fa0 100644 --- a/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py +++ b/verl/workers/rollout/vllm_rollout/vllm_rollout_spmd.py @@ -153,7 +153,7 @@ class vLLMRollout(BaseRollout): lora_kwargs = kwargs.pop("lora_kwargs", {}) self.lora_kwargs = lora_kwargs # copy it to avoid secretly modifying the engine config - engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) + engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) or {} # For each vLLM engine parameter, # - `None` means not setting it, so we pop it, and leave it to vLLM default value