[BREAKING][rollout] feat: allow users pass all vllm/sglang engine args (#3037)

This PR allows users to pass all vllm/sglang engine args and optimizes
qwen3 rollout speed through vllm Engine argument.

1. deprecate the default value of previous engine_kwargs
2. pass all the engine_kwargs to vllm/sglang engine
3. optimize Qwen3-235B rollout speed by setting TP=8 and enabling expert
parallel.

From top to bottom: tp=16 without EP, tp=8 without EP and tp=8 with EP.
<img width="1000" height="808" alt="image"
src="https://github.com/user-attachments/assets/6b096be4-3896-4e96-8916-d8d6e13a58cc"
/>

PS: The DeepSeek-V3's rollout slows down after enabling expert
parallelism.
This commit is contained in:
kang sheng
2025-08-14 19:12:26 +08:00
committed by GitHub
parent bd3b735514
commit bd756c15c8
18 changed files with 47 additions and 74 deletions

View File

@ -180,12 +180,9 @@ Actor/Rollout/Reference Policy
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
# for hf rollout
do_sample: True
engine_kwargs: # inference engine parameters
vllm:
swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB
disable_mm_preprocessor_cache: False # disable preprocessor cache for multimodel models
sglang:
attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla
engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
vllm: {}
sglang: {}
n: 1 # for each prompt, sample n responses (i.e. num sample times). set it to values > 1 for grpo, rloo
val_kwargs:
@ -353,19 +350,9 @@ Reference model will be enabled when ``actor.use_kl_loss`` or/and ``algorithm.us
deterministic outputs. When set to True, the rollout will use the ``actor_rollout_ref.rollout.val_kwargs`` parameters
(top_k, top_p, temperature) to control the sampling behavior.
- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args
- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args, please refer vllm official doc for detail
- ``swap_space``: swap space in GB used by the inference engine. Positive integer, e.g., ``32`` means 32 GB. ``null``: means not setting and using the engine default value (usually, e.g., 4 GB for vLLM)
- ``disable_mm_preprocessor_cache``: Whether to disable preprocessor cache for multimodel models.
- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args
- ``attention_backend``: The attention backend to use for the inference engine.
- ``null``: means not setting and using the engine default value (usually, e.g., ``fa3`` for SGLang)
- ``flashinfer``: Use flashinfer attention backend.
- ``triton``: Use triton attention backend.
- ``flashmla``: Use flashmla attention backend.
- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args, please refer sglang official doc for detail
- ``actor_rollout_ref.rollout.ignore_eos``: Whether to ignore the EOS
token and continue generating tokens after the EOS token is generated.

View File

@ -68,7 +68,7 @@ EP=${EP:-8}
ETP=1
CP=1
optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
last_layer=${LAST_LAYER:-6}
LAST_LAYER=${LAST_LAYER:-6}
project_name='verl-deepseek-v3'

View File

@ -62,7 +62,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
actor_rollout_ref.rollout.n=5 \
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \

View File

@ -26,7 +26,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.enforce_eager=False \

View File

@ -32,7 +32,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.enforce_eager=False \

View File

@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.enforce_eager=True \

View File

@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.enforce_eager=True \

View File

@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
actor_rollout_ref.rollout.name=$ENGINE \
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
actor_rollout_ref.rollout.enable_chunked_prefill=False \
actor_rollout_ref.rollout.enforce_eager=True \

View File

@ -57,7 +57,7 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10))
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
offload=True
OPTIM_OFFLOAD=${OPTIM_OFFLOAD:-True}
gen_tp=16
gen_tp=8
train_tp=${TP:-4}
train_pp=${PP:-8}
@ -128,6 +128,7 @@ python3 -m verl.trainer.main_ppo \
actor_rollout_ref.actor.optim.clip_grad=1.0 \
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
+actor_rollout_ref.rollout.engine_kwargs.vllm.enable_expert_parallel=True \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \

View File

@ -255,7 +255,8 @@ def _hf_casual_fwd_bwd(config, sp_size, dp_size):
grad = model.model.layers[0].self_attn.q_proj.weight.grad
grad_full = model_no_sp.model.layers[0].self_attn.q_proj.weight.grad
torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=3e-5)
torch.testing.assert_close(grad, grad_full, atol=1e-2, rtol=3e-5)
# The check should be less strict because the gradient is not an averaged value.
torch.testing.assert_close(grad, grad_full, rtol=1e-2, atol=1e-3)
if __name__ == "__main__":

View File

@ -183,12 +183,9 @@ actor_rollout_ref:
gate_proj_layer_name: gate_up
# number of responses (i.e. num sample times)
n: 1
engine_kwargs: # inference engine parameters
vllm:
swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB
disable_mm_preprocessor_cache: False # whether to disable the preprocessor cache for multimodel models.
sglang:
attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla
engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
vllm: {}
sglang: {}
val_kwargs:
# sampling parameters for validation
top_k: -1 # 0 for hf rollout, -1 for vllm rollout

View File

@ -476,23 +476,14 @@ actor_rollout_ref:
# Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition.
multi_stage_wake_up: false
# Extra inference engine arguments (vllm, sglang).
# Extra inference engine arguments, please refer vllm/sglang official doc for detail
engine_kwargs:
# for vllm
vllm:
# vllm engine config
vllm: {}
# Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
swap_space: null
# Whether to disable the preprocessor cache for multimodel models.
disable_mm_preprocessor_cache: False
# for sglang
sglang:
# The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
attention_backend: null
# sglang engine config
sglang: {}
# Sampling parameters used during validation.
val_kwargs:

View File

@ -166,11 +166,8 @@ actor_rollout_ref:
over_sample_rate: 0
multi_stage_wake_up: false
engine_kwargs:
vllm:
swap_space: null
disable_mm_preprocessor_cache: false
sglang:
attention_backend: null
vllm: {}
sglang: {}
val_kwargs:
_target_: verl.workers.config.SamplingConfig
top_k: -1

View File

@ -141,11 +141,8 @@ actor_rollout_ref:
over_sample_rate: 0
multi_stage_wake_up: false
engine_kwargs:
vllm:
swap_space: null
disable_mm_preprocessor_cache: false
sglang:
attention_backend: null
vllm: {}
sglang: {}
val_kwargs:
_target_: verl.workers.config.SamplingConfig
top_k: -1

View File

@ -91,23 +91,14 @@ over_sample_rate: 0
# This is only effective for SGLang rollout.
multi_stage_wake_up: false
# Extra inference engine arguments (vllm, sglang).
# Extra inference engine arguments (vllm, sglang), please refer vllm/sglang official doc for detail
engine_kwargs:
# for vllm
vllm:
# vllm engine config
vllm: {}
# Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
swap_space: null
# Whether to disable the preprocessor cache for multimodel models.
disable_mm_preprocessor_cache: False
# for sglang
sglang:
# The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
attention_backend: null
# sglang engine config
sglang: {}
# Sampling parameters used during validation.
val_kwargs:

View File

@ -437,8 +437,11 @@ class SGLangRollout(BaseRollout):
tp_size_per_node = self._tp_size // nnodes
node_rank = self._tp_rank // tp_size_per_node
first_rank_in_node = self._tp_rank % tp_size_per_node == 0
engine_kwargs = self.config.get("engine_kwargs", {}).get("sglang", {})
attention_backend = engine_kwargs.get("attention_backend", None)
engine_kwargs = self.config.get("engine_kwargs", {}).get("sglang", {}) or {}
engine_kwargs = {key: val for key, val in engine_kwargs.items() if val is not None}
# attention backend will be changed to fa3 if not specified
attention_backend = engine_kwargs.pop("attention_backend", None)
if first_rank_in_node:
rank = dist.get_rank()
@ -471,6 +474,7 @@ class SGLangRollout(BaseRollout):
# In async mode for AgentLoop, SGLang support token in token out to avoid the tokenizer
# inconsistency issue.
skip_tokenizer_init=self.config.mode == "async",
**engine_kwargs,
)
else:
self._engine = None

View File

@ -256,6 +256,12 @@ class AsyncvLLMServer(AsyncServerBase):
else:
logger.warning(f"cudagraph_capture_sizes must be a list, but got {cudagraph_capture_sizes}")
engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) or {}
engine_kwargs = {key: val for key, val in engine_kwargs.items() if val is not None}
if config.get("limit_images", None): # support for multi-image data
engine_kwargs["limit_mm_per_prompt"] = {"image": config.get("limit_images")}
engine_args = AsyncEngineArgs(
model=local_path,
enable_sleep_mode=config.free_cache_engine,
@ -277,6 +283,7 @@ class AsyncvLLMServer(AsyncServerBase):
trust_remote_code=trust_remote_code,
seed=config.get("seed", 0),
**compilation_config,
**engine_kwargs,
)
# init async llm engine

View File

@ -153,7 +153,7 @@ class vLLMRollout(BaseRollout):
lora_kwargs = kwargs.pop("lora_kwargs", {})
self.lora_kwargs = lora_kwargs
# copy it to avoid secretly modifying the engine config
engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {})
engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) or {}
# For each vLLM engine parameter,
# - `None` means not setting it, so we pop it, and leave it to vLLM default value