mirror of
https://github.com/volcengine/verl.git
synced 2025-10-20 13:43:50 +08:00
[BREAKING][rollout] feat: allow users pass all vllm/sglang engine args (#3037)
This PR allows users to pass all vllm/sglang engine args and optimizes qwen3 rollout speed through vllm Engine argument. 1. deprecate the default value of previous engine_kwargs 2. pass all the engine_kwargs to vllm/sglang engine 3. optimize Qwen3-235B rollout speed by setting TP=8 and enabling expert parallel. From top to bottom: tp=16 without EP, tp=8 without EP and tp=8 with EP. <img width="1000" height="808" alt="image" src="https://github.com/user-attachments/assets/6b096be4-3896-4e96-8916-d8d6e13a58cc" /> PS: The DeepSeek-V3's rollout slows down after enabling expert parallelism.
This commit is contained in:
@ -180,12 +180,9 @@ Actor/Rollout/Reference Policy
|
||||
log_prob_max_token_len_per_gpu: ${actor_rollout_ref.actor.ppo_max_token_len_per_gpu}
|
||||
# for hf rollout
|
||||
do_sample: True
|
||||
engine_kwargs: # inference engine parameters
|
||||
vllm:
|
||||
swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB
|
||||
disable_mm_preprocessor_cache: False # disable preprocessor cache for multimodel models
|
||||
sglang:
|
||||
attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla
|
||||
engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
|
||||
vllm: {}
|
||||
sglang: {}
|
||||
|
||||
n: 1 # for each prompt, sample n responses (i.e. num sample times). set it to values > 1 for grpo, rloo
|
||||
val_kwargs:
|
||||
@ -353,19 +350,9 @@ Reference model will be enabled when ``actor.use_kl_loss`` or/and ``algorithm.us
|
||||
deterministic outputs. When set to True, the rollout will use the ``actor_rollout_ref.rollout.val_kwargs`` parameters
|
||||
(top_k, top_p, temperature) to control the sampling behavior.
|
||||
|
||||
- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args
|
||||
- ``actor_rollout_ref.rollout.engine_kwargs.vllm``: extra vllm engine args, please refer vllm official doc for detail
|
||||
|
||||
- ``swap_space``: swap space in GB used by the inference engine. Positive integer, e.g., ``32`` means 32 GB. ``null``: means not setting and using the engine default value (usually, e.g., 4 GB for vLLM)
|
||||
- ``disable_mm_preprocessor_cache``: Whether to disable preprocessor cache for multimodel models.
|
||||
|
||||
- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args
|
||||
|
||||
- ``attention_backend``: The attention backend to use for the inference engine.
|
||||
|
||||
- ``null``: means not setting and using the engine default value (usually, e.g., ``fa3`` for SGLang)
|
||||
- ``flashinfer``: Use flashinfer attention backend.
|
||||
- ``triton``: Use triton attention backend.
|
||||
- ``flashmla``: Use flashmla attention backend.
|
||||
- ``actor_rollout_ref.rollout.engine_kwargs.sglang``: extra sglang engine args, please refer sglang official doc for detail
|
||||
|
||||
- ``actor_rollout_ref.rollout.ignore_eos``: Whether to ignore the EOS
|
||||
token and continue generating tokens after the EOS token is generated.
|
||||
|
@ -68,7 +68,7 @@ EP=${EP:-8}
|
||||
ETP=1
|
||||
CP=1
|
||||
optimizer_offload_fraction=${OFFLOAD_FRACTION:-1.}
|
||||
last_layer=${LAST_LAYER:-6}
|
||||
LAST_LAYER=${LAST_LAYER:-6}
|
||||
|
||||
|
||||
project_name='verl-deepseek-v3'
|
||||
|
@ -62,7 +62,7 @@ python3 -m verl.trainer.main_ppo --config-path=config \
|
||||
actor_rollout_ref.rollout.log_prob_use_dynamic_bsz=True \
|
||||
actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=20480 \
|
||||
actor_rollout_ref.rollout.name=$ENGINE \
|
||||
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \
|
||||
actor_rollout_ref.rollout.n=5 \
|
||||
actor_rollout_ref.ref.log_prob_micro_batch_size_per_gpu=20 \
|
||||
|
@ -26,7 +26,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
||||
actor_rollout_ref.rollout.name=$ENGINE \
|
||||
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
||||
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
||||
actor_rollout_ref.rollout.enforce_eager=False \
|
||||
|
@ -32,7 +32,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=20 \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
||||
actor_rollout_ref.rollout.name=$ENGINE \
|
||||
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
||||
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
||||
actor_rollout_ref.rollout.enforce_eager=False \
|
||||
|
@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=1 \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=8 \
|
||||
actor_rollout_ref.rollout.name=$ENGINE \
|
||||
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
actor_rollout_ref.rollout.gpu_memory_utilization=0.3 \
|
||||
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
||||
actor_rollout_ref.rollout.enforce_eager=True \
|
||||
|
@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=2 \
|
||||
actor_rollout_ref.rollout.name=$ENGINE \
|
||||
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
actor_rollout_ref.rollout.gpu_memory_utilization=0.6 \
|
||||
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
||||
actor_rollout_ref.rollout.enforce_eager=True \
|
||||
|
@ -31,7 +31,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.rollout.log_prob_micro_batch_size_per_gpu=4 \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=4 \
|
||||
actor_rollout_ref.rollout.name=$ENGINE \
|
||||
actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
+actor_rollout_ref.rollout.engine_kwargs.vllm.disable_mm_preprocessor_cache=True \
|
||||
actor_rollout_ref.rollout.gpu_memory_utilization=0.5 \
|
||||
actor_rollout_ref.rollout.enable_chunked_prefill=False \
|
||||
actor_rollout_ref.rollout.enforce_eager=True \
|
||||
|
@ -57,7 +57,7 @@ actor_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 10 / 10))
|
||||
infer_ppo_max_token_len=$(((max_prompt_length + max_response_length) * 1))
|
||||
offload=True
|
||||
OPTIM_OFFLOAD=${OPTIM_OFFLOAD:-True}
|
||||
gen_tp=16
|
||||
gen_tp=8
|
||||
train_tp=${TP:-4}
|
||||
train_pp=${PP:-8}
|
||||
|
||||
@ -128,6 +128,7 @@ python3 -m verl.trainer.main_ppo \
|
||||
actor_rollout_ref.actor.optim.clip_grad=1.0 \
|
||||
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
|
||||
actor_rollout_ref.rollout.gpu_memory_utilization=0.85 \
|
||||
+actor_rollout_ref.rollout.engine_kwargs.vllm.enable_expert_parallel=True \
|
||||
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
|
||||
actor_rollout_ref.rollout.enable_chunked_prefill=True \
|
||||
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
|
||||
|
@ -255,7 +255,8 @@ def _hf_casual_fwd_bwd(config, sp_size, dp_size):
|
||||
grad = model.model.layers[0].self_attn.q_proj.weight.grad
|
||||
grad_full = model_no_sp.model.layers[0].self_attn.q_proj.weight.grad
|
||||
torch.testing.assert_close(mean_local, mean_full, rtol=1e-2, atol=3e-5)
|
||||
torch.testing.assert_close(grad, grad_full, atol=1e-2, rtol=3e-5)
|
||||
# The check should be less strict because the gradient is not an averaged value.
|
||||
torch.testing.assert_close(grad, grad_full, rtol=1e-2, atol=1e-3)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
@ -183,12 +183,9 @@ actor_rollout_ref:
|
||||
gate_proj_layer_name: gate_up
|
||||
# number of responses (i.e. num sample times)
|
||||
n: 1
|
||||
engine_kwargs: # inference engine parameters
|
||||
vllm:
|
||||
swap_space: null # null means "use the engine default value" (usually 4 GB), setting it to, e.g., 32 means 32 GB
|
||||
disable_mm_preprocessor_cache: False # whether to disable the preprocessor cache for multimodel models.
|
||||
sglang:
|
||||
attention_backend: null # null means use the engine default value, available options: flashinfer, triton, flashmla
|
||||
engine_kwargs: # inference engine parameters, please refer vllm/sglang official doc for detail
|
||||
vllm: {}
|
||||
sglang: {}
|
||||
val_kwargs:
|
||||
# sampling parameters for validation
|
||||
top_k: -1 # 0 for hf rollout, -1 for vllm rollout
|
||||
|
@ -476,23 +476,14 @@ actor_rollout_ref:
|
||||
# Whether to wake up inference engine in multi-stage to reduce peak memory during training-rollout transition.
|
||||
multi_stage_wake_up: false
|
||||
|
||||
# Extra inference engine arguments (vllm, sglang).
|
||||
# Extra inference engine arguments, please refer vllm/sglang official doc for detail
|
||||
engine_kwargs:
|
||||
|
||||
# for vllm
|
||||
vllm:
|
||||
# vllm engine config
|
||||
vllm: {}
|
||||
|
||||
# Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
|
||||
swap_space: null
|
||||
|
||||
# Whether to disable the preprocessor cache for multimodel models.
|
||||
disable_mm_preprocessor_cache: False
|
||||
|
||||
# for sglang
|
||||
sglang:
|
||||
|
||||
# The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
|
||||
attention_backend: null
|
||||
# sglang engine config
|
||||
sglang: {}
|
||||
|
||||
# Sampling parameters used during validation.
|
||||
val_kwargs:
|
||||
|
@ -166,11 +166,8 @@ actor_rollout_ref:
|
||||
over_sample_rate: 0
|
||||
multi_stage_wake_up: false
|
||||
engine_kwargs:
|
||||
vllm:
|
||||
swap_space: null
|
||||
disable_mm_preprocessor_cache: false
|
||||
sglang:
|
||||
attention_backend: null
|
||||
vllm: {}
|
||||
sglang: {}
|
||||
val_kwargs:
|
||||
_target_: verl.workers.config.SamplingConfig
|
||||
top_k: -1
|
||||
|
@ -141,11 +141,8 @@ actor_rollout_ref:
|
||||
over_sample_rate: 0
|
||||
multi_stage_wake_up: false
|
||||
engine_kwargs:
|
||||
vllm:
|
||||
swap_space: null
|
||||
disable_mm_preprocessor_cache: false
|
||||
sglang:
|
||||
attention_backend: null
|
||||
vllm: {}
|
||||
sglang: {}
|
||||
val_kwargs:
|
||||
_target_: verl.workers.config.SamplingConfig
|
||||
top_k: -1
|
||||
|
@ -91,23 +91,14 @@ over_sample_rate: 0
|
||||
# This is only effective for SGLang rollout.
|
||||
multi_stage_wake_up: false
|
||||
|
||||
# Extra inference engine arguments (vllm, sglang).
|
||||
# Extra inference engine arguments (vllm, sglang), please refer vllm/sglang official doc for detail
|
||||
engine_kwargs:
|
||||
|
||||
# for vllm
|
||||
vllm:
|
||||
# vllm engine config
|
||||
vllm: {}
|
||||
|
||||
# Swap space (in GB) used by inference engine. null uses default (e.g., 4 GB).
|
||||
swap_space: null
|
||||
|
||||
# Whether to disable the preprocessor cache for multimodel models.
|
||||
disable_mm_preprocessor_cache: False
|
||||
|
||||
# for sglang
|
||||
sglang:
|
||||
|
||||
# The attention backend for sglang engine. Options: flashinfer, triton, flashmla, null for default.
|
||||
attention_backend: null
|
||||
# sglang engine config
|
||||
sglang: {}
|
||||
|
||||
# Sampling parameters used during validation.
|
||||
val_kwargs:
|
||||
|
@ -437,8 +437,11 @@ class SGLangRollout(BaseRollout):
|
||||
tp_size_per_node = self._tp_size // nnodes
|
||||
node_rank = self._tp_rank // tp_size_per_node
|
||||
first_rank_in_node = self._tp_rank % tp_size_per_node == 0
|
||||
engine_kwargs = self.config.get("engine_kwargs", {}).get("sglang", {})
|
||||
attention_backend = engine_kwargs.get("attention_backend", None)
|
||||
engine_kwargs = self.config.get("engine_kwargs", {}).get("sglang", {}) or {}
|
||||
engine_kwargs = {key: val for key, val in engine_kwargs.items() if val is not None}
|
||||
|
||||
# attention backend will be changed to fa3 if not specified
|
||||
attention_backend = engine_kwargs.pop("attention_backend", None)
|
||||
|
||||
if first_rank_in_node:
|
||||
rank = dist.get_rank()
|
||||
@ -471,6 +474,7 @@ class SGLangRollout(BaseRollout):
|
||||
# In async mode for AgentLoop, SGLang support token in token out to avoid the tokenizer
|
||||
# inconsistency issue.
|
||||
skip_tokenizer_init=self.config.mode == "async",
|
||||
**engine_kwargs,
|
||||
)
|
||||
else:
|
||||
self._engine = None
|
||||
|
@ -256,6 +256,12 @@ class AsyncvLLMServer(AsyncServerBase):
|
||||
else:
|
||||
logger.warning(f"cudagraph_capture_sizes must be a list, but got {cudagraph_capture_sizes}")
|
||||
|
||||
engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) or {}
|
||||
|
||||
engine_kwargs = {key: val for key, val in engine_kwargs.items() if val is not None}
|
||||
if config.get("limit_images", None): # support for multi-image data
|
||||
engine_kwargs["limit_mm_per_prompt"] = {"image": config.get("limit_images")}
|
||||
|
||||
engine_args = AsyncEngineArgs(
|
||||
model=local_path,
|
||||
enable_sleep_mode=config.free_cache_engine,
|
||||
@ -277,6 +283,7 @@ class AsyncvLLMServer(AsyncServerBase):
|
||||
trust_remote_code=trust_remote_code,
|
||||
seed=config.get("seed", 0),
|
||||
**compilation_config,
|
||||
**engine_kwargs,
|
||||
)
|
||||
|
||||
# init async llm engine
|
||||
|
@ -153,7 +153,7 @@ class vLLMRollout(BaseRollout):
|
||||
lora_kwargs = kwargs.pop("lora_kwargs", {})
|
||||
self.lora_kwargs = lora_kwargs
|
||||
# copy it to avoid secretly modifying the engine config
|
||||
engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {})
|
||||
engine_kwargs = config.get("engine_kwargs", {}).get("vllm", {}) or {}
|
||||
|
||||
# For each vLLM engine parameter,
|
||||
# - `None` means not setting it, so we pop it, and leave it to vLLM default value
|
||||
|
Reference in New Issue
Block a user