From 321938e9ac4000e0cb37e328359a7fd3026bc672 Mon Sep 17 00:00:00 2001 From: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Date: Wed, 27 Aug 2025 17:52:24 -0400 Subject: [PATCH] [Feature] Add `VLLM_DISABLE_PAD_FOR_CUDAGRAPH` to Avoid Hang Issue (#23595) Signed-off-by: yewentao256 Signed-off-by: Wentao Ye <44945378+yewentao256@users.noreply.github.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> --- vllm/envs.py | 7 +++++++ vllm/v1/worker/gpu_model_runner.py | 1 + 2 files changed, 8 insertions(+) diff --git a/vllm/envs.py b/vllm/envs.py index 35735b5525..a6a795dcfc 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -166,6 +166,7 @@ if TYPE_CHECKING: VLLM_USE_FLASHINFER_MOE_MXFP4_BF16: bool = False VLLM_ALLREDUCE_USE_SYMM_MEM: bool = False VLLM_TUNED_CONFIG_FOLDER: Optional[str] = None + VLLM_DISABLE_PAD_FOR_CUDAGRAPH: bool = False def get_default_cache_root(): @@ -1144,6 +1145,12 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_ENABLE_CUDAGRAPH_GC": lambda: bool(int(os.getenv("VLLM_ENABLE_CUDAGRAPH_GC", "0"))), + # Disable padding to CUDA graph capture batch sizes. + # TODO(wentao): https://github.com/vllm-project/vllm/issues/23378 + # After the issue is fixed, we can remove this flag. + "VLLM_DISABLE_PAD_FOR_CUDAGRAPH": + lambda: bool(int(os.getenv("VLLM_DISABLE_PAD_FOR_CUDAGRAPH", "0"))), + # Used to force set up loopback IP "VLLM_LOOPBACK_IP": lambda: os.getenv("VLLM_LOOPBACK_IP", ""), diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 01c90b2ea3..a194808e51 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1491,6 +1491,7 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin): num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens if (self.compilation_config.cudagraph_mode != CUDAGraphMode.NONE + and not envs.VLLM_DISABLE_PAD_FOR_CUDAGRAPH and num_scheduled_tokens <= self.cudagraph_batch_sizes[-1]): # Use CUDA graphs. # Add padding to the batch size.