Fix precommit

Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
Add VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS
2025-10-20 14:53:52 +08:00 · 2025-07-18 18:48:12 +00:00 · 2025-07-18 14:39:11 -04:00
2 changed files with 14 additions and 3 deletions
--- a/vllm/distributed/parallel_state.py
+++ b/vllm/distributed/parallel_state.py
@ -222,10 +222,15 @@ class GroupCoordinator:

        for ranks in group_ranks:
            device_group = torch.distributed.new_group(
-                ranks, backend=torch_distributed_backend)
+                ranks,
+                backend=torch_distributed_backend,
+                timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
            # a group with `gloo` backend, to allow direct coordination between
            # processes through the CPU.
-            cpu_group = torch.distributed.new_group(ranks, backend="gloo")
+            cpu_group = torch.distributed.new_group(
+                ranks,
+                backend="gloo",
+                timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
            if self.rank in ranks:
                self.ranks = ranks
                self.world_size = len(ranks)
@ -965,7 +970,8 @@ def init_distributed_environment(
            backend=backend,
            init_method=distributed_init_method,
            world_size=world_size,
-            rank=rank)
+            rank=rank,
+            timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
    # set the local rank
    # local_rank is not available in torch ProcessGroup,
    # see https://github.com/pytorch/pytorch/issues/122816
--- a/vllm/envs.py
+++ b/vllm/envs.py
@ -140,6 +140,7 @@ if TYPE_CHECKING:
    VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
    VLLM_USE_CUDNN_PREFILL: bool = False
    VLLM_LOOPBACK_IP: str = ""
+    VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS: Optional[int] = None


 def get_default_cache_root():
@ -505,6 +506,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_IMAGE_FETCH_TIMEOUT":
    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),

+    # Timeout for torch distributed calls
+    "VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS":
+    lambda: maybe_convert_int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", None)),
+
    # Timeout for fetching videos when serving multimodal models
    # Default is 30 seconds
    "VLLM_VIDEO_FETCH_TIMEOUT":
Author	SHA1	Message	Date
Tyler Michael Smith	2f86f710dd	Fix precommit Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>	2025-07-18 18:48:12 +00:00
Tyler Michael Smith	feeb17303d	Add VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>	2025-07-18 14:39:11 -04:00