Compare commits

...

2 Commits

Author SHA1 Message Date
2f86f710dd Fix precommit
Signed-off-by: Tyler Michael Smith <tysmith@redhat.com>
2025-07-18 18:48:12 +00:00
feeb17303d Add VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS
Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-07-18 14:39:11 -04:00
2 changed files with 14 additions and 3 deletions

View File

@ -222,10 +222,15 @@ class GroupCoordinator:
for ranks in group_ranks:
device_group = torch.distributed.new_group(
ranks, backend=torch_distributed_backend)
ranks,
backend=torch_distributed_backend,
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
# a group with `gloo` backend, to allow direct coordination between
# processes through the CPU.
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
cpu_group = torch.distributed.new_group(
ranks,
backend="gloo",
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
if self.rank in ranks:
self.ranks = ranks
self.world_size = len(ranks)
@ -965,7 +970,8 @@ def init_distributed_environment(
backend=backend,
init_method=distributed_init_method,
world_size=world_size,
rank=rank)
rank=rank,
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
# set the local rank
# local_rank is not available in torch ProcessGroup,
# see https://github.com/pytorch/pytorch/issues/122816

View File

@ -140,6 +140,7 @@ if TYPE_CHECKING:
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
VLLM_USE_CUDNN_PREFILL: bool = False
VLLM_LOOPBACK_IP: str = ""
VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS: Optional[int] = None
def get_default_cache_root():
@ -505,6 +506,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_IMAGE_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
# Timeout for torch distributed calls
"VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS":
lambda: maybe_convert_int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", None)),
# Timeout for fetching videos when serving multimodal models
# Default is 30 seconds
"VLLM_VIDEO_FETCH_TIMEOUT":