mirror of
https://github.com/vllm-project/vllm.git
synced 2025-10-20 23:03:52 +08:00
Compare commits
2 Commits
v0.11.0rc5
...
tms/distri
Author | SHA1 | Date | |
---|---|---|---|
2f86f710dd | |||
feeb17303d |
@ -222,10 +222,15 @@ class GroupCoordinator:
|
||||
|
||||
for ranks in group_ranks:
|
||||
device_group = torch.distributed.new_group(
|
||||
ranks, backend=torch_distributed_backend)
|
||||
ranks,
|
||||
backend=torch_distributed_backend,
|
||||
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
|
||||
# a group with `gloo` backend, to allow direct coordination between
|
||||
# processes through the CPU.
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||
cpu_group = torch.distributed.new_group(
|
||||
ranks,
|
||||
backend="gloo",
|
||||
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
|
||||
if self.rank in ranks:
|
||||
self.ranks = ranks
|
||||
self.world_size = len(ranks)
|
||||
@ -965,7 +970,8 @@ def init_distributed_environment(
|
||||
backend=backend,
|
||||
init_method=distributed_init_method,
|
||||
world_size=world_size,
|
||||
rank=rank)
|
||||
rank=rank,
|
||||
timeout=envs.VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS)
|
||||
# set the local rank
|
||||
# local_rank is not available in torch ProcessGroup,
|
||||
# see https://github.com/pytorch/pytorch/issues/122816
|
||||
|
@ -140,6 +140,7 @@ if TYPE_CHECKING:
|
||||
VLLM_NIXL_ABORT_REQUEST_TIMEOUT: int = 120
|
||||
VLLM_USE_CUDNN_PREFILL: bool = False
|
||||
VLLM_LOOPBACK_IP: str = ""
|
||||
VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS: Optional[int] = None
|
||||
|
||||
|
||||
def get_default_cache_root():
|
||||
@ -505,6 +506,10 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_IMAGE_FETCH_TIMEOUT":
|
||||
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
|
||||
|
||||
# Timeout for torch distributed calls
|
||||
"VLLM_DISTRIBUTED_INIT_TIMEOUT_SECONDS":
|
||||
lambda: maybe_convert_int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", None)),
|
||||
|
||||
# Timeout for fetching videos when serving multimodal models
|
||||
# Default is 30 seconds
|
||||
"VLLM_VIDEO_FETCH_TIMEOUT":
|
||||
|
Reference in New Issue
Block a user