From 08879a391648dcb3752b24292a8b7afdea58ec56 Mon Sep 17 00:00:00 2001 From: kaixuanliu Date: Fri, 5 Sep 2025 14:06:26 +0800 Subject: [PATCH] avoid setting device_id to `init_process_group` (#7542) In some usecases such as vllm, we need to new distributed group not only on gpu, but also on cpu, if we set `device_id` here, it will prevent us from new distributed group on cpu: [L230](https://github.com/vllm-project/vllm/blob/main/vllm/distributed/parallel_state.py#L230) . This PR fixes this bug. --------- Signed-off-by: Liu, Kaixuan Co-authored-by: Olatunji Ruwase Co-authored-by: Stas Bekman --- deepspeed/comm/torch.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/deepspeed/comm/torch.py b/deepspeed/comm/torch.py index e92b104d8..19f2ba2d4 100755 --- a/deepspeed/comm/torch.py +++ b/deepspeed/comm/torch.py @@ -147,17 +147,14 @@ class TorchBackend(Backend): def init_process_group(self, backend, timeout, init_method, rank, world_size): if not torch.distributed.is_initialized(): - kwargs = dict( - timeout=timeout, - init_method=init_method, - rank=rank, - world_size=world_size, - ) + kwargs = dict(timeout=timeout, init_method=init_method, rank=rank, world_size=world_size) # 1. device_id arg was added in torch==2.3 # 2. setting device_id leads to hanging in 2.6.0