fix Muti node CUDA error: invalid device ordinal #3775 (#3779)

This commit is contained in:
Ricardo Dominguez-Olmedo
2025-09-13 15:32:47 +02:00
committed by GitHub
parent dfdc219018
commit 0cb1a33475

View File

@ -400,7 +400,7 @@ class PartialState:
DistributedType.DEEPSPEED,
DistributedType.FSDP,
):
torch.distributed.barrier(device_ids=[self.process_index])
torch.distributed.barrier(device_ids=[self.local_process_index])
elif self.distributed_type == DistributedType.XLA:
xm.rendezvous("accelerate.utils.wait_for_everyone")