mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[c10d] Remove deprecated multi-gpu-per-thread APIs (#114156)
As of today, PyTorch Distributed's preferred programming model is one device per thread, as exemplified by the APIs in its document. The multi-GPU functions (which stand for multiple GPUs per CPU thread) have been deprecated for three versions. Removing them now before 2.2 release. Pull Request resolved: https://github.com/pytorch/pytorch/pull/114156 Approved by: https://github.com/albanD, https://github.com/fduwjj, https://github.com/H-Huang
This commit is contained in:
@ -4162,233 +4162,6 @@ class DistributedTest:
|
||||
group, group_id, rank = self._init_full_group_test()
|
||||
self._test_barrier_helper(group, group_id, rank)
|
||||
|
||||
def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
|
||||
for src in group:
|
||||
expected_tensor = _build_tensor(src + 1)
|
||||
tensors = [
|
||||
_build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank]
|
||||
]
|
||||
if rank == src:
|
||||
tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0])
|
||||
|
||||
dist.broadcast_multigpu(tensors, src, group_id)
|
||||
for tensor in tensors:
|
||||
self.assertEqual(tensor, expected_tensor)
|
||||
self._barrier()
|
||||
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "mpi", "MPI doesn't support broadcast multigpu"
|
||||
)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "nccl", "NCCL broadcast multigpu skipped"
|
||||
)
|
||||
@skip_if_no_gpu
|
||||
def test_broadcast_multigpu(self):
|
||||
group, group_id, rank = self._init_global_test()
|
||||
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
|
||||
self._test_broadcast_multigpu_helper(group, group_id, rank, rank_to_GPU)
|
||||
|
||||
def _test_all_reduce_multigpu_helper(
|
||||
self,
|
||||
group,
|
||||
group_id,
|
||||
rank,
|
||||
rank_to_GPU,
|
||||
op,
|
||||
master_value,
|
||||
worker_value,
|
||||
expected_value,
|
||||
dtype=torch.float,
|
||||
):
|
||||
for src in group:
|
||||
curr_value = master_value if rank == src else worker_value
|
||||
tensors = [
|
||||
_build_tensor(src + 1, curr_value, dtype=dtype).cuda(device=i)
|
||||
for i in rank_to_GPU[rank]
|
||||
]
|
||||
self.call_dist_op(
|
||||
":all_reduce",
|
||||
False,
|
||||
dist.all_reduce_multigpu,
|
||||
tensors,
|
||||
op,
|
||||
group_id,
|
||||
)
|
||||
expected_tensor = _build_tensor(src + 1, expected_value, dtype=dtype)
|
||||
for tensor in tensors:
|
||||
self.assertEqual(tensor, expected_tensor)
|
||||
|
||||
self._barrier()
|
||||
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "mpi", "MPI doesn't support broadcast multigpu"
|
||||
)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL"
|
||||
)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
|
||||
)
|
||||
@skip_if_no_gpu
|
||||
def test_all_reduce_multigpu(self):
|
||||
group, group_id, rank = self._init_global_test()
|
||||
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
|
||||
self._test_all_reduce_multigpu_helper(
|
||||
group,
|
||||
group_id,
|
||||
rank,
|
||||
rank_to_GPU,
|
||||
dist.ReduceOp.SUM,
|
||||
2,
|
||||
10,
|
||||
(2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
|
||||
)
|
||||
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "mpi", "MPI doesn't support broadcast multigpu"
|
||||
)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL"
|
||||
)
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
|
||||
)
|
||||
@skip_if_no_gpu
|
||||
def test_all_reduce_multigpu_complex(self):
|
||||
group, group_id, rank = self._init_global_test()
|
||||
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
|
||||
self._test_all_reduce_multigpu_helper(
|
||||
group,
|
||||
group_id,
|
||||
rank,
|
||||
rank_to_GPU,
|
||||
dist.ReduceOp.SUM,
|
||||
complex(2, 3),
|
||||
complex(10, 11),
|
||||
(complex(2, 3) + complex(10, 11) * (len(group) - 1))
|
||||
* len(rank_to_GPU[0]),
|
||||
dtype=torch.cfloat,
|
||||
)
|
||||
|
||||
def _test_reduce_multigpu_helper(
|
||||
self,
|
||||
group,
|
||||
group_id,
|
||||
rank,
|
||||
rank_to_GPU,
|
||||
op,
|
||||
master_value,
|
||||
worker_value,
|
||||
expected_value,
|
||||
):
|
||||
for src in group:
|
||||
tensor_value = master_value if rank == src else worker_value
|
||||
tensors = [
|
||||
_build_tensor(src + 1, tensor_value).cuda(device=i)
|
||||
for i in rank_to_GPU[rank]
|
||||
]
|
||||
self.call_dist_op(
|
||||
":reduce",
|
||||
False,
|
||||
dist.reduce_multigpu,
|
||||
tensors,
|
||||
src,
|
||||
op,
|
||||
group_id,
|
||||
expect_event=len(tensors) == 1,
|
||||
tensor_shapes=[tensors[0].shape],
|
||||
)
|
||||
if rank == src:
|
||||
expected_tensor = _build_tensor(src + 1, expected_value)
|
||||
self.assertEqual(tensors[0], expected_tensor)
|
||||
|
||||
self._barrier()
|
||||
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND != "nccl", "Only Nccl backend supports reduce multigpu"
|
||||
)
|
||||
@skip_if_no_gpu
|
||||
def test_reduce_multigpu(self):
|
||||
group, group_id, rank = self._init_global_test()
|
||||
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
|
||||
device_id = rank_to_GPU[rank][0]
|
||||
torch.cuda.set_device(device_id)
|
||||
self._test_reduce_multigpu_helper(
|
||||
group,
|
||||
group_id,
|
||||
rank,
|
||||
rank_to_GPU,
|
||||
dist.ReduceOp.SUM,
|
||||
2,
|
||||
10,
|
||||
(2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
|
||||
)
|
||||
|
||||
def _test_all_gather_multigpu_helper(
|
||||
self, group, group_id, rank, rank_to_GPU, dtype=torch.float
|
||||
):
|
||||
for dest in group:
|
||||
tensors = [
|
||||
_build_tensor(dest + 1, dtype=dtype).cuda(device=i)
|
||||
for i in rank_to_GPU[rank]
|
||||
]
|
||||
|
||||
# construct expected output along with
|
||||
# a place holder to receive all gather results
|
||||
output_tensors = []
|
||||
expected_output = []
|
||||
output_per_gpu = (
|
||||
[_build_tensor(dest + 1, -1, dtype=dtype)]
|
||||
* len(rank_to_GPU[0])
|
||||
* len(group)
|
||||
)
|
||||
expected_per_gpu = (
|
||||
[_build_tensor(dest + 1, dtype=dtype)]
|
||||
* len(rank_to_GPU[0])
|
||||
* len(group)
|
||||
)
|
||||
for gpu in rank_to_GPU[rank]:
|
||||
output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
|
||||
expected_output.append(
|
||||
[t.cuda(device=gpu) for t in expected_per_gpu]
|
||||
)
|
||||
self.call_dist_op(
|
||||
":all_gather",
|
||||
False,
|
||||
dist.all_gather_multigpu,
|
||||
output_tensors,
|
||||
tensors,
|
||||
group_id,
|
||||
expect_event=len(expected_output) == 1,
|
||||
)
|
||||
self.assertEqual(output_tensors, expected_output)
|
||||
|
||||
self._barrier()
|
||||
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND != "nccl", "Only Nccl backend supports allgather multigpu"
|
||||
)
|
||||
@skip_if_no_gpu
|
||||
def test_all_gather_multigpu(self):
|
||||
group, group_id, rank = self._init_global_test()
|
||||
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
|
||||
device_id = rank_to_GPU[rank][0]
|
||||
torch.cuda.set_device(device_id)
|
||||
self._test_all_gather_multigpu_helper(group, group_id, rank, rank_to_GPU)
|
||||
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
BACKEND != "nccl", "Only Nccl backend supports allgather multigpu"
|
||||
)
|
||||
@skip_if_no_gpu
|
||||
def test_all_gather_multigpu_complex(self):
|
||||
group, group_id, rank = self._init_global_test()
|
||||
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
|
||||
device_id = rank_to_GPU[rank][0]
|
||||
torch.cuda.set_device(device_id)
|
||||
self._test_all_gather_multigpu_helper(
|
||||
group, group_id, rank, rank_to_GPU, dtype=torch.cfloat
|
||||
)
|
||||
|
||||
def _model_step(self, model):
|
||||
for param in model.parameters():
|
||||
if param.grad is not None:
|
||||
|
Reference in New Issue
Block a user