[c10d] Remove deprecated multi-gpu-per-thread APIs (#114156)

As of today, PyTorch Distributed's preferred programming model is one device per thread, as exemplified by the APIs in its document.  The multi-GPU functions (which stand for multiple GPUs per CPU thread) have been deprecated for three versions. Removing them now before 2.2 release.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/114156
Approved by: https://github.com/albanD, https://github.com/fduwjj, https://github.com/H-Huang
This commit is contained in:
Ke Wen
2023-11-21 03:50:19 +00:00
committed by PyTorch MergeBot
parent f67696f45e
commit dc65f6c601
7 changed files with 12 additions and 657 deletions

View File

@ -4162,233 +4162,6 @@ class DistributedTest:
group, group_id, rank = self._init_full_group_test()
self._test_barrier_helper(group, group_id, rank)
def _test_broadcast_multigpu_helper(self, group, group_id, rank, rank_to_GPU):
for src in group:
expected_tensor = _build_tensor(src + 1)
tensors = [
_build_tensor(src + 1, -1).cuda(device=i) for i in rank_to_GPU[rank]
]
if rank == src:
tensors[0] = expected_tensor.cuda(device=rank_to_GPU[rank][0])
dist.broadcast_multigpu(tensors, src, group_id)
for tensor in tensors:
self.assertEqual(tensor, expected_tensor)
self._barrier()
@skip_but_pass_in_sandcastle_if(
BACKEND == "mpi", "MPI doesn't support broadcast multigpu"
)
@skip_but_pass_in_sandcastle_if(
BACKEND == "nccl", "NCCL broadcast multigpu skipped"
)
@skip_if_no_gpu
def test_broadcast_multigpu(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
self._test_broadcast_multigpu_helper(group, group_id, rank, rank_to_GPU)
def _test_all_reduce_multigpu_helper(
self,
group,
group_id,
rank,
rank_to_GPU,
op,
master_value,
worker_value,
expected_value,
dtype=torch.float,
):
for src in group:
curr_value = master_value if rank == src else worker_value
tensors = [
_build_tensor(src + 1, curr_value, dtype=dtype).cuda(device=i)
for i in rank_to_GPU[rank]
]
self.call_dist_op(
":all_reduce",
False,
dist.all_reduce_multigpu,
tensors,
op,
group_id,
)
expected_tensor = _build_tensor(src + 1, expected_value, dtype=dtype)
for tensor in tensors:
self.assertEqual(tensor, expected_tensor)
self._barrier()
@skip_but_pass_in_sandcastle_if(
BACKEND == "mpi", "MPI doesn't support broadcast multigpu"
)
@skip_but_pass_in_sandcastle_if(
BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL"
)
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
)
@skip_if_no_gpu
def test_all_reduce_multigpu(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
self._test_all_reduce_multigpu_helper(
group,
group_id,
rank,
rank_to_GPU,
dist.ReduceOp.SUM,
2,
10,
(2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
)
@skip_but_pass_in_sandcastle_if(
BACKEND == "mpi", "MPI doesn't support broadcast multigpu"
)
@skip_but_pass_in_sandcastle_if(
BACKEND == "nccl", "CUDA all_reduce multigpu skipped for NCCL"
)
@skip_but_pass_in_sandcastle_if(
BACKEND == "ucc" and IS_SANDCASTLE, "Skipped internally"
)
@skip_if_no_gpu
def test_all_reduce_multigpu_complex(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
self._test_all_reduce_multigpu_helper(
group,
group_id,
rank,
rank_to_GPU,
dist.ReduceOp.SUM,
complex(2, 3),
complex(10, 11),
(complex(2, 3) + complex(10, 11) * (len(group) - 1))
* len(rank_to_GPU[0]),
dtype=torch.cfloat,
)
def _test_reduce_multigpu_helper(
self,
group,
group_id,
rank,
rank_to_GPU,
op,
master_value,
worker_value,
expected_value,
):
for src in group:
tensor_value = master_value if rank == src else worker_value
tensors = [
_build_tensor(src + 1, tensor_value).cuda(device=i)
for i in rank_to_GPU[rank]
]
self.call_dist_op(
":reduce",
False,
dist.reduce_multigpu,
tensors,
src,
op,
group_id,
expect_event=len(tensors) == 1,
tensor_shapes=[tensors[0].shape],
)
if rank == src:
expected_tensor = _build_tensor(src + 1, expected_value)
self.assertEqual(tensors[0], expected_tensor)
self._barrier()
@skip_but_pass_in_sandcastle_if(
BACKEND != "nccl", "Only Nccl backend supports reduce multigpu"
)
@skip_if_no_gpu
def test_reduce_multigpu(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
device_id = rank_to_GPU[rank][0]
torch.cuda.set_device(device_id)
self._test_reduce_multigpu_helper(
group,
group_id,
rank,
rank_to_GPU,
dist.ReduceOp.SUM,
2,
10,
(2 + 10 * (len(group) - 1)) * len(rank_to_GPU[0]),
)
def _test_all_gather_multigpu_helper(
self, group, group_id, rank, rank_to_GPU, dtype=torch.float
):
for dest in group:
tensors = [
_build_tensor(dest + 1, dtype=dtype).cuda(device=i)
for i in rank_to_GPU[rank]
]
# construct expected output along with
# a place holder to receive all gather results
output_tensors = []
expected_output = []
output_per_gpu = (
[_build_tensor(dest + 1, -1, dtype=dtype)]
* len(rank_to_GPU[0])
* len(group)
)
expected_per_gpu = (
[_build_tensor(dest + 1, dtype=dtype)]
* len(rank_to_GPU[0])
* len(group)
)
for gpu in rank_to_GPU[rank]:
output_tensors.append([t.cuda(device=gpu) for t in output_per_gpu])
expected_output.append(
[t.cuda(device=gpu) for t in expected_per_gpu]
)
self.call_dist_op(
":all_gather",
False,
dist.all_gather_multigpu,
output_tensors,
tensors,
group_id,
expect_event=len(expected_output) == 1,
)
self.assertEqual(output_tensors, expected_output)
self._barrier()
@skip_but_pass_in_sandcastle_if(
BACKEND != "nccl", "Only Nccl backend supports allgather multigpu"
)
@skip_if_no_gpu
def test_all_gather_multigpu(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
device_id = rank_to_GPU[rank][0]
torch.cuda.set_device(device_id)
self._test_all_gather_multigpu_helper(group, group_id, rank, rank_to_GPU)
@skip_but_pass_in_sandcastle_if(
BACKEND != "nccl", "Only Nccl backend supports allgather multigpu"
)
@skip_if_no_gpu
def test_all_gather_multigpu_complex(self):
group, group_id, rank = self._init_global_test()
rank_to_GPU = init_multigpu_helper(dist.get_world_size(), BACKEND)
device_id = rank_to_GPU[rank][0]
torch.cuda.set_device(device_id)
self._test_all_gather_multigpu_helper(
group, group_id, rank, rank_to_GPU, dtype=torch.cfloat
)
def _model_step(self, model):
for param in model.parameters():
if param.grad is not None: