mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
All these UTs are working as is, just removing the skip - test_p2p_ipc - test_repros.py: working, added fp8 support - test_activation_checkpointing.py - test_content_store.py - test_cuda_multigpu.py - test_compute_comm_reordering.py - test_segment_reductions.py - test_dataloader.py - test_math_ops.py - test_loop_ordering.py - test_control_flow.py - distributed_test.py - test_mem_tracker.py - test_fsdp_optim_state.py - test_fully_shard_mixed_precision.py: skippped for < ROCm7.0 - test_aot_inductor_custom_ops.py - test_c10d_ops_nccl.py - test_eager_transforms.py - test_sparse_csr.py - test_inductor_collectives.py - test_fake_tensor.py - test_cupy_as_tensor.py - test_cuda.py: enable UTs that are working - test_matmul_cuda.py: enable UTs that are working Fixes #ISSUE_NUMBER Pull Request resolved: https://github.com/pytorch/pytorch/pull/161715 Approved by: https://github.com/msaroufim Co-authored-by: Mark Saroufim <marksaroufim@fb.com>
74 lines
2.0 KiB
Python
74 lines
2.0 KiB
Python
# Owner(s): ["oncall: distributed"]
|
|
|
|
# To run:
|
|
# python test/distributed/test_p2p_ipc.py
|
|
|
|
|
|
import torch
|
|
from torch.multiprocessing.reductions import reduce_tensor
|
|
from torch.testing._internal.common_distributed import MultiProcContinuousTest
|
|
from torch.testing._internal.common_utils import requires_cuda_p2p_access, run_tests
|
|
|
|
|
|
# So that tests are written in device-agnostic way
|
|
device_type = "cuda"
|
|
device_module = torch.get_device_module(device_type)
|
|
|
|
|
|
@requires_cuda_p2p_access()
|
|
class P2PIpcTest(MultiProcContinuousTest):
|
|
@classmethod
|
|
def backend_str(cls):
|
|
return "gloo"
|
|
|
|
def _init_device(self) -> None:
|
|
# init and pin the process to the device
|
|
device_module.set_device(self.device)
|
|
torch.empty(1, device=self.device)
|
|
|
|
@property
|
|
def device(self) -> torch.device:
|
|
return torch.device(device_type, self.rank)
|
|
|
|
def test_p2p_ipc(self) -> None:
|
|
"""
|
|
Test that cross-process P2P access works, by reducing a tensor,
|
|
and then constructing a new tensor from the reduced tensor,
|
|
while modifying the 6-th argument.
|
|
|
|
This test is here to help stabilize the P2P share mechanism,
|
|
preventing bc-breakage.
|
|
"""
|
|
self._init_device()
|
|
|
|
tensor: torch.Tensor
|
|
|
|
if self.rank == 0:
|
|
tensor = torch.randn(2333, device=self.device)
|
|
tensor_meta = reduce_tensor(tensor)
|
|
torch.distributed.broadcast_object_list([tensor_meta], src=0)
|
|
else:
|
|
recv_list = [None]
|
|
torch.distributed.broadcast_object_list(recv_list, src=0)
|
|
tensor_meta = recv_list[0]
|
|
func, args = tensor_meta
|
|
args = list(args)
|
|
args[6] = self.rank
|
|
tensor = func(*args)
|
|
|
|
torch.distributed.barrier()
|
|
|
|
if self.rank == 0:
|
|
tensor.fill_(1)
|
|
|
|
device_module.synchronize()
|
|
torch.distributed.barrier()
|
|
|
|
assert tensor.allclose(tensor, 1)
|
|
|
|
torch.distributed.barrier()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
run_tests()
|