mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[CI][CUDA] Move cu118 distributed pull jobs to cu126, move cu124-sm75 to cu126-sm75 (#151594)
This PR moves distributed cuda CI job from cuda 11.8 to cuda 12.6. In doing so, a few unit test failures were exposed, some if not all of which would take a while to root-cause and fix, so temporarily skip them after creating the issues. https://github.com/pytorch/pytorch/issues/153479 test_nan_assert tricky behavior (e.g. skip_but_pass_in_sandcastle, ubuntu 20.04 does not work, ubuntu 22.04 works, Amazon Linux 2023 skip - what is Sandcastle OS?) https://github.com/pytorch/pytorch/issues/153122 CUDA context related https://github.com/pytorch/pytorch/issues/153517 NCCL regression, future NCCL may fix it See: https://github.com/pytorch/pytorch/issues/147383 Pull Request resolved: https://github.com/pytorch/pytorch/pull/151594 Approved by: https://github.com/eqy, https://github.com/atalman, https://github.com/cyyever
This commit is contained in:
committed by
PyTorch MergeBot
parent
2b43d635d3
commit
8cabd23b3d
@ -481,7 +481,8 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
|
||||
|
||||
@requires_nccl()
|
||||
@skip_but_pass_in_sandcastle_if(
|
||||
not (TEST_MULTIGPU and CUDA_12_AND_ABOVE),
|
||||
# skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479
|
||||
not (TEST_MULTIGPU and CUDA_12_AND_ABOVE and False),
|
||||
"NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA",
|
||||
)
|
||||
@parametrize(
|
||||
@ -657,9 +658,11 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
|
||||
# fail because one context takes about 1 GB -- much more than the
|
||||
# tensor size created in this test.
|
||||
self.assertTrue(
|
||||
used_after < used_before * 1.5,
|
||||
# Bump the heuristic from 1.5 to 1.7 due to
|
||||
# https://github.com/pytorch/pytorch/issues/153122
|
||||
used_after < used_before * 1.7,
|
||||
f"{device} used {used_after} bytes after collective, "
|
||||
f"50% more than the status before ({used_before} bytes). "
|
||||
f"70% more than the status before ({used_before} bytes). "
|
||||
f"Extra CUDA context may have been created.",
|
||||
)
|
||||
|
||||
@ -1049,6 +1052,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase):
|
||||
def test_non_blocking_with_eager_init(self):
|
||||
# Test creating a pg eagerly with nonblocking mode when
|
||||
# we've passed a specific device_id to init_process_group.
|
||||
raise SkipTest("Skip due to https://github.com/pytorch/pytorch/issues/153517")
|
||||
os.environ["TORCH_NCCL_USE_COMM_NONBLOCKING"] = "1"
|
||||
os.environ["TORCH_NCCL_NONBLOCKING_TIMEOUT"] = "100"
|
||||
store = c10d.FileStore(self.file_name, self.world_size)
|
||||
@ -3676,6 +3680,9 @@ class NcclProcessGroupWithDispatchedCollectivesTests(
|
||||
@skip_if_lt_x_gpu(1)
|
||||
@parametrize("float8_dtype", [torch.float8_e4m3fn, torch.float8_e5m2])
|
||||
def test_allgather_float8(self, float8_dtype):
|
||||
device = torch.device(f"cuda:{self.rank:d}")
|
||||
if not sm_is_or_higher_than(device, 9, 0):
|
||||
self.skipTest("FP8 reduction support begins with sm90 capable devices")
|
||||
store = dist.FileStore(self.file_name, self.world_size)
|
||||
dist.init_process_group(
|
||||
"nccl",
|
||||
|
Reference in New Issue
Block a user