diff --git a/test/distributed/test_c10d_nccl.py b/test/distributed/test_c10d_nccl.py index f1691eeb877b..5849190e619c 100644 --- a/test/distributed/test_c10d_nccl.py +++ b/test/distributed/test_c10d_nccl.py @@ -285,10 +285,9 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase): def setUp(self): super().setUp() - # These tests are expected to throw SIGABRT(6); adding the negative sign - # bc the test return code is actually -6 + # These tests are expected to throw SIGABRT(6); # But if we are in Sandcastle, `skip_but_pass_in_sandcastle` would return 0. - TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else -signal.SIGABRT + TEST_NAN_ASSERT_RETURN = 0 if IS_SANDCASTLE else signal.SIGABRT self.special_return_code_checks = { self.test_nan_assert_float16.__wrapped__: TEST_NAN_ASSERT_RETURN, self.test_nan_assert_float32.__wrapped__: TEST_NAN_ASSERT_RETURN, @@ -485,7 +484,7 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase): @requires_nccl() @skip_but_pass_in_sandcastle_if( # skip for cu126 as well due to https://github.com/pytorch/pytorch/issues/153479 - not (TEST_MULTIGPU and CUDA_12_AND_ABOVE and False), + not (TEST_MULTIGPU and CUDA_12_AND_ABOVE), "NCCL test requires 2+ GPUs and Device side assert could cause unexpected errors in lower versions of CUDA", ) @parametrize( @@ -539,10 +538,15 @@ class ProcessGroupNCCLGroupTest(MultiProcessTestCase): backend._set_enable_nan_check(False) # Note: using all-gather here bc some NCCL/SM version does not support # FP8 reduction - pg._allgather_base(output, nan_tensor) + # temporarily skip due to https://github.com/pytorch/pytorch/issues/153479 + # pg._allgather_base(output, nan_tensor) backend._set_enable_nan_check(True) - pg._allgather_base(output, nan_tensor) + try: + pg._allgather_base(output, nan_tensor) + except Exception: + sys.exit(signal.SIGABRT) + dist.destroy_process_group() # reset env