Use is_available instead of device_count to check for CUDA availability (#97043)

There are some tests that incorrectly uses the number of GPU devices `torch.cuda.device_count() > 0` to check for CUDA availability instead of the default `torch.cuda.is_available()` call. This makes these tests more brittle when encountering infra flakiness on G5 runner using A10G, for example [test_pytorch_np](https://hud.pytorch.org/failure/FAILED%20test_tensorboard.py%3A%3ATestTensorBoardPyTorchNumpy%3A%3Atest_pytorch_np%20-%20RuntimeError%3A%20No%20CUDA%20GPUs%20are%20available). The underlying problem is that GPU devices could crash on these runner. While the root cause for that is unclear and we will try to upgrade to a new NVIDIA driver https://github.com/pytorch/pytorch/pull/96904 to see if it helps, we can also make these tests more resilient by using the correct check to skip tests correctly when GPU crashes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/97043 Approved by: https://github.com/clee2000
2025-10-20 21:14:14 +08:00 · 2023-03-18 00:39:42 +00:00
parent c62fc81cc5
commit 679dec847e
2 changed files with 3 additions and 3 deletions
--- a/test/test_sparse.py
+++ b/test/test_sparse.py
@ -2941,7 +2941,7 @@ class TestSparse(TestSparseBase):
            self.skipTest(f'requires_grad==True requires float or complex dtype, got {dtype}')

        self._test_empty_full(device, dtype, requires_grad)
-        if torch.cuda.device_count() > 0:
+        if torch.cuda.is_available():
            self._test_empty_full(None, dtype, requires_grad)
            self._test_empty_full(torch.device('cuda:0'), dtype, requires_grad)

--- a/test/test_tensorboard.py
+++ b/test/test_tensorboard.py
@ -94,14 +94,14 @@ class TestTensorBoardPyTorchNumpy(BaseTestCase):
            self.assertIsInstance(make_np(tensor), np.ndarray)

            # CUDA tensor
-            if torch.cuda.device_count() > 0:
+            if torch.cuda.is_available():
                self.assertIsInstance(make_np(tensor.cuda()), np.ndarray)

            # regular variable
            self.assertIsInstance(make_np(torch.autograd.Variable(tensor)), np.ndarray)

            # CUDA variable
-            if torch.cuda.device_count() > 0:
+            if torch.cuda.is_available():
                self.assertIsInstance(make_np(torch.autograd.Variable(tensor).cuda()), np.ndarray)

        # python primitive type