Fix NVML visible device parsing (#92315)

`CUDA_VISIBLE_DEVICES` can contain either ordinals or UUIDs Extend the logic to be able to parse it by UUID Added unit test to validate that parser and matcher behavior matches that of 525.60.13 driver Skip MIG- device parsing Fixes https://github.com/pytorch/pytorch/issues/90543 Pull Request resolved: https://github.com/pytorch/pytorch/pull/92315 Approved by: https://github.com/ngimel
2025-10-20 21:14:14 +08:00 · 2023-02-13 04:25:04 +00:00
parent 6fadd5e94a
commit e7e51b3a5c
2 changed files with 173 additions and 14 deletions
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@ -63,6 +63,67 @@ class TestExtendedCUDAIsAvail(TestCase):
                assert in_bad_fork


+class TestVisibleDeviceParses(TestCase):
+
+    def test_env_var_parsing(self):
+        def _parse_visible_devices(val):
+            from torch.cuda import _parse_visible_devices as _pvd
+            with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
+                return _pvd()
+
+        # rest of the string is ignored
+        self.assertEqual(_parse_visible_devices("1gpu2,2ampere"), [1, 2])
+        # Negatives abort parsing
+        self.assertEqual(_parse_visible_devices("0, 1, 2, -1, 3"), [0, 1, 2])
+        # Double mention of ordinal returns empty set
+        self.assertEqual(_parse_visible_devices("0, 1, 2, 1"), [])
+        # Unary pluses and minuses
+        self.assertEqual(_parse_visible_devices("2, +3, -0, 5"), [2, 3, 0, 5])
+        # Random string is used as empty set
+        self.assertEqual(_parse_visible_devices("one,two,3,4"), [])
+        # Random string is used as separator
+        self.assertEqual(_parse_visible_devices("4,3,two,one"), [4, 3])
+        # GPU ids are parsed
+        self.assertEqual(_parse_visible_devices("GPU-9e8d35e3"), ["GPU-9e8d35e3"])
+        # Ordinals are not included in GPUid set
+        self.assertEqual(_parse_visible_devices("GPU-123, 2"), ["GPU-123"])
+        # MIG ids are parsed
+        self.assertEqual(_parse_visible_devices("MIG-89c850dc"), ["MIG-89c850dc"])
+
+    def test_partial_uuid_resolver(self):
+        from torch.cuda import _transform_uuid_to_ordinals
+        uuids = ['GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1',
+                 'GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293',
+                 'GPU-e429a63e-c61c-4795-b757-5132caeb8e70',
+                 'GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98',
+                 'GPU-bbcd6503-5150-4e92-c266-97cc4390d04e',
+                 'GPU-472ea263-58d7-410d-cc82-f7fdece5bd28',
+                 'GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e',
+                 'GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad']
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1])
+        self.assertEqual(_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids), [1, 7, 5])
+        # First invalid UUID aborts parsing
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), [])
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids), [1])
+        # First ambigous UUID aborts parsing
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1])
+        # Duplicate UUIDs result in empty set
+        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids), [])
+
+    def test_ordinal_parse_visible_devices(self):
+        def _device_count_nvml(val):
+            from torch.cuda import _device_count_nvml as _dc
+            with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
+                return _dc()
+
+        with patch.object(torch.cuda, '_raw_device_count_nvml', return_value=2):
+            self.assertEqual(_device_count_nvml("1, 0"), 2)
+            # Ordinal out of bounds aborts parsing
+            self.assertEqual(_device_count_nvml("1, 5, 0"), 1)
+
+
+
 instantiate_parametrized_tests(TestExtendedCUDAIsAvail)

 if __name__ == '__main__':