Fix NVML visible device parsing (#92315)

`CUDA_VISIBLE_DEVICES` can contain either ordinals or UUIDs Extend the logic to be able to parse it by UUID

Added unit test to validate that parser and matcher behavior matches that of 525.60.13  driver

Skip MIG- device parsing

Fixes https://github.com/pytorch/pytorch/issues/90543

Pull Request resolved: https://github.com/pytorch/pytorch/pull/92315
Approved by: https://github.com/ngimel
This commit is contained in:
Nikita Shulga
2023-02-13 04:25:04 +00:00
committed by PyTorch MergeBot
parent 6fadd5e94a
commit e7e51b3a5c
2 changed files with 173 additions and 14 deletions

View File

@ -63,6 +63,67 @@ class TestExtendedCUDAIsAvail(TestCase):
assert in_bad_fork
class TestVisibleDeviceParses(TestCase):
def test_env_var_parsing(self):
def _parse_visible_devices(val):
from torch.cuda import _parse_visible_devices as _pvd
with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
return _pvd()
# rest of the string is ignored
self.assertEqual(_parse_visible_devices("1gpu2,2ampere"), [1, 2])
# Negatives abort parsing
self.assertEqual(_parse_visible_devices("0, 1, 2, -1, 3"), [0, 1, 2])
# Double mention of ordinal returns empty set
self.assertEqual(_parse_visible_devices("0, 1, 2, 1"), [])
# Unary pluses and minuses
self.assertEqual(_parse_visible_devices("2, +3, -0, 5"), [2, 3, 0, 5])
# Random string is used as empty set
self.assertEqual(_parse_visible_devices("one,two,3,4"), [])
# Random string is used as separator
self.assertEqual(_parse_visible_devices("4,3,two,one"), [4, 3])
# GPU ids are parsed
self.assertEqual(_parse_visible_devices("GPU-9e8d35e3"), ["GPU-9e8d35e3"])
# Ordinals are not included in GPUid set
self.assertEqual(_parse_visible_devices("GPU-123, 2"), ["GPU-123"])
# MIG ids are parsed
self.assertEqual(_parse_visible_devices("MIG-89c850dc"), ["MIG-89c850dc"])
def test_partial_uuid_resolver(self):
from torch.cuda import _transform_uuid_to_ordinals
uuids = ['GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1',
'GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293',
'GPU-e429a63e-c61c-4795-b757-5132caeb8e70',
'GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98',
'GPU-bbcd6503-5150-4e92-c266-97cc4390d04e',
'GPU-472ea263-58d7-410d-cc82-f7fdece5bd28',
'GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e',
'GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad']
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
self.assertEqual(_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1])
self.assertEqual(_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids), [1, 7, 5])
# First invalid UUID aborts parsing
self.assertEqual(_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), [])
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids), [1])
# First ambigous UUID aborts parsing
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1])
# Duplicate UUIDs result in empty set
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids), [])
def test_ordinal_parse_visible_devices(self):
def _device_count_nvml(val):
from torch.cuda import _device_count_nvml as _dc
with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
return _dc()
with patch.object(torch.cuda, '_raw_device_count_nvml', return_value=2):
self.assertEqual(_device_count_nvml("1, 0"), 2)
# Ordinal out of bounds aborts parsing
self.assertEqual(_device_count_nvml("1, 5, 0"), 1)
instantiate_parametrized_tests(TestExtendedCUDAIsAvail)
if __name__ == '__main__':