Enable UFMT on test/test_cuda*.py (#124352)

Part of: #123062

Ran lintrunner on:

- test/test_cuda.py
- test/test_cuda_expandable_segments.py
- test/test_cuda_multigpu.py
- test/test_cuda_nvml_based_avail.py
- test/test_cuda_primary_ctx.py
- test/test_cuda_sanitizer.py
- test/test_cuda_trace.py

Detail:

```bash
$ lintrunner -a --take UFMT --all-files
ok No lint issues.
Successfully applied all patches.
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/124352
Approved by: https://github.com/ezyang
This commit is contained in:
Yuanhao Ji
2024-04-25 18:31:03 +00:00
committed by PyTorch MergeBot
parent 977dc5593a
commit d5182bb75b
8 changed files with 1612 additions and 786 deletions

View File

@ -1051,13 +1051,6 @@ exclude_patterns = [
'test/quantization/fx/test_numeric_suite_fx.py',
'test/quantization/fx/test_quantize_fx.py',
'test/quantization/fx/test_subgraph_rewriter.py',
'test/test_cuda.py',
'test/test_cuda_expandable_segments.py',
'test/test_cuda_multigpu.py',
'test/test_cuda_nvml_based_avail.py',
'test/test_cuda_primary_ctx.py',
'test/test_cuda_sanitizer.py',
'test/test_cuda_trace.py',
'test/test_custom_op_testing.py',
'test/test_dataloader.py',
'test/test_datapipe.py',

File diff suppressed because it is too large Load Diff

View File

@ -2,13 +2,14 @@
# run time cuda tests, but with the allocator using expandable segments
import os
import torch
from torch.testing._internal.common_cuda import IS_JETSON
if torch.cuda.is_available() and not IS_JETSON:
torch.cuda.memory._set_allocator_settings('expandable_segments:True')
torch.cuda.memory._set_allocator_settings("expandable_segments:True")
current_dir = os.path.dirname(os.path.abspath(__file__))
filepath = os.path.join(current_dir, 'test_cuda.py')
exec(compile(open(filepath).read(), filepath, mode='exec'))
filepath = os.path.join(current_dir, "test_cuda.py")
exec(compile(open(filepath).read(), filepath, mode="exec"))

View File

@ -3,38 +3,45 @@
import collections
import contextlib
import ctypes
import io
import gc
import io
import queue
import sys
import tempfile
import threading
import torch
import torch.cuda.comm as comm
import unittest
from itertools import repeat, chain
from itertools import chain, repeat
from typing import NamedTuple
import torch
import torch.cuda.comm as comm
from torch.nn.parallel import scatter_gather
from torch.testing._internal.common_cuda import (
_create_scaling_case,
_create_scaling_models_optimizers,
TEST_MULTIGPU,
)
from torch.testing._internal.common_utils import (
get_cycles_per_ms,
instantiate_parametrized_tests,
IS_JETSON,
IS_REMOTE_GPU,
IS_SANDCASTLE,
NoTest,
TEST_CUDA,
TestCase,
get_cycles_per_ms,
instantiate_parametrized_tests,
run_tests,
skipCUDANonDefaultStreamIf,
skipIfRocm,
TEST_CUDA,
TestCase,
)
from torch.testing._internal.common_cuda import TEST_MULTIGPU, _create_scaling_case, _create_scaling_models_optimizers
TEST_CUDAMALLOCASYNC = TEST_CUDA and (torch.cuda.get_allocator_backend() == "cudaMallocAsync")
TEST_CUDAMALLOCASYNC = TEST_CUDA and (
torch.cuda.get_allocator_backend() == "cudaMallocAsync"
)
if not TEST_CUDA:
print('CUDA not available, skipping tests', file=sys.stderr)
print("CUDA not available, skipping tests", file=sys.stderr)
TestCase = NoTest # noqa: F811
@ -44,7 +51,9 @@ class TestCudaMultiGPU(TestCase):
def _check_memory_stat_consistency(self):
snapshot = torch.cuda.memory_snapshot()
expected_each_device = collections.defaultdict(lambda: collections.defaultdict(int))
expected_each_device = collections.defaultdict(
lambda: collections.defaultdict(int)
)
for segment in snapshot:
expandable = segment["is_expandable"]
@ -56,7 +65,9 @@ class TestCudaMultiGPU(TestCase):
expected["segment." + pool_str + ".current"] += 1
expected["allocated_bytes.all.current"] += segment["allocated_size"]
expected["allocated_bytes." + pool_str + ".current"] += segment["allocated_size"]
expected["allocated_bytes." + pool_str + ".current"] += segment[
"allocated_size"
]
expected["reserved_bytes.all.current"] += segment["total_size"]
expected["reserved_bytes." + pool_str + ".current"] += segment["total_size"]
@ -65,7 +76,9 @@ class TestCudaMultiGPU(TestCase):
expected["active_bytes." + pool_str + ".current"] += segment["active_size"]
expected["requested_bytes.all.current"] += segment["requested_size"]
expected["requested_bytes." + pool_str + ".current"] += segment["requested_size"]
expected["requested_bytes." + pool_str + ".current"] += segment[
"requested_size"
]
sum_requested = 0
is_split = len(segment["blocks"]) > 1
@ -83,7 +96,9 @@ class TestCudaMultiGPU(TestCase):
expected["inactive_split.all.current"] += 1
expected["inactive_split." + pool_str + ".current"] += 1
expected["inactive_split_bytes.all.current"] += block["size"]
expected["inactive_split_bytes." + pool_str + ".current"] += block["size"]
expected["inactive_split_bytes." + pool_str + ".current"] += block[
"size"
]
self.assertEqual(sum_requested, segment["requested_size"])
@ -94,15 +109,15 @@ class TestCudaMultiGPU(TestCase):
def test_cuda_synchronize(self):
torch.cuda.synchronize()
torch.cuda.synchronize('cuda')
torch.cuda.synchronize('cuda:0')
torch.cuda.synchronize("cuda")
torch.cuda.synchronize("cuda:0")
torch.cuda.synchronize(0)
torch.cuda.synchronize(torch.device('cuda:0'))
torch.cuda.synchronize(torch.device("cuda:0"))
if TEST_MULTIGPU:
torch.cuda.synchronize('cuda:1')
torch.cuda.synchronize("cuda:1")
torch.cuda.synchronize(1)
torch.cuda.synchronize(torch.device('cuda:1'))
torch.cuda.synchronize(torch.device("cuda:1"))
with self.assertRaisesRegex(ValueError, "Expected a cuda device, but"):
torch.cuda.synchronize(torch.device("cpu"))
@ -285,8 +300,10 @@ class TestCudaMultiGPU(TestCase):
# interlace
torch.cuda.empty_cache()
gen0 = self._test_memory_stats_generator(self, device='cuda:0', N=35)
gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
gen0 = self._test_memory_stats_generator(self, device="cuda:0", N=35)
gen1 = self._test_memory_stats_generator(
self, device=torch.device("cuda:1"), N=35
)
end0 = end1 = False
while not (end0 and end1):
end0 = advance(gen0, end0)
@ -295,7 +312,9 @@ class TestCudaMultiGPU(TestCase):
# semi-random order
torch.cuda.empty_cache()
gen0 = self._test_memory_stats_generator(self, device=0, N=35)
gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
gen1 = self._test_memory_stats_generator(
self, device=torch.device("cuda:1"), N=35
)
end0 = end1 = False
while not (end0 and end1):
@ -396,10 +415,10 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_copy_streams(self):
d0 = torch.device('cuda:0')
d0 = torch.device("cuda:0")
x0 = torch.zeros(5, 5, device=d0)
d1 = torch.device('cuda:1')
d1 = torch.device("cuda:1")
x1 = torch.zeros(5, 5, device=d1)
self._test_copy_sync_current_stream(x0, x1)
@ -416,13 +435,13 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor")
def test_load_nonexistent_device(self):
# Setup: create a serialized file object with a 'cuda:9' restore location
tensor = torch.randn(2, device='cuda')
tensor = torch.randn(2, device="cuda")
buf = io.BytesIO()
torch.save(tensor, buf)
# NB: this might not work in the future if serialization changes
buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9'))
buf = io.BytesIO(buf.getvalue().replace(b"cuda:0", b"cuda:9"))
msg = r'Attempting to deserialize object on CUDA device 9'
msg = r"Attempting to deserialize object on CUDA device 9"
with self.assertRaisesRegex(RuntimeError, msg):
_ = torch.load(buf)
@ -431,7 +450,7 @@ class TestCudaMultiGPU(TestCase):
x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
def gpu_remap(storage, location):
if location == 'cuda:1':
if location == "cuda:1":
return storage.cuda(0)
with tempfile.NamedTemporaryFile() as f:
@ -450,7 +469,7 @@ class TestCudaMultiGPU(TestCase):
with tempfile.NamedTemporaryFile() as f:
torch.save(x, f)
f.seek(0)
x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
x_copy = torch.load(f, map_location={"cuda:1": "cuda:0"})
for original, copy in zip(x, x_copy):
self.assertEqual(copy, original)
self.assertIs(type(copy), type(original))
@ -458,10 +477,10 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_multigpu_storage_clone(self):
x = torch.randn(4, 4, device='cuda:1').storage()
x = torch.randn(4, 4, device="cuda:1").storage()
y = x.clone()
self.assertEqual(x.get_device(), y.get_device())
for t in ['byte', 'char', 'short', 'int', 'long', 'half', 'double']:
for t in ["byte", "char", "short", "int", "long", "half", "double"]:
self.assertEqual(getattr(x, t)().get_device(), x.get_device())
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
@ -479,8 +498,8 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_current_stream(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
s0 = torch.cuda.current_stream()
s1 = torch.cuda.current_stream(device=1)
@ -501,15 +520,14 @@ class TestCudaMultiGPU(TestCase):
self.assertEqual(d0, s2.device)
self.assertEqual(s0, s1)
with self.assertRaisesRegex(ValueError,
"Expected a cuda device, but got: cpu"):
torch.cuda.current_stream(torch.device('cpu'))
with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
torch.cuda.current_stream(torch.device("cpu"))
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
@skipCUDANonDefaultStreamIf(True)
def test_default_stream(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
with torch.cuda.device(d0):
s0 = torch.cuda.default_stream()
@ -533,14 +551,13 @@ class TestCudaMultiGPU(TestCase):
with torch.cuda.device(d1):
self.assertEqual(torch.cuda.current_stream(), s1)
with self.assertRaisesRegex(ValueError,
"Expected a cuda device, but got: cpu"):
torch.cuda.default_stream(torch.device('cpu'))
with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
torch.cuda.default_stream(torch.device("cpu"))
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_stream_event_device(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
e0 = torch.cuda.Event()
self.assertEqual(None, e0.device)
@ -553,10 +570,10 @@ class TestCudaMultiGPU(TestCase):
s1 = torch.cuda.Stream()
e1 = s1.record_event()
self.assertEqual(s0.device, torch.device('cuda:0'))
self.assertEqual(e0.device, torch.device('cuda:0'))
self.assertEqual(s1.device, torch.device('cuda:1'))
self.assertEqual(e1.device, torch.device('cuda:1'))
self.assertEqual(s0.device, torch.device("cuda:0"))
self.assertEqual(e0.device, torch.device("cuda:0"))
self.assertEqual(s1.device, torch.device("cuda:1"))
self.assertEqual(e1.device, torch.device("cuda:1"))
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_stream_context(self):
@ -592,18 +609,17 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_streams_multi_gpu(self):
default_stream = torch.cuda.current_stream()
self.assertEqual(default_stream.device, torch.device('cuda:0'))
self.assertEqual(default_stream.device, torch.device("cuda:0"))
stream = torch.cuda.Stream(device=1)
self.assertEqual(stream.device, torch.device('cuda:1'))
self.assertEqual(stream.device, torch.device("cuda:1"))
with torch.cuda.device(1):
self.assertEqual(
torch.cuda.current_stream().device, torch.device('cuda:1'))
self.assertEqual(torch.cuda.current_stream().device, torch.device("cuda:1"))
self.assertNotEqual(torch.cuda.current_stream(), default_stream)
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_streams_multi_gpu_query(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
torch.cuda.synchronize(d0)
torch.cuda.synchronize(d1)
@ -642,8 +658,8 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_streams_multi_gpu_eq(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
with torch.cuda.device(d0):
s0 = torch.cuda.current_stream()
@ -676,12 +692,12 @@ class TestCudaMultiGPU(TestCase):
s0 = torch.cuda.Stream(device=0, priority=low)
self.assertEqual(low, s0.priority)
self.assertEqual(torch.device('cuda:0'), s0.device)
self.assertEqual(torch.device("cuda:0"), s0.device)
s1 = torch.cuda.Stream(device=1, priority=high)
self.assertEqual(high, s1.priority)
self.assertEqual(torch.device('cuda:1'), s1.device)
self.assertEqual(torch.device("cuda:1"), s1.device)
@unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
def test_tensor_device(self):
@ -754,7 +770,7 @@ class TestCudaMultiGPU(TestCase):
@staticmethod
def _test_stream_event_nogil(self, sync_func, p2c, c2p):
with torch.cuda.device('cuda:1'):
with torch.cuda.device("cuda:1"):
c2p.put(0)
p2c.get()
c2p.put(sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES))
@ -763,9 +779,11 @@ class TestCudaMultiGPU(TestCase):
@skipIfRocm
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_stream_event_nogil(self):
for sync_func in [TestCudaMultiGPU._stream_synchronize,
TestCudaMultiGPU._event_synchronize,
TestCudaMultiGPU._event_wait]:
for sync_func in [
TestCudaMultiGPU._stream_synchronize,
TestCudaMultiGPU._event_synchronize,
TestCudaMultiGPU._event_wait,
]:
p2c = queue.Queue()
c2p = queue.Queue()
e_tik = torch.cuda.Event(enable_timing=True)
@ -773,12 +791,13 @@ class TestCudaMultiGPU(TestCase):
t = threading.Thread(
target=TestCudaMultiGPU._test_stream_event_nogil,
args=(self, sync_func, p2c, c2p))
args=(self, sync_func, p2c, c2p),
)
t.daemon = True
t.start()
c2p.get()
with torch.cuda.device('cuda:0'):
with torch.cuda.device("cuda:0"):
e_tik.record()
p2c.put(0)
parent_time = sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES)
@ -801,8 +820,8 @@ class TestCudaMultiGPU(TestCase):
@skipIfRocm
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_events_wait(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
torch.cuda.synchronize(d0)
torch.cuda.synchronize(d1)
@ -827,8 +846,8 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
def test_events_multi_gpu_query(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
with torch.cuda.device(d0):
s0 = torch.cuda.current_stream()
@ -869,8 +888,8 @@ class TestCudaMultiGPU(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
@skipIfRocm
def test_events_multi_gpu_elapsed_time(self):
d0 = torch.device('cuda:0')
d1 = torch.device('cuda:1')
d0 = torch.device("cuda:0")
d1 = torch.device("cuda:1")
with torch.cuda.device(d0):
s0 = torch.cuda.current_stream()
@ -934,8 +953,7 @@ class TestCudaMultiGPU(TestCase):
def test_external_streams_multi_device(self):
device = torch.cuda.device(1)
with self._get_external_stream(device) as stream_v:
ext_stream = torch.cuda.ExternalStream(
stream_v, device=device)
ext_stream = torch.cuda.ExternalStream(stream_v, device=device)
self.assertEqual(stream_v, ext_stream.cuda_stream)
self.assertEqual(ext_stream.device.index, device.idx)
@ -956,7 +974,7 @@ class TestCudaMultiGPU(TestCase):
del t
t = torch.FloatTensor([2]).pin_memory()
self.assertNotEqual(t.data_ptr(), ptr, msg='allocation re-used too soon')
self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
with torch.cuda.device(0):
gpu_tensor0.copy_(t, non_blocking=True)
@ -988,7 +1006,7 @@ class TestCudaMultiGPU(TestCase):
def _test(idx):
before_free_bytes, before_available_bytes = torch.cuda.mem_get_info(idx)
# increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
t = torch.randn(1024 * 1024 * 8, device='cuda:' + str(idx))
t = torch.randn(1024 * 1024 * 8, device="cuda:" + str(idx))
if IS_JETSON:
# w/o syncing, mem_get_info will run before memory allocated has actually increased.
# This race condition causes consistent failure
@ -1022,6 +1040,7 @@ class TestCudaMultiGPU(TestCase):
leak_gpu0()
except RuntimeError as e:
import re
assert re.match(regex, str(e)), str(e) + "\n does not match: \n" + regex
else:
# assertRaisesRegex does not pass with Python for Jetson,
@ -1030,12 +1049,15 @@ class TestCudaMultiGPU(TestCase):
leak_gpu0()
if TEST_MULTIGPU:
@self.wrap_with_cuda_memory_check
def leak_gpu1():
# increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:1")))
with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"):
with self.assertRaisesRegex(
RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"
):
leak_gpu1()
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
@ -1071,8 +1093,8 @@ class TestCudaMultiGPU(TestCase):
# Multiply by 2 here so to's backward creates gradient values that are different from the case above,
# to mitigate weirdness if the caching allocator happens to reuse memory regions that were populated
# with 1s by the case above
s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.
s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.
s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
torch.cuda.synchronize(device=dev0)
torch.cuda.synchronize(device=dev1)
s0.backward(retain_graph=True)
@ -1085,7 +1107,12 @@ class TestCudaMultiGPU(TestCase):
def test_cuda_init_race(self):
# See https://github.com/pytorch/pytorch/issues/16559
import subprocess
subprocess.check_call([sys.executable, '-c', """\
subprocess.check_call(
[
sys.executable,
"-c",
"""\
import torch
import threading
@ -1096,7 +1123,9 @@ t1 = threading.Thread(target=worker, args=(0,))
t2 = threading.Thread(target=worker, args=(1,))
t1.start()
t2.start()
"""])
""",
]
)
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_grad_scaling_device_as_key(self):
@ -1128,14 +1157,24 @@ t2.start()
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_grad_scaling_scale(self):
scaler = torch.cuda.amp.GradScaler(init_scale=2.)
scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")
t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1")
# Create some nested iterables of tensors on different devices.
outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())])
outputs = (
t1.clone(),
(t0.clone(), t1.clone()),
[t0.clone(), (t1.clone(), t0.clone())],
)
outputs = scaler.scale(outputs)
self.assertTrue(outputs[0] == 8.0 and outputs[1][0] == 8.0 and outputs[1][1] == 8.0 and
outputs[2][0] == 8.0 and outputs[2][1][0] == 8.0 and outputs[2][1][1] == 8.0)
self.assertTrue(
outputs[0] == 8.0
and outputs[1][0] == 8.0
and outputs[1][1] == 8.0
and outputs[2][0] == 8.0
and outputs[2][1][0] == 8.0
and outputs[2][1][1] == 8.0
)
self.assertTrue(scaler._scale.device == t1.device)
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
@ -1148,12 +1187,25 @@ t2.start()
dev1 = torch.device("cuda:1")
for enabled in True, False:
mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \
_create_scaling_case()
mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \
_create_scaling_models_optimizers(device=dev1)
(
mod_control0,
mod_scaling0,
opt_control0,
opt_scaling0,
data,
loss_fn,
skip_iter,
) = _create_scaling_case()
(
mod_control1,
mod_scaling1,
opt_control1,
opt_scaling1,
) = _create_scaling_models_optimizers(device=dev1)
scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
scaler = torch.cuda.amp.GradScaler(
init_scale=128.0, growth_factor=2.0, enabled=enabled, growth_interval=1
)
def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
for i, (input, target) in enumerate(data):
@ -1162,13 +1214,15 @@ t2.start()
output0 = model0(input)
output1 = model1(input.to(dev1))
loss0 = loss_fn(0.3 * output0 + 0.7 * output1.to(dev0), target)
loss1 = loss_fn(0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1))
loss1 = loss_fn(
0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1)
)
if try_scaling_api:
scaler.scale(loss0).backward(retain_graph=True)
scaler.scale(loss1).backward()
if i == skip_iter and scaler.is_enabled():
model1[1].weight.grad.data.fill_(float('inf'))
model1[1].weight.grad.data.fill_(float("inf"))
# As an additional stress test, separately unscale for one of the optimizers.
scaler.unscale_(optimizer0)
@ -1178,11 +1232,20 @@ t2.start()
# Make sure the found_infs were collected properly across optimizers and devices.
if scaler.is_enabled():
self.assertTrue(len(scaler._found_inf_per_device(optimizer0)) == 1)
self.assertTrue(len(scaler._found_inf_per_device(optimizer1)) == 1)
self.assertTrue(scaler._found_inf_per_device(optimizer0)[dev0].item() == 0.)
self.assertTrue(scaler._found_inf_per_device(optimizer1)[dev1].item() ==
float(i == skip_iter))
self.assertTrue(
len(scaler._found_inf_per_device(optimizer0)) == 1
)
self.assertTrue(
len(scaler._found_inf_per_device(optimizer1)) == 1
)
self.assertTrue(
scaler._found_inf_per_device(optimizer0)[dev0].item()
== 0.0
)
self.assertTrue(
scaler._found_inf_per_device(optimizer1)[dev1].item()
== float(i == skip_iter)
)
scaler.update()
else:
@ -1196,25 +1259,41 @@ t2.start()
run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True)
# The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once.
self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 *
scaler.get_backoff_factor()**1) if enabled else 1.0)
self.assertTrue(
scaler.get_scale()
== (
128.0
* scaler.get_growth_factor() ** 3
* scaler.get_backoff_factor() ** 1
)
if enabled
else 1.0
)
# Copy mod_control1 and mod_scaling1 back the device 0 for comparison
mod_control1.to(dev0)
mod_scaling1.to(dev0)
for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
for c, s in zip(
chain(mod_control0.parameters(), mod_control1.parameters()),
chain(mod_scaling0.parameters(), mod_scaling1.parameters()),
):
self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
@unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
def test_cuda_device_memory_allocated(self):
from torch.cuda import memory_allocated
device_count = torch.cuda.device_count()
current_alloc = [memory_allocated(idx) for idx in range(device_count)]
x = torch.ones(10, device="cuda:0")
self.assertGreater(memory_allocated(0), current_alloc[0])
self.assertTrue(all(memory_allocated(torch.cuda.device(idx)) == current_alloc[idx] for idx in range(1, device_count)))
self.assertTrue(
all(
memory_allocated(torch.cuda.device(idx)) == current_alloc[idx]
for idx in range(1, device_count)
)
)
class TestCudaComm(TestCase):
@ -1226,12 +1305,17 @@ class TestCudaComm(TestCase):
for i, t in enumerate(results):
self.assertEqual(t.get_device(), i)
self.assertEqual(t, input)
if input.is_cuda and input.get_device() == i: # test not copying on same device
if (
input.is_cuda and input.get_device() == i
): # test not copying on same device
self.assertEqual(t.data_ptr(), input.data_ptr())
# test out=
for inplace in [True, False]:
if inplace:
outputs = [torch.empty_like(input, device=0), torch.empty_like(input, device=1)]
outputs = [
torch.empty_like(input, device=0),
torch.empty_like(input, device=1),
]
else:
outputs = [input.cuda(0), torch.empty_like(input, device=1)]
results = comm.broadcast(input, out=outputs)
@ -1241,13 +1325,19 @@ class TestCudaComm(TestCase):
self.assertEqual(t.get_device(), i)
self.assertEqual(t, input)
# test error msg
with self.assertRaisesRegex(RuntimeError, r"Exactly one of 'devices' and 'out'"):
with self.assertRaisesRegex(
RuntimeError, r"Exactly one of 'devices' and 'out'"
):
comm.broadcast(input, (0, 1), out=outputs)
with self.assertRaisesRegex(RuntimeError,
r"Expected all output tensors to be CUDA tensors, but output tensor at index 1"):
with self.assertRaisesRegex(
RuntimeError,
r"Expected all output tensors to be CUDA tensors, but output tensor at index 1",
):
comm.broadcast(input, out=[input.cuda(0), input.cpu()])
with self.assertRaisesRegex(RuntimeError,
r"Expected all output tensors to have same shape as the source .+ at index 1"):
with self.assertRaisesRegex(
RuntimeError,
r"Expected all output tensors to have same shape as the source .+ at index 1",
):
comm.broadcast(input, out=[input.cuda(0), input.cuda(1).unsqueeze(0)])
def test_broadcast_cpu(self):
@ -1289,16 +1379,16 @@ class TestCudaComm(TestCase):
numel = 5
num_bytes = numel * 8
tensors = [
self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
torch.randn(numel).long().cuda(),
torch.randn(numel).cuda(),
self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
torch.randn(numel).long().cuda(),
torch.randn(numel).long().cuda(),
self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
torch.randn(numel * 2).int().cuda(), # int is 2x shorter
torch.randn(numel).cuda(),
]
@ -1323,7 +1413,7 @@ class TestCudaComm(TestCase):
tensors = [
torch.tensor([]).byte().cuda(),
torch.randn(5).cuda(),
torch.randn(5).double().cuda()
torch.randn(5).double().cuda(),
]
self._test_broadcast_coalesced(tensors, 256)
@ -1364,16 +1454,16 @@ class TestCudaComm(TestCase):
numel = 5
num_bytes = numel * 8
tensors = [
self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
torch.randn(numel).long().cuda(),
torch.randn(numel).cuda(),
self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
torch.randn(numel).long().cuda(),
torch.randn(numel).long().cuda(),
self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
torch.randn(numel * 2).int().cuda(), # int is 2x shorter
torch.randn(numel).cuda(),
]
@ -1412,7 +1502,9 @@ class TestCudaComm(TestCase):
self.assertEqual(r, input[tuple(index)], atol=0, rtol=0)
chunk_start = chunk_end
if r.device == input.device:
self.assertEqual(r.data_ptr(), input.data_ptr()) # for target @ same device, a view should be returned
self.assertEqual(
r.data_ptr(), input.data_ptr()
) # for target @ same device, a view should be returned
# test out
out = [torch.empty_like(t) for t in result]
@ -1429,20 +1521,38 @@ class TestCudaComm(TestCase):
# test error msg
if chunk_sizes is not None:
with self.assertRaisesRegex(RuntimeError, r"Expected devices and chunk_sizes to be of same length"):
comm.scatter(input, [0 for _ in range(len(chunk_sizes) + 1)], dim=dim, chunk_sizes=chunk_sizes)
with self.assertRaisesRegex(
RuntimeError, r"Expected devices and chunk_sizes to be of same length"
):
comm.scatter(
input,
[0 for _ in range(len(chunk_sizes) + 1)],
dim=dim,
chunk_sizes=chunk_sizes,
)
with self.assertRaisesRegex(RuntimeError, r"'devices' must not be specified"):
comm.scatter(input, (0, 1), dim=dim, out=out)
with self.assertRaisesRegex(RuntimeError, r"Expected at least one device to scatter to"):
with self.assertRaisesRegex(
RuntimeError, r"Expected at least one device to scatter to"
):
comm.scatter(input, (), dim=dim)
with self.assertRaisesRegex(RuntimeError, r"Expected at least one output tensor to scatter to"):
with self.assertRaisesRegex(
RuntimeError, r"Expected at least one output tensor to scatter to"
):
comm.scatter(input, dim=dim, out=[])
with self.assertRaisesRegex(RuntimeError,
r"Expected all output tensors to be CUDA tensors, but output tensor at index 0"):
with self.assertRaisesRegex(
RuntimeError,
r"Expected all output tensors to be CUDA tensors, but output tensor at index 0",
):
comm.scatter(input, dim=dim, out=([out[0].cpu()] + out[1:]))
with self.assertRaisesRegex(RuntimeError, r"Output tensor at index 0 has incorrect shape"):
with self.assertRaisesRegex(
RuntimeError, r"Output tensor at index 0 has incorrect shape"
):
comm.scatter(input, dim=dim, out=([out[0].unsqueeze(0)] + out[1:]))
with self.assertRaisesRegex(RuntimeError, r"Total size for output tensors along scatter dim \d+ does not match"):
with self.assertRaisesRegex(
RuntimeError,
r"Total size for output tensors along scatter dim \d+ does not match",
):
index = [slice(None, None) for _ in range(input.dim())]
index[dim] = slice(1, None)
comm.scatter(input, dim=dim, out=([out[0][tuple(index)]] + out[1:]))
@ -1480,13 +1590,13 @@ class TestCudaComm(TestCase):
expected_size[dim] += y.size(dim)
expected_size = torch.Size(expected_size)
destinations = [None, torch.device('cuda:0'), torch.device('cpu')]
destinations = [None, torch.device("cuda:0"), torch.device("cpu")]
if torch.cuda.device_count() > 2:
destinations.append(torch.device('cuda:2'))
destinations.append(torch.device("cuda:2"))
with torch.cuda.device(1):
for destination in destinations:
if destination is None:
expected_device = torch.device('cuda', torch.cuda.current_device())
expected_device = torch.device("cuda", torch.cuda.current_device())
else:
expected_device = destination
for use_out in [True, False]:
@ -1507,15 +1617,31 @@ class TestCudaComm(TestCase):
self.assertEqual(result[tuple(index)], y)
# test error msg
with self.assertRaisesRegex(RuntimeError, r"'destination' must not be specified"):
comm.gather((x, y), dim, destination='cpu', out=torch.empty(expected_size, device='cpu'))
with self.assertRaisesRegex(RuntimeError, r"Expected at least one tensor to gather from"):
with self.assertRaisesRegex(
RuntimeError, r"'destination' must not be specified"
):
comm.gather(
(x, y),
dim,
destination="cpu",
out=torch.empty(expected_size, device="cpu"),
)
with self.assertRaisesRegex(
RuntimeError, r"Expected at least one tensor to gather from"
):
comm.gather(())
with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to be CUDA tensors, "):
with self.assertRaisesRegex(
RuntimeError, r"Expected all input tensors to be CUDA tensors, "
):
comm.gather((x.cpu(), y))
with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to have the same number of dimensions"):
with self.assertRaisesRegex(
RuntimeError,
r"Expected all input tensors to have the same number of dimensions",
):
comm.gather((x, y.unsqueeze(0)))
with self.assertRaisesRegex(RuntimeError, r"Input tensor at index 1 has invalid shape"):
with self.assertRaisesRegex(
RuntimeError, r"Input tensor at index 1 has invalid shape"
):
if dim in [0, -2]:
comm.gather((x, y[:, 1:]), dim=dim)
elif dim in [1, -1]:
@ -1532,7 +1658,9 @@ class TestCudaComm(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_memory_format_scatter_gather(self):
nhwc = torch.randn((10, 3, 32, 32), device='cpu').contiguous(memory_format=torch.channels_last)
nhwc = torch.randn((10, 3, 32, 32), device="cpu").contiguous(
memory_format=torch.channels_last
)
results = torch.cuda.comm.scatter(nhwc, (0, 1), None, 0)
for result in results:
self.assertFalse(result.is_contiguous())
@ -1541,7 +1669,6 @@ class TestCudaComm(TestCase):
gathered = torch.cuda.comm.gather(results)
self.assertTrue(gathered.is_contiguous(memory_format=torch.channels_last))
@unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
def test_scatter_namedtuple(self):
# tests ability to scatter namedtuples and retrieve a list where each
@ -1589,8 +1716,8 @@ class TestCudaComm(TestCase):
def test_gather_namedtuple(self):
# tests ability to gather a list of namedtuples and return a namedtuple where each
# element is of the expected tensor type.
fields = ['a', 'b']
TestNamedTupleInput_0 = collections.namedtuple('NamedTuple', fields)
fields = ["a", "b"]
TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields)
num_gpus = torch.cuda.device_count()
a = torch.rand(num_gpus * 2, device=0)
@ -1603,10 +1730,10 @@ class TestCudaComm(TestCase):
outputs = [out1, out2]
out = scatter_gather.gather(outputs, 'cpu') # test on CPU
out = scatter_gather.gather(outputs, "cpu") # test on CPU
for i, x in enumerate(out):
self.assertTrue(isinstance(x, type(out2[-1]))) # x must be a tensor
cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
self.assertTrue(torch.equal(x, cat))
out = scatter_gather.gather(outputs, 0) # test on GPU
@ -1635,15 +1762,15 @@ class TestCudaComm(TestCase):
cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
self.assertTrue(torch.equal(x, cat))
out = scatter_gather.gather(outputs, 'cpu') # test on CPU
out = scatter_gather.gather(outputs, "cpu") # test on CPU
for i, x in enumerate(out):
self.assertTrue(isinstance(x, type(out2[-1])))
cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
self.assertTrue(torch.equal(x, cat))
instantiate_parametrized_tests(TestCudaMultiGPU)
if __name__ == '__main__':
if __name__ == "__main__":
run_tests()

View File

@ -1,19 +1,28 @@
# Owner(s): ["module: cuda"]
import multiprocessing
import os
import sys
import multiprocessing
import torch
import unittest
from unittest.mock import patch
import torch
# NOTE: Each of the tests in this module need to be run in a brand new process to ensure CUDA is uninitialized
# prior to test initiation.
with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
# Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
# otherwise be triggered by the `torch.testing._internal.common_utils` module import
from torch.testing._internal.common_utils import (parametrize, instantiate_parametrized_tests, run_tests, TestCase,
IS_WINDOWS, IS_JETSON, NoTest)
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
IS_JETSON,
IS_WINDOWS,
NoTest,
parametrize,
run_tests,
TestCase,
)
# NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
# `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
# to bypass that method here which should be irrelevant to the parameterized tests in this module.
@ -21,7 +30,7 @@ with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
TEST_CUDA = torch.cuda.is_available()
if not TEST_CUDA:
print('CUDA not available, skipping tests', file=sys.stderr)
print("CUDA not available, skipping tests", file=sys.stderr)
TestCase = NoTest # type: ignore[misc, assignment] # noqa: F811
@ -30,11 +39,14 @@ class TestExtendedCUDAIsAvail(TestCase):
SUBPROCESS_REMINDER_MSG = (
"\n REMINDER: Tests defined in test_cuda_nvml_based_avail.py must be run in a process "
"where there CUDA Driver API has not been initialized. Before further debugging, ensure you are either using "
"run_test.py or have added --subprocess to run each test in a different subprocess.")
"run_test.py or have added --subprocess to run each test in a different subprocess."
)
def setUp(self):
super().setUp()
torch.cuda._cached_device_count = None # clear the lru_cache on this method before our test
torch.cuda._cached_device_count = (
None # clear the lru_cache on this method before our test
)
@staticmethod
def in_bad_fork_test() -> bool:
@ -47,31 +59,33 @@ class TestExtendedCUDAIsAvail(TestCase):
# If the NVML-based assessment is attempted but fails, the CUDA Runtime API check should be executed
@unittest.skipIf(IS_WINDOWS, "Needs fork")
@parametrize("nvml_avail", [True, False])
@parametrize("avoid_init", ['1', '0', None])
@parametrize("avoid_init", ["1", "0", None])
def test_cuda_is_available(self, avoid_init, nvml_avail):
if IS_JETSON and nvml_avail and avoid_init == '1':
self.skipTest('Not working for Jetson')
if IS_JETSON and nvml_avail and avoid_init == "1":
self.skipTest("Not working for Jetson")
patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
with patch.dict(os.environ, **patch_env):
if nvml_avail:
_ = torch.cuda.is_available()
else:
with patch.object(torch.cuda, '_device_count_nvml', return_value=-1):
with patch.object(torch.cuda, "_device_count_nvml", return_value=-1):
_ = torch.cuda.is_available()
with multiprocessing.get_context("fork").Pool(1) as pool:
in_bad_fork = pool.apply(TestExtendedCUDAIsAvail.in_bad_fork_test)
if os.getenv('PYTORCH_NVML_BASED_CUDA_CHECK') == '1' and nvml_avail:
self.assertFalse(in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG)
if os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1" and nvml_avail:
self.assertFalse(
in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG
)
else:
assert in_bad_fork
@torch.testing._internal.common_utils.markDynamoStrictTest
class TestVisibleDeviceParses(TestCase):
def test_env_var_parsing(self):
def _parse_visible_devices(val):
from torch.cuda import _parse_visible_devices as _pvd
with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
return _pvd()
@ -96,39 +110,57 @@ class TestVisibleDeviceParses(TestCase):
def test_partial_uuid_resolver(self):
from torch.cuda import _transform_uuid_to_ordinals
uuids = ['GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1',
'GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293',
'GPU-e429a63e-c61c-4795-b757-5132caeb8e70',
'GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98',
'GPU-bbcd6503-5150-4e92-c266-97cc4390d04e',
'GPU-472ea263-58d7-410d-cc82-f7fdece5bd28',
'GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e',
'GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad']
uuids = [
"GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1",
"GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293",
"GPU-e429a63e-c61c-4795-b757-5132caeb8e70",
"GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98",
"GPU-bbcd6503-5150-4e92-c266-97cc4390d04e",
"GPU-472ea263-58d7-410d-cc82-f7fdece5bd28",
"GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e",
"GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad",
]
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
self.assertEqual(_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1])
self.assertEqual(_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids), [1, 7, 5])
self.assertEqual(
_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
)
self.assertEqual(
_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
[1, 7, 5],
)
# First invalid UUID aborts parsing
self.assertEqual(_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), [])
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids), [1])
self.assertEqual(
_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), []
)
self.assertEqual(
_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
[1],
)
# First ambigous UUID aborts parsing
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1])
self.assertEqual(
_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
)
# Duplicate UUIDs result in empty set
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids), [])
self.assertEqual(
_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids),
[],
)
def test_ordinal_parse_visible_devices(self):
def _device_count_nvml(val):
from torch.cuda import _device_count_nvml as _dc
with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
return _dc()
with patch.object(torch.cuda, '_raw_device_count_nvml', return_value=2):
with patch.object(torch.cuda, "_raw_device_count_nvml", return_value=2):
self.assertEqual(_device_count_nvml("1, 0"), 2)
# Ordinal out of bounds aborts parsing
self.assertEqual(_device_count_nvml("1, 5, 0"), 1)
instantiate_parametrized_tests(TestExtendedCUDAIsAvail)
if __name__ == '__main__':
if __name__ == "__main__":
run_tests()

View File

@ -1,15 +1,21 @@
# Owner(s): ["module: cuda"]
import torch
from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocmVersionLessThan, NoTest
from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
import sys
import unittest
import torch
from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
from torch.testing._internal.common_utils import (
NoTest,
run_tests,
skipIfRocmVersionLessThan,
TestCase,
)
# NOTE: this needs to be run in a brand new process
if not TEST_CUDA:
print('CUDA not available, skipping tests', file=sys.stderr)
print("CUDA not available, skipping tests", file=sys.stderr)
TestCase = NoTest # noqa: F811
@ -18,17 +24,21 @@ class TestCudaPrimaryCtx(TestCase):
CTX_ALREADY_CREATED_ERR_MSG = (
"Tests defined in test_cuda_primary_ctx.py must be run in a process "
"where CUDA contexts are never created. Use either run_test.py or add "
"--subprocess to run each test in a different subprocess.")
"--subprocess to run each test in a different subprocess."
)
@skipIfRocmVersionLessThan((4, 4, 21504))
def setUp(self):
for device in range(torch.cuda.device_count()):
# Ensure context has not been created beforehand
self.assertFalse(torch._C._cuda_hasPrimaryContext(device), TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG)
self.assertFalse(
torch._C._cuda_hasPrimaryContext(device),
TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
)
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_str_repr(self):
x = torch.randn(1, device='cuda:1')
x = torch.randn(1, device="cuda:1")
# We should have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -43,13 +53,13 @@ class TestCudaPrimaryCtx(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_copy(self):
x = torch.randn(1, device='cuda:1')
x = torch.randn(1, device="cuda:1")
# We should have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
y = torch.randn(1, device='cpu')
y = torch.randn(1, device="cpu")
y.copy_(x)
# We should still have only created context on 'cuda:1'
@ -58,7 +68,7 @@ class TestCudaPrimaryCtx(TestCase):
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
def test_pin_memory(self):
x = torch.randn(1, device='cuda:1')
x = torch.randn(1, device="cuda:1")
# We should have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -70,7 +80,7 @@ class TestCudaPrimaryCtx(TestCase):
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.randn(3, device='cpu').pin_memory()
x = torch.randn(3, device="cpu").pin_memory()
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -82,19 +92,19 @@ class TestCudaPrimaryCtx(TestCase):
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.randn(3, device='cpu', pin_memory=True)
x = torch.randn(3, device="cpu", pin_memory=True)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.zeros(3, device='cpu', pin_memory=True)
x = torch.zeros(3, device="cpu", pin_memory=True)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
x = torch.empty(3, device='cpu', pin_memory=True)
x = torch.empty(3, device="cpu", pin_memory=True)
# We should still have only created context on 'cuda:1'
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -106,5 +116,6 @@ class TestCudaPrimaryCtx(TestCase):
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
if __name__ == '__main__':
if __name__ == "__main__":
run_tests()

View File

@ -7,8 +7,8 @@ from typing import List
import torch
import torch.cuda._sanitizer as csan
from torch.cuda._sanitizer import StreamId, DataPtr, EventId
from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
from torch.cuda._sanitizer import DataPtr, EventId, StreamId
from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase
if not TEST_CUDA:

View File

@ -6,7 +6,7 @@ import unittest.mock
import torch
import torch.cuda._gpu_trace as gpu_trace
from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase
# NOTE: Each test needs to be run in a brand new process, to reset the registered hooks
# and make sure the CUDA streams are initialized for each test that uses them.