mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Enable UFMT on test/test_cuda*.py
(#124352)
Part of: #123062 Ran lintrunner on: - test/test_cuda.py - test/test_cuda_expandable_segments.py - test/test_cuda_multigpu.py - test/test_cuda_nvml_based_avail.py - test/test_cuda_primary_ctx.py - test/test_cuda_sanitizer.py - test/test_cuda_trace.py Detail: ```bash $ lintrunner -a --take UFMT --all-files ok No lint issues. Successfully applied all patches. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/124352 Approved by: https://github.com/ezyang
This commit is contained in:
committed by
PyTorch MergeBot
parent
977dc5593a
commit
d5182bb75b
@ -1051,13 +1051,6 @@ exclude_patterns = [
|
||||
'test/quantization/fx/test_numeric_suite_fx.py',
|
||||
'test/quantization/fx/test_quantize_fx.py',
|
||||
'test/quantization/fx/test_subgraph_rewriter.py',
|
||||
'test/test_cuda.py',
|
||||
'test/test_cuda_expandable_segments.py',
|
||||
'test/test_cuda_multigpu.py',
|
||||
'test/test_cuda_nvml_based_avail.py',
|
||||
'test/test_cuda_primary_ctx.py',
|
||||
'test/test_cuda_sanitizer.py',
|
||||
'test/test_cuda_trace.py',
|
||||
'test/test_custom_op_testing.py',
|
||||
'test/test_dataloader.py',
|
||||
'test/test_datapipe.py',
|
||||
|
1822
test/test_cuda.py
1822
test/test_cuda.py
File diff suppressed because it is too large
Load Diff
@ -2,13 +2,14 @@
|
||||
# run time cuda tests, but with the allocator using expandable segments
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
|
||||
from torch.testing._internal.common_cuda import IS_JETSON
|
||||
|
||||
if torch.cuda.is_available() and not IS_JETSON:
|
||||
torch.cuda.memory._set_allocator_settings('expandable_segments:True')
|
||||
torch.cuda.memory._set_allocator_settings("expandable_segments:True")
|
||||
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
filepath = os.path.join(current_dir, 'test_cuda.py')
|
||||
exec(compile(open(filepath).read(), filepath, mode='exec'))
|
||||
filepath = os.path.join(current_dir, "test_cuda.py")
|
||||
exec(compile(open(filepath).read(), filepath, mode="exec"))
|
||||
|
@ -3,38 +3,45 @@
|
||||
import collections
|
||||
import contextlib
|
||||
import ctypes
|
||||
import io
|
||||
import gc
|
||||
import io
|
||||
import queue
|
||||
import sys
|
||||
import tempfile
|
||||
import threading
|
||||
import torch
|
||||
import torch.cuda.comm as comm
|
||||
import unittest
|
||||
|
||||
from itertools import repeat, chain
|
||||
from itertools import chain, repeat
|
||||
from typing import NamedTuple
|
||||
|
||||
import torch
|
||||
import torch.cuda.comm as comm
|
||||
from torch.nn.parallel import scatter_gather
|
||||
from torch.testing._internal.common_cuda import (
|
||||
_create_scaling_case,
|
||||
_create_scaling_models_optimizers,
|
||||
TEST_MULTIGPU,
|
||||
)
|
||||
from torch.testing._internal.common_utils import (
|
||||
get_cycles_per_ms,
|
||||
instantiate_parametrized_tests,
|
||||
IS_JETSON,
|
||||
IS_REMOTE_GPU,
|
||||
IS_SANDCASTLE,
|
||||
NoTest,
|
||||
TEST_CUDA,
|
||||
TestCase,
|
||||
get_cycles_per_ms,
|
||||
instantiate_parametrized_tests,
|
||||
run_tests,
|
||||
skipCUDANonDefaultStreamIf,
|
||||
skipIfRocm,
|
||||
TEST_CUDA,
|
||||
TestCase,
|
||||
)
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU, _create_scaling_case, _create_scaling_models_optimizers
|
||||
|
||||
TEST_CUDAMALLOCASYNC = TEST_CUDA and (torch.cuda.get_allocator_backend() == "cudaMallocAsync")
|
||||
TEST_CUDAMALLOCASYNC = TEST_CUDA and (
|
||||
torch.cuda.get_allocator_backend() == "cudaMallocAsync"
|
||||
)
|
||||
|
||||
if not TEST_CUDA:
|
||||
print('CUDA not available, skipping tests', file=sys.stderr)
|
||||
print("CUDA not available, skipping tests", file=sys.stderr)
|
||||
TestCase = NoTest # noqa: F811
|
||||
|
||||
|
||||
@ -44,7 +51,9 @@ class TestCudaMultiGPU(TestCase):
|
||||
def _check_memory_stat_consistency(self):
|
||||
snapshot = torch.cuda.memory_snapshot()
|
||||
|
||||
expected_each_device = collections.defaultdict(lambda: collections.defaultdict(int))
|
||||
expected_each_device = collections.defaultdict(
|
||||
lambda: collections.defaultdict(int)
|
||||
)
|
||||
|
||||
for segment in snapshot:
|
||||
expandable = segment["is_expandable"]
|
||||
@ -56,7 +65,9 @@ class TestCudaMultiGPU(TestCase):
|
||||
expected["segment." + pool_str + ".current"] += 1
|
||||
|
||||
expected["allocated_bytes.all.current"] += segment["allocated_size"]
|
||||
expected["allocated_bytes." + pool_str + ".current"] += segment["allocated_size"]
|
||||
expected["allocated_bytes." + pool_str + ".current"] += segment[
|
||||
"allocated_size"
|
||||
]
|
||||
|
||||
expected["reserved_bytes.all.current"] += segment["total_size"]
|
||||
expected["reserved_bytes." + pool_str + ".current"] += segment["total_size"]
|
||||
@ -65,7 +76,9 @@ class TestCudaMultiGPU(TestCase):
|
||||
expected["active_bytes." + pool_str + ".current"] += segment["active_size"]
|
||||
|
||||
expected["requested_bytes.all.current"] += segment["requested_size"]
|
||||
expected["requested_bytes." + pool_str + ".current"] += segment["requested_size"]
|
||||
expected["requested_bytes." + pool_str + ".current"] += segment[
|
||||
"requested_size"
|
||||
]
|
||||
|
||||
sum_requested = 0
|
||||
is_split = len(segment["blocks"]) > 1
|
||||
@ -83,7 +96,9 @@ class TestCudaMultiGPU(TestCase):
|
||||
expected["inactive_split.all.current"] += 1
|
||||
expected["inactive_split." + pool_str + ".current"] += 1
|
||||
expected["inactive_split_bytes.all.current"] += block["size"]
|
||||
expected["inactive_split_bytes." + pool_str + ".current"] += block["size"]
|
||||
expected["inactive_split_bytes." + pool_str + ".current"] += block[
|
||||
"size"
|
||||
]
|
||||
|
||||
self.assertEqual(sum_requested, segment["requested_size"])
|
||||
|
||||
@ -94,15 +109,15 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
def test_cuda_synchronize(self):
|
||||
torch.cuda.synchronize()
|
||||
torch.cuda.synchronize('cuda')
|
||||
torch.cuda.synchronize('cuda:0')
|
||||
torch.cuda.synchronize("cuda")
|
||||
torch.cuda.synchronize("cuda:0")
|
||||
torch.cuda.synchronize(0)
|
||||
torch.cuda.synchronize(torch.device('cuda:0'))
|
||||
torch.cuda.synchronize(torch.device("cuda:0"))
|
||||
|
||||
if TEST_MULTIGPU:
|
||||
torch.cuda.synchronize('cuda:1')
|
||||
torch.cuda.synchronize("cuda:1")
|
||||
torch.cuda.synchronize(1)
|
||||
torch.cuda.synchronize(torch.device('cuda:1'))
|
||||
torch.cuda.synchronize(torch.device("cuda:1"))
|
||||
|
||||
with self.assertRaisesRegex(ValueError, "Expected a cuda device, but"):
|
||||
torch.cuda.synchronize(torch.device("cpu"))
|
||||
@ -285,8 +300,10 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
# interlace
|
||||
torch.cuda.empty_cache()
|
||||
gen0 = self._test_memory_stats_generator(self, device='cuda:0', N=35)
|
||||
gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
|
||||
gen0 = self._test_memory_stats_generator(self, device="cuda:0", N=35)
|
||||
gen1 = self._test_memory_stats_generator(
|
||||
self, device=torch.device("cuda:1"), N=35
|
||||
)
|
||||
end0 = end1 = False
|
||||
while not (end0 and end1):
|
||||
end0 = advance(gen0, end0)
|
||||
@ -295,7 +312,9 @@ class TestCudaMultiGPU(TestCase):
|
||||
# semi-random order
|
||||
torch.cuda.empty_cache()
|
||||
gen0 = self._test_memory_stats_generator(self, device=0, N=35)
|
||||
gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
|
||||
gen1 = self._test_memory_stats_generator(
|
||||
self, device=torch.device("cuda:1"), N=35
|
||||
)
|
||||
end0 = end1 = False
|
||||
|
||||
while not (end0 and end1):
|
||||
@ -396,10 +415,10 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_copy_streams(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d0 = torch.device("cuda:0")
|
||||
x0 = torch.zeros(5, 5, device=d0)
|
||||
|
||||
d1 = torch.device('cuda:1')
|
||||
d1 = torch.device("cuda:1")
|
||||
x1 = torch.zeros(5, 5, device=d1)
|
||||
self._test_copy_sync_current_stream(x0, x1)
|
||||
|
||||
@ -416,13 +435,13 @@ class TestCudaMultiGPU(TestCase):
|
||||
@unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor")
|
||||
def test_load_nonexistent_device(self):
|
||||
# Setup: create a serialized file object with a 'cuda:9' restore location
|
||||
tensor = torch.randn(2, device='cuda')
|
||||
tensor = torch.randn(2, device="cuda")
|
||||
buf = io.BytesIO()
|
||||
torch.save(tensor, buf)
|
||||
# NB: this might not work in the future if serialization changes
|
||||
buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9'))
|
||||
buf = io.BytesIO(buf.getvalue().replace(b"cuda:0", b"cuda:9"))
|
||||
|
||||
msg = r'Attempting to deserialize object on CUDA device 9'
|
||||
msg = r"Attempting to deserialize object on CUDA device 9"
|
||||
with self.assertRaisesRegex(RuntimeError, msg):
|
||||
_ = torch.load(buf)
|
||||
|
||||
@ -431,7 +450,7 @@ class TestCudaMultiGPU(TestCase):
|
||||
x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]
|
||||
|
||||
def gpu_remap(storage, location):
|
||||
if location == 'cuda:1':
|
||||
if location == "cuda:1":
|
||||
return storage.cuda(0)
|
||||
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
@ -450,7 +469,7 @@ class TestCudaMultiGPU(TestCase):
|
||||
with tempfile.NamedTemporaryFile() as f:
|
||||
torch.save(x, f)
|
||||
f.seek(0)
|
||||
x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
|
||||
x_copy = torch.load(f, map_location={"cuda:1": "cuda:0"})
|
||||
for original, copy in zip(x, x_copy):
|
||||
self.assertEqual(copy, original)
|
||||
self.assertIs(type(copy), type(original))
|
||||
@ -458,10 +477,10 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_multigpu_storage_clone(self):
|
||||
x = torch.randn(4, 4, device='cuda:1').storage()
|
||||
x = torch.randn(4, 4, device="cuda:1").storage()
|
||||
y = x.clone()
|
||||
self.assertEqual(x.get_device(), y.get_device())
|
||||
for t in ['byte', 'char', 'short', 'int', 'long', 'half', 'double']:
|
||||
for t in ["byte", "char", "short", "int", "long", "half", "double"]:
|
||||
self.assertEqual(getattr(x, t)().get_device(), x.get_device())
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
@ -479,8 +498,8 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_current_stream(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
|
||||
s0 = torch.cuda.current_stream()
|
||||
s1 = torch.cuda.current_stream(device=1)
|
||||
@ -501,15 +520,14 @@ class TestCudaMultiGPU(TestCase):
|
||||
self.assertEqual(d0, s2.device)
|
||||
self.assertEqual(s0, s1)
|
||||
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
"Expected a cuda device, but got: cpu"):
|
||||
torch.cuda.current_stream(torch.device('cpu'))
|
||||
with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
|
||||
torch.cuda.current_stream(torch.device("cpu"))
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
@skipCUDANonDefaultStreamIf(True)
|
||||
def test_default_stream(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
|
||||
with torch.cuda.device(d0):
|
||||
s0 = torch.cuda.default_stream()
|
||||
@ -533,14 +551,13 @@ class TestCudaMultiGPU(TestCase):
|
||||
with torch.cuda.device(d1):
|
||||
self.assertEqual(torch.cuda.current_stream(), s1)
|
||||
|
||||
with self.assertRaisesRegex(ValueError,
|
||||
"Expected a cuda device, but got: cpu"):
|
||||
torch.cuda.default_stream(torch.device('cpu'))
|
||||
with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
|
||||
torch.cuda.default_stream(torch.device("cpu"))
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_stream_event_device(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
e0 = torch.cuda.Event()
|
||||
|
||||
self.assertEqual(None, e0.device)
|
||||
@ -553,10 +570,10 @@ class TestCudaMultiGPU(TestCase):
|
||||
s1 = torch.cuda.Stream()
|
||||
e1 = s1.record_event()
|
||||
|
||||
self.assertEqual(s0.device, torch.device('cuda:0'))
|
||||
self.assertEqual(e0.device, torch.device('cuda:0'))
|
||||
self.assertEqual(s1.device, torch.device('cuda:1'))
|
||||
self.assertEqual(e1.device, torch.device('cuda:1'))
|
||||
self.assertEqual(s0.device, torch.device("cuda:0"))
|
||||
self.assertEqual(e0.device, torch.device("cuda:0"))
|
||||
self.assertEqual(s1.device, torch.device("cuda:1"))
|
||||
self.assertEqual(e1.device, torch.device("cuda:1"))
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_stream_context(self):
|
||||
@ -592,18 +609,17 @@ class TestCudaMultiGPU(TestCase):
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_streams_multi_gpu(self):
|
||||
default_stream = torch.cuda.current_stream()
|
||||
self.assertEqual(default_stream.device, torch.device('cuda:0'))
|
||||
self.assertEqual(default_stream.device, torch.device("cuda:0"))
|
||||
stream = torch.cuda.Stream(device=1)
|
||||
self.assertEqual(stream.device, torch.device('cuda:1'))
|
||||
self.assertEqual(stream.device, torch.device("cuda:1"))
|
||||
with torch.cuda.device(1):
|
||||
self.assertEqual(
|
||||
torch.cuda.current_stream().device, torch.device('cuda:1'))
|
||||
self.assertEqual(torch.cuda.current_stream().device, torch.device("cuda:1"))
|
||||
self.assertNotEqual(torch.cuda.current_stream(), default_stream)
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_streams_multi_gpu_query(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
torch.cuda.synchronize(d0)
|
||||
torch.cuda.synchronize(d1)
|
||||
|
||||
@ -642,8 +658,8 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_streams_multi_gpu_eq(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
|
||||
with torch.cuda.device(d0):
|
||||
s0 = torch.cuda.current_stream()
|
||||
@ -676,12 +692,12 @@ class TestCudaMultiGPU(TestCase):
|
||||
s0 = torch.cuda.Stream(device=0, priority=low)
|
||||
|
||||
self.assertEqual(low, s0.priority)
|
||||
self.assertEqual(torch.device('cuda:0'), s0.device)
|
||||
self.assertEqual(torch.device("cuda:0"), s0.device)
|
||||
|
||||
s1 = torch.cuda.Stream(device=1, priority=high)
|
||||
|
||||
self.assertEqual(high, s1.priority)
|
||||
self.assertEqual(torch.device('cuda:1'), s1.device)
|
||||
self.assertEqual(torch.device("cuda:1"), s1.device)
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
|
||||
def test_tensor_device(self):
|
||||
@ -754,7 +770,7 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
@staticmethod
|
||||
def _test_stream_event_nogil(self, sync_func, p2c, c2p):
|
||||
with torch.cuda.device('cuda:1'):
|
||||
with torch.cuda.device("cuda:1"):
|
||||
c2p.put(0)
|
||||
p2c.get()
|
||||
c2p.put(sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES))
|
||||
@ -763,9 +779,11 @@ class TestCudaMultiGPU(TestCase):
|
||||
@skipIfRocm
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_stream_event_nogil(self):
|
||||
for sync_func in [TestCudaMultiGPU._stream_synchronize,
|
||||
TestCudaMultiGPU._event_synchronize,
|
||||
TestCudaMultiGPU._event_wait]:
|
||||
for sync_func in [
|
||||
TestCudaMultiGPU._stream_synchronize,
|
||||
TestCudaMultiGPU._event_synchronize,
|
||||
TestCudaMultiGPU._event_wait,
|
||||
]:
|
||||
p2c = queue.Queue()
|
||||
c2p = queue.Queue()
|
||||
e_tik = torch.cuda.Event(enable_timing=True)
|
||||
@ -773,12 +791,13 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
t = threading.Thread(
|
||||
target=TestCudaMultiGPU._test_stream_event_nogil,
|
||||
args=(self, sync_func, p2c, c2p))
|
||||
args=(self, sync_func, p2c, c2p),
|
||||
)
|
||||
t.daemon = True
|
||||
t.start()
|
||||
|
||||
c2p.get()
|
||||
with torch.cuda.device('cuda:0'):
|
||||
with torch.cuda.device("cuda:0"):
|
||||
e_tik.record()
|
||||
p2c.put(0)
|
||||
parent_time = sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES)
|
||||
@ -801,8 +820,8 @@ class TestCudaMultiGPU(TestCase):
|
||||
@skipIfRocm
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_events_wait(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
torch.cuda.synchronize(d0)
|
||||
torch.cuda.synchronize(d1)
|
||||
|
||||
@ -827,8 +846,8 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
def test_events_multi_gpu_query(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
|
||||
with torch.cuda.device(d0):
|
||||
s0 = torch.cuda.current_stream()
|
||||
@ -869,8 +888,8 @@ class TestCudaMultiGPU(TestCase):
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
|
||||
@skipIfRocm
|
||||
def test_events_multi_gpu_elapsed_time(self):
|
||||
d0 = torch.device('cuda:0')
|
||||
d1 = torch.device('cuda:1')
|
||||
d0 = torch.device("cuda:0")
|
||||
d1 = torch.device("cuda:1")
|
||||
|
||||
with torch.cuda.device(d0):
|
||||
s0 = torch.cuda.current_stream()
|
||||
@ -934,8 +953,7 @@ class TestCudaMultiGPU(TestCase):
|
||||
def test_external_streams_multi_device(self):
|
||||
device = torch.cuda.device(1)
|
||||
with self._get_external_stream(device) as stream_v:
|
||||
ext_stream = torch.cuda.ExternalStream(
|
||||
stream_v, device=device)
|
||||
ext_stream = torch.cuda.ExternalStream(stream_v, device=device)
|
||||
self.assertEqual(stream_v, ext_stream.cuda_stream)
|
||||
self.assertEqual(ext_stream.device.index, device.idx)
|
||||
|
||||
@ -956,7 +974,7 @@ class TestCudaMultiGPU(TestCase):
|
||||
|
||||
del t
|
||||
t = torch.FloatTensor([2]).pin_memory()
|
||||
self.assertNotEqual(t.data_ptr(), ptr, msg='allocation re-used too soon')
|
||||
self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")
|
||||
|
||||
with torch.cuda.device(0):
|
||||
gpu_tensor0.copy_(t, non_blocking=True)
|
||||
@ -988,7 +1006,7 @@ class TestCudaMultiGPU(TestCase):
|
||||
def _test(idx):
|
||||
before_free_bytes, before_available_bytes = torch.cuda.mem_get_info(idx)
|
||||
# increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
|
||||
t = torch.randn(1024 * 1024 * 8, device='cuda:' + str(idx))
|
||||
t = torch.randn(1024 * 1024 * 8, device="cuda:" + str(idx))
|
||||
if IS_JETSON:
|
||||
# w/o syncing, mem_get_info will run before memory allocated has actually increased.
|
||||
# This race condition causes consistent failure
|
||||
@ -1022,6 +1040,7 @@ class TestCudaMultiGPU(TestCase):
|
||||
leak_gpu0()
|
||||
except RuntimeError as e:
|
||||
import re
|
||||
|
||||
assert re.match(regex, str(e)), str(e) + "\n does not match: \n" + regex
|
||||
else:
|
||||
# assertRaisesRegex does not pass with Python for Jetson,
|
||||
@ -1030,12 +1049,15 @@ class TestCudaMultiGPU(TestCase):
|
||||
leak_gpu0()
|
||||
|
||||
if TEST_MULTIGPU:
|
||||
|
||||
@self.wrap_with_cuda_memory_check
|
||||
def leak_gpu1():
|
||||
# increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
|
||||
l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:1")))
|
||||
|
||||
with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"
|
||||
):
|
||||
leak_gpu1()
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
@ -1071,8 +1093,8 @@ class TestCudaMultiGPU(TestCase):
|
||||
# Multiply by 2 here so to's backward creates gradient values that are different from the case above,
|
||||
# to mitigate weirdness if the caching allocator happens to reuse memory regions that were populated
|
||||
# with 1s by the case above
|
||||
s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.
|
||||
s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.
|
||||
s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
|
||||
s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
|
||||
torch.cuda.synchronize(device=dev0)
|
||||
torch.cuda.synchronize(device=dev1)
|
||||
s0.backward(retain_graph=True)
|
||||
@ -1085,7 +1107,12 @@ class TestCudaMultiGPU(TestCase):
|
||||
def test_cuda_init_race(self):
|
||||
# See https://github.com/pytorch/pytorch/issues/16559
|
||||
import subprocess
|
||||
subprocess.check_call([sys.executable, '-c', """\
|
||||
|
||||
subprocess.check_call(
|
||||
[
|
||||
sys.executable,
|
||||
"-c",
|
||||
"""\
|
||||
import torch
|
||||
import threading
|
||||
|
||||
@ -1096,7 +1123,9 @@ t1 = threading.Thread(target=worker, args=(0,))
|
||||
t2 = threading.Thread(target=worker, args=(1,))
|
||||
t1.start()
|
||||
t2.start()
|
||||
"""])
|
||||
""",
|
||||
]
|
||||
)
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_grad_scaling_device_as_key(self):
|
||||
@ -1128,14 +1157,24 @@ t2.start()
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_grad_scaling_scale(self):
|
||||
scaler = torch.cuda.amp.GradScaler(init_scale=2.)
|
||||
scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
|
||||
t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")
|
||||
t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1")
|
||||
# Create some nested iterables of tensors on different devices.
|
||||
outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())])
|
||||
outputs = (
|
||||
t1.clone(),
|
||||
(t0.clone(), t1.clone()),
|
||||
[t0.clone(), (t1.clone(), t0.clone())],
|
||||
)
|
||||
outputs = scaler.scale(outputs)
|
||||
self.assertTrue(outputs[0] == 8.0 and outputs[1][0] == 8.0 and outputs[1][1] == 8.0 and
|
||||
outputs[2][0] == 8.0 and outputs[2][1][0] == 8.0 and outputs[2][1][1] == 8.0)
|
||||
self.assertTrue(
|
||||
outputs[0] == 8.0
|
||||
and outputs[1][0] == 8.0
|
||||
and outputs[1][1] == 8.0
|
||||
and outputs[2][0] == 8.0
|
||||
and outputs[2][1][0] == 8.0
|
||||
and outputs[2][1][1] == 8.0
|
||||
)
|
||||
self.assertTrue(scaler._scale.device == t1.device)
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
@ -1148,12 +1187,25 @@ t2.start()
|
||||
dev1 = torch.device("cuda:1")
|
||||
|
||||
for enabled in True, False:
|
||||
mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \
|
||||
_create_scaling_case()
|
||||
mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \
|
||||
_create_scaling_models_optimizers(device=dev1)
|
||||
(
|
||||
mod_control0,
|
||||
mod_scaling0,
|
||||
opt_control0,
|
||||
opt_scaling0,
|
||||
data,
|
||||
loss_fn,
|
||||
skip_iter,
|
||||
) = _create_scaling_case()
|
||||
(
|
||||
mod_control1,
|
||||
mod_scaling1,
|
||||
opt_control1,
|
||||
opt_scaling1,
|
||||
) = _create_scaling_models_optimizers(device=dev1)
|
||||
|
||||
scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
|
||||
scaler = torch.cuda.amp.GradScaler(
|
||||
init_scale=128.0, growth_factor=2.0, enabled=enabled, growth_interval=1
|
||||
)
|
||||
|
||||
def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
|
||||
for i, (input, target) in enumerate(data):
|
||||
@ -1162,13 +1214,15 @@ t2.start()
|
||||
output0 = model0(input)
|
||||
output1 = model1(input.to(dev1))
|
||||
loss0 = loss_fn(0.3 * output0 + 0.7 * output1.to(dev0), target)
|
||||
loss1 = loss_fn(0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1))
|
||||
loss1 = loss_fn(
|
||||
0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1)
|
||||
)
|
||||
|
||||
if try_scaling_api:
|
||||
scaler.scale(loss0).backward(retain_graph=True)
|
||||
scaler.scale(loss1).backward()
|
||||
if i == skip_iter and scaler.is_enabled():
|
||||
model1[1].weight.grad.data.fill_(float('inf'))
|
||||
model1[1].weight.grad.data.fill_(float("inf"))
|
||||
|
||||
# As an additional stress test, separately unscale for one of the optimizers.
|
||||
scaler.unscale_(optimizer0)
|
||||
@ -1178,11 +1232,20 @@ t2.start()
|
||||
|
||||
# Make sure the found_infs were collected properly across optimizers and devices.
|
||||
if scaler.is_enabled():
|
||||
self.assertTrue(len(scaler._found_inf_per_device(optimizer0)) == 1)
|
||||
self.assertTrue(len(scaler._found_inf_per_device(optimizer1)) == 1)
|
||||
self.assertTrue(scaler._found_inf_per_device(optimizer0)[dev0].item() == 0.)
|
||||
self.assertTrue(scaler._found_inf_per_device(optimizer1)[dev1].item() ==
|
||||
float(i == skip_iter))
|
||||
self.assertTrue(
|
||||
len(scaler._found_inf_per_device(optimizer0)) == 1
|
||||
)
|
||||
self.assertTrue(
|
||||
len(scaler._found_inf_per_device(optimizer1)) == 1
|
||||
)
|
||||
self.assertTrue(
|
||||
scaler._found_inf_per_device(optimizer0)[dev0].item()
|
||||
== 0.0
|
||||
)
|
||||
self.assertTrue(
|
||||
scaler._found_inf_per_device(optimizer1)[dev1].item()
|
||||
== float(i == skip_iter)
|
||||
)
|
||||
|
||||
scaler.update()
|
||||
else:
|
||||
@ -1196,25 +1259,41 @@ t2.start()
|
||||
run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True)
|
||||
|
||||
# The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once.
|
||||
self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 *
|
||||
scaler.get_backoff_factor()**1) if enabled else 1.0)
|
||||
self.assertTrue(
|
||||
scaler.get_scale()
|
||||
== (
|
||||
128.0
|
||||
* scaler.get_growth_factor() ** 3
|
||||
* scaler.get_backoff_factor() ** 1
|
||||
)
|
||||
if enabled
|
||||
else 1.0
|
||||
)
|
||||
|
||||
# Copy mod_control1 and mod_scaling1 back the device 0 for comparison
|
||||
mod_control1.to(dev0)
|
||||
mod_scaling1.to(dev0)
|
||||
|
||||
for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
|
||||
chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
|
||||
for c, s in zip(
|
||||
chain(mod_control0.parameters(), mod_control1.parameters()),
|
||||
chain(mod_scaling0.parameters(), mod_scaling1.parameters()),
|
||||
):
|
||||
self.assertEqual(c, s, rtol=1e-5, atol=1e-7)
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
|
||||
def test_cuda_device_memory_allocated(self):
|
||||
from torch.cuda import memory_allocated
|
||||
|
||||
device_count = torch.cuda.device_count()
|
||||
current_alloc = [memory_allocated(idx) for idx in range(device_count)]
|
||||
x = torch.ones(10, device="cuda:0")
|
||||
self.assertGreater(memory_allocated(0), current_alloc[0])
|
||||
self.assertTrue(all(memory_allocated(torch.cuda.device(idx)) == current_alloc[idx] for idx in range(1, device_count)))
|
||||
self.assertTrue(
|
||||
all(
|
||||
memory_allocated(torch.cuda.device(idx)) == current_alloc[idx]
|
||||
for idx in range(1, device_count)
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class TestCudaComm(TestCase):
|
||||
@ -1226,12 +1305,17 @@ class TestCudaComm(TestCase):
|
||||
for i, t in enumerate(results):
|
||||
self.assertEqual(t.get_device(), i)
|
||||
self.assertEqual(t, input)
|
||||
if input.is_cuda and input.get_device() == i: # test not copying on same device
|
||||
if (
|
||||
input.is_cuda and input.get_device() == i
|
||||
): # test not copying on same device
|
||||
self.assertEqual(t.data_ptr(), input.data_ptr())
|
||||
# test out=
|
||||
for inplace in [True, False]:
|
||||
if inplace:
|
||||
outputs = [torch.empty_like(input, device=0), torch.empty_like(input, device=1)]
|
||||
outputs = [
|
||||
torch.empty_like(input, device=0),
|
||||
torch.empty_like(input, device=1),
|
||||
]
|
||||
else:
|
||||
outputs = [input.cuda(0), torch.empty_like(input, device=1)]
|
||||
results = comm.broadcast(input, out=outputs)
|
||||
@ -1241,13 +1325,19 @@ class TestCudaComm(TestCase):
|
||||
self.assertEqual(t.get_device(), i)
|
||||
self.assertEqual(t, input)
|
||||
# test error msg
|
||||
with self.assertRaisesRegex(RuntimeError, r"Exactly one of 'devices' and 'out'"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Exactly one of 'devices' and 'out'"
|
||||
):
|
||||
comm.broadcast(input, (0, 1), out=outputs)
|
||||
with self.assertRaisesRegex(RuntimeError,
|
||||
r"Expected all output tensors to be CUDA tensors, but output tensor at index 1"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
r"Expected all output tensors to be CUDA tensors, but output tensor at index 1",
|
||||
):
|
||||
comm.broadcast(input, out=[input.cuda(0), input.cpu()])
|
||||
with self.assertRaisesRegex(RuntimeError,
|
||||
r"Expected all output tensors to have same shape as the source .+ at index 1"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
r"Expected all output tensors to have same shape as the source .+ at index 1",
|
||||
):
|
||||
comm.broadcast(input, out=[input.cuda(0), input.cuda(1).unsqueeze(0)])
|
||||
|
||||
def test_broadcast_cpu(self):
|
||||
@ -1289,16 +1379,16 @@ class TestCudaComm(TestCase):
|
||||
numel = 5
|
||||
num_bytes = numel * 8
|
||||
tensors = [
|
||||
self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
|
||||
torch.randn(numel).long().cuda(),
|
||||
torch.randn(numel).cuda(),
|
||||
self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
|
||||
self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
|
||||
self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
|
||||
self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
|
||||
torch.randn(numel).long().cuda(),
|
||||
torch.randn(numel).long().cuda(),
|
||||
self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
|
||||
self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
|
||||
torch.randn(numel * 2).int().cuda(), # int is 2x shorter
|
||||
torch.randn(numel).cuda(),
|
||||
]
|
||||
@ -1323,7 +1413,7 @@ class TestCudaComm(TestCase):
|
||||
tensors = [
|
||||
torch.tensor([]).byte().cuda(),
|
||||
torch.randn(5).cuda(),
|
||||
torch.randn(5).double().cuda()
|
||||
torch.randn(5).double().cuda(),
|
||||
]
|
||||
self._test_broadcast_coalesced(tensors, 256)
|
||||
|
||||
@ -1364,16 +1454,16 @@ class TestCudaComm(TestCase):
|
||||
numel = 5
|
||||
num_bytes = numel * 8
|
||||
tensors = [
|
||||
self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
|
||||
torch.randn(numel).long().cuda(),
|
||||
torch.randn(numel).cuda(),
|
||||
self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
|
||||
self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
|
||||
self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
|
||||
self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
|
||||
self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
|
||||
torch.randn(numel).long().cuda(),
|
||||
torch.randn(numel).long().cuda(),
|
||||
self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
|
||||
self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
|
||||
torch.randn(numel * 2).int().cuda(), # int is 2x shorter
|
||||
torch.randn(numel).cuda(),
|
||||
]
|
||||
@ -1412,7 +1502,9 @@ class TestCudaComm(TestCase):
|
||||
self.assertEqual(r, input[tuple(index)], atol=0, rtol=0)
|
||||
chunk_start = chunk_end
|
||||
if r.device == input.device:
|
||||
self.assertEqual(r.data_ptr(), input.data_ptr()) # for target @ same device, a view should be returned
|
||||
self.assertEqual(
|
||||
r.data_ptr(), input.data_ptr()
|
||||
) # for target @ same device, a view should be returned
|
||||
|
||||
# test out
|
||||
out = [torch.empty_like(t) for t in result]
|
||||
@ -1429,20 +1521,38 @@ class TestCudaComm(TestCase):
|
||||
|
||||
# test error msg
|
||||
if chunk_sizes is not None:
|
||||
with self.assertRaisesRegex(RuntimeError, r"Expected devices and chunk_sizes to be of same length"):
|
||||
comm.scatter(input, [0 for _ in range(len(chunk_sizes) + 1)], dim=dim, chunk_sizes=chunk_sizes)
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Expected devices and chunk_sizes to be of same length"
|
||||
):
|
||||
comm.scatter(
|
||||
input,
|
||||
[0 for _ in range(len(chunk_sizes) + 1)],
|
||||
dim=dim,
|
||||
chunk_sizes=chunk_sizes,
|
||||
)
|
||||
with self.assertRaisesRegex(RuntimeError, r"'devices' must not be specified"):
|
||||
comm.scatter(input, (0, 1), dim=dim, out=out)
|
||||
with self.assertRaisesRegex(RuntimeError, r"Expected at least one device to scatter to"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Expected at least one device to scatter to"
|
||||
):
|
||||
comm.scatter(input, (), dim=dim)
|
||||
with self.assertRaisesRegex(RuntimeError, r"Expected at least one output tensor to scatter to"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Expected at least one output tensor to scatter to"
|
||||
):
|
||||
comm.scatter(input, dim=dim, out=[])
|
||||
with self.assertRaisesRegex(RuntimeError,
|
||||
r"Expected all output tensors to be CUDA tensors, but output tensor at index 0"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
r"Expected all output tensors to be CUDA tensors, but output tensor at index 0",
|
||||
):
|
||||
comm.scatter(input, dim=dim, out=([out[0].cpu()] + out[1:]))
|
||||
with self.assertRaisesRegex(RuntimeError, r"Output tensor at index 0 has incorrect shape"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Output tensor at index 0 has incorrect shape"
|
||||
):
|
||||
comm.scatter(input, dim=dim, out=([out[0].unsqueeze(0)] + out[1:]))
|
||||
with self.assertRaisesRegex(RuntimeError, r"Total size for output tensors along scatter dim \d+ does not match"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
r"Total size for output tensors along scatter dim \d+ does not match",
|
||||
):
|
||||
index = [slice(None, None) for _ in range(input.dim())]
|
||||
index[dim] = slice(1, None)
|
||||
comm.scatter(input, dim=dim, out=([out[0][tuple(index)]] + out[1:]))
|
||||
@ -1480,13 +1590,13 @@ class TestCudaComm(TestCase):
|
||||
expected_size[dim] += y.size(dim)
|
||||
expected_size = torch.Size(expected_size)
|
||||
|
||||
destinations = [None, torch.device('cuda:0'), torch.device('cpu')]
|
||||
destinations = [None, torch.device("cuda:0"), torch.device("cpu")]
|
||||
if torch.cuda.device_count() > 2:
|
||||
destinations.append(torch.device('cuda:2'))
|
||||
destinations.append(torch.device("cuda:2"))
|
||||
with torch.cuda.device(1):
|
||||
for destination in destinations:
|
||||
if destination is None:
|
||||
expected_device = torch.device('cuda', torch.cuda.current_device())
|
||||
expected_device = torch.device("cuda", torch.cuda.current_device())
|
||||
else:
|
||||
expected_device = destination
|
||||
for use_out in [True, False]:
|
||||
@ -1507,15 +1617,31 @@ class TestCudaComm(TestCase):
|
||||
self.assertEqual(result[tuple(index)], y)
|
||||
|
||||
# test error msg
|
||||
with self.assertRaisesRegex(RuntimeError, r"'destination' must not be specified"):
|
||||
comm.gather((x, y), dim, destination='cpu', out=torch.empty(expected_size, device='cpu'))
|
||||
with self.assertRaisesRegex(RuntimeError, r"Expected at least one tensor to gather from"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"'destination' must not be specified"
|
||||
):
|
||||
comm.gather(
|
||||
(x, y),
|
||||
dim,
|
||||
destination="cpu",
|
||||
out=torch.empty(expected_size, device="cpu"),
|
||||
)
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Expected at least one tensor to gather from"
|
||||
):
|
||||
comm.gather(())
|
||||
with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to be CUDA tensors, "):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Expected all input tensors to be CUDA tensors, "
|
||||
):
|
||||
comm.gather((x.cpu(), y))
|
||||
with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to have the same number of dimensions"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError,
|
||||
r"Expected all input tensors to have the same number of dimensions",
|
||||
):
|
||||
comm.gather((x, y.unsqueeze(0)))
|
||||
with self.assertRaisesRegex(RuntimeError, r"Input tensor at index 1 has invalid shape"):
|
||||
with self.assertRaisesRegex(
|
||||
RuntimeError, r"Input tensor at index 1 has invalid shape"
|
||||
):
|
||||
if dim in [0, -2]:
|
||||
comm.gather((x, y[:, 1:]), dim=dim)
|
||||
elif dim in [1, -1]:
|
||||
@ -1532,7 +1658,9 @@ class TestCudaComm(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_memory_format_scatter_gather(self):
|
||||
nhwc = torch.randn((10, 3, 32, 32), device='cpu').contiguous(memory_format=torch.channels_last)
|
||||
nhwc = torch.randn((10, 3, 32, 32), device="cpu").contiguous(
|
||||
memory_format=torch.channels_last
|
||||
)
|
||||
results = torch.cuda.comm.scatter(nhwc, (0, 1), None, 0)
|
||||
for result in results:
|
||||
self.assertFalse(result.is_contiguous())
|
||||
@ -1541,7 +1669,6 @@ class TestCudaComm(TestCase):
|
||||
gathered = torch.cuda.comm.gather(results)
|
||||
self.assertTrue(gathered.is_contiguous(memory_format=torch.channels_last))
|
||||
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
|
||||
def test_scatter_namedtuple(self):
|
||||
# tests ability to scatter namedtuples and retrieve a list where each
|
||||
@ -1589,8 +1716,8 @@ class TestCudaComm(TestCase):
|
||||
def test_gather_namedtuple(self):
|
||||
# tests ability to gather a list of namedtuples and return a namedtuple where each
|
||||
# element is of the expected tensor type.
|
||||
fields = ['a', 'b']
|
||||
TestNamedTupleInput_0 = collections.namedtuple('NamedTuple', fields)
|
||||
fields = ["a", "b"]
|
||||
TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields)
|
||||
|
||||
num_gpus = torch.cuda.device_count()
|
||||
a = torch.rand(num_gpus * 2, device=0)
|
||||
@ -1603,10 +1730,10 @@ class TestCudaComm(TestCase):
|
||||
|
||||
outputs = [out1, out2]
|
||||
|
||||
out = scatter_gather.gather(outputs, 'cpu') # test on CPU
|
||||
out = scatter_gather.gather(outputs, "cpu") # test on CPU
|
||||
for i, x in enumerate(out):
|
||||
self.assertTrue(isinstance(x, type(out2[-1]))) # x must be a tensor
|
||||
cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
|
||||
cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
|
||||
self.assertTrue(torch.equal(x, cat))
|
||||
|
||||
out = scatter_gather.gather(outputs, 0) # test on GPU
|
||||
@ -1635,15 +1762,15 @@ class TestCudaComm(TestCase):
|
||||
cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
|
||||
self.assertTrue(torch.equal(x, cat))
|
||||
|
||||
out = scatter_gather.gather(outputs, 'cpu') # test on CPU
|
||||
out = scatter_gather.gather(outputs, "cpu") # test on CPU
|
||||
for i, x in enumerate(out):
|
||||
self.assertTrue(isinstance(x, type(out2[-1])))
|
||||
cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
|
||||
cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
|
||||
self.assertTrue(torch.equal(x, cat))
|
||||
|
||||
|
||||
instantiate_parametrized_tests(TestCudaMultiGPU)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
@ -1,19 +1,28 @@
|
||||
# Owner(s): ["module: cuda"]
|
||||
|
||||
import multiprocessing
|
||||
import os
|
||||
import sys
|
||||
import multiprocessing
|
||||
import torch
|
||||
import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
import torch
|
||||
|
||||
# NOTE: Each of the tests in this module need to be run in a brand new process to ensure CUDA is uninitialized
|
||||
# prior to test initiation.
|
||||
with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
|
||||
# Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
|
||||
# otherwise be triggered by the `torch.testing._internal.common_utils` module import
|
||||
from torch.testing._internal.common_utils import (parametrize, instantiate_parametrized_tests, run_tests, TestCase,
|
||||
IS_WINDOWS, IS_JETSON, NoTest)
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
IS_JETSON,
|
||||
IS_WINDOWS,
|
||||
NoTest,
|
||||
parametrize,
|
||||
run_tests,
|
||||
TestCase,
|
||||
)
|
||||
|
||||
# NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
|
||||
# `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
|
||||
# to bypass that method here which should be irrelevant to the parameterized tests in this module.
|
||||
@ -21,7 +30,7 @@ with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
|
||||
|
||||
TEST_CUDA = torch.cuda.is_available()
|
||||
if not TEST_CUDA:
|
||||
print('CUDA not available, skipping tests', file=sys.stderr)
|
||||
print("CUDA not available, skipping tests", file=sys.stderr)
|
||||
TestCase = NoTest # type: ignore[misc, assignment] # noqa: F811
|
||||
|
||||
|
||||
@ -30,11 +39,14 @@ class TestExtendedCUDAIsAvail(TestCase):
|
||||
SUBPROCESS_REMINDER_MSG = (
|
||||
"\n REMINDER: Tests defined in test_cuda_nvml_based_avail.py must be run in a process "
|
||||
"where there CUDA Driver API has not been initialized. Before further debugging, ensure you are either using "
|
||||
"run_test.py or have added --subprocess to run each test in a different subprocess.")
|
||||
"run_test.py or have added --subprocess to run each test in a different subprocess."
|
||||
)
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
torch.cuda._cached_device_count = None # clear the lru_cache on this method before our test
|
||||
torch.cuda._cached_device_count = (
|
||||
None # clear the lru_cache on this method before our test
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def in_bad_fork_test() -> bool:
|
||||
@ -47,31 +59,33 @@ class TestExtendedCUDAIsAvail(TestCase):
|
||||
# If the NVML-based assessment is attempted but fails, the CUDA Runtime API check should be executed
|
||||
@unittest.skipIf(IS_WINDOWS, "Needs fork")
|
||||
@parametrize("nvml_avail", [True, False])
|
||||
@parametrize("avoid_init", ['1', '0', None])
|
||||
@parametrize("avoid_init", ["1", "0", None])
|
||||
def test_cuda_is_available(self, avoid_init, nvml_avail):
|
||||
if IS_JETSON and nvml_avail and avoid_init == '1':
|
||||
self.skipTest('Not working for Jetson')
|
||||
if IS_JETSON and nvml_avail and avoid_init == "1":
|
||||
self.skipTest("Not working for Jetson")
|
||||
patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
|
||||
with patch.dict(os.environ, **patch_env):
|
||||
if nvml_avail:
|
||||
_ = torch.cuda.is_available()
|
||||
else:
|
||||
with patch.object(torch.cuda, '_device_count_nvml', return_value=-1):
|
||||
with patch.object(torch.cuda, "_device_count_nvml", return_value=-1):
|
||||
_ = torch.cuda.is_available()
|
||||
with multiprocessing.get_context("fork").Pool(1) as pool:
|
||||
in_bad_fork = pool.apply(TestExtendedCUDAIsAvail.in_bad_fork_test)
|
||||
if os.getenv('PYTORCH_NVML_BASED_CUDA_CHECK') == '1' and nvml_avail:
|
||||
self.assertFalse(in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG)
|
||||
if os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1" and nvml_avail:
|
||||
self.assertFalse(
|
||||
in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG
|
||||
)
|
||||
else:
|
||||
assert in_bad_fork
|
||||
|
||||
|
||||
@torch.testing._internal.common_utils.markDynamoStrictTest
|
||||
class TestVisibleDeviceParses(TestCase):
|
||||
|
||||
def test_env_var_parsing(self):
|
||||
def _parse_visible_devices(val):
|
||||
from torch.cuda import _parse_visible_devices as _pvd
|
||||
|
||||
with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
|
||||
return _pvd()
|
||||
|
||||
@ -96,39 +110,57 @@ class TestVisibleDeviceParses(TestCase):
|
||||
|
||||
def test_partial_uuid_resolver(self):
|
||||
from torch.cuda import _transform_uuid_to_ordinals
|
||||
uuids = ['GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1',
|
||||
'GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293',
|
||||
'GPU-e429a63e-c61c-4795-b757-5132caeb8e70',
|
||||
'GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98',
|
||||
'GPU-bbcd6503-5150-4e92-c266-97cc4390d04e',
|
||||
'GPU-472ea263-58d7-410d-cc82-f7fdece5bd28',
|
||||
'GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e',
|
||||
'GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad']
|
||||
|
||||
uuids = [
|
||||
"GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1",
|
||||
"GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293",
|
||||
"GPU-e429a63e-c61c-4795-b757-5132caeb8e70",
|
||||
"GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98",
|
||||
"GPU-bbcd6503-5150-4e92-c266-97cc4390d04e",
|
||||
"GPU-472ea263-58d7-410d-cc82-f7fdece5bd28",
|
||||
"GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e",
|
||||
"GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad",
|
||||
]
|
||||
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
|
||||
self.assertEqual(_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1])
|
||||
self.assertEqual(_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids), [1, 7, 5])
|
||||
self.assertEqual(
|
||||
_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
|
||||
)
|
||||
self.assertEqual(
|
||||
_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
|
||||
[1, 7, 5],
|
||||
)
|
||||
# First invalid UUID aborts parsing
|
||||
self.assertEqual(_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), [])
|
||||
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids), [1])
|
||||
self.assertEqual(
|
||||
_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), []
|
||||
)
|
||||
self.assertEqual(
|
||||
_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
|
||||
[1],
|
||||
)
|
||||
# First ambigous UUID aborts parsing
|
||||
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1])
|
||||
self.assertEqual(
|
||||
_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
|
||||
)
|
||||
# Duplicate UUIDs result in empty set
|
||||
self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids), [])
|
||||
self.assertEqual(
|
||||
_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids),
|
||||
[],
|
||||
)
|
||||
|
||||
def test_ordinal_parse_visible_devices(self):
|
||||
def _device_count_nvml(val):
|
||||
from torch.cuda import _device_count_nvml as _dc
|
||||
|
||||
with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
|
||||
return _dc()
|
||||
|
||||
with patch.object(torch.cuda, '_raw_device_count_nvml', return_value=2):
|
||||
with patch.object(torch.cuda, "_raw_device_count_nvml", return_value=2):
|
||||
self.assertEqual(_device_count_nvml("1, 0"), 2)
|
||||
# Ordinal out of bounds aborts parsing
|
||||
self.assertEqual(_device_count_nvml("1, 5, 0"), 1)
|
||||
|
||||
|
||||
|
||||
instantiate_parametrized_tests(TestExtendedCUDAIsAvail)
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
@ -1,15 +1,21 @@
|
||||
# Owner(s): ["module: cuda"]
|
||||
|
||||
import torch
|
||||
from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocmVersionLessThan, NoTest
|
||||
from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
|
||||
from torch.testing._internal.common_utils import (
|
||||
NoTest,
|
||||
run_tests,
|
||||
skipIfRocmVersionLessThan,
|
||||
TestCase,
|
||||
)
|
||||
|
||||
# NOTE: this needs to be run in a brand new process
|
||||
|
||||
if not TEST_CUDA:
|
||||
print('CUDA not available, skipping tests', file=sys.stderr)
|
||||
print("CUDA not available, skipping tests", file=sys.stderr)
|
||||
TestCase = NoTest # noqa: F811
|
||||
|
||||
|
||||
@ -18,17 +24,21 @@ class TestCudaPrimaryCtx(TestCase):
|
||||
CTX_ALREADY_CREATED_ERR_MSG = (
|
||||
"Tests defined in test_cuda_primary_ctx.py must be run in a process "
|
||||
"where CUDA contexts are never created. Use either run_test.py or add "
|
||||
"--subprocess to run each test in a different subprocess.")
|
||||
"--subprocess to run each test in a different subprocess."
|
||||
)
|
||||
|
||||
@skipIfRocmVersionLessThan((4, 4, 21504))
|
||||
def setUp(self):
|
||||
for device in range(torch.cuda.device_count()):
|
||||
# Ensure context has not been created beforehand
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(device), TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG)
|
||||
self.assertFalse(
|
||||
torch._C._cuda_hasPrimaryContext(device),
|
||||
TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
|
||||
)
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_str_repr(self):
|
||||
x = torch.randn(1, device='cuda:1')
|
||||
x = torch.randn(1, device="cuda:1")
|
||||
|
||||
# We should have only created context on 'cuda:1'
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
@ -43,13 +53,13 @@ class TestCudaPrimaryCtx(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_copy(self):
|
||||
x = torch.randn(1, device='cuda:1')
|
||||
x = torch.randn(1, device="cuda:1")
|
||||
|
||||
# We should have only created context on 'cuda:1'
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
|
||||
|
||||
y = torch.randn(1, device='cpu')
|
||||
y = torch.randn(1, device="cpu")
|
||||
y.copy_(x)
|
||||
|
||||
# We should still have only created context on 'cuda:1'
|
||||
@ -58,7 +68,7 @@ class TestCudaPrimaryCtx(TestCase):
|
||||
|
||||
@unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
|
||||
def test_pin_memory(self):
|
||||
x = torch.randn(1, device='cuda:1')
|
||||
x = torch.randn(1, device="cuda:1")
|
||||
|
||||
# We should have only created context on 'cuda:1'
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
@ -70,7 +80,7 @@ class TestCudaPrimaryCtx(TestCase):
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
|
||||
|
||||
x = torch.randn(3, device='cpu').pin_memory()
|
||||
x = torch.randn(3, device="cpu").pin_memory()
|
||||
|
||||
# We should still have only created context on 'cuda:1'
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
@ -82,19 +92,19 @@ class TestCudaPrimaryCtx(TestCase):
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
|
||||
|
||||
x = torch.randn(3, device='cpu', pin_memory=True)
|
||||
x = torch.randn(3, device="cpu", pin_memory=True)
|
||||
|
||||
# We should still have only created context on 'cuda:1'
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
|
||||
|
||||
x = torch.zeros(3, device='cpu', pin_memory=True)
|
||||
x = torch.zeros(3, device="cpu", pin_memory=True)
|
||||
|
||||
# We should still have only created context on 'cuda:1'
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
|
||||
|
||||
x = torch.empty(3, device='cpu', pin_memory=True)
|
||||
x = torch.empty(3, device="cpu", pin_memory=True)
|
||||
|
||||
# We should still have only created context on 'cuda:1'
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
@ -106,5 +116,6 @@ class TestCudaPrimaryCtx(TestCase):
|
||||
self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
|
||||
self.assertTrue(torch._C._cuda_hasPrimaryContext(1))
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
@ -7,8 +7,8 @@ from typing import List
|
||||
|
||||
import torch
|
||||
import torch.cuda._sanitizer as csan
|
||||
from torch.cuda._sanitizer import StreamId, DataPtr, EventId
|
||||
from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
|
||||
from torch.cuda._sanitizer import DataPtr, EventId, StreamId
|
||||
from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase
|
||||
|
||||
|
||||
if not TEST_CUDA:
|
||||
|
@ -6,7 +6,7 @@ import unittest.mock
|
||||
|
||||
import torch
|
||||
import torch.cuda._gpu_trace as gpu_trace
|
||||
from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
|
||||
from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase
|
||||
|
||||
# NOTE: Each test needs to be run in a brand new process, to reset the registered hooks
|
||||
# and make sure the CUDA streams are initialized for each test that uses them.
|
||||
|
Reference in New Issue
Block a user