Enable UFMT on test/test_cuda*.py (#124352)

Part of: #123062 Ran lintrunner on: - test/test_cuda.py - test/test_cuda_expandable_segments.py - test/test_cuda_multigpu.py - test/test_cuda_nvml_based_avail.py - test/test_cuda_primary_ctx.py - test/test_cuda_sanitizer.py - test/test_cuda_trace.py Detail: ```bash $ lintrunner -a --take UFMT --all-files ok No lint issues. Successfully applied all patches. ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/124352 Approved by: https://github.com/ezyang
2025-10-20 21:14:14 +08:00 · 2024-04-25 18:31:03 +00:00
parent 977dc5593a
commit d5182bb75b
8 changed files with 1612 additions and 786 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -1051,13 +1051,6 @@ exclude_patterns = [
    'test/quantization/fx/test_numeric_suite_fx.py',
    'test/quantization/fx/test_quantize_fx.py',
    'test/quantization/fx/test_subgraph_rewriter.py',
-    'test/test_cuda.py',
-    'test/test_cuda_expandable_segments.py',
-    'test/test_cuda_multigpu.py',
-    'test/test_cuda_nvml_based_avail.py',
-    'test/test_cuda_primary_ctx.py',
-    'test/test_cuda_sanitizer.py',
-    'test/test_cuda_trace.py',
    'test/test_custom_op_testing.py',
    'test/test_dataloader.py',
    'test/test_datapipe.py',
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
--- a/test/test_cuda_expandable_segments.py
+++ b/test/test_cuda_expandable_segments.py
@ -2,13 +2,14 @@
 # run time cuda tests, but with the allocator using expandable segments

 import os
+
 import torch

 from torch.testing._internal.common_cuda import IS_JETSON

 if torch.cuda.is_available() and not IS_JETSON:
-    torch.cuda.memory._set_allocator_settings('expandable_segments:True')
+    torch.cuda.memory._set_allocator_settings("expandable_segments:True")

    current_dir = os.path.dirname(os.path.abspath(__file__))
-    filepath = os.path.join(current_dir, 'test_cuda.py')
-    exec(compile(open(filepath).read(), filepath, mode='exec'))
+    filepath = os.path.join(current_dir, "test_cuda.py")
+    exec(compile(open(filepath).read(), filepath, mode="exec"))
--- a/test/test_cuda_multigpu.py
+++ b/test/test_cuda_multigpu.py
@ -3,38 +3,45 @@
 import collections
 import contextlib
 import ctypes
-import io
 import gc
+import io
 import queue
 import sys
 import tempfile
 import threading
-import torch
-import torch.cuda.comm as comm
 import unittest

-from itertools import repeat, chain
+from itertools import chain, repeat
 from typing import NamedTuple
+
+import torch
+import torch.cuda.comm as comm
 from torch.nn.parallel import scatter_gather
+from torch.testing._internal.common_cuda import (
+    _create_scaling_case,
+    _create_scaling_models_optimizers,
+    TEST_MULTIGPU,
+)
 from torch.testing._internal.common_utils import (
+    get_cycles_per_ms,
+    instantiate_parametrized_tests,
    IS_JETSON,
    IS_REMOTE_GPU,
    IS_SANDCASTLE,
    NoTest,
-    TEST_CUDA,
-    TestCase,
-    get_cycles_per_ms,
-    instantiate_parametrized_tests,
    run_tests,
    skipCUDANonDefaultStreamIf,
    skipIfRocm,
+    TEST_CUDA,
+    TestCase,
 )
-from torch.testing._internal.common_cuda import TEST_MULTIGPU, _create_scaling_case, _create_scaling_models_optimizers

-TEST_CUDAMALLOCASYNC = TEST_CUDA and (torch.cuda.get_allocator_backend() == "cudaMallocAsync")
+TEST_CUDAMALLOCASYNC = TEST_CUDA and (
+    torch.cuda.get_allocator_backend() == "cudaMallocAsync"
+)

 if not TEST_CUDA:
-    print('CUDA not available, skipping tests', file=sys.stderr)
+    print("CUDA not available, skipping tests", file=sys.stderr)
    TestCase = NoTest  # noqa: F811


@ -44,7 +51,9 @@ class TestCudaMultiGPU(TestCase):
    def _check_memory_stat_consistency(self):
        snapshot = torch.cuda.memory_snapshot()

-        expected_each_device = collections.defaultdict(lambda: collections.defaultdict(int))
+        expected_each_device = collections.defaultdict(
+            lambda: collections.defaultdict(int)
+        )

        for segment in snapshot:
            expandable = segment["is_expandable"]
@ -56,7 +65,9 @@ class TestCudaMultiGPU(TestCase):
                expected["segment." + pool_str + ".current"] += 1

            expected["allocated_bytes.all.current"] += segment["allocated_size"]
-            expected["allocated_bytes." + pool_str + ".current"] += segment["allocated_size"]
+            expected["allocated_bytes." + pool_str + ".current"] += segment[
+                "allocated_size"
+            ]

            expected["reserved_bytes.all.current"] += segment["total_size"]
            expected["reserved_bytes." + pool_str + ".current"] += segment["total_size"]
@ -65,7 +76,9 @@ class TestCudaMultiGPU(TestCase):
            expected["active_bytes." + pool_str + ".current"] += segment["active_size"]

            expected["requested_bytes.all.current"] += segment["requested_size"]
-            expected["requested_bytes." + pool_str + ".current"] += segment["requested_size"]
+            expected["requested_bytes." + pool_str + ".current"] += segment[
+                "requested_size"
+            ]

            sum_requested = 0
            is_split = len(segment["blocks"]) > 1
@ -83,7 +96,9 @@ class TestCudaMultiGPU(TestCase):
                    expected["inactive_split.all.current"] += 1
                    expected["inactive_split." + pool_str + ".current"] += 1
                    expected["inactive_split_bytes.all.current"] += block["size"]
-                    expected["inactive_split_bytes." + pool_str + ".current"] += block["size"]
+                    expected["inactive_split_bytes." + pool_str + ".current"] += block[
+                        "size"
+                    ]

            self.assertEqual(sum_requested, segment["requested_size"])

@ -94,15 +109,15 @@ class TestCudaMultiGPU(TestCase):

    def test_cuda_synchronize(self):
        torch.cuda.synchronize()
-        torch.cuda.synchronize('cuda')
-        torch.cuda.synchronize('cuda:0')
+        torch.cuda.synchronize("cuda")
+        torch.cuda.synchronize("cuda:0")
        torch.cuda.synchronize(0)
-        torch.cuda.synchronize(torch.device('cuda:0'))
+        torch.cuda.synchronize(torch.device("cuda:0"))

        if TEST_MULTIGPU:
-            torch.cuda.synchronize('cuda:1')
+            torch.cuda.synchronize("cuda:1")
            torch.cuda.synchronize(1)
-            torch.cuda.synchronize(torch.device('cuda:1'))
+            torch.cuda.synchronize(torch.device("cuda:1"))

        with self.assertRaisesRegex(ValueError, "Expected a cuda device, but"):
            torch.cuda.synchronize(torch.device("cpu"))
@ -285,8 +300,10 @@ class TestCudaMultiGPU(TestCase):

        # interlace
        torch.cuda.empty_cache()
-        gen0 = self._test_memory_stats_generator(self, device='cuda:0', N=35)
-        gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
+        gen0 = self._test_memory_stats_generator(self, device="cuda:0", N=35)
+        gen1 = self._test_memory_stats_generator(
+            self, device=torch.device("cuda:1"), N=35
+        )
        end0 = end1 = False
        while not (end0 and end1):
            end0 = advance(gen0, end0)
@ -295,7 +312,9 @@ class TestCudaMultiGPU(TestCase):
        # semi-random order
        torch.cuda.empty_cache()
        gen0 = self._test_memory_stats_generator(self, device=0, N=35)
-        gen1 = self._test_memory_stats_generator(self, device=torch.device('cuda:1'), N=35)
+        gen1 = self._test_memory_stats_generator(
+            self, device=torch.device("cuda:1"), N=35
+        )
        end0 = end1 = False

        while not (end0 and end1):
@ -396,10 +415,10 @@ class TestCudaMultiGPU(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_copy_streams(self):
-        d0 = torch.device('cuda:0')
+        d0 = torch.device("cuda:0")
        x0 = torch.zeros(5, 5, device=d0)

-        d1 = torch.device('cuda:1')
+        d1 = torch.device("cuda:1")
        x1 = torch.zeros(5, 5, device=d1)
        self._test_copy_sync_current_stream(x0, x1)

@ -416,13 +435,13 @@ class TestCudaMultiGPU(TestCase):
    @unittest.skipIf(torch.cuda.device_count() >= 10, "Loading a cuda:9 tensor")
    def test_load_nonexistent_device(self):
        # Setup: create a serialized file object with a 'cuda:9' restore location
-        tensor = torch.randn(2, device='cuda')
+        tensor = torch.randn(2, device="cuda")
        buf = io.BytesIO()
        torch.save(tensor, buf)
        # NB: this might not work in the future if serialization changes
-        buf = io.BytesIO(buf.getvalue().replace(b'cuda:0', b'cuda:9'))
+        buf = io.BytesIO(buf.getvalue().replace(b"cuda:0", b"cuda:9"))

-        msg = r'Attempting to deserialize object on CUDA device 9'
+        msg = r"Attempting to deserialize object on CUDA device 9"
        with self.assertRaisesRegex(RuntimeError, msg):
            _ = torch.load(buf)

@ -431,7 +450,7 @@ class TestCudaMultiGPU(TestCase):
        x = [torch.randn(4, 4).cuda(0), torch.randn(4, 4).cuda(1)]

        def gpu_remap(storage, location):
-            if location == 'cuda:1':
+            if location == "cuda:1":
                return storage.cuda(0)

        with tempfile.NamedTemporaryFile() as f:
@ -450,7 +469,7 @@ class TestCudaMultiGPU(TestCase):
        with tempfile.NamedTemporaryFile() as f:
            torch.save(x, f)
            f.seek(0)
-            x_copy = torch.load(f, map_location={'cuda:1': 'cuda:0'})
+            x_copy = torch.load(f, map_location={"cuda:1": "cuda:0"})
        for original, copy in zip(x, x_copy):
            self.assertEqual(copy, original)
            self.assertIs(type(copy), type(original))
@ -458,10 +477,10 @@ class TestCudaMultiGPU(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_multigpu_storage_clone(self):
-        x = torch.randn(4, 4, device='cuda:1').storage()
+        x = torch.randn(4, 4, device="cuda:1").storage()
        y = x.clone()
        self.assertEqual(x.get_device(), y.get_device())
-        for t in ['byte', 'char', 'short', 'int', 'long', 'half', 'double']:
+        for t in ["byte", "char", "short", "int", "long", "half", "double"]:
            self.assertEqual(getattr(x, t)().get_device(), x.get_device())

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
@ -479,8 +498,8 @@ class TestCudaMultiGPU(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_current_stream(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")

        s0 = torch.cuda.current_stream()
        s1 = torch.cuda.current_stream(device=1)
@ -501,15 +520,14 @@ class TestCudaMultiGPU(TestCase):
        self.assertEqual(d0, s2.device)
        self.assertEqual(s0, s1)

-        with self.assertRaisesRegex(ValueError,
-                                    "Expected a cuda device, but got: cpu"):
-            torch.cuda.current_stream(torch.device('cpu'))
+        with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
+            torch.cuda.current_stream(torch.device("cpu"))

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    @skipCUDANonDefaultStreamIf(True)
    def test_default_stream(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")

        with torch.cuda.device(d0):
            s0 = torch.cuda.default_stream()
@ -533,14 +551,13 @@ class TestCudaMultiGPU(TestCase):
        with torch.cuda.device(d1):
            self.assertEqual(torch.cuda.current_stream(), s1)

-        with self.assertRaisesRegex(ValueError,
-                                    "Expected a cuda device, but got: cpu"):
-            torch.cuda.default_stream(torch.device('cpu'))
+        with self.assertRaisesRegex(ValueError, "Expected a cuda device, but got: cpu"):
+            torch.cuda.default_stream(torch.device("cpu"))

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_stream_event_device(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
        e0 = torch.cuda.Event()

        self.assertEqual(None, e0.device)
@ -553,10 +570,10 @@ class TestCudaMultiGPU(TestCase):
            s1 = torch.cuda.Stream()
            e1 = s1.record_event()

-        self.assertEqual(s0.device, torch.device('cuda:0'))
-        self.assertEqual(e0.device, torch.device('cuda:0'))
-        self.assertEqual(s1.device, torch.device('cuda:1'))
-        self.assertEqual(e1.device, torch.device('cuda:1'))
+        self.assertEqual(s0.device, torch.device("cuda:0"))
+        self.assertEqual(e0.device, torch.device("cuda:0"))
+        self.assertEqual(s1.device, torch.device("cuda:1"))
+        self.assertEqual(e1.device, torch.device("cuda:1"))

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_stream_context(self):
@ -592,18 +609,17 @@ class TestCudaMultiGPU(TestCase):
    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_streams_multi_gpu(self):
        default_stream = torch.cuda.current_stream()
-        self.assertEqual(default_stream.device, torch.device('cuda:0'))
+        self.assertEqual(default_stream.device, torch.device("cuda:0"))
        stream = torch.cuda.Stream(device=1)
-        self.assertEqual(stream.device, torch.device('cuda:1'))
+        self.assertEqual(stream.device, torch.device("cuda:1"))
        with torch.cuda.device(1):
-            self.assertEqual(
-                torch.cuda.current_stream().device, torch.device('cuda:1'))
+            self.assertEqual(torch.cuda.current_stream().device, torch.device("cuda:1"))
            self.assertNotEqual(torch.cuda.current_stream(), default_stream)

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_streams_multi_gpu_query(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
        torch.cuda.synchronize(d0)
        torch.cuda.synchronize(d1)

@ -642,8 +658,8 @@ class TestCudaMultiGPU(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_streams_multi_gpu_eq(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")

        with torch.cuda.device(d0):
            s0 = torch.cuda.current_stream()
@ -676,12 +692,12 @@ class TestCudaMultiGPU(TestCase):
        s0 = torch.cuda.Stream(device=0, priority=low)

        self.assertEqual(low, s0.priority)
-        self.assertEqual(torch.device('cuda:0'), s0.device)
+        self.assertEqual(torch.device("cuda:0"), s0.device)

        s1 = torch.cuda.Stream(device=1, priority=high)

        self.assertEqual(high, s1.priority)
-        self.assertEqual(torch.device('cuda:1'), s1.device)
+        self.assertEqual(torch.device("cuda:1"), s1.device)

    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
    def test_tensor_device(self):
@ -754,7 +770,7 @@ class TestCudaMultiGPU(TestCase):

    @staticmethod
    def _test_stream_event_nogil(self, sync_func, p2c, c2p):
-        with torch.cuda.device('cuda:1'):
+        with torch.cuda.device("cuda:1"):
            c2p.put(0)
            p2c.get()
            c2p.put(sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES))
@ -763,9 +779,11 @@ class TestCudaMultiGPU(TestCase):
    @skipIfRocm
    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_stream_event_nogil(self):
-        for sync_func in [TestCudaMultiGPU._stream_synchronize,
-                          TestCudaMultiGPU._event_synchronize,
-                          TestCudaMultiGPU._event_wait]:
+        for sync_func in [
+            TestCudaMultiGPU._stream_synchronize,
+            TestCudaMultiGPU._event_synchronize,
+            TestCudaMultiGPU._event_wait,
+        ]:
            p2c = queue.Queue()
            c2p = queue.Queue()
            e_tik = torch.cuda.Event(enable_timing=True)
@ -773,12 +791,13 @@ class TestCudaMultiGPU(TestCase):

            t = threading.Thread(
                target=TestCudaMultiGPU._test_stream_event_nogil,
-                args=(self, sync_func, p2c, c2p))
+                args=(self, sync_func, p2c, c2p),
+            )
            t.daemon = True
            t.start()

            c2p.get()
-            with torch.cuda.device('cuda:0'):
+            with torch.cuda.device("cuda:0"):
                e_tik.record()
                p2c.put(0)
                parent_time = sync_func(self, TestCudaMultiGPU.FIFTY_MIL_CYCLES)
@ -801,8 +820,8 @@ class TestCudaMultiGPU(TestCase):
    @skipIfRocm
    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_events_wait(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")
        torch.cuda.synchronize(d0)
        torch.cuda.synchronize(d1)

@ -827,8 +846,8 @@ class TestCudaMultiGPU(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    def test_events_multi_gpu_query(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")

        with torch.cuda.device(d0):
            s0 = torch.cuda.current_stream()
@ -869,8 +888,8 @@ class TestCudaMultiGPU(TestCase):
    @unittest.skipIf(not TEST_MULTIGPU, "detected only one GPU")
    @skipIfRocm
    def test_events_multi_gpu_elapsed_time(self):
-        d0 = torch.device('cuda:0')
-        d1 = torch.device('cuda:1')
+        d0 = torch.device("cuda:0")
+        d1 = torch.device("cuda:1")

        with torch.cuda.device(d0):
            s0 = torch.cuda.current_stream()
@ -934,8 +953,7 @@ class TestCudaMultiGPU(TestCase):
    def test_external_streams_multi_device(self):
        device = torch.cuda.device(1)
        with self._get_external_stream(device) as stream_v:
-            ext_stream = torch.cuda.ExternalStream(
-                stream_v, device=device)
+            ext_stream = torch.cuda.ExternalStream(stream_v, device=device)
            self.assertEqual(stream_v, ext_stream.cuda_stream)
            self.assertEqual(ext_stream.device.index, device.idx)

@ -956,7 +974,7 @@ class TestCudaMultiGPU(TestCase):

        del t
        t = torch.FloatTensor([2]).pin_memory()
-        self.assertNotEqual(t.data_ptr(), ptr, msg='allocation re-used too soon')
+        self.assertNotEqual(t.data_ptr(), ptr, msg="allocation re-used too soon")

        with torch.cuda.device(0):
            gpu_tensor0.copy_(t, non_blocking=True)
@ -988,7 +1006,7 @@ class TestCudaMultiGPU(TestCase):
        def _test(idx):
            before_free_bytes, before_available_bytes = torch.cuda.mem_get_info(idx)
            # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
-            t = torch.randn(1024 * 1024 * 8, device='cuda:' + str(idx))
+            t = torch.randn(1024 * 1024 * 8, device="cuda:" + str(idx))
            if IS_JETSON:
                # w/o syncing, mem_get_info will run before memory allocated has actually increased.
                # This race condition causes consistent failure
@ -1022,6 +1040,7 @@ class TestCudaMultiGPU(TestCase):
                leak_gpu0()
            except RuntimeError as e:
                import re
+
                assert re.match(regex, str(e)), str(e) + "\n does not match: \n" + regex
        else:
            # assertRaisesRegex does not pass with Python for Jetson,
@ -1030,12 +1049,15 @@ class TestCudaMultiGPU(TestCase):
                leak_gpu0()

        if TEST_MULTIGPU:
+
            @self.wrap_with_cuda_memory_check
            def leak_gpu1():
                # increasing to 8MB to force acquiring a new block and overcome blocksize differences across platforms
                l.append(torch.randn(1024 * 1024 * 8, device=torch.device("cuda:1")))

-            with self.assertRaisesRegex(RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"):
+            with self.assertRaisesRegex(
+                RuntimeError, r"CUDA driver API confirmed .+ on device 1.+"
+            ):
                leak_gpu1()

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
@ -1071,8 +1093,8 @@ class TestCudaMultiGPU(TestCase):
        # Multiply by 2 here so to's backward creates gradient values that are different from the case above,
        # to mitigate weirdness if the caching allocator happens to reuse memory regions that were populated
        # with 1s by the case above
-        s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.
-        s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.
+        s0 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
+        s1 = to_backward_recipient.to(device="cuda:0").sum() * 2.0
        torch.cuda.synchronize(device=dev0)
        torch.cuda.synchronize(device=dev1)
        s0.backward(retain_graph=True)
@ -1085,7 +1107,12 @@ class TestCudaMultiGPU(TestCase):
    def test_cuda_init_race(self):
        # See https://github.com/pytorch/pytorch/issues/16559
        import subprocess
-        subprocess.check_call([sys.executable, '-c', """\
+
+        subprocess.check_call(
+            [
+                sys.executable,
+                "-c",
+                """\
 import torch
 import threading

@ -1096,7 +1123,9 @@ t1 = threading.Thread(target=worker, args=(0,))
 t2 = threading.Thread(target=worker, args=(1,))
 t1.start()
 t2.start()
-"""])
+""",
+            ]
+        )

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_grad_scaling_device_as_key(self):
@ -1128,14 +1157,24 @@ t2.start()

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_grad_scaling_scale(self):
-        scaler = torch.cuda.amp.GradScaler(init_scale=2.)
+        scaler = torch.cuda.amp.GradScaler(init_scale=2.0)
        t0 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:0")
        t1 = torch.full((1,), 4.0, dtype=torch.float32, device="cuda:1")
        # Create some nested iterables of tensors on different devices.
-        outputs = (t1.clone(), (t0.clone(), t1.clone()), [t0.clone(), (t1.clone(), t0.clone())])
+        outputs = (
+            t1.clone(),
+            (t0.clone(), t1.clone()),
+            [t0.clone(), (t1.clone(), t0.clone())],
+        )
        outputs = scaler.scale(outputs)
-        self.assertTrue(outputs[0] == 8.0 and outputs[1][0] == 8.0 and outputs[1][1] == 8.0 and
-                        outputs[2][0] == 8.0 and outputs[2][1][0] == 8.0 and outputs[2][1][1] == 8.0)
+        self.assertTrue(
+            outputs[0] == 8.0
+            and outputs[1][0] == 8.0
+            and outputs[1][1] == 8.0
+            and outputs[2][0] == 8.0
+            and outputs[2][1][0] == 8.0
+            and outputs[2][1][1] == 8.0
+        )
        self.assertTrue(scaler._scale.device == t1.device)

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
@ -1148,12 +1187,25 @@ t2.start()
        dev1 = torch.device("cuda:1")

        for enabled in True, False:
-            mod_control0, mod_scaling0, opt_control0, opt_scaling0, data, loss_fn, skip_iter = \
-                _create_scaling_case()
-            mod_control1, mod_scaling1, opt_control1, opt_scaling1 = \
-                _create_scaling_models_optimizers(device=dev1)
+            (
+                mod_control0,
+                mod_scaling0,
+                opt_control0,
+                opt_scaling0,
+                data,
+                loss_fn,
+                skip_iter,
+            ) = _create_scaling_case()
+            (
+                mod_control1,
+                mod_scaling1,
+                opt_control1,
+                opt_scaling1,
+            ) = _create_scaling_models_optimizers(device=dev1)

-            scaler = torch.cuda.amp.GradScaler(init_scale=128., growth_factor=2.0, enabled=enabled, growth_interval=1)
+            scaler = torch.cuda.amp.GradScaler(
+                init_scale=128.0, growth_factor=2.0, enabled=enabled, growth_interval=1
+            )

            def run(model0, model1, optimizer0, optimizer1, try_scaling_api):
                for i, (input, target) in enumerate(data):
@ -1162,13 +1214,15 @@ t2.start()
                    output0 = model0(input)
                    output1 = model1(input.to(dev1))
                    loss0 = loss_fn(0.3 * output0 + 0.7 * output1.to(dev0), target)
-                    loss1 = loss_fn(0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1))
+                    loss1 = loss_fn(
+                        0.6 * output0.to(dev1) - 0.4 * output1, target.to(dev1)
+                    )

                    if try_scaling_api:
                        scaler.scale(loss0).backward(retain_graph=True)
                        scaler.scale(loss1).backward()
                        if i == skip_iter and scaler.is_enabled():
-                            model1[1].weight.grad.data.fill_(float('inf'))
+                            model1[1].weight.grad.data.fill_(float("inf"))

                        # As an additional stress test, separately unscale for one of the optimizers.
                        scaler.unscale_(optimizer0)
@ -1178,11 +1232,20 @@ t2.start()

                        # Make sure the found_infs were collected properly across optimizers and devices.
                        if scaler.is_enabled():
-                            self.assertTrue(len(scaler._found_inf_per_device(optimizer0)) == 1)
-                            self.assertTrue(len(scaler._found_inf_per_device(optimizer1)) == 1)
-                            self.assertTrue(scaler._found_inf_per_device(optimizer0)[dev0].item() == 0.)
-                            self.assertTrue(scaler._found_inf_per_device(optimizer1)[dev1].item() ==
-                                            float(i == skip_iter))
+                            self.assertTrue(
+                                len(scaler._found_inf_per_device(optimizer0)) == 1
+                            )
+                            self.assertTrue(
+                                len(scaler._found_inf_per_device(optimizer1)) == 1
+                            )
+                            self.assertTrue(
+                                scaler._found_inf_per_device(optimizer0)[dev0].item()
+                                == 0.0
+                            )
+                            self.assertTrue(
+                                scaler._found_inf_per_device(optimizer1)[dev1].item()
+                                == float(i == skip_iter)
+                            )

                        scaler.update()
                    else:
@ -1196,25 +1259,41 @@ t2.start()
            run(mod_scaling0, mod_scaling1, opt_scaling0, opt_scaling1, True)

            # The loss scale should have been multiplied by the growth factor 3 times and the backoff factor once.
-            self.assertTrue(scaler.get_scale() == (128. * scaler.get_growth_factor()**3 *
-                                                   scaler.get_backoff_factor()**1) if enabled else 1.0)
+            self.assertTrue(
+                scaler.get_scale()
+                == (
+                    128.0
+                    * scaler.get_growth_factor() ** 3
+                    * scaler.get_backoff_factor() ** 1
+                )
+                if enabled
+                else 1.0
+            )

            # Copy mod_control1 and mod_scaling1 back the device 0 for comparison
            mod_control1.to(dev0)
            mod_scaling1.to(dev0)

-            for c, s in zip(chain(mod_control0.parameters(), mod_control1.parameters()),
-                            chain(mod_scaling0.parameters(), mod_scaling1.parameters())):
+            for c, s in zip(
+                chain(mod_control0.parameters(), mod_control1.parameters()),
+                chain(mod_scaling0.parameters(), mod_scaling1.parameters()),
+            ):
                self.assertEqual(c, s, rtol=1e-5, atol=1e-7)

    @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
    def test_cuda_device_memory_allocated(self):
        from torch.cuda import memory_allocated
+
        device_count = torch.cuda.device_count()
        current_alloc = [memory_allocated(idx) for idx in range(device_count)]
        x = torch.ones(10, device="cuda:0")
        self.assertGreater(memory_allocated(0), current_alloc[0])
-        self.assertTrue(all(memory_allocated(torch.cuda.device(idx)) == current_alloc[idx] for idx in range(1, device_count)))
+        self.assertTrue(
+            all(
+                memory_allocated(torch.cuda.device(idx)) == current_alloc[idx]
+                for idx in range(1, device_count)
+            )
+        )


 class TestCudaComm(TestCase):
@ -1226,12 +1305,17 @@ class TestCudaComm(TestCase):
        for i, t in enumerate(results):
            self.assertEqual(t.get_device(), i)
            self.assertEqual(t, input)
-            if input.is_cuda and input.get_device() == i:  # test not copying on same device
+            if (
+                input.is_cuda and input.get_device() == i
+            ):  # test not copying on same device
                self.assertEqual(t.data_ptr(), input.data_ptr())
        # test out=
        for inplace in [True, False]:
            if inplace:
-                outputs = [torch.empty_like(input, device=0), torch.empty_like(input, device=1)]
+                outputs = [
+                    torch.empty_like(input, device=0),
+                    torch.empty_like(input, device=1),
+                ]
            else:
                outputs = [input.cuda(0), torch.empty_like(input, device=1)]
            results = comm.broadcast(input, out=outputs)
@ -1241,13 +1325,19 @@ class TestCudaComm(TestCase):
                self.assertEqual(t.get_device(), i)
                self.assertEqual(t, input)
        # test error msg
-        with self.assertRaisesRegex(RuntimeError, r"Exactly one of 'devices' and 'out'"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Exactly one of 'devices' and 'out'"
+        ):
            comm.broadcast(input, (0, 1), out=outputs)
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected all output tensors to be CUDA tensors, but output tensor at index 1"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all output tensors to be CUDA tensors, but output tensor at index 1",
+        ):
            comm.broadcast(input, out=[input.cuda(0), input.cpu()])
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected all output tensors to have same shape as the source .+ at index 1"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all output tensors to have same shape as the source .+ at index 1",
+        ):
            comm.broadcast(input, out=[input.cuda(0), input.cuda(1).unsqueeze(0)])

    def test_broadcast_cpu(self):
@ -1289,16 +1379,16 @@ class TestCudaComm(TestCase):
        numel = 5
        num_bytes = numel * 8
        tensors = [
-            self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
            torch.randn(numel).long().cuda(),
            torch.randn(numel).cuda(),
-            self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
-            self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
+            self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
+            self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
            torch.randn(numel).long().cuda(),
            torch.randn(numel).long().cuda(),
-            self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
+            self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
            torch.randn(numel).cuda(),
        ]
@ -1323,7 +1413,7 @@ class TestCudaComm(TestCase):
        tensors = [
            torch.tensor([]).byte().cuda(),
            torch.randn(5).cuda(),
-            torch.randn(5).double().cuda()
+            torch.randn(5).double().cuda(),
        ]
        self._test_broadcast_coalesced(tensors, 256)

@ -1364,16 +1454,16 @@ class TestCudaComm(TestCase):
        numel = 5
        num_bytes = numel * 8
        tensors = [
-            self.genSparseTensor((2, 3), 2, 1, False, 'cuda', torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 1, False, "cuda", torch.float64)[0],
            torch.randn(numel).long().cuda(),
            torch.randn(numel).cuda(),
-            self.genSparseTensor((2, 3), 2, 10, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((2, 3), 2, 5, False, 'cuda', torch.float64)[0],
-            self.genSparseTensor((3, 3), 2, 7, False, 'cuda', torch.int64)[0],
-            self.genSparseTensor((2, 3), 2, 2, False, 'cuda', torch.float32)[0],
+            self.genSparseTensor((2, 3), 2, 10, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((2, 3), 2, 5, False, "cuda", torch.float64)[0],
+            self.genSparseTensor((3, 3), 2, 7, False, "cuda", torch.int64)[0],
+            self.genSparseTensor((2, 3), 2, 2, False, "cuda", torch.float32)[0],
            torch.randn(numel).long().cuda(),
            torch.randn(numel).long().cuda(),
-            self.genSparseTensor((2, 7), 2, 3, False, 'cuda', torch.int64)[0],
+            self.genSparseTensor((2, 7), 2, 3, False, "cuda", torch.int64)[0],
            torch.randn(numel * 2).int().cuda(),  # int is 2x shorter
            torch.randn(numel).cuda(),
        ]
@ -1412,7 +1502,9 @@ class TestCudaComm(TestCase):
            self.assertEqual(r, input[tuple(index)], atol=0, rtol=0)
            chunk_start = chunk_end
            if r.device == input.device:
-                self.assertEqual(r.data_ptr(), input.data_ptr())  # for target @ same device, a view should be returned
+                self.assertEqual(
+                    r.data_ptr(), input.data_ptr()
+                )  # for target @ same device, a view should be returned

        # test out
        out = [torch.empty_like(t) for t in result]
@ -1429,20 +1521,38 @@ class TestCudaComm(TestCase):

        # test error msg
        if chunk_sizes is not None:
-            with self.assertRaisesRegex(RuntimeError, r"Expected devices and chunk_sizes to be of same length"):
-                comm.scatter(input, [0 for _ in range(len(chunk_sizes) + 1)], dim=dim, chunk_sizes=chunk_sizes)
+            with self.assertRaisesRegex(
+                RuntimeError, r"Expected devices and chunk_sizes to be of same length"
+            ):
+                comm.scatter(
+                    input,
+                    [0 for _ in range(len(chunk_sizes) + 1)],
+                    dim=dim,
+                    chunk_sizes=chunk_sizes,
+                )
        with self.assertRaisesRegex(RuntimeError, r"'devices' must not be specified"):
            comm.scatter(input, (0, 1), dim=dim, out=out)
-        with self.assertRaisesRegex(RuntimeError, r"Expected at least one device to scatter to"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected at least one device to scatter to"
+        ):
            comm.scatter(input, (), dim=dim)
-        with self.assertRaisesRegex(RuntimeError, r"Expected at least one output tensor to scatter to"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected at least one output tensor to scatter to"
+        ):
            comm.scatter(input, dim=dim, out=[])
-        with self.assertRaisesRegex(RuntimeError,
-                                    r"Expected all output tensors to be CUDA tensors, but output tensor at index 0"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all output tensors to be CUDA tensors, but output tensor at index 0",
+        ):
            comm.scatter(input, dim=dim, out=([out[0].cpu()] + out[1:]))
-        with self.assertRaisesRegex(RuntimeError, r"Output tensor at index 0 has incorrect shape"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Output tensor at index 0 has incorrect shape"
+        ):
            comm.scatter(input, dim=dim, out=([out[0].unsqueeze(0)] + out[1:]))
-        with self.assertRaisesRegex(RuntimeError, r"Total size for output tensors along scatter dim \d+ does not match"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Total size for output tensors along scatter dim \d+ does not match",
+        ):
            index = [slice(None, None) for _ in range(input.dim())]
            index[dim] = slice(1, None)
            comm.scatter(input, dim=dim, out=([out[0][tuple(index)]] + out[1:]))
@ -1480,13 +1590,13 @@ class TestCudaComm(TestCase):
        expected_size[dim] += y.size(dim)
        expected_size = torch.Size(expected_size)

-        destinations = [None, torch.device('cuda:0'), torch.device('cpu')]
+        destinations = [None, torch.device("cuda:0"), torch.device("cpu")]
        if torch.cuda.device_count() > 2:
-            destinations.append(torch.device('cuda:2'))
+            destinations.append(torch.device("cuda:2"))
        with torch.cuda.device(1):
            for destination in destinations:
                if destination is None:
-                    expected_device = torch.device('cuda', torch.cuda.current_device())
+                    expected_device = torch.device("cuda", torch.cuda.current_device())
                else:
                    expected_device = destination
                for use_out in [True, False]:
@ -1507,15 +1617,31 @@ class TestCudaComm(TestCase):
                    self.assertEqual(result[tuple(index)], y)

        # test error msg
-        with self.assertRaisesRegex(RuntimeError, r"'destination' must not be specified"):
-            comm.gather((x, y), dim, destination='cpu', out=torch.empty(expected_size, device='cpu'))
-        with self.assertRaisesRegex(RuntimeError, r"Expected at least one tensor to gather from"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"'destination' must not be specified"
+        ):
+            comm.gather(
+                (x, y),
+                dim,
+                destination="cpu",
+                out=torch.empty(expected_size, device="cpu"),
+            )
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected at least one tensor to gather from"
+        ):
            comm.gather(())
-        with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to be CUDA tensors, "):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Expected all input tensors to be CUDA tensors, "
+        ):
            comm.gather((x.cpu(), y))
-        with self.assertRaisesRegex(RuntimeError, r"Expected all input tensors to have the same number of dimensions"):
+        with self.assertRaisesRegex(
+            RuntimeError,
+            r"Expected all input tensors to have the same number of dimensions",
+        ):
            comm.gather((x, y.unsqueeze(0)))
-        with self.assertRaisesRegex(RuntimeError, r"Input tensor at index 1 has invalid shape"):
+        with self.assertRaisesRegex(
+            RuntimeError, r"Input tensor at index 1 has invalid shape"
+        ):
            if dim in [0, -2]:
                comm.gather((x, y[:, 1:]), dim=dim)
            elif dim in [1, -1]:
@ -1532,7 +1658,9 @@ class TestCudaComm(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_memory_format_scatter_gather(self):
-        nhwc = torch.randn((10, 3, 32, 32), device='cpu').contiguous(memory_format=torch.channels_last)
+        nhwc = torch.randn((10, 3, 32, 32), device="cpu").contiguous(
+            memory_format=torch.channels_last
+        )
        results = torch.cuda.comm.scatter(nhwc, (0, 1), None, 0)
        for result in results:
            self.assertFalse(result.is_contiguous())
@ -1541,7 +1669,6 @@ class TestCudaComm(TestCase):
        gathered = torch.cuda.comm.gather(results)
        self.assertTrue(gathered.is_contiguous(memory_format=torch.channels_last))

-
    @unittest.skipIf(not TEST_MULTIGPU, "Test needs multiple GPUs")
    def test_scatter_namedtuple(self):
        # tests ability to scatter namedtuples and retrieve a list where each
@ -1589,8 +1716,8 @@ class TestCudaComm(TestCase):
    def test_gather_namedtuple(self):
        # tests ability to gather a list of namedtuples and return a namedtuple where each
        # element is of the expected tensor type.
-        fields = ['a', 'b']
-        TestNamedTupleInput_0 = collections.namedtuple('NamedTuple', fields)
+        fields = ["a", "b"]
+        TestNamedTupleInput_0 = collections.namedtuple("NamedTuple", fields)

        num_gpus = torch.cuda.device_count()
        a = torch.rand(num_gpus * 2, device=0)
@ -1603,10 +1730,10 @@ class TestCudaComm(TestCase):

        outputs = [out1, out2]

-        out = scatter_gather.gather(outputs, 'cpu')  # test on CPU
+        out = scatter_gather.gather(outputs, "cpu")  # test on CPU
        for i, x in enumerate(out):
            self.assertTrue(isinstance(x, type(out2[-1])))  # x must be a tensor
-            cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
+            cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
            self.assertTrue(torch.equal(x, cat))

        out = scatter_gather.gather(outputs, 0)  # test on GPU
@ -1635,15 +1762,15 @@ class TestCudaComm(TestCase):
            cat = torch.cat((outputs[0][i].to(0), outputs[1][i].to(0)))
            self.assertTrue(torch.equal(x, cat))

-        out = scatter_gather.gather(outputs, 'cpu')  # test on CPU
+        out = scatter_gather.gather(outputs, "cpu")  # test on CPU
        for i, x in enumerate(out):
            self.assertTrue(isinstance(x, type(out2[-1])))
-            cat = torch.cat((outputs[0][i].to('cpu'), outputs[1][i].to('cpu')))
+            cat = torch.cat((outputs[0][i].to("cpu"), outputs[1][i].to("cpu")))
            self.assertTrue(torch.equal(x, cat))


 instantiate_parametrized_tests(TestCudaMultiGPU)


-if __name__ == '__main__':
+if __name__ == "__main__":
    run_tests()
--- a/test/test_cuda_nvml_based_avail.py
+++ b/test/test_cuda_nvml_based_avail.py
@ -1,19 +1,28 @@
 # Owner(s): ["module: cuda"]

+import multiprocessing
 import os
 import sys
-import multiprocessing
-import torch
 import unittest
 from unittest.mock import patch

+import torch
+
 # NOTE: Each of the tests in this module need to be run in a brand new process to ensure CUDA is uninitialized
 # prior to test initiation.
 with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):
    # Before executing the desired tests, we need to disable CUDA initialization and fork_handler additions that would
    # otherwise be triggered by the `torch.testing._internal.common_utils` module import
-    from torch.testing._internal.common_utils import (parametrize, instantiate_parametrized_tests, run_tests, TestCase,
-                                                      IS_WINDOWS, IS_JETSON, NoTest)
+    from torch.testing._internal.common_utils import (
+        instantiate_parametrized_tests,
+        IS_JETSON,
+        IS_WINDOWS,
+        NoTest,
+        parametrize,
+        run_tests,
+        TestCase,
+    )
+
    # NOTE: Because `remove_device_and_dtype_suffixes` initializes CUDA context (triggered via the import of
    # `torch.testing._internal.common_device_type` which imports `torch.testing._internal.common_cuda`) we need
    # to bypass that method here which should be irrelevant to the parameterized tests in this module.
@ -21,7 +30,7 @@ with patch.dict(os.environ, {"PYTORCH_NVML_BASED_CUDA_CHECK": "1"}):

    TEST_CUDA = torch.cuda.is_available()
    if not TEST_CUDA:
-        print('CUDA not available, skipping tests', file=sys.stderr)
+        print("CUDA not available, skipping tests", file=sys.stderr)
        TestCase = NoTest  # type: ignore[misc, assignment] # noqa: F811


@ -30,11 +39,14 @@ class TestExtendedCUDAIsAvail(TestCase):
    SUBPROCESS_REMINDER_MSG = (
        "\n REMINDER: Tests defined in test_cuda_nvml_based_avail.py must be run in a process "
        "where there CUDA Driver API has not been initialized. Before further debugging, ensure you are either using "
-        "run_test.py or have added --subprocess to run each test in a different subprocess.")
+        "run_test.py or have added --subprocess to run each test in a different subprocess."
+    )

    def setUp(self):
        super().setUp()
-        torch.cuda._cached_device_count = None  # clear the lru_cache on this method before our test
+        torch.cuda._cached_device_count = (
+            None  # clear the lru_cache on this method before our test
+        )

    @staticmethod
    def in_bad_fork_test() -> bool:
@ -47,31 +59,33 @@ class TestExtendedCUDAIsAvail(TestCase):
    # If the NVML-based assessment is attempted but fails, the CUDA Runtime API check should be executed
    @unittest.skipIf(IS_WINDOWS, "Needs fork")
    @parametrize("nvml_avail", [True, False])
-    @parametrize("avoid_init", ['1', '0', None])
+    @parametrize("avoid_init", ["1", "0", None])
    def test_cuda_is_available(self, avoid_init, nvml_avail):
-        if IS_JETSON and nvml_avail and avoid_init == '1':
-            self.skipTest('Not working for Jetson')
+        if IS_JETSON and nvml_avail and avoid_init == "1":
+            self.skipTest("Not working for Jetson")
        patch_env = {"PYTORCH_NVML_BASED_CUDA_CHECK": avoid_init} if avoid_init else {}
        with patch.dict(os.environ, **patch_env):
            if nvml_avail:
                _ = torch.cuda.is_available()
            else:
-                with patch.object(torch.cuda, '_device_count_nvml', return_value=-1):
+                with patch.object(torch.cuda, "_device_count_nvml", return_value=-1):
                    _ = torch.cuda.is_available()
            with multiprocessing.get_context("fork").Pool(1) as pool:
                in_bad_fork = pool.apply(TestExtendedCUDAIsAvail.in_bad_fork_test)
-            if os.getenv('PYTORCH_NVML_BASED_CUDA_CHECK') == '1' and nvml_avail:
-                self.assertFalse(in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG)
+            if os.getenv("PYTORCH_NVML_BASED_CUDA_CHECK") == "1" and nvml_avail:
+                self.assertFalse(
+                    in_bad_fork, TestExtendedCUDAIsAvail.SUBPROCESS_REMINDER_MSG
+                )
            else:
                assert in_bad_fork


@torch.testing._internal.common_utils.markDynamoStrictTest
 class TestVisibleDeviceParses(TestCase):
-
    def test_env_var_parsing(self):
        def _parse_visible_devices(val):
            from torch.cuda import _parse_visible_devices as _pvd
+
            with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
                return _pvd()

@ -96,39 +110,57 @@ class TestVisibleDeviceParses(TestCase):

    def test_partial_uuid_resolver(self):
        from torch.cuda import _transform_uuid_to_ordinals
-        uuids = ['GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1',
-                 'GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293',
-                 'GPU-e429a63e-c61c-4795-b757-5132caeb8e70',
-                 'GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98',
-                 'GPU-bbcd6503-5150-4e92-c266-97cc4390d04e',
-                 'GPU-472ea263-58d7-410d-cc82-f7fdece5bd28',
-                 'GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e',
-                 'GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad']
+
+        uuids = [
+            "GPU-9942190a-aa31-4ff1-4aa9-c388d80f85f1",
+            "GPU-9e8d35e3-a134-0fdd-0e01-23811fdbd293",
+            "GPU-e429a63e-c61c-4795-b757-5132caeb8e70",
+            "GPU-eee1dfbc-0a0f-6ad8-5ff6-dc942a8b9d98",
+            "GPU-bbcd6503-5150-4e92-c266-97cc4390d04e",
+            "GPU-472ea263-58d7-410d-cc82-f7fdece5bd28",
+            "GPU-e56257c4-947f-6a5b-7ec9-0f45567ccf4e",
+            "GPU-1c20e77d-1c1a-d9ed-fe37-18b8466a78ad",
+        ]
        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3"], uuids), [1])
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1])
-        self.assertEqual(_transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids), [1, 7, 5])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-e4", "GPU-9e8d35e3"], uuids), [2, 1]
+        )
+        self.assertEqual(
+            _transform_uuid_to_ordinals("GPU-9e8d35e3,GPU-1,GPU-47".split(","), uuids),
+            [1, 7, 5],
+        )
        # First invalid UUID aborts parsing
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), [])
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids), [1])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-123", "GPU-9e8d35e3"], uuids), []
+        )
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-123", "GPU-47"], uuids),
+            [1],
+        )
        # First ambigous UUID aborts parsing
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-e", "GPU-47"], uuids), [1]
+        )
        # Duplicate UUIDs result in empty set
-        self.assertEqual(_transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids), [])
+        self.assertEqual(
+            _transform_uuid_to_ordinals(["GPU-9e8d35e3", "GPU-47", "GPU-9e8"], uuids),
+            [],
+        )

    def test_ordinal_parse_visible_devices(self):
        def _device_count_nvml(val):
            from torch.cuda import _device_count_nvml as _dc
+
            with patch.dict(os.environ, {"CUDA_VISIBLE_DEVICES": val}, clear=True):
                return _dc()

-        with patch.object(torch.cuda, '_raw_device_count_nvml', return_value=2):
+        with patch.object(torch.cuda, "_raw_device_count_nvml", return_value=2):
            self.assertEqual(_device_count_nvml("1, 0"), 2)
            # Ordinal out of bounds aborts parsing
            self.assertEqual(_device_count_nvml("1, 5, 0"), 1)


-
 instantiate_parametrized_tests(TestExtendedCUDAIsAvail)

-if __name__ == '__main__':
+if __name__ == "__main__":
    run_tests()
--- a/test/test_cuda_primary_ctx.py
+++ b/test/test_cuda_primary_ctx.py
@ -1,15 +1,21 @@
 # Owner(s): ["module: cuda"]

-import torch
-from torch.testing._internal.common_utils import TestCase, run_tests, skipIfRocmVersionLessThan, NoTest
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
 import sys
 import unittest

+import torch
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU
+from torch.testing._internal.common_utils import (
+    NoTest,
+    run_tests,
+    skipIfRocmVersionLessThan,
+    TestCase,
+)
+
 # NOTE: this needs to be run in a brand new process

 if not TEST_CUDA:
-    print('CUDA not available, skipping tests', file=sys.stderr)
+    print("CUDA not available, skipping tests", file=sys.stderr)
    TestCase = NoTest  # noqa: F811


@ -18,17 +24,21 @@ class TestCudaPrimaryCtx(TestCase):
    CTX_ALREADY_CREATED_ERR_MSG = (
        "Tests defined in test_cuda_primary_ctx.py must be run in a process "
        "where CUDA contexts are never created. Use either run_test.py or add "
-        "--subprocess to run each test in a different subprocess.")
+        "--subprocess to run each test in a different subprocess."
+    )

    @skipIfRocmVersionLessThan((4, 4, 21504))
    def setUp(self):
        for device in range(torch.cuda.device_count()):
            # Ensure context has not been created beforehand
-            self.assertFalse(torch._C._cuda_hasPrimaryContext(device), TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG)
+            self.assertFalse(
+                torch._C._cuda_hasPrimaryContext(device),
+                TestCudaPrimaryCtx.CTX_ALREADY_CREATED_ERR_MSG,
+            )

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_str_repr(self):
-        x = torch.randn(1, device='cuda:1')
+        x = torch.randn(1, device="cuda:1")

        # We should have only created context on 'cuda:1'
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -43,13 +53,13 @@ class TestCudaPrimaryCtx(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_copy(self):
-        x = torch.randn(1, device='cuda:1')
+        x = torch.randn(1, device="cuda:1")

        # We should have only created context on 'cuda:1'
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
        self.assertTrue(torch._C._cuda_hasPrimaryContext(1))

-        y = torch.randn(1, device='cpu')
+        y = torch.randn(1, device="cpu")
        y.copy_(x)

        # We should still have only created context on 'cuda:1'
@ -58,7 +68,7 @@ class TestCudaPrimaryCtx(TestCase):

    @unittest.skipIf(not TEST_MULTIGPU, "only one GPU detected")
    def test_pin_memory(self):
-        x = torch.randn(1, device='cuda:1')
+        x = torch.randn(1, device="cuda:1")

        # We should have only created context on 'cuda:1'
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -70,7 +80,7 @@ class TestCudaPrimaryCtx(TestCase):
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
        self.assertTrue(torch._C._cuda_hasPrimaryContext(1))

-        x = torch.randn(3, device='cpu').pin_memory()
+        x = torch.randn(3, device="cpu").pin_memory()

        # We should still have only created context on 'cuda:1'
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -82,19 +92,19 @@ class TestCudaPrimaryCtx(TestCase):
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
        self.assertTrue(torch._C._cuda_hasPrimaryContext(1))

-        x = torch.randn(3, device='cpu', pin_memory=True)
+        x = torch.randn(3, device="cpu", pin_memory=True)

        # We should still have only created context on 'cuda:1'
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
        self.assertTrue(torch._C._cuda_hasPrimaryContext(1))

-        x = torch.zeros(3, device='cpu', pin_memory=True)
+        x = torch.zeros(3, device="cpu", pin_memory=True)

        # We should still have only created context on 'cuda:1'
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
        self.assertTrue(torch._C._cuda_hasPrimaryContext(1))

-        x = torch.empty(3, device='cpu', pin_memory=True)
+        x = torch.empty(3, device="cpu", pin_memory=True)

        # We should still have only created context on 'cuda:1'
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
@ -106,5 +116,6 @@ class TestCudaPrimaryCtx(TestCase):
        self.assertFalse(torch._C._cuda_hasPrimaryContext(0))
        self.assertTrue(torch._C._cuda_hasPrimaryContext(1))

-if __name__ == '__main__':
+
+if __name__ == "__main__":
    run_tests()
--- a/test/test_cuda_sanitizer.py
+++ b/test/test_cuda_sanitizer.py
@ -7,8 +7,8 @@ from typing import List

 import torch
 import torch.cuda._sanitizer as csan
-from torch.cuda._sanitizer import StreamId, DataPtr, EventId
-from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
+from torch.cuda._sanitizer import DataPtr, EventId, StreamId
+from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase


 if not TEST_CUDA:
--- a/test/test_cuda_trace.py
+++ b/test/test_cuda_trace.py
@ -6,7 +6,7 @@ import unittest.mock

 import torch
 import torch.cuda._gpu_trace as gpu_trace
-from torch.testing._internal.common_utils import TestCase, run_tests, NoTest, TEST_CUDA
+from torch.testing._internal.common_utils import NoTest, run_tests, TEST_CUDA, TestCase

 # NOTE: Each test needs to be run in a brand new process, to reset the registered hooks
 # and make sure the CUDA streams are initialized for each test that uses them.