Refactor multigpu tests to test_cuda_multigpu (#104059)

Mostly refactor, that moves all the tests from `test_cuda` that benefit from multiGPU environment into its own file. - Add `TestCudaMallocAsync` class for Async tests ( to separate them from `TestCudaComm`) - Move individual tests from `TestCuda` to `TestCudaMultiGPU` - Move `_create_scaling_models_optimizers` and `_create_scaling_case` to `torch.testing._internal.common_cuda` - Add newly created `test_cuda_multigpu` to the multigpu periodic test  ### <samp>🤖 Generated by Copilot at f4d46fa</samp> This pull request fixes a flaky test and improves the testing of gradient scaling on multiple GPUs. It adds verbose output for two CUDA tests, and refactors some common code into helper functions in `torch/testing/_internal/common_cuda.py`. Pull Request resolved: https://github.com/pytorch/pytorch/pull/104059 Approved by: https://github.com/huydhn
2025-10-20 21:14:14 +08:00 · 2023-06-27 05:32:05 +00:00
parent 572ff2779b
commit c3e4a67905
4 changed files with 1773 additions and 1725 deletions
--- a/.ci/pytorch/multigpu-test.sh
+++ b/.ci/pytorch/multigpu-test.sh
@ -8,6 +8,7 @@
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

 echo "Testing pytorch"
+time python test/run_test.py --include test_cuda_multigpu test_cuda_primary_ctx --verbose

 # Disabling tests to see if they solve timeout issues; see https://github.com/pytorch/pytorch/issues/70015
 # python tools/download_mnist.py --quiet -d test/cpp/api/mnist
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
--- a/test/test_cuda_multigpu.py
+++ b/test/test_cuda_multigpu.py
--- a/torch/testing/_internal/common_cuda.py
+++ b/torch/testing/_internal/common_cuda.py
@ -199,6 +199,40 @@ def _check_hipsparse_generic_available():
 TEST_CUSPARSE_GENERIC = _check_cusparse_generic_available()
 TEST_HIPSPARSE_GENERIC = _check_hipsparse_generic_available()

+# Shared by test_cuda.py and test_multigpu.py
+def _create_scaling_models_optimizers(device="cuda", optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
+    # Create a module+optimizer that will use scaling, and a control module+optimizer
+    # that will not use scaling, against which the scaling-enabled module+optimizer can be compared.
+    mod_control = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
+    mod_scaling = torch.nn.Sequential(torch.nn.Linear(8, 8), torch.nn.Linear(8, 8)).to(device=device)
+    with torch.no_grad():
+        for c, s in zip(mod_control.parameters(), mod_scaling.parameters()):
+            s.copy_(c)
+
+    kwargs = {"lr": 1.0}
+    if optimizer_kwargs is not None:
+        kwargs.update(optimizer_kwargs)
+    opt_control = optimizer_ctor(mod_control.parameters(), **kwargs)
+    opt_scaling = optimizer_ctor(mod_scaling.parameters(), **kwargs)
+
+    return mod_control, mod_scaling, opt_control, opt_scaling
+
+
+def _create_scaling_case(device="cuda", dtype=torch.float, optimizer_ctor=torch.optim.SGD, optimizer_kwargs=None):
+    data = [(torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
+            (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
+            (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device)),
+            (torch.randn((8, 8), dtype=dtype, device=device), torch.randn((8, 8), dtype=dtype, device=device))]
+
+    loss_fn = torch.nn.MSELoss().cuda()
+
+    skip_iter = 2
+
+    return _create_scaling_models_optimizers(
+        device=device, optimizer_ctor=optimizer_ctor, optimizer_kwargs=optimizer_kwargs,
+    ) + (data, loss_fn, skip_iter)
+
+
 # Importing this module should NOT eagerly initialize CUDA
 if not CUDA_ALREADY_INITIALIZED_ON_IMPORT:
    assert not torch.cuda.is_initialized()