Enable some tests of TEST_NN class on Intel GPU

2025-11-03 07:24:58 +08:00 · 2025-10-29 01:29:56 +00:00
3 changed files with 136 additions and 118 deletions
--- a/test/test_namedtensor.py
+++ b/test/test_namedtensor.py
@ -2,7 +2,7 @@
 # ruff: noqa: F841
 import unittest
 from torch.testing._internal.common_utils import TestCase, run_tests, TEST_NUMPY
-from torch.testing._internal.common_utils import skipIfTorchDynamo
+from torch.testing._internal.common_utils import skipIfTorchDynamo, skipIfXpu
 from torch.testing._internal.common_cuda import TEST_CUDA
 from torch.testing._internal.common_device_type import get_all_device_types
 from collections import namedtuple, OrderedDict
@ -285,6 +285,7 @@ class TestNamedTensor(TestCase):
        empty_named_tensor = torch.tensor([[1, 2, 3, 4], [4, 3, 2, 1]], names=[])
        self.assertEqual(ref_tensor, empty_named_tensor)

+    @skipIfXpu   # https://github.com/intel/torch-xpu-ops/issues/2023
    def test_max_pooling(self):
        def check_tuple_return(op, inputs, expected_names):
            values, indices = op(*inputs)
--- a/test/test_nn.py
+++ b/test/test_nn.py
@ -32,13 +32,13 @@ from torch.nn import Buffer, Parameter
 from torch.nn.parallel._functions import Broadcast
 from torch.testing._internal.common_dtype import integral_types, get_all_math_dtypes, floating_types
 from torch.testing._internal.common_utils import dtype_name, freeze_rng_state, run_tests, TestCase, \
-    skipIfNoLapack, skipIfRocm, \
+    skipIfNoLapack, skipIfRocm, skipIfXpu, \
    TEST_NUMPY, TEST_SCIPY, TEST_WITH_CROSSREF, TEST_WITH_ROCM, \
    download_file, get_function_arglist, load_tests, skipIfMPS, \
    IS_PPC, \
    parametrize as parametrize_test, subtest, instantiate_parametrized_tests, \
    skipIfTorchDynamo, gcIfJetson, set_default_dtype
-from torch.testing._internal.common_cuda import TEST_CUDA, TEST_MULTIGPU, TEST_CUDNN, \
+from torch.testing._internal.common_cuda import TEST_CUDA, TEST_CUDNN, \
    _get_torch_rocm_version
 from torch.testing._internal.common_nn import NNTestCase, NewModuleTest, CriterionTest, \
    module_tests, criterion_tests, loss_reference_fns, _create_basic_net, \
@ -47,7 +47,7 @@ from torch.testing._internal.common_device_type import dtypesIfMPS, instantiate_
    dtypesIfCUDA, precisionOverride, onlyCUDA, onlyCPU, \
    skipCUDAIfRocm, skipCUDAIf, skipCUDAIfNotRocm, \
    onlyNativeDeviceTypes, deviceCountAtLeast, largeTensorTest, expectedFailureMeta, expectedFailureMPS, \
-    skipMeta, get_all_device_types
+    skipMeta, get_all_device_types, TEST_XPU

 from hypothesis import given
 import torch.testing._internal.hypothesis_utils as hu
@ -75,14 +75,18 @@ if TEST_SCIPY:
 if TEST_NUMPY:
    import numpy as np

+TEST_GPU = torch.cuda.is_available() or torch.xpu.is_available()
+TEST_MULTIGPU = TEST_GPU and torch.accelerator.device_count() >= 2
+device_type = (acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu")

 # WARNING: If you add a new top-level test case to this file, you MUST
 # update test/run_test.py to list it, otherwise it will NOT be run in
 # CI.

 class TestNN(NNTestCase):
-    _do_cuda_memory_leak_check = True
-    _do_cuda_non_default_stream = True
+    if TEST_CUDA:
+        _do_cuda_memory_leak_check = True
+        _do_cuda_non_default_stream = True

    def _forward(self, module, input: _TensorOrTensors):
        with freeze_rng_state():
@ -227,12 +231,13 @@ class TestNN(NNTestCase):
        self.assertEqual(m.double(), m.to(torch.float64))
        self.assertRaises(RuntimeError, lambda: m.to('cpu', copy=True))

-        if torch.cuda.is_available():
-            for cuda in ['cuda', 'cuda:0' if torch.cuda.device_count() == 1 else 'cuda:1']:
-                m2 = m.cuda(device=cuda)
-                self.assertIs(m2, m2.to(cuda))
+        if torch.cuda.is_available() or torch.xpu.is_available():
+            for gpu in [device_type,
+                        str(torch.device(0)) if torch.accelerator.device_count() == 1 else str(torch.device(1))]:
+                m2 = m.to(device_type)
+                self.assertIs(m2, m2.to(device_type))
                self.assertEqual(m, m2.to('cpu'))
-                self.assertEqual(m2, m.to(cuda))
+                self.assertEqual(m2, m.to(device_type))
                self.assertIs(m2, m2.to(dtype=torch.float32))
                self.assertEqual(m2.double(), m2.to(dtype=torch.float64))

@ -1923,6 +1928,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        m = pickle.loads(pickle.dumps(m))
        self.assertIsInstance(m, nn.Linear)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/2228
    @skipIfTorchDynamo("TorchDynamo fails here for unknown reasons")
    @set_default_dtype(torch.double)
    def test_spectral_norm(self):
@ -2290,34 +2296,37 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        inputs = torch.randn((), requires_grad=True)
        self.assertTrue(gradcheck(lambda x: F.normalize(x, p=1, dim=-1), (inputs,)))

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/2228
    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
    # Skip the test for ROCm as per https://github.com/pytorch/pytorch/issues/53190
    @skipIfRocm
    def test_broadcast_double_backwards_gpu(self):
-        tensors = (torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double),
-                   torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double),
-                   torch.randn(4, 4, device='cuda', requires_grad=True, dtype=torch.double))
+        tensors = (torch.randn(4, 4, device=device_type, requires_grad=True, dtype=torch.double),
+                   torch.randn(4, 4, device=device_type, requires_grad=True, dtype=torch.double),
+                   torch.randn(4, 4, device=device_type, requires_grad=True, dtype=torch.double))
        # TODO(#50743): the following segfaults with check_batched_grad=True
        _assertGradAndGradgradChecks(self, lambda *i: Broadcast.apply((0, 1), *i), tensors,
                                     check_batched_grad=False)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/2228
    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
    def test_broadcast_not_requiring_grad(self):
        variables = [
-            torch.randn(1, 2, device='cuda', requires_grad=True),
-            torch.randn(1, 2, device='cuda', requires_grad=False),
-            torch.randn(1, 2, device='cuda', requires_grad=False),
-            torch.randn(1, 2, device='cuda', requires_grad=True),
-            torch.randn(1, 2, device='cuda', requires_grad=True),
+            torch.randn(1, 2, device=device_type, requires_grad=True),
+            torch.randn(1, 2, device=device_type, requires_grad=False),
+            torch.randn(1, 2, device=device_type, requires_grad=False),
+            torch.randn(1, 2, device=device_type, requires_grad=True),
+            torch.randn(1, 2, device=device_type, requires_grad=True),
        ]
        broadcasted_variables = Broadcast.apply((0, 1), *variables)
        for output_idx, broadcasted_var in enumerate(broadcasted_variables):
            input_var = variables[output_idx % len(variables)]
            self.assertEqual(input_var.requires_grad, broadcasted_var.requires_grad)

+    @skipIfXpu  # https://github.com/intel/torch-xpu-ops/issues/2228
    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
    def test_broadcast_no_grad(self):
-        x = torch.randn(1, 2, dtype=torch.float32, requires_grad=True, device='cuda')
+        x = torch.randn(1, 2, dtype=torch.float32, requires_grad=True, device=device_type)
        with torch.no_grad():
            broadcasted = Broadcast.apply((0, 1), x)
        self.assertTrue(x.requires_grad)
@ -2733,12 +2742,12 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            target_lengths = target_lengths.to(dtype=torch.float)
            torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)

-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, 'CUDA and XPU not available')
    def test_CTCLoss_lengthchecks_cuda(self):
        for target_lengths in [[30, 25, 20], [-1, -1, -1]]:
            for input_lengths in [[50, 50, 50], [-1, -1, -1]]:
-                targets = torch.randint(1, 15, (3, 29), dtype=torch.long, device='cuda')
-                log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2)
+                targets = torch.randint(1, 15, (3, 29), dtype=torch.long, device=device_type)
+                log_probs = torch.randn(50, 3, 15, dtype=torch.float, device=device_type).log_softmax(2)
                with self.assertRaises(RuntimeError):
                    torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)

@ -2750,7 +2759,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                with self.assertRaises(RuntimeError):
                    torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths)

-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, 'CUDA and XPU not available')
    def test_CTCLoss_long_targets(self):
        input_length = 4000
        vocab_size = 3
@ -2768,13 +2777,17 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        grad_cpu, = torch.autograd.grad(res_cpu, log_probs, grad_out)

        with torch.backends.cudnn.flags(enabled=False):
-            res_gpu = torch.nn.functional.ctc_loss(log_probs.cuda(), targets.cuda(), input_lengths, target_lengths,
-                                                   reduction='sum', zero_infinity=True)
-            grad_gpu, = torch.autograd.grad(res_gpu, log_probs, grad_out.cuda())
+            res_gpu = torch.nn.functional.ctc_loss(log_probs.to(device_type),
+                                                   targets.to(device_type),
+                                                   input_lengths,
+                                                   target_lengths,
+                                                   reduction='sum',
+                                                   zero_infinity=True)
+            grad_gpu, = torch.autograd.grad(res_gpu, log_probs, grad_out.to(device_type))
        self.assertEqual(res_cpu, res_gpu, atol=1e-4, rtol=0)
        self.assertEqual(grad_cpu, grad_gpu, atol=1e-4, rtol=0)

-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, 'CUDA and XPU not available')
    def test_CTCLoss_critical_target_len(self):
        # cudnn has an unexpected problem with target length 256, see issue #53505
        N = 1
@ -2784,7 +2797,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        target = torch.randint(low=1, high=C, size=(S,), dtype=torch.int)
        input_lengths = torch.full(size=(N,), fill_value=T, dtype=torch.int)
        target_lengths = torch.tensor(S, dtype=torch.int)
-        inp = torch.randn(T, N, C, dtype=torch.float, device='cuda').log_softmax(2).requires_grad_()
+        inp = torch.randn(T, N, C, dtype=torch.float, device=device_type).log_softmax(2).requires_grad_()
        with cudnn.flags(enabled=True):
            res_gpu = torch.nn.functional.ctc_loss(inp, target, input_lengths, target_lengths, reduction='none')
        res_cpu = torch.nn.functional.ctc_loss(inp.cpu(), target, input_lengths, target_lengths, reduction='none')
@ -2792,7 +2805,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")

    def test_CTCLoss_zero_lengths(self):
        devices = ['cpu']
-        devices += ['cuda'] if TEST_CUDA else []
+        devices += [device_type] if TEST_CUDA or TEST_XPU else []
        N = 3
        S = 2
        C = 200
@ -2814,16 +2827,16 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            res.sum().backward()
            self.assertTrue((inp.grad == 0).all().item())

-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, 'CUDA and XPU not available')
    def test_CTCLoss_zero_infinity(self):
        target_lengths = [60, 25, 20]
        input_lengths = [50, 50, 50]
-        targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int, device='cuda')
-        log_probs = torch.randn(50, 3, 15, dtype=torch.float, device='cuda').log_softmax(2).requires_grad_()
+        targets = torch.randint(1, 15, (sum(target_lengths),), dtype=torch.int, device=device_type)
+        log_probs = torch.randn(50, 3, 15, dtype=torch.float, device=device_type).log_softmax(2).requires_grad_()
        res = torch.nn.functional.ctc_loss(log_probs, targets, input_lengths, target_lengths,
                                           reduction='sum', zero_infinity=True)
        with torch.backends.cudnn.flags(enabled=False):
-            res2 = torch.nn.functional.ctc_loss(log_probs, targets.cuda().long(), input_lengths, target_lengths,
+            res2 = torch.nn.functional.ctc_loss(log_probs, targets.to(device_type).long(), input_lengths, target_lengths,
                                                reduction='sum', zero_infinity=True)
        res_cpu = torch.nn.functional.ctc_loss(log_probs.cpu(), targets.cpu(), input_lengths, target_lengths,
                                               reduction='sum', zero_infinity=True)
@ -2894,12 +2907,12 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        self.assertRaises(Exception, lambda: lstm(input, (cx, hx)))


-    @unittest.skipIf(not TEST_CUDA, 'CUDA not available')
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, 'CUDA and XPU not available')
    def test_pack_sequence_batch_sizes_throw(self):
        with self.assertRaisesRegex(ValueError, r"batch_sizes should always be on CPU"):
-            m = nn.LSTM(3, 4, bidirectional=True, num_layers=2).to('cuda')
-            a = torch.rand(5, 3, device='cuda')
-            b = torch.tensor([1, 1, 1, 1, 1], device='cuda')
+            m = nn.LSTM(3, 4, bidirectional=True, num_layers=2).to(device_type)
+            a = torch.rand(5, 3, device=device_type)
+            b = torch.tensor([1, 1, 1, 1, 1], device=device_type)
            input = nn.utils.rnn.PackedSequence(a, b)

    def test_Transformer_cell(self):
@ -3173,7 +3186,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            nhead = 2
            dim_feedforward = 16
            dropout = 0.0
-            device = torch.device("cuda" if use_cuda else "cpu")
+            device = torch.device(device_type if use_cuda else "cpu")

            layer = nn.TransformerDecoderLayer(
                d_model,
@ -3199,8 +3212,8 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            def perm_fn(x):
                return x.transpose(1, 0) if batch_first else x
            activation = F.relu
-            use_cuda = torch.cuda.is_available()
-            device = torch.device("cuda" if use_cuda else "cpu")
+            use_cuda = torch.cuda.is_available() or torch.xpu.is_available()
+            device = torch.device(device_type if use_cuda else "cpu")

            decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation,
                                             batch_first=batch_first)
@ -3424,8 +3437,8 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")

            # gelu activation test cases
            activation = "gelu"
-            use_cuda = torch.cuda.is_available()
-            device = torch.device("cuda" if use_cuda else "cpu")
+            use_cuda = torch.cuda.is_available() or torch.xpu.is_available()
+            device = torch.device(device_type if use_cuda else "cpu")

            decoder_layer = get_a_test_layer(use_cuda=use_cuda, activation=activation,
                                             batch_first=batch_first)
@ -3495,9 +3508,9 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
    def test_cudnn_rnn_dropout_states_device(self):
        rnn = nn.RNN(10, 20, num_layers=2, dropout=.5)
        device = 1
-        input = torch.randn(5, 4, 10).cuda(device)
-        rnn.cuda(device)
-        hx = torch.randn(2, 4, 20).cuda(device)
+        input = torch.randn(5, 4, 10).to(device)
+        rnn.to(device)
+        hx = torch.randn(2, 4, 20).to(device)
        output = rnn(input, hx)

    def test_cudnn_forward_exception(self):
@ -3524,7 +3537,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        # ROCm RNN does not issue warning about single contig chunk of memory, so don't assert it
        first_warn = not torch.version.hip
        for rnn in rnns:
-            rnn.cuda()
+            rnn.to(device_type)
            input = torch.randn(5, 4, 10, requires_grad=True, device="cuda")
            hx = torch.randn(1, 5, 20, requires_grad=True, device="cuda")
            all_vars = [input, hx] + list(rnn.parameters())
@ -3579,7 +3592,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        ]
        for rnn in rnns:
            rnn.bias_ih_l0_reverse = rnn.bias_ih_l0
-            rnn.cuda()
+            rnn.to(device_type)
            input = torch.randn(5, 4, 10, requires_grad=True, device="cuda")
            hx = torch.randn(2, 5, 20, requires_grad=True, device="cuda")
            all_vars = [input, hx] + list(rnn.parameters())
@ -3870,7 +3883,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")

        def test(input_shape, hidden_h_shape, hidden_c_shape):
            for input, hidden in get_inputs(input_shape, hidden_h_shape, hidden_c_shape):
-                model = nn.LSTM(input_size, hidden_size, num_layers, proj_size=proj_size)
+                model = nn.LSTM(input_size, hidden_size, num_layers, proj_size=proj_size, device=device_type)
                self.assertRaises(RuntimeError, lambda: model(input, hidden))

        correct_input_shape = (seq_len, batch_size, input_size)
@ -3950,14 +3963,14 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")

        for mode in rnn_modes:
            model = getattr(nn, mode)(input_size, hidden_size, num_layers)
-            model_cuda = copy.deepcopy(model).to('cuda:0')
+            model_cuda = copy.deepcopy(model).to(0)
            input = torch.randn(correct_input_shape)
            hidden = torch.randn(correct_hidden_shape)

            # input and weights are not at the same device
            with self.assertRaisesRegex(RuntimeError,
                                        "Input and parameter tensors are not at the same device"):
-                model(input.to('cuda:0'))
+                model(input.to(0))
            with self.assertRaisesRegex(RuntimeError,
                                        "Input and parameter tensors are not at the same device"):
                model_cuda(input)
@ -3966,21 +3979,21 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            with self.assertRaisesRegex(RuntimeError,
                                        r"Input and hidden tensors are not at the same device"):
                if mode == 'LSTM':
-                    model(input, (hidden.to('cuda:0'), hidden.to('cuda:0')))
+                    model(input, (hidden.to(0), hidden.to(0)))
                else:
-                    model(input, (hidden.to('cuda:0')))
+                    model(input, (hidden.to(0)))
            with self.assertRaisesRegex(RuntimeError,
                                        r"Input and hidden tensors are not at the same device"):
                if mode == 'LSTM':
-                    model_cuda(input.to('cuda:0'), (hidden, hidden))
+                    model_cuda(input.to(0), (hidden, hidden))
                else:
-                    model_cuda(input.to('cuda:0'), (hidden))
+                    model_cuda(input.to(0), (hidden))

            # hidden tensors are not at the same CUDA device
            if mode == 'LSTM':
                with self.assertRaisesRegex(RuntimeError,
                                            "Input and hidden tensors are not at the same device"):
-                    model(input.to('cuda:0'), (hidden.to('cuda:0'), hidden.to('cuda:1')))
+                    model(input.to(0), (hidden.to(0), hidden.to(1)))

    @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported")
    def test_projections_lstm_check_device(self):
@ -4004,17 +4017,17 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        # input and weights are not at the same device
        with self.assertRaisesRegex(RuntimeError,
                                    "Input and parameter tensors are not at the same device"):
-            model(input.to('cuda:0'))
+            model(input.to(0))

        # input and hiddens are not at the same device
        with self.assertRaisesRegex(RuntimeError,
                                    r"Input and hidden tensors are not at the same device"):
-            model(input, (hidden_h.to('cuda:0'), hidden_c.to('cuda:0')))
+            model(input, (hidden_h.to(0), hidden_c.to(0)))

        # hidden tensors are not at the same CUDA device
        with self.assertRaisesRegex(RuntimeError,
                                    "Input and hidden tensors are not at the same device"):
-            model(input.to('cuda:0'), (hidden_h.to('cuda:0'), hidden_c.to('cuda:1')))
+            model(input.to(0), (hidden_h.to(0), hidden_c.to(1)))

    def test_rnn_initial_hidden_state(self):
        rnn_modes = ['RNN', 'GRU', 'LSTM']
@ -4077,17 +4090,17 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                hx = hx_val.clone().requires_grad_(True)

            if cuda:
-                rnn.cuda()
-                input_var.data = input_var.data.cuda()
+                rnn.to(device_type)
+                input_var.data = input_var.data.to(device_type)
                if is_lstm:
-                    hx[0].data = hx[0].data.cuda()
-                    hx[1].data = hx[1].data.cuda()
+                    hx[0].data = hx[0].data.to(device_type)
+                    hx[1].data = hx[1].data.to(device_type)
                else:
-                    hx.data = hx.data.cuda()
-                grad_hy = grad_hy.cuda()
+                    hx.data = hx.data.to(device_type)
+                grad_hy = grad_hy.to(device_type)
                if grad_cy is not None:
-                    grad_cy = grad_cy.cuda()
-                grad_output = grad_output.cuda()
+                    grad_cy = grad_cy.to(device_type)
+                grad_output = grad_output.to(device_type)

            output, hy = rnn(input, hx)

@ -4286,8 +4299,8 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            m = torch.nn.utils.weight_norm(m, name=name)

            # moves to CUDA
-            m = m.cuda()
-            input = input.cuda()
+            m = m.to(device_type)
+            input = input.to(device_type)

            # otherwise, subsequent warnings will be hidden, and further tests rely on them
            warnings.simplefilter("always")
@ -4315,10 +4328,10 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        self.assertFalse(hasattr(m, "weight_hh_l0"))
        # verifies that moving to CUDA with only some attributes defined
        # does not throw an error
-        m.cuda()
+        m.to(device_type)
        # recompute the weight and make sure that module can be used
-        m.weight_hh_l0 = weight_orig.cuda()
-        inp = inp.cuda()
+        m.weight_hh_l0 = weight_orig.to(device_type)
+        inp = inp.to(device_type)
        # otherwise, subsequent warnings will be hidden, and further tests rely on them
        warnings.simplefilter("always")
        self.assertEqual(m(inp)[0].cpu(), out_expected[0])
@ -4333,7 +4346,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                for cuda in (True, False):
                    rnn = nn.RNN(10, 1000, 2, bias=False, dropout=p, nonlinearity='relu')
                    if cuda:
-                        rnn.cuda()
+                        rnn.to(device_type)

                    if train:
                        rnn.train()
@ -4346,8 +4359,8 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                    input = torch.ones(1, 1, 10)
                    hx = torch.zeros(2, 1, 1000)
                    if cuda:
-                        input = input.cuda()
-                        hx = hx.cuda()
+                        input = input.to(device_type)
+                        hx = hx.to(device_type)

                    output, hy = rnn(input, hx)
                    self.assertEqual(output.data.min(), output.data.max())
@ -4367,7 +4380,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                    self.assertEqual(hy.data[0][0][0], 10)
                    self.assertEqual(hy.data[1][0][0], output_val)

-    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    @unittest.skipIf(not TEST_CUDNN and not TEST_XPU, "needs cudnn or xpu")
    @set_default_dtype(torch.double)
    def test_error_RNN_seq_len_zero(self):
        # checking error message when RNN has seq_len = 0
@ -4376,9 +4389,9 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                for device in get_all_device_types():
                    input = torch.ones(0, 10, 5)
                    rnn = module(5, 6, bidirectional=bidirectional)
-                    if device == 'cuda':
-                        rnn.cuda()
-                        input = input.cuda()
+                    if device == device_type:
+                        rnn.to(device_type)
+                        input = input.to(device_type)

                    with self.assertRaisesRegex(RuntimeError, "Expected sequence length to be larger than 0 in RNN"):
                        rnn(input)
@ -4388,9 +4401,9 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            for device in get_all_device_types():
                input = torch.zeros((5, 0, 3))
                rnn = module(input_size=3, hidden_size=4)
-                if device == 'cuda':
-                    rnn.cuda()
-                    input = input.cuda()
+                if device == 'cuda' or device == 'xpu':
+                    rnn.to(device_type)
+                    input = input.to(device_type)
                outs = rnn(input)
                self.assertEqual(outs[0].shape, torch.Size([5, 0, 4]))
                # Check that backward does not cause a hard error
@ -4403,7 +4416,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                for cuda in (True, False):
                    rnn = nn.RNN(100, 100, 2, bias=False, dropout=p, nonlinearity='relu')
                    if cuda:
-                        rnn.cuda()
+                        rnn.to(device_type)

                    if train:
                        rnn.train()
@ -4412,8 +4425,8 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
                    input = torch.rand(1, 1, 100)
                    hx = torch.rand(2, 1, 100)
                    if cuda:
-                        input = input.cuda()
-                        hx = hx.cuda()
+                        input = input.to(device_type)
+                        hx = hx.to(device_type)

                    output1, hy1 = rnn(input, hx)
                    output2, hy2 = rnn(input, hx)
@ -4444,8 +4457,8 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            rnn = nn.RNN(100, 100, 2, dropout=0, nonlinearity='relu')
            input = torch.rand(3, 2, 100)
            if cuda:
-                input.data = input.data.cuda()
-                rnn.cuda()
+                input.data = input.data.to(device_type)
+                rnn.to(device_type)

            if train:
                rnn.train()
@ -4654,7 +4667,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")

    def test_PReLU_backward_requires_grad_false(self):
        devices = ['cpu']
-        devices += ['cuda'] if TEST_CUDA else []
+        devices += [device_type] if TEST_CUDA or TEST_XPU else []
        for d in devices:
            m = nn.PReLU().to(d)
            x = torch.randn(2, 3, 4, 5, device=d, requires_grad=False)
@ -5008,17 +5021,17 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        helper(self, torch.bfloat16)
        helper(self, torch.float16)

-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
-    @unittest.skipIf(not TEST_CUDNN, "needs cudnn")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU unavailable")
+    @unittest.skipIf(not TEST_CUDNN and not TEST_XPU, "needs cudnn or xpu")
    def test_batchnorm_cudnn_nhwc(self):
        def run_test(input, grad_output):
            c = input.size(1)
-            mod = nn.BatchNorm2d(c).cuda().float()
+            mod = nn.BatchNorm2d(c).to(device_type).float()
            mod.weight.data.uniform_()
            mod.bias.data.uniform_()
            ref_input = input.detach().clone().contiguous().requires_grad_(True)
            ref_grad = grad.detach().clone().contiguous()
-            ref_mod = nn.BatchNorm2d(c).cuda().float()
+            ref_mod = nn.BatchNorm2d(c).to(device_type).float()
            ref_mod.load_state_dict(mod.state_dict())
            out = mod(input)
            out.backward(grad_output)
@ -5031,25 +5044,25 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            self.assertEqual(mod.bias.grad, ref_mod.bias.grad)
            self.assertEqual(input.grad, ref_input.grad)

-        input = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device="cuda")
+        input = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device=device_type)
        input = input.contiguous(memory_format=torch.channels_last).detach().requires_grad_()

-        grad = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device="cuda")
+        grad = torch.randint(1, 10, (4, 8, 2, 2), dtype=torch.float32, device=device_type)
        grad = grad.contiguous(memory_format=torch.channels_last)
        run_test(input, grad)
        # see #42588, grad is channels_last contiguous, but grad.suggest_memory_format (rightly) return "contiguous"
        # not channels_last
-        input = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device="cuda")
+        input = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device=device_type)
        input = input.contiguous(memory_format=torch.channels_last).detach().requires_grad_()
-        grad = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device="cuda")
+        grad = torch.randint(1, 10, (2, 8, 8, 1), dtype=torch.float32, device=device_type)
        grad = grad.permute(0, 2, 1, 3)
        run_test(input, grad)

-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU unavailable")
    def test_batchnorm_cudnn_half(self):
        # THNN
-        input = torch.randint(1, 10, (2, 3, 2, 2), dtype=torch.half, device="cuda", requires_grad=True)
-        m = nn.BatchNorm2d(3).half().cuda()
+        input = torch.randint(1, 10, (2, 3, 2, 2), dtype=torch.half, device=device_type, requires_grad=True)
+        m = nn.BatchNorm2d(3).half().to(device_type)
        thnn_output = m(input)
        thnn_output.sum().backward()
        thnn_input_grad = input.grad.data.clone()
@ -5065,10 +5078,10 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            self.assertEqual(cudnn_output, thnn_output)
            self.assertEqual(cudnn_input_grad, thnn_input_grad, atol=1e-3, rtol=0)

-    @unittest.skipIf(not TEST_CUDA, "CUDA unavailable")
+    @unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU unavailable")
    def test_batchnorm_nonaffine_cuda_half_input(self):
-        input = torch.randn(16, 3, 24, 24, dtype=torch.half, device="cuda")
-        m = nn.BatchNorm2d(3, affine=False).cuda().float()  # keep running stats in FP32
+        input = torch.randn(16, 3, 24, 24, dtype=torch.half, device=device_type)
+        m = nn.BatchNorm2d(3, affine=False).to(device_type).float()  # keep running stats in FP32
        output = m(input)
        self.assertEqualTypeString(output, input)
        m.eval()
@ -5155,7 +5168,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        self.assertTrue(torch.equal(running_var, bn.running_var))


-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @unittest.skipIf(not torch.cuda.is_available() and not torch.xpu.is_available(), "CUDA and XPU not available")
    @parametrize_test("dims", [2, 3], name_fn=lambda x: f"{x}D")
    @parametrize_test("mode", ["train", "inference"], name_fn=lambda x: x)
    @parametrize_test(
@ -5193,13 +5206,13 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        name_fn=lambda f, b, m, t: f"{f}_vs_{b}{'_mixed' if m else ''}_{dtype_name(t)}"
    )
    def test_batchnorm(self, dims, mode, memory_format, ref_backend, mixed, dtype):
-        if torch.version.cuda:
+        if torch.version.cuda or torch.version.xpu:
            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
                                        "test_batchnorm_3D_train_NCHW_vs_cpu_mixed_bfloat16",
                                        "test_batchnorm_2D_train_NHWC_vs_NCHW_mixed_bfloat16",
                                        "test_batchnorm_3D_train_NHWC_vs_NCHW_mixed_bfloat16",
                                        "test_batchnorm_3D_train_NCHW_vs_native_mixed_float16"):
-                self.skipTest("Failed on CUDA")
+                self.skipTest(f"Failed on {device_type.upper()}")

        if torch.version.hip:
            if self._testMethodName in ("test_batchnorm_2D_train_NCHW_vs_cpu_mixed_bfloat16",
@ -5234,7 +5247,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            if backend in ("NHWC", "NHWC3D", "NCHW", "NCHW3D"):
                return device
            if backend == "native":
-                return "cuda"
+                return device_type
            if backend == "cpu":
                return "cpu"
            else:
@ -5304,11 +5317,11 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
            memory_format = _get_memory_format_from_name(memory_format_name)

            ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
-            ref_device = _get_ref_device(ref_backend, device="cuda")
+            ref_device = _get_ref_device(ref_backend, device=device_type)

            size = (4, 8, 2, 2, 2) if memory_format_name in ("NCHW3D", "NHWC3D") else (4, 8, 2, 2)
-            inp = _create_tensor(size, memory_format, dtype, device="cuda").detach().requires_grad_()
-            grad = _create_tensor(size, memory_format, dtype, device="cuda")
+            inp = _create_tensor(size, memory_format, dtype, device=device_type).detach().requires_grad_()
+            grad = _create_tensor(size, memory_format, dtype, device=device_type)
            ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device).requires_grad_()
            ref_grad = grad.detach().clone(memory_format=ref_memory_format).to(device=ref_device)

@ -5318,10 +5331,10 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        def _inference(memory_format_name, ref_backend, mixed, dtype):
            memory_format = _get_memory_format_from_name(memory_format_name)
            ref_memory_format = _get_backend_memory_format(ref_backend, memory_format)
-            ref_device = _get_ref_device(ref_backend, device="cuda")
+            ref_device = _get_ref_device(ref_backend, device=device_type)

            size = (2, 64, 50, 50, 50) if memory_format_name in ("NCHW3D", "NHWC3D") else (2, 64, 50, 50)
-            inp = _create_tensor(size, memory_format, dtype, device="cuda")
+            inp = _create_tensor(size, memory_format, dtype, device=device_type)
            ref_inp = inp.detach().clone(memory_format=ref_memory_format).to(device=ref_device)
            mod = _create_backend(inp, mixed).eval()
            ref_mod = _create_backend(ref_inp, mixed).eval()
@ -5336,13 +5349,13 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        else:
            _inference(memory_format, ref_backend, mixed, dtype)

-    @unittest.skipIf(not torch.cuda.is_available(), "CUDA not available")
+    @unittest.skipIf(not torch.cuda.is_available() and not torch.xpu.is_available(), "CUDA and XPU not available")
    def test_batchnorm_nhwc_cuda(self):
        for dtype in (torch.half, torch.float):
            (N, C, H, W) = 2, 64, 50, 50
            model = torch.nn.BatchNorm2d(C, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
-            model = model.eval().cuda().to(dtype)
-            inp1 = torch.randn(N, C, H, W, device=torch.device('cuda'), dtype=dtype)
+            model = model.eval().to(device_type).to(dtype)
+            inp1 = torch.randn(N, C, H, W, device=device_type, dtype=dtype)
            inp2 = inp1.contiguous(memory_format=torch.channels_last)
            out1 = model(inp1)
            out2 = model(inp2)
@ -5413,7 +5426,7 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")

    @unittest.expectedFailure
    def test_pdist_cuda_gradgrad_unimplemented(self):
-        inp = torch.randn(4, 5, device='cuda', requires_grad=True)
+        inp = torch.randn(4, 5, device=device_type, requires_grad=True)
        gradgradcheck(F.pdist, (inp,))

    # Merge into OpInfo?
@ -5745,9 +5758,9 @@ tensor(..., device='meta', size=(1,), requires_grad=True)""")
        with self.assertRaisesRegex(RuntimeError, "bicubic interpolation only supports 4D input"):
            F.grid_sample(torch.empty(1, 1, 2, 2, 2), torch.empty(1, 1, 1, 1, 3), mode='bicubic')

-        if TEST_CUDA:
+        if TEST_GPU:
            with self.assertRaisesRegex(RuntimeError, "Expected all tensors to be on the same device"):
-                F.grid_sample(input.cuda(), grid, align_corners=False)
+                F.grid_sample(input.to(device_type), grid, align_corners=False)

    def test_affine_grid_error_checking(self):
        # 2D affine
--- a/torch/testing/_internal/common_device_type.py
+++ b/torch/testing/_internal/common_device_type.py
@ -2002,7 +2002,11 @@ def skipPRIVATEUSE1(fn):
 # TODO: the "all" in the name isn't true anymore for quite some time as we have also have for example XLA and MPS now.
 #  This should probably enumerate all available device type test base classes.
 def get_all_device_types() -> list[str]:
-    return ["cpu"] if not torch.cuda.is_available() else ["cpu", "cuda"]
+    return (
+        ["cpu"]
+        if not torch.accelerator.is_available()
+        else ["cpu", torch.accelerator.current_accelerator().type]
+    )


 # skip since currently flex attention requires at least `avx2` support on CPU.