Add more UTs

Port test_op_dtype_prop.py to Intel GPU
Extend 8 runners to 12 runners
2025-10-24 15:44:58 +08:00 · 2025-10-23 03:30:56 -07:00 · 2025-10-23 06:59:27 +00:00 · 2025-10-22 22:49:02 -07:00 · 2025-10-22 03:30:52 -07:00 · 2025-10-21 20:24:03 -07:00
14 changed files with 333 additions and 324 deletions
--- a/.github/workflows/xpu.yml
+++ b/.github/workflows/xpu.yml
@ -59,14 +59,18 @@ jobs:
      runner: linux.c7i.12xlarge
      test-matrix: |
        { include: [
-          { config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
-          { config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
+          { config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
+          { config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
        ]}
    secrets: inherit

--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@ -79,7 +79,12 @@ from torch.testing._internal.common_utils import (
    TEST_WITH_ROCM,
 )
 from torch.testing._internal.custom_tensor import CustomTensorPlainOut
-from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
+from torch.testing._internal.inductor_utils import (
+    GPU_TYPE,
+    HAS_GPU,
+    HAS_XPU_AND_TRITON,
+    IS_BIG_GPU,
+)
 from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
 from torch.testing._internal.triton_utils import requires_gpu
 from torch.utils import _pytree as pytree
@ -1543,7 +1548,9 @@ class AOTInductorTestsTemplate:
        )

    # scaled_dot_product_flash_attention
-    @unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
+    @unittest.skipIf(
+        not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
+    )
    def test_sdpa(self):
        class Model(torch.nn.Module):
            def __init__(self) -> None:
@ -5574,8 +5581,8 @@ class AOTInductorTestsTemplate:
                ).run(code)

    def test_aoti_debug_printing_model_inputs_codegen(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("requires CUDA")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")

        class Model(torch.nn.Module):
            def __init__(self):
@ -5911,11 +5918,12 @@ class AOTInductorTestsTemplate:
        example_inputs = (torch.randn(2, 128, 4096, device=self.device),)
        self.check_model(Model(), example_inputs, dynamic_shapes={"x": {0: bs}})

+    @skipIfXpu(msg="Currently Profiling not enabled on XPU CI builds")
    @requires_gpu
    def test_d2h_copy(self):
        # device to copy host should always have the same stride
-        if "cuda" not in self.device:
-            raise unittest.SkipTest("This test is only for CUDA")
+        if GPU_TYPE not in self.device:
+            raise unittest.SkipTest("This test is only for GPU")

        class ToCpuModel(nn.Module):
            def forward(self, x):
@ -5939,7 +5947,7 @@ class AOTInductorTestsTemplate:
        with torch.profiler.profile(
            activities=[
                torch.profiler.ProfilerActivity.CPU,
-                torch.profiler.ProfilerActivity.CUDA,
+                getattr(torch.profiler.ProfilerActivity, GPU_TYPE.upper()),
            ],
        ) as prof:
            true_res = aoti_model(input_tensor)
@ -6367,8 +6375,8 @@ class AOTInductorTestsTemplate:
        runner.free_inactive_constant_buffer()

    def test_update_user_managed_buffer(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("requires CUDA")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")

        class Model(torch.nn.Module):
            def __init__(self, n, k, device):
@ -6412,10 +6420,10 @@ class AOTInductorTestsTemplate:
            "L__self___weight": torch.randn(N, K, device=self.device),
            "L__self___bias": torch.randn(N, device=self.device),
        }
-        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
        # Do not use user managed_buffer, should have less free memory.
        runner.update_constant_buffer(new_weights, True, False, False)
-        mem_after, _ = torch.cuda.mem_get_info(self.device)
+        mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
        self.assertGreater(mem_before, mem_after)

        runner.swap_constant_buffer()
@ -6447,10 +6455,10 @@ class AOTInductorTestsTemplate:
            "L__self___weight": torch.randn(N, K, device=self.device),
            "L__self___bias": torch.randn(N, device=self.device),
        }
-        mem_before, _ = torch.cuda.mem_get_info(self.device)
+        mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
        # Try user managed_buffer, should have same free memory.
        runner.update_constant_buffer(new_weights, True, False, True)
-        mem_after, _ = torch.cuda.mem_get_info(self.device)
+        mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
        self.assertEqual(mem_before, mem_after, atol=1e-3, rtol=1e-3)

        runner.swap_constant_buffer()
@ -6522,8 +6530,8 @@ class AOTInductorTestsTemplate:
        "To enable after the C shim FC window ends",
    )
    def test_misaligned_input_1(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("CUDA test only")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("GPU test only")

        class Model(torch.nn.Module):
            def forward(self, x):
@ -6549,8 +6557,8 @@ class AOTInductorTestsTemplate:
        torch.testing.assert_close(actual, expected)

    def test_misaligned_input_2(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("CUDA test only")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("GPU test only")

        class Model(torch.nn.Module):
            def forward(self, x):
@ -7098,8 +7106,8 @@ class AOTInductorTestsTemplate:
        self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)

    def test_sym_expr_indexing(self):
-        if self.device != "cuda":
-            raise unittest.SkipTest("requires CUDA")
+        if self.device != GPU_TYPE:
+            raise unittest.SkipTest("requires GPU")

        class Repro(torch.nn.Module):
            def __init__(self) -> None:
@ -7117,7 +7125,7 @@ class AOTInductorTestsTemplate:
                arange_1 = torch.ops.aten.arange.start(
                    180,
                    181,
-                    device=torch.device(type="cuda", index=0),
+                    device=torch.device(type=GPU_TYPE, index=0),
                    pin_memory=False,
                )
                add_14 = torch.ops.aten.add.Tensor(arange_1, 198)
@ -7636,8 +7644,6 @@ GPU_TEST_FAILURES = {
    "test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
    # No scaled_dot_product_efficient_attention implementation for XPU yet.
    "test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
-    # No fft implementation for XPU yet.
-    "test_fft_c2c": fail_gpu(("xpu",), is_skip=True),
 }

 MPS_TEST_FAILURES = {
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@ -28,12 +28,7 @@ from torch.export.pt2_archive._package import (
    load_weights_to_pt2_contents,
 )
 from torch.testing._internal.common_cuda import _get_torch_cuda_version
-from torch.testing._internal.common_utils import (
-    IS_FBCODE,
-    skipIfRocm,
-    skipIfXpu,
-    TEST_CUDA,
-)
+from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU


@ -688,13 +683,13 @@ class TestAOTInductorPackage(TestCase):
        self.assertEqual(loaded1(*example_inputs1), ep1.module()(*example_inputs1))
        self.assertEqual(loaded2(*example_inputs2), ep2.module()(*example_inputs2))

-    @unittest.skipIf(not TEST_CUDA, "requires cuda")
+    @unittest.skipIf(not HAS_GPU, "requires gpu")
    def test_duplicate_calls(self):
        options = {
            "aot_inductor.package": True,
        }

-        device = "cuda"
+        device = GPU_TYPE

        class Model1(torch.nn.Module):
            def __init__(self) -> None:
--- a/test/inductor/test_best_config.py
+++ b/test/inductor/test_best_config.py
@ -9,7 +9,7 @@ import unittest

 import torch
 from torch._inductor import config
-from torch.testing._internal.common_utils import IS_LINUX, skipIfXpu
+from torch.testing._internal.common_utils import IS_LINUX
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU


@ -48,7 +48,6 @@ class TestKernelBestConfig(TestCase):
        config.max_autotune = cls.original_max_autotune
        super().tearDownClass()

-    @skipIfXpu
    def test_best_config_has_triton_cache_key(self):
        with tempfile.TemporaryDirectory() as tmpdir:
            os.environ["TORCHINDUCTOR_CACHE_DIR"] = tmpdir
--- a/test/inductor/test_codecache.py
+++ b/test/inductor/test_codecache.py
@ -68,6 +68,7 @@ from torch.testing._internal.inductor_utils import (
    HAS_GPU,
    HAS_MULTIGPU,
    HAS_TRITON,
+    HAS_XPU_AND_TRITON,
    patch_inductor_backend,
    requires_gpu,
    requires_triton,
@ -1215,7 +1216,7 @@ class TestFxGraphCache(TestCase):
            self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
            self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)

-    @requires_cuda_and_triton
+    @requires_gpu()
    @config.patch({"fx_graph_cache": True})
    @config.patch({"fx_graph_remote_cache": False})
    @with_tf32_off
@ -1238,7 +1239,7 @@ class TestFxGraphCache(TestCase):
        def fn2(q, k, v):
            return flex_attention(q, k, v, score_mod=score_mod2, block_mask=block_mask)

-        a, b, c = (torch.randn(1, 4, 512, 64).cuda() for _ in range(3))
+        a, b, c = (torch.randn(1, 4, 512, 64).to(GPU_TYPE) for _ in range(3))
        compiled_fn = torch.compile(fn)
        compiled_fn2 = torch.compile(fn2)

@ -2923,8 +2924,8 @@ class TestAutotuneCache(TestCase):
        for k in global_stats.triton.cache.keys():
            self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")

-    @requires_cuda_and_triton
-    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+    @requires_gpu()
+    @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
    @config.patch({"fx_graph_cache": False})
    @config.patch({"fx_graph_remote_cache": False})
    @config.patch({"autotune_local_cache": False})
@ -2942,10 +2943,10 @@ class TestAutotuneCache(TestCase):
        def f(x, y, a, b):
            return Model()(x, y, a, b)

-        x = torch.randn(100, 100).cuda()
-        y = torch.randn(100, 100).cuda()
-        a = torch.randn(1000, 100).cuda()
-        b = torch.randn(1000, 100).cuda()
+        x = torch.randn(100, 100).to(GPU_TYPE)
+        y = torch.randn(100, 100).to(GPU_TYPE)
+        a = torch.randn(1000, 100).to(GPU_TYPE)
+        b = torch.randn(1000, 100).to(GPU_TYPE)
        f_compiled = torch.compile(f, fullgraph=True)

        with PatchCaches():
@ -2964,8 +2965,8 @@ class TestAutotuneCache(TestCase):
        for k in global_stats.triton.cache.keys():
            self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")

-    @requires_cuda_and_triton
-    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+    @requires_gpu()
+    @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
    @config.patch({"fx_graph_cache": False})
    @config.patch({"fx_graph_remote_cache": False})
    @config.patch({"autotune_local_cache": True})
@ -2983,12 +2984,12 @@ class TestAutotuneCache(TestCase):

        f_compiled = torch.compile(f, fullgraph=True)

-        a = torch.randn(101, 100).cuda()
-        b = torch.randn(101, 100).cuda()
-        c = torch.randn(102, 100).cuda()
-        d = torch.randn(102, 100).cuda()
-        e = torch.randn(103, 100).cuda()
-        f = torch.randn(103, 100).cuda()
+        a = torch.randn(101, 100).to(GPU_TYPE)
+        b = torch.randn(101, 100).to(GPU_TYPE)
+        c = torch.randn(102, 100).to(GPU_TYPE)
+        d = torch.randn(102, 100).to(GPU_TYPE)
+        e = torch.randn(103, 100).to(GPU_TYPE)
+        f = torch.randn(103, 100).to(GPU_TYPE)

        with PatchCaches():
            f_compiled(a, b, c, d, e, f)
@ -3025,8 +3026,8 @@ class TestAutotuneCache(TestCase):
            self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")

    @requires_triton()
-    @requires_cuda_and_triton
-    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+    @requires_gpu()
+    @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
    @config.patch({"fx_graph_cache": False})
    @config.patch({"fx_graph_remote_cache": False})
    @config.patch({"bundled_autotune_remote_cache": False})
@ -3089,8 +3090,8 @@ class TestAutotuneCache(TestCase):


 class TestRemoteAOTAutogradCache(TestCase):
-    @requires_cuda_and_triton
-    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+    @requires_gpu()
+    @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
    @config.patch({"fx_graph_cache": False})
    @config.patch({"fx_graph_remote_cache": True})
    @torch._functorch.config.patch({"enable_autograd_cache": False})
@ -3100,8 +3101,8 @@ class TestRemoteAOTAutogradCache(TestCase):
            return a + b

        f_compiled = torch.compile(f)
-        a = torch.randn(101, 100, device="cuda", requires_grad=False)
-        b = torch.randn(101, 100, device="cuda", requires_grad=False)
+        a = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
+        b = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
        with PatchCaches():
            f_compiled(a, b)

@ -3128,8 +3129,8 @@ class TestRemoteAOTAutogradCache(TestCase):
        for k in global_stats.fx_graph.cache.keys():
            self.assertRegex(k, r"pt2:fx-graph-v1::[0-9a-z]{52}:c[0-9]+")

-    @requires_cuda_and_triton
-    @unittest.skipIf(not SM80OrLater, "Requires SM80+")
+    @requires_gpu()
+    @unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
    @config.patch({"fx_graph_cache": False})
    @config.patch({"fx_graph_remote_cache": True})
    @torch._functorch.config.patch({"enable_autograd_cache": False})
@ -3203,7 +3204,7 @@ class TestUtils(TestCase):

    # This combination of settings exposed a bug where we cleared the
    # PyCodeCache disk artifacts while they were still needed:
-    @requires_cuda_and_triton
+    @requires_gpu()
    @config.patch(
        {
            "coordinate_descent_tuning": True,
@ -3212,9 +3213,9 @@ class TestUtils(TestCase):
    )
    def test_force_disable_coordinate_descent(self):
        def fn():
-            inp = torch.randn(32, 50, 768, device="cuda")
-            weight = torch.randn(768, 768, device="cuda")
-            layer = torch.nn.LayerNorm(768, device="cuda")
+            inp = torch.randn(32, 50, 768, device=GPU_TYPE)
+            weight = torch.randn(768, 768, device=GPU_TYPE)
+            layer = torch.nn.LayerNorm(768, device=GPU_TYPE)
            return layer(inp @ weight)

        torch.compile(fn)()
--- a/test/inductor/test_compiled_optimizers.py
+++ b/test/inductor/test_compiled_optimizers.py
@ -201,14 +201,17 @@ KERNEL_COUNT_OVERRIDES = {
    "test_adamw_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
    "test_adamw_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
-    "test_adamw_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
    "test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
+    "test_adamw_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
+    "test_adamw_tensor_lr_tensor_betas_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_adamw_tensor_lr_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
    "test_adamw_tensor_lr_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_adam_tensor_lr_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
    "test_adam_tensor_lr_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_adam_tensor_lr_tensor_betas_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
+    "test_adam_tensor_lr_tensor_betas_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_adam_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
+    "test_adam_tensor_lr_tensor_betas_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_adam_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
    "test_adam_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_adadelta_tensor_lr_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
@ -247,9 +250,9 @@ KERNEL_COUNT_OVERRIDES = {
    "test_adamax_tensor_lr_weight_decay_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
    "test_adamax_tensor_lr_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_asgd_tensor_lr_weight_decay_maximize_capturable_cuda": lambda x: assert_expected_inline(x, """5"""),
-    "test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": lambda x: assert_expected_inline(x, """8"""),
+    "test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": lambda x: assert_expected_inline(x, """5"""),
    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),  # noqa: B950
-    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """9"""),  # noqa: B950
+    "test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),  # noqa: B950
    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": lambda x: assert_expected_inline(x, """6"""),
    "test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": lambda x: assert_expected_inline(x, """6"""),
    "test_sgd_tensor_lr_cpu": lambda x: assert_expected_inline(x, """2"""),
@ -436,7 +439,7 @@ def make_test(
    closure=None,
    scheduler_cls=None,
    kernel_count=2,
-    device="cuda",
+    device=GPU_TYPE,
    **kwargs,
 ):
    @config.patch("score_fusion_memory_threshold", 1)
--- a/test/inductor/test_cooperative_reductions.py
+++ b/test/inductor/test_cooperative_reductions.py
@ -18,7 +18,7 @@ from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
 )
-from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU


 class TestingHeuristics(InductorChoices):
@ -176,7 +176,7 @@ class CooperativeReductionTests(TestCase):
            return reduction_fn(x + y, dim=-1)

        reduction_fn = getattr(torch, name)
-        args = [torch.randn(1, 1024**2, device="cuda", dtype=dtype) for _ in range(2)]
+        args = [torch.randn(1, 1024**2, device=GPU_TYPE, dtype=dtype) for _ in range(2)]
        self.run_and_check(fn, args, dtype)

    def test_bool_reduction_fns(self):
@ -190,7 +190,7 @@ class CooperativeReductionTests(TestCase):
                torch.all(x > y),
            ]

-        args = [torch.randn(1024, device="cuda") for _ in range(2)]
+        args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
        source_code = self.run_and_check(fn, args)
        if "async_compile.multi_kernel" in source_code:
            return
@ -204,7 +204,7 @@ class CooperativeReductionTests(TestCase):
        def fn(x):
            return x.mean(), x.std() + x.min()

-        args = [torch.randn([bs, count], device="cuda")]
+        args = [torch.randn([bs, count], device=GPU_TYPE)]
        self.run_and_check(fn, args)

    def test_chained_reductions(self):
@ -213,18 +213,19 @@ class CooperativeReductionTests(TestCase):
                x = x + torch.softmax(x, 1)
            return x

-        args = [torch.randn(4, 100000, device="cuda")]
+        args = [torch.randn(4, 100000, device=GPU_TYPE)]
        source_code = self.run_and_check(fn, args)
        if "async_compile.multi_kernel" in source_code:
            return

        # With online softmax, the computation of max and sum are done
        # jointly and they share a single barrier call.
-        expected_num_barrier = 8 if config.online_softmax else 16
+        # XPU doesn't support online softmax yet.
+        expected_num_barrier = 8 if config.online_softmax and GPU_TYPE != "xpu" else 16
        self.assertEqual(
            source_code.count("triton_helpers.x_grid_barrier"), expected_num_barrier
        )
-        self.assertEqual(source_code.count("empty_strided_cuda"), 5)
+        self.assertEqual(source_code.count(f"empty_strided_{GPU_TYPE}"), 5)

    def test_reduce_split(self):
        def fn(a, b):
@ -233,8 +234,8 @@ class CooperativeReductionTests(TestCase):
            return a1, b1

        inps = [
-            torch.rand(2048, 512, device="cuda"),
-            torch.rand(20, 20, device="cuda"),
+            torch.rand(2048, 512, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        ]
        self.run_and_check(fn, inps, expect_kernel_count=2)

@ -290,7 +291,7 @@ class TestFixedConfigs(TestCase):
        def fn(x):
            return torch.softmax(x + 1, dim=-1) + x

-        args = [torch.randn(8, 8000, device="cuda")]
+        args = [torch.randn(8, 8000, device=GPU_TYPE)]
        self._check(fn, args, persistent=persistent, cooperative=cooperative, cfg=cfg)

    @parametrize(
@ -315,7 +316,7 @@ class TestFixedConfigs(TestCase):
        cfg = {"XBLOCK": 64, "RSPLIT": rsplit, "num_warps": 8}
        if not persistent:
            cfg["R0_BLOCK"] = 64
-        args = [torch.randn(x, r, device="cuda")]
+        args = [torch.randn(x, r, device=GPU_TYPE)]
        self._check(fn, args, persistent=persistent, cfg=cfg)

    @parametrize("persistent", [True, False])
@ -335,8 +336,8 @@ class TestFixedConfigs(TestCase):
        args = [
            torch.stack(
                [
-                    torch.arange(10, 4096, device="cuda"),
-                    -torch.arange(10, 4096, device="cuda"),
+                    torch.arange(10, 4096, device=GPU_TYPE),
+                    -torch.arange(10, 4096, device=GPU_TYPE),
                ]
            )
        ]
@ -346,12 +347,12 @@ class TestFixedConfigs(TestCase):
                [
                    torch.tensor(
                        [0.0] * 150 + [float("inf")] * 150,
-                        device="cuda",
+                        device=GPU_TYPE,
                        dtype=torch.float32,
                    ),
                    torch.tensor(
                        [0.0] * 150 + [-float("inf")] * 150,
-                        device="cuda",
+                        device=GPU_TYPE,
                        dtype=torch.float32,
                    ),
                ]
@ -374,12 +375,12 @@ class TestFixedConfigs(TestCase):
        cfg = {"XBLOCK": 128, "RSPLIT": rsplit, "num_warps": 16, "num_stages": 1}
        if not persistent:
            cfg["R0_BLOCK"] = 64
-        args = [torch.randn(1024, device="cuda") for _ in range(2)]
+        args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
        self._check(fn, args, persistent=persistent, cfg=cfg)


 if __name__ == "__main__":
    from torch._dynamo.test_case import run_tests

-    if HAS_CUDA_AND_TRITON:
+    if HAS_GPU:
        run_tests(needs="filelock")
--- a/test/inductor/test_foreach.py
+++ b/test/inductor/test_foreach.py
@ -14,8 +14,8 @@ from torch.testing._internal.common_utils import (
    IS_FBCODE,
    parametrize,
 )
-from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
-from torch.testing._internal.triton_utils import requires_cuda_and_triton
+from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
+from torch.testing._internal.triton_utils import requires_gpu
 from torch.utils._pytree import tree_flatten


@ -23,11 +23,11 @@ aten = torch.ops.aten

 try:
    try:
-        from .test_torchinductor import check_model, check_model_cuda
+        from .test_torchinductor import check_model, check_model_gpu
    except ImportError:
        from test_torchinductor import (  # @manual=fbcode//caffe2/test/inductor:test_inductor-library
            check_model,
-            check_model_cuda,
+            check_model_gpu,
        )
 except (unittest.SkipTest, ImportError) as e:
    sys.stderr.write(f"{type(e)}: {e}\n")
@ -188,30 +188,30 @@ decomp_ops = parametrize("op", compose_ops, name_fn=lambda f: f.__name__)
 def gen_args(op):
    if op in un_ops_under_test:
        return (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )
    elif op in bin_ops_under_test:
        return (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )
    else:
        return (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )


@instantiate_parametrized_tests
 class ForeachTests(TestCase):
-    check_model_cuda = check_model_cuda
+    check_model_gpu = check_model_gpu
    check_model_cpu = check_model
    check_kernel_count = True

@ -239,7 +239,7 @@ class ForeachTests(TestCase):
            def fn(a0, a1, b0, b1, c0, c1):
                return op([a0, a1], [b0, b1], [c0, c1])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            gen_args(op),
        )
@ -248,50 +248,50 @@ class ForeachTests(TestCase):
        def fn(a0, a1):
            return op([a0, a1], 3.3)

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
        )

    def _test_single_scalar_tensor(self, op):
        def fn(a0, a1):
-            return op([a0, a1], torch.tensor(3.3, device="cuda:0"))
+            return op([a0, a1], torch.tensor(3.3, device=GPU_TYPE))

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
        )

    # called in test_cuda_cpp_wrapper.py
-    @requires_cuda_and_triton
+    @requires_gpu
    def test_foreach_cpp_wrapper_cuda(self):
        self._test_single_list(op=torch._foreach_add)

-    @requires_cuda_and_triton
+    @requires_gpu
    @all_ops
    def test_single_list(self, op):
        self._test_single_list(op)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    def test_single_scalar(self, op):
        self._test_single_scalar(op)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_tensor_bin_ops
    def test_single_scalar_tensor(self, op):
        self._test_single_scalar_tensor(op)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @all_ops
    def test_scheduler_fusion_list(self, op):
        if op in un_ops_under_test:
@ -312,31 +312,31 @@ class ForeachTests(TestCase):
                c = op([a0, a1], [b0, b1], [c0, c1])
                return c, torch._foreach_add([a0, a1], c)

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            gen_args(op),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    def test_scheduler_fusion_scalar(self, op):
        def fn(a0, a1):
            c = op([a0, a1], 3.4)
            return c, torch._foreach_add([a0, a1], c)

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    def test_broadcasting(self, op):
        def fn(a0, a1, b0, b1):
@ -345,17 +345,17 @@ class ForeachTests(TestCase):
        fn_opt = torch.compile(fn)

        inputs = (
-            torch.rand(10, 1, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(1, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 1, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(1, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )
        actual = fn_opt(*inputs)
        expected = fn(*inputs)
        self.assertEqual(actual, expected)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @all_ops
    def test_singleton_lists(self, op):
        if op in un_ops_under_test:
@ -363,15 +363,15 @@ class ForeachTests(TestCase):
            def fn(a0):
                return op([a0])

-            args = (torch.rand(10, 10, device="cuda:0"),)
+            args = (torch.rand(10, 10, device=GPU_TYPE),)
        elif op in bin_ops_under_test:

            def fn(a0, b0):
                return op([a0], [b0])

            args = (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
            )

        else:
@ -380,19 +380,19 @@ class ForeachTests(TestCase):
                return op([a0], [b0], [c0])

            args = (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
            )

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            args,
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @bin_ops
    def test_type_promotion(self, op):
        def fn(a0, a1, b0, b1):
@ -403,17 +403,17 @@ class ForeachTests(TestCase):
        max32 = torch.iinfo(torch.int32).max
        max64 = torch.iinfo(torch.int64).max
        inputs = (
-            torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
-            torch.randint(max32, (20, 20), device="cuda:0", dtype=torch.int32),
-            torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
-            torch.randint(max64, (20, 20), device="cuda:0", dtype=torch.int64),
+            torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
+            torch.randint(max32, (20, 20), device=GPU_TYPE, dtype=torch.int32),
+            torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
+            torch.randint(max64, (20, 20), device=GPU_TYPE, dtype=torch.int64),
        )
        actual = fn_opt(*inputs)
        expected = fn(*inputs)
        self.assertEqual(actual, expected)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    def test_kernel_split_arg_limit_list(self, op):
        # NB: foeach_copy won't pass this test because it will dce one set of buffers
@ -426,8 +426,8 @@ class ForeachTests(TestCase):
        max_args = 370
        max_list_len = (max_args // 3) + 1
        inputs = (
-            [torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
-            [torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
+            [torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
+            [torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
        )

        actual = fn_opt(*inputs)
@ -435,7 +435,7 @@ class ForeachTests(TestCase):
        self.assertEqual(actual, expected)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    @unittest.skip(
        "Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
@ -448,27 +448,27 @@ class ForeachTests(TestCase):

        max_args = 370
        max_list_len = (max_args // 2) + 1
-        inputs = ([torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],)
+        inputs = ([torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],)

        actual = fn_opt(*inputs)
        expected = fn(*inputs)
        self.assertEqual(actual, expected)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    @bin_ops
    def test_fusion_duplicate_buffer_list(self, op):
        def fn(a0, a1, b0, b1):
            c = op([a0, a1], [b0, b1])
            return op([a0, b0], [c[0], c[0]])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
            reference_in_float=False,
            check_lowp=False,
@ -479,7 +479,7 @@ class ForeachTests(TestCase):
            kernel_count = 2
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, kernel_count)

-    @requires_cuda_and_triton
+    @requires_gpu
    @all_ops
    def test_non_foreach_consumer_list(self, op):
        if op in un_ops_under_test:
@ -500,31 +500,31 @@ class ForeachTests(TestCase):
                c = op([a0, a1], [b0, b1], [c0, c1])
                return torch.mul(c[0], a0)

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            gen_args(op),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    def test_non_foreach_consumer_scalar(self, op):
        def fn(a0, a1):
            c = op([a0, a1], 4.7)
            return torch.mul(c[0], a0)

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @all_ops
    def test_non_foreach_producer_list(self, op):
        if op in un_ops_under_test:
@ -548,13 +548,13 @@ class ForeachTests(TestCase):
                c1 = torch.add(a1, b1)
                return op([a0, a1], [b0, b1], [c0, c1])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn, gen_args(op), reference_in_float=False, check_lowp=False
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    def test_non_foreach_producer_scalar(self, op):
        def fn(a0, a1, b0, b1):
@ -562,19 +562,19 @@ class ForeachTests(TestCase):
            c1 = torch.mul(a1, b1)
            return op([c0, c1], 5.6)

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @all_ops
    def test_non_foreach_consumer_producer_list(self, op):
        if op in un_ops_under_test:
@ -607,7 +607,7 @@ class ForeachTests(TestCase):
                e1 = torch.mul(d[1], a1)
                return [e0, e1]

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            gen_args(op),
            reference_in_float=False,
@ -616,7 +616,7 @@ class ForeachTests(TestCase):

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @scalar_bin_ops
    def test_non_foreach_consumer_producer_scalar(self, op):
        def fn(a0, a1, b0, b1):
@ -627,13 +627,13 @@ class ForeachTests(TestCase):
            e1 = torch.mul(d[1], a1)
            return [e0, e1]

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
            reference_in_float=False,
            check_lowp=False,
@ -641,7 +641,7 @@ class ForeachTests(TestCase):

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @bin_ops
    @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
    @torch._dynamo.config.patch("assume_static_by_default", False)
@ -651,17 +651,17 @@ class ForeachTests(TestCase):
            return op([a0, a1], [b0, b1])

        inputs = (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )

-        self.check_model_cuda(fn, inputs)
+        self.check_model_gpu(fn, inputs)

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
    @torch._dynamo.config.patch("assume_static_by_default", False)
    @torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@ -670,17 +670,17 @@ class ForeachTests(TestCase):
            return op([a0, a1], [b0, b1])

        inputs = (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )

-        self.check_model_cuda(fn, inputs)
+        self.check_model_gpu(fn, inputs)

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @torch._dynamo.config.patch("automatic_dynamic_shapes", False)
    @torch._dynamo.config.patch("assume_static_by_default", False)
    @torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@ -690,13 +690,13 @@ class ForeachTests(TestCase):
            return op([a0, a1], [b0, b1])

        inputs = (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )

-        self.check_model_cuda(fn, inputs)
+        self.check_model_gpu(fn, inputs)

    @unittest.skipIf(IS_FBCODE, "cpp compile not supported in fbcode")
    @bin_ops
@ -715,27 +715,27 @@ class ForeachTests(TestCase):

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    @decomp_ops
    def test_decomp(self, op):
        def fn(a0, a1, b0, b1, c0, c1):
            return op([a0, a1], [b0, b1], [c0, c1], value=0.5)

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(20, 20, device="cuda:0"),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(20, 20, device=GPU_TYPE),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    def test_fuse_concat(self):
        def fn(x1, x2, x3, w1, w2, w3):
            x = torch.stack([x1, x2, x3])
@ -745,73 +745,73 @@ class ForeachTests(TestCase):

            return y

-        x1 = torch.randn(5, 4).cuda()
+        x1 = torch.randn(5, 4).to(GPU_TYPE)
        x2 = x1 + 1
        x3 = x1 + 2
-        w1 = torch.randn(4, 3).cuda()
+        w1 = torch.randn(4, 3).to(GPU_TYPE)
        w2 = w1 + 1
        w3 = w1 + 2

        args = (x1, x2, x3, w1, w2, w3)

-        self.check_model_cuda(fn, args)
+        self.check_model_gpu(fn, args)

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    def test_zero_elems(self):
        def fn(a0, a1, b0, b1):
            return torch._foreach_add([a0, a1], [b0, b1])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(0, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
-                torch.rand(0, device="cuda:0"),
-                torch.rand(10, 10, device="cuda:0"),
+                torch.rand(0, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
+                torch.rand(0, device=GPU_TYPE),
+                torch.rand(10, 10, device=GPU_TYPE),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @bin_ops
    def test_2d_blocking(self, op):
        def fn(a0, a1, b0, b1):
            return op([a0, a1], [b0, b1])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 40, device="cuda:0"),
-                torch.rand(10, 30, device="cuda:0"),
-                torch.rand(40, 10, device="cuda:0").t(),
-                torch.rand(30, 10, device="cuda:0").t(),
+                torch.rand(10, 40, device=GPU_TYPE),
+                torch.rand(10, 30, device=GPU_TYPE),
+                torch.rand(40, 10, device=GPU_TYPE).t(),
+                torch.rand(30, 10, device=GPU_TYPE).t(),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @bin_ops
    def test_2d_blocking_partitioning(self, op):
        def fn(a0, a1, b0, b1):
            return op([a0, a1], [b0, b1])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(30, 20, device="cuda:0"),
-                torch.rand(40, 30, device="cuda:0"),
-                torch.rand(30, 20, device="cuda:0"),
-                torch.rand(30, 40, device="cuda:0").t(),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(40, 30, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(30, 40, device=GPU_TYPE).t(),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    @bin_ops
    def test_2d_blocking_partitioning_elems(self, op):
        """2D blocking should be grouped by number of yelems"""
@ -819,21 +819,21 @@ class ForeachTests(TestCase):
        def fn(a0, a1, a2, b0, b1, b2):
            return op([a0, a1, a2], [b0, b1, b2])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 20, device="cuda:0"),
-                torch.rand(30, 20, device="cuda:0"),
-                torch.rand(10, 30, device="cuda:0"),
-                torch.rand(20, 10, device="cuda:0").t(),
-                torch.rand(20, 30, device="cuda:0").t(),
-                torch.rand(30, 10, device="cuda:0").t(),
+                torch.rand(10, 20, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(10, 30, device=GPU_TYPE),
+                torch.rand(20, 10, device=GPU_TYPE).t(),
+                torch.rand(20, 30, device=GPU_TYPE).t(),
+                torch.rand(30, 10, device=GPU_TYPE).t(),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    @bin_ops
    @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
    def test_2d_blocking_partitioning_mixed_sizes(self, op):
@ -842,21 +842,21 @@ class ForeachTests(TestCase):
        def fn(a0, a1, a2, b0, b1, b2):
            return op([a0, a1, a2], [b0, b1, b2])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(10, 20, device="cuda:0"),
-                torch.rand(30, 20, device="cuda:0"),
-                torch.rand(10, 30, device="cuda:0"),
-                torch.rand(20, 10, device="cuda:0").t(),
-                torch.rand(20, 30, device="cuda:0").t(),
-                torch.rand(30, 10, device="cuda:0").t(),
+                torch.rand(10, 20, device=GPU_TYPE),
+                torch.rand(30, 20, device=GPU_TYPE),
+                torch.rand(10, 30, device=GPU_TYPE),
+                torch.rand(20, 10, device=GPU_TYPE).t(),
+                torch.rand(20, 30, device=GPU_TYPE).t(),
+                torch.rand(30, 10, device=GPU_TYPE).t(),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @inplace_bin_ops
    def test_reinplacing(self, op):
        def fn(a0, a1, b0, b1):
@ -864,63 +864,63 @@ class ForeachTests(TestCase):
            return [a0, a1]

        inputs = (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )

-        self.check_model_cuda(fn, inputs, check_lowp=False)
+        self.check_model_gpu(fn, inputs, check_lowp=False)

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @inplace_bin_ops
    def test_reinplacing_mut_before(self, op):
        def fn(a0, a1, b0, b1):
-            a0.add_(torch.ones(10, 10, device="cuda:0"))
+            a0.add_(torch.ones(10, 10, device=GPU_TYPE))
            op([a0, a1], [b0, b1])
            return [a0, a1]

        inputs = (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )

-        self.check_model_cuda(fn, inputs, check_lowp=False)
+        self.check_model_gpu(fn, inputs, check_lowp=False)

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @inplace_bin_ops
    def test_reinplacing_mut_after(self, op):
        def fn(a0, a1, b0, b1):
            op([a0, a1], [b0, b1])
-            a0.add_(torch.ones(10, 10, device="cuda:0"))
+            a0.add_(torch.ones(10, 10, device=GPU_TYPE))
            return [a0, a1]

        inputs = (
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
-            torch.rand(10, 10, device="cuda:0"),
-            torch.rand(20, 20, device="cuda:0"),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
+            torch.rand(10, 10, device=GPU_TYPE),
+            torch.rand(20, 20, device=GPU_TYPE),
        )

-        self.check_model_cuda(fn, inputs, check_lowp=False)
+        self.check_model_gpu(fn, inputs, check_lowp=False)

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    def test_multi_device(self):
        def test_foreach_add(a0, a1, b0, b1):
            return torch._foreach_add([a0, a1], [b0, b1])

        inps = [
-            torch.ones(10, 10, device="cuda"),
+            torch.ones(10, 10, device=GPU_TYPE),
            torch.ones(20, 20, device="cpu"),
-            torch.zeros(10, 10, device="cuda"),
+            torch.zeros(10, 10, device=GPU_TYPE),
            torch.zeros(20, 20, device="cpu"),
        ]

@ -930,13 +930,13 @@ class ForeachTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    def test_aliasing(self):
        def test_foreach_add(a0, a1, a2, b0, b1, b2):
            return torch._foreach_add_([a0, a1, a2], [b0, b1, b2])

-        input = torch.ones(10, 10, device="cuda")
-        input2 = torch.ones(10, 10, device="cuda")
+        input = torch.ones(10, 10, device=GPU_TYPE)
+        input2 = torch.ones(10, 10, device=GPU_TYPE)
        inps = [
            input,
            input.view(10, 10),
@ -952,7 +952,7 @@ class ForeachTests(TestCase):
        self.assertEqual(out_eager, out_compiled)
        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)

-    @requires_cuda_and_triton
+    @requires_gpu
    @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 1)
    def test_2d_block_no_mixed_sizes_no_mask(self):
        """2D blocking with no mixed sizes constant mask"""
@ -960,21 +960,21 @@ class ForeachTests(TestCase):
        def fn(a0, a1, a2, b0, b1, b2):
            return torch._foreach_add([a0, a1, a2], [b0, b1, b2])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(1024, 2048, device="cuda:0"),
-                torch.rand(2048, 2048, device="cuda:0"),
-                torch.rand(1024, 2048, device="cuda:0"),
-                torch.rand(2048, 1024, device="cuda:0").t(),
-                torch.rand(2048, 2048, device="cuda:0").t(),
-                torch.rand(2048, 1024, device="cuda:0").t(),
+                torch.rand(1024, 2048, device=GPU_TYPE),
+                torch.rand(2048, 2048, device=GPU_TYPE),
+                torch.rand(1024, 2048, device=GPU_TYPE),
+                torch.rand(2048, 1024, device=GPU_TYPE).t(),
+                torch.rand(2048, 2048, device=GPU_TYPE).t(),
+                torch.rand(2048, 1024, device=GPU_TYPE).t(),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)

-    @requires_cuda_and_triton
+    @requires_gpu
    @torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
    def test_2d_block_mixed_sizes_with_mask(self):
        """2D blocking with mixed sizes should have mask"""
@ -982,21 +982,21 @@ class ForeachTests(TestCase):
        def fn(a0, a1, a2, b0, b1, b2):
            return torch._foreach_add([a0, a1, a2], [b0, b1, b2])

-        self.check_model_cuda(
+        self.check_model_gpu(
            fn,
            (
-                torch.rand(1024, 2048, device="cuda:0"),
-                torch.rand(2048, 2048, device="cuda:0"),
-                torch.rand(1024, 2048, device="cuda:0"),
-                torch.rand(2048, 1024, device="cuda:0").t(),
-                torch.rand(2048, 2048, device="cuda:0").t(),
-                torch.rand(2048, 1024, device="cuda:0").t(),
+                torch.rand(1024, 2048, device=GPU_TYPE),
+                torch.rand(2048, 2048, device=GPU_TYPE),
+                torch.rand(1024, 2048, device=GPU_TYPE),
+                torch.rand(2048, 1024, device=GPU_TYPE).t(),
+                torch.rand(2048, 2048, device=GPU_TYPE).t(),
+                torch.rand(2048, 1024, device=GPU_TYPE).t(),
            ),
        )

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)

-    @requires_cuda_and_triton
+    @requires_gpu
    @foreach_map_bin_ops
    def test_foreach_map_backward_binary(self, op):
        from torch._dynamo.polyfills import foreach_map_fn
@ -1011,14 +1011,14 @@ class ForeachTests(TestCase):

        ref_inps = (
            [
-                torch.rand(10, 20, device="cuda:0", requires_grad=True),
-                torch.rand(10, 30, device="cuda:0", requires_grad=True),
-                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+                torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
+                torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
+                torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
            ],
            [
-                torch.rand(10, 20, device="cuda:0", requires_grad=True),
-                torch.rand(10, 30, device="cuda:0", requires_grad=True),
-                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+                torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
+                torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
+                torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
            ],
        )
        inps = (
@ -1037,7 +1037,7 @@ class ForeachTests(TestCase):

        self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)

-    @requires_cuda_and_triton
+    @requires_gpu
    def test_foreach_map_input_mutation(self):
        def fn(xs, ys):
            outs = foreach_map_add_inplace(xs, ys)
@ -1045,14 +1045,14 @@ class ForeachTests(TestCase):

        ref_inps = (
            [
-                torch.rand(10, 20, device="cuda:0", requires_grad=True),
-                torch.rand(10, 30, device="cuda:0", requires_grad=True),
-                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+                torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
+                torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
+                torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
            ],
            [
-                torch.rand(10, 20, device="cuda:0", requires_grad=True),
-                torch.rand(10, 30, device="cuda:0", requires_grad=True),
-                torch.rand(30, 30, device="cuda:0", requires_grad=True),
+                torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
+                torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
+                torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
            ],
        )
        # Set requires_grad to be False to avoid mutating a leaf variable
@ -1073,7 +1073,7 @@ class ForeachTests(TestCase):
            ):
                _ = run_fw_bw_and_get_code(lambda: torch.compile(fn)(*inps))

-    @requires_cuda_and_triton
+    @requires_gpu
    @foreach_map_un_ops
    def test_foreach_map_backward_unary(self, op):
        from torch._dynamo.polyfills import foreach_map_fn
@ -1087,9 +1087,9 @@ class ForeachTests(TestCase):
            return outs[0].sum() + outs[1].sum() + outs[2].sum()

        ref_inp = [
-            torch.rand(10, 20, device="cuda:0", requires_grad=True),
-            torch.rand(10, 30, device="cuda:0", requires_grad=True),
-            torch.rand(30, 30, device="cuda:0", requires_grad=True),
+            torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
+            torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
+            torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
        ]

        inp = [x.clone().detach().requires_grad_(True) for x in ref_inp]
@ -1109,5 +1109,5 @@ class ForeachTests(TestCase):
 if __name__ == "__main__":
    from torch._inductor.test_case import run_tests

-    if HAS_CPU or HAS_CUDA_AND_TRITON:
+    if HAS_CPU or HAS_GPU:
        run_tests(needs="filelock")
--- a/test/inductor/test_multi_kernel.py
+++ b/test/inductor/test_multi_kernel.py
@ -111,6 +111,7 @@ class MultiKernelTest(TestCase):
    @requires_triton()
    # TODO: bobrenjc93 to fix multi-kernel for ROCM
    @skipIfRocm
+    @skipIfXpu
    @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
    def test_triton_gemm(self):
        def fn(x, y):
@ -139,6 +140,7 @@ class MultiKernelTest(TestCase):
    @requires_triton()
    # TODO: bobrenjc93 to fix multi-kernel for ROCM
    @skipIfRocm
+    @skipIfXpu
    @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
    def test_triton_relu_fused_gemm(self):
        def fn(x, y):
--- a/test/inductor/test_op_dtype_prop.py
+++ b/test/inductor/test_op_dtype_prop.py
@ -74,7 +74,7 @@ class TestCase(InductorTestCase):
        def run(op, args, kwargs):
            return op(*args, **kwargs)

-        sample_inputs_itr = op.sample_inputs("cuda", dtype, requires_grad=False)
+        sample_inputs_itr = op.sample_inputs(GPU_TYPE, dtype, requires_grad=False)
        for sample_input in sample_inputs_itr:
            args = (sample_input.input,) + sample_input.args
            kwargs = sample_input.kwargs
@ -307,7 +307,9 @@ class TestCase(InductorTestCase):
        )


-instantiate_device_type_tests(TestCase, globals(), only_for=("cuda",))
+instantiate_device_type_tests(
+    TestCase, globals(), only_for=("cuda", "xpu"), allow_xpu=True
+)

 if __name__ == "__main__":
    from torch._inductor.test_case import run_tests
--- a/test/inductor/test_pattern_matcher.py
+++ b/test/inductor/test_pattern_matcher.py
@ -35,7 +35,7 @@ from torch._inductor.virtualized import V
 from torch.fx.experimental.proxy_tensor import make_fx
 from torch.testing import FileCheck
 from torch.testing._internal.common_cuda import SM80OrLater, xfailIfSM89
-from torch.testing._internal.common_device_type import expectedFailureXPU, skipCUDAIf
+from torch.testing._internal.common_device_type import skipCUDAIf
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    IS_LINUX,
@ -139,7 +139,7 @@ class TestPatternMatcher(TestCase):
                ref[indices], test[indices]
            )  # also checks that dtype is correct

-    @skipIfXpu
+    # @skipIfXpu
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @inductor_config.patch(
        {
@ -239,7 +239,6 @@ class TestPatternMatcher(TestCase):
        self.assertEqual(f(inp), f_replaced(inp))
        self.assertEqual(count, 2)

-    @skipIfXpu
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @inductor_config.patch(
        {
@ -288,7 +287,6 @@ class TestPatternMatcher(TestCase):
            self._test_fused_int_mm_mul_impl(fn2, args, True)

    @skipIfRocm
-    @skipIfXpu
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @inductor_config.patch(
        {
@ -336,7 +334,6 @@ class TestPatternMatcher(TestCase):
                "triton_tem" if not extern_mm else "extern_kernels.mm"
            ).run(code)

-    @expectedFailureXPU
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @inductor_config.patch(
        {
@ -372,7 +369,6 @@ class TestPatternMatcher(TestCase):
        for args in args_list:
            self._test_mixed_impl(fn, args, True, False)

-    @expectedFailureXPU
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @inductor_config.patch(
        {
@ -399,7 +395,6 @@ class TestPatternMatcher(TestCase):
            )
            self._test_mixed_impl(fn, args, True, False, rtol=0.16, atol=1e-4)

-    @expectedFailureXPU
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @inductor_config.patch(
        {
@ -431,7 +426,6 @@ class TestPatternMatcher(TestCase):
        for args in args_list:
            self._test_mixed_impl(fn, args, True, False)

-    @expectedFailureXPU
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @inductor_config.patch(
        {
@ -469,7 +463,6 @@ class TestPatternMatcher(TestCase):
        for args in args_list:
            self._test_mixed_impl(fn, args, True, False)

-    @expectedFailureXPU
    @skipCUDAIf(not SM80OrLater, "need sm_80")
    @unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
    def test_mixed_mm_gating(self):
--- a/torch/_higher_order_ops/associative_scan.py
+++ b/torch/_higher_order_ops/associative_scan.py
@ -208,9 +208,9 @@ def associative_scan(
            raise ValueError(
                f"Combine_mode must either 'pointwise' or 'generic', but got {cm}"
            )
-        if cm == "pointwise" and not all(l.device.type == "cuda" for l in lxs):
+        if cm == "pointwise" and not all(l.device.type in ["cuda", "xpu"] for l in lxs):
            raise ValueError(
-                "For combine_mode='pointwise', all input tensors need to be on CUDA"
+                "For combine_mode='pointwise', all input tensors need to be on CUDA or XPU"
            )

        # Checks for xs
--- a/torch/testing/_internal/common_optimizers.py
+++ b/torch/testing/_internal/common_optimizers.py
@ -46,6 +46,7 @@ from torch.testing._internal.common_utils import (
    skipIfXpu,
    TEST_WITH_TORCHDYNAMO,
 )
+from torch.testing._internal.inductor_utils import GPU_TYPE
 from torch.utils._foreach_utils import _get_foreach_kernels_supported_devices


@ -369,7 +370,7 @@ def optim_inputs_func_adadelta(device, dtype=None):
        OptimizerInput(
            params=None, kwargs={"rho": 0.95, "weight_decay": 0.9}, desc="rho"
        ),
-    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+    ] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])


 def optim_error_inputs_func_adadelta(device, dtype):
@ -569,7 +570,7 @@ def optim_inputs_func_adam(device, dtype=None):
                desc="amsgrad",
            ),
        ]
-        + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+        + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
        + (mps_supported_configs if _get_device_type(device) == "mps" else [])
    )
    if dtype == torch.float16:
@ -650,7 +651,7 @@ def optim_error_inputs_func_adam(device, dtype):
                error_regex=r"betas\[0\] as a Tensor is not supported for capturable=False and foreach=True",
            ),
        ]
-    if _get_device_type(device) == "cuda":
+    if _get_device_type(device) == GPU_TYPE:
        sample_tensor = torch.empty((), device=device, dtype=dtype)
        error_inputs += [
            ErrorOptimizerInput(
@ -721,7 +722,7 @@ def optim_inputs_func_adamax(device, dtype=None):
            kwargs={"weight_decay": 0.1, "maximize": True},
            desc="maximize, weight_decay",
        ),
-    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+    ] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])


 def optim_error_inputs_func_adamax(device, dtype):
@ -792,7 +793,7 @@ def optim_inputs_func_asgd(device, dtype=None):
            kwargs={"weight_decay": 0.1, "maximize": True},
            desc="maximize, nonzero weight_decay",
        ),
-    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+    ] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])


 def optim_error_inputs_func_asgd(device, dtype):
@ -974,7 +975,7 @@ def optim_inputs_func_nadam(device, dtype=None):
            kwargs={"weight_decay": 0.1, "maximize": True},
            desc="maximize",
        ),
-    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+    ] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])


 def optim_error_inputs_func_nadam(device, dtype):
@ -1052,7 +1053,7 @@ def optim_inputs_func_radam(device=None, dtype=None):
            kwargs={"weight_decay": 0.1, "maximize": True},
            desc="maximize",
        ),
-    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+    ] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])


 def optim_error_inputs_func_radam(device, dtype):
@ -1137,7 +1138,7 @@ def optim_inputs_func_rmsprop(device, dtype=None):
            },
            desc="maximize, centered, weight_decay, w/ momentum",
        ),
-    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+    ] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])


 def optim_error_inputs_func_rmsprop(device, dtype):
@ -1179,7 +1180,7 @@ def optim_inputs_func_rprop(device, dtype=None):
            desc="non-default step_sizes",
        ),
        OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
-    ] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+    ] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])


 def optim_error_inputs_func_rprop(device, dtype):
@ -1671,7 +1672,7 @@ optim_db: list[OptimizerInfo] = [
            "maximize",
            "capturable",
        ),
-        supports_fused_on=("cpu", "cuda", "mps"),
+        supports_fused_on=("cpu", "cuda", "xpu", "mps"),
        decorators=(
            # Expected floating point error between fused and compiled forloop
            DecorateInfo(
@ -2161,6 +2162,7 @@ optim_db: list[OptimizerInfo] = [
        supports_fused_on=(
            "cpu",
            "cuda",
+            "xpu",
            "mps",
        ),
        skips=(
--- a/torch/testing/_internal/inductor_utils.py
+++ b/torch/testing/_internal/inductor_utils.py
@ -77,6 +77,7 @@ HAS_XPU_AND_TRITON = torch.xpu.is_available() and HAS_TRITON
 HAS_MPS = torch.mps.is_available()

 HAS_GPU = HAS_CUDA_AND_TRITON or HAS_XPU_AND_TRITON
+HAS_GPU_AND_TRITON = HAS_GPU

 GPU_TYPE = get_gpu_type()

@ -173,7 +174,7 @@ IS_H100 = LazyVal(
    and get_gpu_shared_memory() == 232448
 )

-IS_BIG_GPU = LazyVal(lambda: HAS_CUDA_AND_TRITON and is_big_gpu())
+IS_BIG_GPU = LazyVal(lambda: HAS_GPU_AND_TRITON and is_big_gpu())

 def dummy_graph() -> GraphLowering:
    """
Author	SHA1	Message	Date
xinan.lin	fd73dfb96b	Add more UTs	2025-10-23 03:30:56 -07:00
Deng, Daisy	ecea269748	Port test_op_dtype_prop.py to Intel GPU	2025-10-23 06:59:27 +00:00
xinan.lin	1db0f612b1	Extend 8 runners to 12 runners	2025-10-22 22:49:02 -07:00
xinan.lin	6a6d4fc0e9	skip failed cases.	2025-10-22 03:30:52 -07:00
xinan.lin	003e55b2b8	Add more	2025-10-21 20:24:03 -07:00
xinan.lin	e9aff568a7	[Inductor UT] Enable more UTs for Intel GPU.	2025-10-21 17:59:40 -07:00