Compare commits

...

6 Commits

Author SHA1 Message Date
fd73dfb96b Add more UTs 2025-10-23 03:30:56 -07:00
ecea269748 Port test_op_dtype_prop.py to Intel GPU 2025-10-23 06:59:27 +00:00
1db0f612b1 Extend 8 runners to 12 runners 2025-10-22 22:49:02 -07:00
6a6d4fc0e9 skip failed cases. 2025-10-22 03:30:52 -07:00
003e55b2b8 Add more 2025-10-21 20:24:03 -07:00
e9aff568a7 [Inductor UT] Enable more UTs for Intel GPU. 2025-10-21 17:59:40 -07:00
14 changed files with 333 additions and 324 deletions

View File

@ -59,14 +59,18 @@ jobs:
runner: linux.c7i.12xlarge
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
]}
secrets: inherit

View File

@ -79,7 +79,12 @@ from torch.testing._internal.common_utils import (
TEST_WITH_ROCM,
)
from torch.testing._internal.custom_tensor import CustomTensorPlainOut
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
from torch.testing._internal.inductor_utils import (
GPU_TYPE,
HAS_GPU,
HAS_XPU_AND_TRITON,
IS_BIG_GPU,
)
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
from torch.testing._internal.triton_utils import requires_gpu
from torch.utils import _pytree as pytree
@ -1543,7 +1548,9 @@ class AOTInductorTestsTemplate:
)
# scaled_dot_product_flash_attention
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
@unittest.skipIf(
not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
)
def test_sdpa(self):
class Model(torch.nn.Module):
def __init__(self) -> None:
@ -5574,8 +5581,8 @@ class AOTInductorTestsTemplate:
).run(code)
def test_aoti_debug_printing_model_inputs_codegen(self):
if self.device != "cuda":
raise unittest.SkipTest("requires CUDA")
if self.device != GPU_TYPE:
raise unittest.SkipTest("requires GPU")
class Model(torch.nn.Module):
def __init__(self):
@ -5911,11 +5918,12 @@ class AOTInductorTestsTemplate:
example_inputs = (torch.randn(2, 128, 4096, device=self.device),)
self.check_model(Model(), example_inputs, dynamic_shapes={"x": {0: bs}})
@skipIfXpu(msg="Currently Profiling not enabled on XPU CI builds")
@requires_gpu
def test_d2h_copy(self):
# device to copy host should always have the same stride
if "cuda" not in self.device:
raise unittest.SkipTest("This test is only for CUDA")
if GPU_TYPE not in self.device:
raise unittest.SkipTest("This test is only for GPU")
class ToCpuModel(nn.Module):
def forward(self, x):
@ -5939,7 +5947,7 @@ class AOTInductorTestsTemplate:
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
getattr(torch.profiler.ProfilerActivity, GPU_TYPE.upper()),
],
) as prof:
true_res = aoti_model(input_tensor)
@ -6367,8 +6375,8 @@ class AOTInductorTestsTemplate:
runner.free_inactive_constant_buffer()
def test_update_user_managed_buffer(self):
if self.device != "cuda":
raise unittest.SkipTest("requires CUDA")
if self.device != GPU_TYPE:
raise unittest.SkipTest("requires GPU")
class Model(torch.nn.Module):
def __init__(self, n, k, device):
@ -6412,10 +6420,10 @@ class AOTInductorTestsTemplate:
"L__self___weight": torch.randn(N, K, device=self.device),
"L__self___bias": torch.randn(N, device=self.device),
}
mem_before, _ = torch.cuda.mem_get_info(self.device)
mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
# Do not use user managed_buffer, should have less free memory.
runner.update_constant_buffer(new_weights, True, False, False)
mem_after, _ = torch.cuda.mem_get_info(self.device)
mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
self.assertGreater(mem_before, mem_after)
runner.swap_constant_buffer()
@ -6447,10 +6455,10 @@ class AOTInductorTestsTemplate:
"L__self___weight": torch.randn(N, K, device=self.device),
"L__self___bias": torch.randn(N, device=self.device),
}
mem_before, _ = torch.cuda.mem_get_info(self.device)
mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
# Try user managed_buffer, should have same free memory.
runner.update_constant_buffer(new_weights, True, False, True)
mem_after, _ = torch.cuda.mem_get_info(self.device)
mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
self.assertEqual(mem_before, mem_after, atol=1e-3, rtol=1e-3)
runner.swap_constant_buffer()
@ -6522,8 +6530,8 @@ class AOTInductorTestsTemplate:
"To enable after the C shim FC window ends",
)
def test_misaligned_input_1(self):
if self.device != "cuda":
raise unittest.SkipTest("CUDA test only")
if self.device != GPU_TYPE:
raise unittest.SkipTest("GPU test only")
class Model(torch.nn.Module):
def forward(self, x):
@ -6549,8 +6557,8 @@ class AOTInductorTestsTemplate:
torch.testing.assert_close(actual, expected)
def test_misaligned_input_2(self):
if self.device != "cuda":
raise unittest.SkipTest("CUDA test only")
if self.device != GPU_TYPE:
raise unittest.SkipTest("GPU test only")
class Model(torch.nn.Module):
def forward(self, x):
@ -7098,8 +7106,8 @@ class AOTInductorTestsTemplate:
self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
def test_sym_expr_indexing(self):
if self.device != "cuda":
raise unittest.SkipTest("requires CUDA")
if self.device != GPU_TYPE:
raise unittest.SkipTest("requires GPU")
class Repro(torch.nn.Module):
def __init__(self) -> None:
@ -7117,7 +7125,7 @@ class AOTInductorTestsTemplate:
arange_1 = torch.ops.aten.arange.start(
180,
181,
device=torch.device(type="cuda", index=0),
device=torch.device(type=GPU_TYPE, index=0),
pin_memory=False,
)
add_14 = torch.ops.aten.add.Tensor(arange_1, 198)
@ -7636,8 +7644,6 @@ GPU_TEST_FAILURES = {
"test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
# No scaled_dot_product_efficient_attention implementation for XPU yet.
"test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
# No fft implementation for XPU yet.
"test_fft_c2c": fail_gpu(("xpu",), is_skip=True),
}
MPS_TEST_FAILURES = {

View File

@ -28,12 +28,7 @@ from torch.export.pt2_archive._package import (
load_weights_to_pt2_contents,
)
from torch.testing._internal.common_cuda import _get_torch_cuda_version
from torch.testing._internal.common_utils import (
IS_FBCODE,
skipIfRocm,
skipIfXpu,
TEST_CUDA,
)
from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
@ -688,13 +683,13 @@ class TestAOTInductorPackage(TestCase):
self.assertEqual(loaded1(*example_inputs1), ep1.module()(*example_inputs1))
self.assertEqual(loaded2(*example_inputs2), ep2.module()(*example_inputs2))
@unittest.skipIf(not TEST_CUDA, "requires cuda")
@unittest.skipIf(not HAS_GPU, "requires gpu")
def test_duplicate_calls(self):
options = {
"aot_inductor.package": True,
}
device = "cuda"
device = GPU_TYPE
class Model1(torch.nn.Module):
def __init__(self) -> None:

View File

@ -9,7 +9,7 @@ import unittest
import torch
from torch._inductor import config
from torch.testing._internal.common_utils import IS_LINUX, skipIfXpu
from torch.testing._internal.common_utils import IS_LINUX
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
@ -48,7 +48,6 @@ class TestKernelBestConfig(TestCase):
config.max_autotune = cls.original_max_autotune
super().tearDownClass()
@skipIfXpu
def test_best_config_has_triton_cache_key(self):
with tempfile.TemporaryDirectory() as tmpdir:
os.environ["TORCHINDUCTOR_CACHE_DIR"] = tmpdir

View File

@ -68,6 +68,7 @@ from torch.testing._internal.inductor_utils import (
HAS_GPU,
HAS_MULTIGPU,
HAS_TRITON,
HAS_XPU_AND_TRITON,
patch_inductor_backend,
requires_gpu,
requires_triton,
@ -1215,7 +1216,7 @@ class TestFxGraphCache(TestCase):
self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
@requires_cuda_and_triton
@requires_gpu()
@config.patch({"fx_graph_cache": True})
@config.patch({"fx_graph_remote_cache": False})
@with_tf32_off
@ -1238,7 +1239,7 @@ class TestFxGraphCache(TestCase):
def fn2(q, k, v):
return flex_attention(q, k, v, score_mod=score_mod2, block_mask=block_mask)
a, b, c = (torch.randn(1, 4, 512, 64).cuda() for _ in range(3))
a, b, c = (torch.randn(1, 4, 512, 64).to(GPU_TYPE) for _ in range(3))
compiled_fn = torch.compile(fn)
compiled_fn2 = torch.compile(fn2)
@ -2923,8 +2924,8 @@ class TestAutotuneCache(TestCase):
for k in global_stats.triton.cache.keys():
self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
@requires_cuda_and_triton
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
@requires_gpu()
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
@config.patch({"fx_graph_cache": False})
@config.patch({"fx_graph_remote_cache": False})
@config.patch({"autotune_local_cache": False})
@ -2942,10 +2943,10 @@ class TestAutotuneCache(TestCase):
def f(x, y, a, b):
return Model()(x, y, a, b)
x = torch.randn(100, 100).cuda()
y = torch.randn(100, 100).cuda()
a = torch.randn(1000, 100).cuda()
b = torch.randn(1000, 100).cuda()
x = torch.randn(100, 100).to(GPU_TYPE)
y = torch.randn(100, 100).to(GPU_TYPE)
a = torch.randn(1000, 100).to(GPU_TYPE)
b = torch.randn(1000, 100).to(GPU_TYPE)
f_compiled = torch.compile(f, fullgraph=True)
with PatchCaches():
@ -2964,8 +2965,8 @@ class TestAutotuneCache(TestCase):
for k in global_stats.triton.cache.keys():
self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
@requires_cuda_and_triton
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
@requires_gpu()
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
@config.patch({"fx_graph_cache": False})
@config.patch({"fx_graph_remote_cache": False})
@config.patch({"autotune_local_cache": True})
@ -2983,12 +2984,12 @@ class TestAutotuneCache(TestCase):
f_compiled = torch.compile(f, fullgraph=True)
a = torch.randn(101, 100).cuda()
b = torch.randn(101, 100).cuda()
c = torch.randn(102, 100).cuda()
d = torch.randn(102, 100).cuda()
e = torch.randn(103, 100).cuda()
f = torch.randn(103, 100).cuda()
a = torch.randn(101, 100).to(GPU_TYPE)
b = torch.randn(101, 100).to(GPU_TYPE)
c = torch.randn(102, 100).to(GPU_TYPE)
d = torch.randn(102, 100).to(GPU_TYPE)
e = torch.randn(103, 100).to(GPU_TYPE)
f = torch.randn(103, 100).to(GPU_TYPE)
with PatchCaches():
f_compiled(a, b, c, d, e, f)
@ -3025,8 +3026,8 @@ class TestAutotuneCache(TestCase):
self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
@requires_triton()
@requires_cuda_and_triton
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
@requires_gpu()
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
@config.patch({"fx_graph_cache": False})
@config.patch({"fx_graph_remote_cache": False})
@config.patch({"bundled_autotune_remote_cache": False})
@ -3089,8 +3090,8 @@ class TestAutotuneCache(TestCase):
class TestRemoteAOTAutogradCache(TestCase):
@requires_cuda_and_triton
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
@requires_gpu()
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
@config.patch({"fx_graph_cache": False})
@config.patch({"fx_graph_remote_cache": True})
@torch._functorch.config.patch({"enable_autograd_cache": False})
@ -3100,8 +3101,8 @@ class TestRemoteAOTAutogradCache(TestCase):
return a + b
f_compiled = torch.compile(f)
a = torch.randn(101, 100, device="cuda", requires_grad=False)
b = torch.randn(101, 100, device="cuda", requires_grad=False)
a = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
b = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
with PatchCaches():
f_compiled(a, b)
@ -3128,8 +3129,8 @@ class TestRemoteAOTAutogradCache(TestCase):
for k in global_stats.fx_graph.cache.keys():
self.assertRegex(k, r"pt2:fx-graph-v1::[0-9a-z]{52}:c[0-9]+")
@requires_cuda_and_triton
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
@requires_gpu()
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
@config.patch({"fx_graph_cache": False})
@config.patch({"fx_graph_remote_cache": True})
@torch._functorch.config.patch({"enable_autograd_cache": False})
@ -3203,7 +3204,7 @@ class TestUtils(TestCase):
# This combination of settings exposed a bug where we cleared the
# PyCodeCache disk artifacts while they were still needed:
@requires_cuda_and_triton
@requires_gpu()
@config.patch(
{
"coordinate_descent_tuning": True,
@ -3212,9 +3213,9 @@ class TestUtils(TestCase):
)
def test_force_disable_coordinate_descent(self):
def fn():
inp = torch.randn(32, 50, 768, device="cuda")
weight = torch.randn(768, 768, device="cuda")
layer = torch.nn.LayerNorm(768, device="cuda")
inp = torch.randn(32, 50, 768, device=GPU_TYPE)
weight = torch.randn(768, 768, device=GPU_TYPE)
layer = torch.nn.LayerNorm(768, device=GPU_TYPE)
return layer(inp @ weight)
torch.compile(fn)()

View File

@ -201,14 +201,17 @@ KERNEL_COUNT_OVERRIDES = {
"test_adamw_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_tensor_lr_tensor_betas_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_tensor_lr_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adamw_tensor_lr_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adam_tensor_lr_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adam_tensor_lr_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adam_tensor_lr_tensor_betas_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adam_tensor_lr_tensor_betas_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adam_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adam_tensor_lr_tensor_betas_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adam_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adam_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_adadelta_tensor_lr_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
@ -247,9 +250,9 @@ KERNEL_COUNT_OVERRIDES = {
"test_adamax_tensor_lr_weight_decay_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_adamax_tensor_lr_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_asgd_tensor_lr_weight_decay_maximize_capturable_cuda": lambda x: assert_expected_inline(x, """5"""),
"test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": lambda x: assert_expected_inline(x, """8"""),
"test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": lambda x: assert_expected_inline(x, """5"""),
"test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": lambda x: assert_expected_inline(x, """6"""), # noqa: B950
"test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """9"""), # noqa: B950
"test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """6"""), # noqa: B950
"test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": lambda x: assert_expected_inline(x, """6"""),
"test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": lambda x: assert_expected_inline(x, """6"""),
"test_sgd_tensor_lr_cpu": lambda x: assert_expected_inline(x, """2"""),
@ -436,7 +439,7 @@ def make_test(
closure=None,
scheduler_cls=None,
kernel_count=2,
device="cuda",
device=GPU_TYPE,
**kwargs,
):
@config.patch("score_fusion_memory_threshold", 1)

View File

@ -18,7 +18,7 @@ from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
parametrize,
)
from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
class TestingHeuristics(InductorChoices):
@ -176,7 +176,7 @@ class CooperativeReductionTests(TestCase):
return reduction_fn(x + y, dim=-1)
reduction_fn = getattr(torch, name)
args = [torch.randn(1, 1024**2, device="cuda", dtype=dtype) for _ in range(2)]
args = [torch.randn(1, 1024**2, device=GPU_TYPE, dtype=dtype) for _ in range(2)]
self.run_and_check(fn, args, dtype)
def test_bool_reduction_fns(self):
@ -190,7 +190,7 @@ class CooperativeReductionTests(TestCase):
torch.all(x > y),
]
args = [torch.randn(1024, device="cuda") for _ in range(2)]
args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
source_code = self.run_and_check(fn, args)
if "async_compile.multi_kernel" in source_code:
return
@ -204,7 +204,7 @@ class CooperativeReductionTests(TestCase):
def fn(x):
return x.mean(), x.std() + x.min()
args = [torch.randn([bs, count], device="cuda")]
args = [torch.randn([bs, count], device=GPU_TYPE)]
self.run_and_check(fn, args)
def test_chained_reductions(self):
@ -213,18 +213,19 @@ class CooperativeReductionTests(TestCase):
x = x + torch.softmax(x, 1)
return x
args = [torch.randn(4, 100000, device="cuda")]
args = [torch.randn(4, 100000, device=GPU_TYPE)]
source_code = self.run_and_check(fn, args)
if "async_compile.multi_kernel" in source_code:
return
# With online softmax, the computation of max and sum are done
# jointly and they share a single barrier call.
expected_num_barrier = 8 if config.online_softmax else 16
# XPU doesn't support online softmax yet.
expected_num_barrier = 8 if config.online_softmax and GPU_TYPE != "xpu" else 16
self.assertEqual(
source_code.count("triton_helpers.x_grid_barrier"), expected_num_barrier
)
self.assertEqual(source_code.count("empty_strided_cuda"), 5)
self.assertEqual(source_code.count(f"empty_strided_{GPU_TYPE}"), 5)
def test_reduce_split(self):
def fn(a, b):
@ -233,8 +234,8 @@ class CooperativeReductionTests(TestCase):
return a1, b1
inps = [
torch.rand(2048, 512, device="cuda"),
torch.rand(20, 20, device="cuda"),
torch.rand(2048, 512, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
]
self.run_and_check(fn, inps, expect_kernel_count=2)
@ -290,7 +291,7 @@ class TestFixedConfigs(TestCase):
def fn(x):
return torch.softmax(x + 1, dim=-1) + x
args = [torch.randn(8, 8000, device="cuda")]
args = [torch.randn(8, 8000, device=GPU_TYPE)]
self._check(fn, args, persistent=persistent, cooperative=cooperative, cfg=cfg)
@parametrize(
@ -315,7 +316,7 @@ class TestFixedConfigs(TestCase):
cfg = {"XBLOCK": 64, "RSPLIT": rsplit, "num_warps": 8}
if not persistent:
cfg["R0_BLOCK"] = 64
args = [torch.randn(x, r, device="cuda")]
args = [torch.randn(x, r, device=GPU_TYPE)]
self._check(fn, args, persistent=persistent, cfg=cfg)
@parametrize("persistent", [True, False])
@ -335,8 +336,8 @@ class TestFixedConfigs(TestCase):
args = [
torch.stack(
[
torch.arange(10, 4096, device="cuda"),
-torch.arange(10, 4096, device="cuda"),
torch.arange(10, 4096, device=GPU_TYPE),
-torch.arange(10, 4096, device=GPU_TYPE),
]
)
]
@ -346,12 +347,12 @@ class TestFixedConfigs(TestCase):
[
torch.tensor(
[0.0] * 150 + [float("inf")] * 150,
device="cuda",
device=GPU_TYPE,
dtype=torch.float32,
),
torch.tensor(
[0.0] * 150 + [-float("inf")] * 150,
device="cuda",
device=GPU_TYPE,
dtype=torch.float32,
),
]
@ -374,12 +375,12 @@ class TestFixedConfigs(TestCase):
cfg = {"XBLOCK": 128, "RSPLIT": rsplit, "num_warps": 16, "num_stages": 1}
if not persistent:
cfg["R0_BLOCK"] = 64
args = [torch.randn(1024, device="cuda") for _ in range(2)]
args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
self._check(fn, args, persistent=persistent, cfg=cfg)
if __name__ == "__main__":
from torch._dynamo.test_case import run_tests
if HAS_CUDA_AND_TRITON:
if HAS_GPU:
run_tests(needs="filelock")

View File

@ -14,8 +14,8 @@ from torch.testing._internal.common_utils import (
IS_FBCODE,
parametrize,
)
from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
from torch.testing._internal.triton_utils import requires_cuda_and_triton
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
from torch.testing._internal.triton_utils import requires_gpu
from torch.utils._pytree import tree_flatten
@ -23,11 +23,11 @@ aten = torch.ops.aten
try:
try:
from .test_torchinductor import check_model, check_model_cuda
from .test_torchinductor import check_model, check_model_gpu
except ImportError:
from test_torchinductor import ( # @manual=fbcode//caffe2/test/inductor:test_inductor-library
check_model,
check_model_cuda,
check_model_gpu,
)
except (unittest.SkipTest, ImportError) as e:
sys.stderr.write(f"{type(e)}: {e}\n")
@ -188,30 +188,30 @@ decomp_ops = parametrize("op", compose_ops, name_fn=lambda f: f.__name__)
def gen_args(op):
if op in un_ops_under_test:
return (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
elif op in bin_ops_under_test:
return (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
else:
return (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
@instantiate_parametrized_tests
class ForeachTests(TestCase):
check_model_cuda = check_model_cuda
check_model_gpu = check_model_gpu
check_model_cpu = check_model
check_kernel_count = True
@ -239,7 +239,7 @@ class ForeachTests(TestCase):
def fn(a0, a1, b0, b1, c0, c1):
return op([a0, a1], [b0, b1], [c0, c1])
self.check_model_cuda(
self.check_model_gpu(
fn,
gen_args(op),
)
@ -248,50 +248,50 @@ class ForeachTests(TestCase):
def fn(a0, a1):
return op([a0, a1], 3.3)
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
)
def _test_single_scalar_tensor(self, op):
def fn(a0, a1):
return op([a0, a1], torch.tensor(3.3, device="cuda:0"))
return op([a0, a1], torch.tensor(3.3, device=GPU_TYPE))
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
)
# called in test_cuda_cpp_wrapper.py
@requires_cuda_and_triton
@requires_gpu
def test_foreach_cpp_wrapper_cuda(self):
self._test_single_list(op=torch._foreach_add)
@requires_cuda_and_triton
@requires_gpu
@all_ops
def test_single_list(self, op):
self._test_single_list(op)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
def test_single_scalar(self, op):
self._test_single_scalar(op)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_tensor_bin_ops
def test_single_scalar_tensor(self, op):
self._test_single_scalar_tensor(op)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@all_ops
def test_scheduler_fusion_list(self, op):
if op in un_ops_under_test:
@ -312,31 +312,31 @@ class ForeachTests(TestCase):
c = op([a0, a1], [b0, b1], [c0, c1])
return c, torch._foreach_add([a0, a1], c)
self.check_model_cuda(
self.check_model_gpu(
fn,
gen_args(op),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
def test_scheduler_fusion_scalar(self, op):
def fn(a0, a1):
c = op([a0, a1], 3.4)
return c, torch._foreach_add([a0, a1], c)
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
def test_broadcasting(self, op):
def fn(a0, a1, b0, b1):
@ -345,17 +345,17 @@ class ForeachTests(TestCase):
fn_opt = torch.compile(fn)
inputs = (
torch.rand(10, 1, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(1, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 1, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(1, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
actual = fn_opt(*inputs)
expected = fn(*inputs)
self.assertEqual(actual, expected)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@all_ops
def test_singleton_lists(self, op):
if op in un_ops_under_test:
@ -363,15 +363,15 @@ class ForeachTests(TestCase):
def fn(a0):
return op([a0])
args = (torch.rand(10, 10, device="cuda:0"),)
args = (torch.rand(10, 10, device=GPU_TYPE),)
elif op in bin_ops_under_test:
def fn(a0, b0):
return op([a0], [b0])
args = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
)
else:
@ -380,19 +380,19 @@ class ForeachTests(TestCase):
return op([a0], [b0], [c0])
args = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
)
self.check_model_cuda(
self.check_model_gpu(
fn,
args,
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@bin_ops
def test_type_promotion(self, op):
def fn(a0, a1, b0, b1):
@ -403,17 +403,17 @@ class ForeachTests(TestCase):
max32 = torch.iinfo(torch.int32).max
max64 = torch.iinfo(torch.int64).max
inputs = (
torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
torch.randint(max32, (20, 20), device="cuda:0", dtype=torch.int32),
torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
torch.randint(max64, (20, 20), device="cuda:0", dtype=torch.int64),
torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
torch.randint(max32, (20, 20), device=GPU_TYPE, dtype=torch.int32),
torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
torch.randint(max64, (20, 20), device=GPU_TYPE, dtype=torch.int64),
)
actual = fn_opt(*inputs)
expected = fn(*inputs)
self.assertEqual(actual, expected)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
def test_kernel_split_arg_limit_list(self, op):
# NB: foeach_copy won't pass this test because it will dce one set of buffers
@ -426,8 +426,8 @@ class ForeachTests(TestCase):
max_args = 370
max_list_len = (max_args // 3) + 1
inputs = (
[torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
[torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
[torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
[torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
)
actual = fn_opt(*inputs)
@ -435,7 +435,7 @@ class ForeachTests(TestCase):
self.assertEqual(actual, expected)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
@unittest.skip(
"Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
@ -448,27 +448,27 @@ class ForeachTests(TestCase):
max_args = 370
max_list_len = (max_args // 2) + 1
inputs = ([torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],)
inputs = ([torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],)
actual = fn_opt(*inputs)
expected = fn(*inputs)
self.assertEqual(actual, expected)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
@bin_ops
def test_fusion_duplicate_buffer_list(self, op):
def fn(a0, a1, b0, b1):
c = op([a0, a1], [b0, b1])
return op([a0, b0], [c[0], c[0]])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
reference_in_float=False,
check_lowp=False,
@ -479,7 +479,7 @@ class ForeachTests(TestCase):
kernel_count = 2
self.assertEqual(torch._inductor.metrics.generated_kernel_count, kernel_count)
@requires_cuda_and_triton
@requires_gpu
@all_ops
def test_non_foreach_consumer_list(self, op):
if op in un_ops_under_test:
@ -500,31 +500,31 @@ class ForeachTests(TestCase):
c = op([a0, a1], [b0, b1], [c0, c1])
return torch.mul(c[0], a0)
self.check_model_cuda(
self.check_model_gpu(
fn,
gen_args(op),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
def test_non_foreach_consumer_scalar(self, op):
def fn(a0, a1):
c = op([a0, a1], 4.7)
return torch.mul(c[0], a0)
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@all_ops
def test_non_foreach_producer_list(self, op):
if op in un_ops_under_test:
@ -548,13 +548,13 @@ class ForeachTests(TestCase):
c1 = torch.add(a1, b1)
return op([a0, a1], [b0, b1], [c0, c1])
self.check_model_cuda(
self.check_model_gpu(
fn, gen_args(op), reference_in_float=False, check_lowp=False
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
def test_non_foreach_producer_scalar(self, op):
def fn(a0, a1, b0, b1):
@ -562,19 +562,19 @@ class ForeachTests(TestCase):
c1 = torch.mul(a1, b1)
return op([c0, c1], 5.6)
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@all_ops
def test_non_foreach_consumer_producer_list(self, op):
if op in un_ops_under_test:
@ -607,7 +607,7 @@ class ForeachTests(TestCase):
e1 = torch.mul(d[1], a1)
return [e0, e1]
self.check_model_cuda(
self.check_model_gpu(
fn,
gen_args(op),
reference_in_float=False,
@ -616,7 +616,7 @@ class ForeachTests(TestCase):
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@scalar_bin_ops
def test_non_foreach_consumer_producer_scalar(self, op):
def fn(a0, a1, b0, b1):
@ -627,13 +627,13 @@ class ForeachTests(TestCase):
e1 = torch.mul(d[1], a1)
return [e0, e1]
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
reference_in_float=False,
check_lowp=False,
@ -641,7 +641,7 @@ class ForeachTests(TestCase):
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@bin_ops
@torch._dynamo.config.patch("automatic_dynamic_shapes", False)
@torch._dynamo.config.patch("assume_static_by_default", False)
@ -651,17 +651,17 @@ class ForeachTests(TestCase):
return op([a0, a1], [b0, b1])
inputs = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
self.check_model_cuda(fn, inputs)
self.check_model_gpu(fn, inputs)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
@torch._dynamo.config.patch("automatic_dynamic_shapes", False)
@torch._dynamo.config.patch("assume_static_by_default", False)
@torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@ -670,17 +670,17 @@ class ForeachTests(TestCase):
return op([a0, a1], [b0, b1])
inputs = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
self.check_model_cuda(fn, inputs)
self.check_model_gpu(fn, inputs)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@torch._dynamo.config.patch("automatic_dynamic_shapes", False)
@torch._dynamo.config.patch("assume_static_by_default", False)
@torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
@ -690,13 +690,13 @@ class ForeachTests(TestCase):
return op([a0, a1], [b0, b1])
inputs = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
self.check_model_cuda(fn, inputs)
self.check_model_gpu(fn, inputs)
@unittest.skipIf(IS_FBCODE, "cpp compile not supported in fbcode")
@bin_ops
@ -715,27 +715,27 @@ class ForeachTests(TestCase):
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
@decomp_ops
def test_decomp(self, op):
def fn(a0, a1, b0, b1, c0, c1):
return op([a0, a1], [b0, b1], [c0, c1], value=0.5)
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
def test_fuse_concat(self):
def fn(x1, x2, x3, w1, w2, w3):
x = torch.stack([x1, x2, x3])
@ -745,73 +745,73 @@ class ForeachTests(TestCase):
return y
x1 = torch.randn(5, 4).cuda()
x1 = torch.randn(5, 4).to(GPU_TYPE)
x2 = x1 + 1
x3 = x1 + 2
w1 = torch.randn(4, 3).cuda()
w1 = torch.randn(4, 3).to(GPU_TYPE)
w2 = w1 + 1
w3 = w1 + 2
args = (x1, x2, x3, w1, w2, w3)
self.check_model_cuda(fn, args)
self.check_model_gpu(fn, args)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
def test_zero_elems(self):
def fn(a0, a1, b0, b1):
return torch._foreach_add([a0, a1], [b0, b1])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(0, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(0, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(0, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(0, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@bin_ops
def test_2d_blocking(self, op):
def fn(a0, a1, b0, b1):
return op([a0, a1], [b0, b1])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 40, device="cuda:0"),
torch.rand(10, 30, device="cuda:0"),
torch.rand(40, 10, device="cuda:0").t(),
torch.rand(30, 10, device="cuda:0").t(),
torch.rand(10, 40, device=GPU_TYPE),
torch.rand(10, 30, device=GPU_TYPE),
torch.rand(40, 10, device=GPU_TYPE).t(),
torch.rand(30, 10, device=GPU_TYPE).t(),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@bin_ops
def test_2d_blocking_partitioning(self, op):
def fn(a0, a1, b0, b1):
return op([a0, a1], [b0, b1])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(30, 20, device="cuda:0"),
torch.rand(40, 30, device="cuda:0"),
torch.rand(30, 20, device="cuda:0"),
torch.rand(30, 40, device="cuda:0").t(),
torch.rand(30, 20, device=GPU_TYPE),
torch.rand(40, 30, device=GPU_TYPE),
torch.rand(30, 20, device=GPU_TYPE),
torch.rand(30, 40, device=GPU_TYPE).t(),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
@bin_ops
def test_2d_blocking_partitioning_elems(self, op):
"""2D blocking should be grouped by number of yelems"""
@ -819,21 +819,21 @@ class ForeachTests(TestCase):
def fn(a0, a1, a2, b0, b1, b2):
return op([a0, a1, a2], [b0, b1, b2])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 20, device="cuda:0"),
torch.rand(30, 20, device="cuda:0"),
torch.rand(10, 30, device="cuda:0"),
torch.rand(20, 10, device="cuda:0").t(),
torch.rand(20, 30, device="cuda:0").t(),
torch.rand(30, 10, device="cuda:0").t(),
torch.rand(10, 20, device=GPU_TYPE),
torch.rand(30, 20, device=GPU_TYPE),
torch.rand(10, 30, device=GPU_TYPE),
torch.rand(20, 10, device=GPU_TYPE).t(),
torch.rand(20, 30, device=GPU_TYPE).t(),
torch.rand(30, 10, device=GPU_TYPE).t(),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
@bin_ops
@torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
def test_2d_blocking_partitioning_mixed_sizes(self, op):
@ -842,21 +842,21 @@ class ForeachTests(TestCase):
def fn(a0, a1, a2, b0, b1, b2):
return op([a0, a1, a2], [b0, b1, b2])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(10, 20, device="cuda:0"),
torch.rand(30, 20, device="cuda:0"),
torch.rand(10, 30, device="cuda:0"),
torch.rand(20, 10, device="cuda:0").t(),
torch.rand(20, 30, device="cuda:0").t(),
torch.rand(30, 10, device="cuda:0").t(),
torch.rand(10, 20, device=GPU_TYPE),
torch.rand(30, 20, device=GPU_TYPE),
torch.rand(10, 30, device=GPU_TYPE),
torch.rand(20, 10, device=GPU_TYPE).t(),
torch.rand(20, 30, device=GPU_TYPE).t(),
torch.rand(30, 10, device=GPU_TYPE).t(),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@inplace_bin_ops
def test_reinplacing(self, op):
def fn(a0, a1, b0, b1):
@ -864,63 +864,63 @@ class ForeachTests(TestCase):
return [a0, a1]
inputs = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
self.check_model_cuda(fn, inputs, check_lowp=False)
self.check_model_gpu(fn, inputs, check_lowp=False)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@inplace_bin_ops
def test_reinplacing_mut_before(self, op):
def fn(a0, a1, b0, b1):
a0.add_(torch.ones(10, 10, device="cuda:0"))
a0.add_(torch.ones(10, 10, device=GPU_TYPE))
op([a0, a1], [b0, b1])
return [a0, a1]
inputs = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
self.check_model_cuda(fn, inputs, check_lowp=False)
self.check_model_gpu(fn, inputs, check_lowp=False)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@inplace_bin_ops
def test_reinplacing_mut_after(self, op):
def fn(a0, a1, b0, b1):
op([a0, a1], [b0, b1])
a0.add_(torch.ones(10, 10, device="cuda:0"))
a0.add_(torch.ones(10, 10, device=GPU_TYPE))
return [a0, a1]
inputs = (
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device="cuda:0"),
torch.rand(20, 20, device="cuda:0"),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
torch.rand(10, 10, device=GPU_TYPE),
torch.rand(20, 20, device=GPU_TYPE),
)
self.check_model_cuda(fn, inputs, check_lowp=False)
self.check_model_gpu(fn, inputs, check_lowp=False)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
def test_multi_device(self):
def test_foreach_add(a0, a1, b0, b1):
return torch._foreach_add([a0, a1], [b0, b1])
inps = [
torch.ones(10, 10, device="cuda"),
torch.ones(10, 10, device=GPU_TYPE),
torch.ones(20, 20, device="cpu"),
torch.zeros(10, 10, device="cuda"),
torch.zeros(10, 10, device=GPU_TYPE),
torch.zeros(20, 20, device="cpu"),
]
@ -930,13 +930,13 @@ class ForeachTests(TestCase):
self.assertEqual(out_eager, out_compiled)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
def test_aliasing(self):
def test_foreach_add(a0, a1, a2, b0, b1, b2):
return torch._foreach_add_([a0, a1, a2], [b0, b1, b2])
input = torch.ones(10, 10, device="cuda")
input2 = torch.ones(10, 10, device="cuda")
input = torch.ones(10, 10, device=GPU_TYPE)
input2 = torch.ones(10, 10, device=GPU_TYPE)
inps = [
input,
input.view(10, 10),
@ -952,7 +952,7 @@ class ForeachTests(TestCase):
self.assertEqual(out_eager, out_compiled)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
@requires_cuda_and_triton
@requires_gpu
@torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 1)
def test_2d_block_no_mixed_sizes_no_mask(self):
"""2D blocking with no mixed sizes constant mask"""
@ -960,21 +960,21 @@ class ForeachTests(TestCase):
def fn(a0, a1, a2, b0, b1, b2):
return torch._foreach_add([a0, a1, a2], [b0, b1, b2])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(1024, 2048, device="cuda:0"),
torch.rand(2048, 2048, device="cuda:0"),
torch.rand(1024, 2048, device="cuda:0"),
torch.rand(2048, 1024, device="cuda:0").t(),
torch.rand(2048, 2048, device="cuda:0").t(),
torch.rand(2048, 1024, device="cuda:0").t(),
torch.rand(1024, 2048, device=GPU_TYPE),
torch.rand(2048, 2048, device=GPU_TYPE),
torch.rand(1024, 2048, device=GPU_TYPE),
torch.rand(2048, 1024, device=GPU_TYPE).t(),
torch.rand(2048, 2048, device=GPU_TYPE).t(),
torch.rand(2048, 1024, device=GPU_TYPE).t(),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
@requires_cuda_and_triton
@requires_gpu
@torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
def test_2d_block_mixed_sizes_with_mask(self):
"""2D blocking with mixed sizes should have mask"""
@ -982,21 +982,21 @@ class ForeachTests(TestCase):
def fn(a0, a1, a2, b0, b1, b2):
return torch._foreach_add([a0, a1, a2], [b0, b1, b2])
self.check_model_cuda(
self.check_model_gpu(
fn,
(
torch.rand(1024, 2048, device="cuda:0"),
torch.rand(2048, 2048, device="cuda:0"),
torch.rand(1024, 2048, device="cuda:0"),
torch.rand(2048, 1024, device="cuda:0").t(),
torch.rand(2048, 2048, device="cuda:0").t(),
torch.rand(2048, 1024, device="cuda:0").t(),
torch.rand(1024, 2048, device=GPU_TYPE),
torch.rand(2048, 2048, device=GPU_TYPE),
torch.rand(1024, 2048, device=GPU_TYPE),
torch.rand(2048, 1024, device=GPU_TYPE).t(),
torch.rand(2048, 2048, device=GPU_TYPE).t(),
torch.rand(2048, 1024, device=GPU_TYPE).t(),
),
)
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
@requires_cuda_and_triton
@requires_gpu
@foreach_map_bin_ops
def test_foreach_map_backward_binary(self, op):
from torch._dynamo.polyfills import foreach_map_fn
@ -1011,14 +1011,14 @@ class ForeachTests(TestCase):
ref_inps = (
[
torch.rand(10, 20, device="cuda:0", requires_grad=True),
torch.rand(10, 30, device="cuda:0", requires_grad=True),
torch.rand(30, 30, device="cuda:0", requires_grad=True),
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
],
[
torch.rand(10, 20, device="cuda:0", requires_grad=True),
torch.rand(10, 30, device="cuda:0", requires_grad=True),
torch.rand(30, 30, device="cuda:0", requires_grad=True),
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
],
)
inps = (
@ -1037,7 +1037,7 @@ class ForeachTests(TestCase):
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
@requires_cuda_and_triton
@requires_gpu
def test_foreach_map_input_mutation(self):
def fn(xs, ys):
outs = foreach_map_add_inplace(xs, ys)
@ -1045,14 +1045,14 @@ class ForeachTests(TestCase):
ref_inps = (
[
torch.rand(10, 20, device="cuda:0", requires_grad=True),
torch.rand(10, 30, device="cuda:0", requires_grad=True),
torch.rand(30, 30, device="cuda:0", requires_grad=True),
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
],
[
torch.rand(10, 20, device="cuda:0", requires_grad=True),
torch.rand(10, 30, device="cuda:0", requires_grad=True),
torch.rand(30, 30, device="cuda:0", requires_grad=True),
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
],
)
# Set requires_grad to be False to avoid mutating a leaf variable
@ -1073,7 +1073,7 @@ class ForeachTests(TestCase):
):
_ = run_fw_bw_and_get_code(lambda: torch.compile(fn)(*inps))
@requires_cuda_and_triton
@requires_gpu
@foreach_map_un_ops
def test_foreach_map_backward_unary(self, op):
from torch._dynamo.polyfills import foreach_map_fn
@ -1087,9 +1087,9 @@ class ForeachTests(TestCase):
return outs[0].sum() + outs[1].sum() + outs[2].sum()
ref_inp = [
torch.rand(10, 20, device="cuda:0", requires_grad=True),
torch.rand(10, 30, device="cuda:0", requires_grad=True),
torch.rand(30, 30, device="cuda:0", requires_grad=True),
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
]
inp = [x.clone().detach().requires_grad_(True) for x in ref_inp]
@ -1109,5 +1109,5 @@ class ForeachTests(TestCase):
if __name__ == "__main__":
from torch._inductor.test_case import run_tests
if HAS_CPU or HAS_CUDA_AND_TRITON:
if HAS_CPU or HAS_GPU:
run_tests(needs="filelock")

View File

@ -111,6 +111,7 @@ class MultiKernelTest(TestCase):
@requires_triton()
# TODO: bobrenjc93 to fix multi-kernel for ROCM
@skipIfRocm
@skipIfXpu
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
def test_triton_gemm(self):
def fn(x, y):
@ -139,6 +140,7 @@ class MultiKernelTest(TestCase):
@requires_triton()
# TODO: bobrenjc93 to fix multi-kernel for ROCM
@skipIfRocm
@skipIfXpu
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
def test_triton_relu_fused_gemm(self):
def fn(x, y):

View File

@ -74,7 +74,7 @@ class TestCase(InductorTestCase):
def run(op, args, kwargs):
return op(*args, **kwargs)
sample_inputs_itr = op.sample_inputs("cuda", dtype, requires_grad=False)
sample_inputs_itr = op.sample_inputs(GPU_TYPE, dtype, requires_grad=False)
for sample_input in sample_inputs_itr:
args = (sample_input.input,) + sample_input.args
kwargs = sample_input.kwargs
@ -307,7 +307,9 @@ class TestCase(InductorTestCase):
)
instantiate_device_type_tests(TestCase, globals(), only_for=("cuda",))
instantiate_device_type_tests(
TestCase, globals(), only_for=("cuda", "xpu"), allow_xpu=True
)
if __name__ == "__main__":
from torch._inductor.test_case import run_tests

View File

@ -35,7 +35,7 @@ from torch._inductor.virtualized import V
from torch.fx.experimental.proxy_tensor import make_fx
from torch.testing import FileCheck
from torch.testing._internal.common_cuda import SM80OrLater, xfailIfSM89
from torch.testing._internal.common_device_type import expectedFailureXPU, skipCUDAIf
from torch.testing._internal.common_device_type import skipCUDAIf
from torch.testing._internal.common_utils import (
instantiate_parametrized_tests,
IS_LINUX,
@ -139,7 +139,7 @@ class TestPatternMatcher(TestCase):
ref[indices], test[indices]
) # also checks that dtype is correct
@skipIfXpu
# @skipIfXpu
@skipCUDAIf(not SM80OrLater, "need sm_80")
@inductor_config.patch(
{
@ -239,7 +239,6 @@ class TestPatternMatcher(TestCase):
self.assertEqual(f(inp), f_replaced(inp))
self.assertEqual(count, 2)
@skipIfXpu
@skipCUDAIf(not SM80OrLater, "need sm_80")
@inductor_config.patch(
{
@ -288,7 +287,6 @@ class TestPatternMatcher(TestCase):
self._test_fused_int_mm_mul_impl(fn2, args, True)
@skipIfRocm
@skipIfXpu
@skipCUDAIf(not SM80OrLater, "need sm_80")
@inductor_config.patch(
{
@ -336,7 +334,6 @@ class TestPatternMatcher(TestCase):
"triton_tem" if not extern_mm else "extern_kernels.mm"
).run(code)
@expectedFailureXPU
@skipCUDAIf(not SM80OrLater, "need sm_80")
@inductor_config.patch(
{
@ -372,7 +369,6 @@ class TestPatternMatcher(TestCase):
for args in args_list:
self._test_mixed_impl(fn, args, True, False)
@expectedFailureXPU
@skipCUDAIf(not SM80OrLater, "need sm_80")
@inductor_config.patch(
{
@ -399,7 +395,6 @@ class TestPatternMatcher(TestCase):
)
self._test_mixed_impl(fn, args, True, False, rtol=0.16, atol=1e-4)
@expectedFailureXPU
@skipCUDAIf(not SM80OrLater, "need sm_80")
@inductor_config.patch(
{
@ -431,7 +426,6 @@ class TestPatternMatcher(TestCase):
for args in args_list:
self._test_mixed_impl(fn, args, True, False)
@expectedFailureXPU
@skipCUDAIf(not SM80OrLater, "need sm_80")
@inductor_config.patch(
{
@ -469,7 +463,6 @@ class TestPatternMatcher(TestCase):
for args in args_list:
self._test_mixed_impl(fn, args, True, False)
@expectedFailureXPU
@skipCUDAIf(not SM80OrLater, "need sm_80")
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
def test_mixed_mm_gating(self):

View File

@ -208,9 +208,9 @@ def associative_scan(
raise ValueError(
f"Combine_mode must either 'pointwise' or 'generic', but got {cm}"
)
if cm == "pointwise" and not all(l.device.type == "cuda" for l in lxs):
if cm == "pointwise" and not all(l.device.type in ["cuda", "xpu"] for l in lxs):
raise ValueError(
"For combine_mode='pointwise', all input tensors need to be on CUDA"
"For combine_mode='pointwise', all input tensors need to be on CUDA or XPU"
)
# Checks for xs

View File

@ -46,6 +46,7 @@ from torch.testing._internal.common_utils import (
skipIfXpu,
TEST_WITH_TORCHDYNAMO,
)
from torch.testing._internal.inductor_utils import GPU_TYPE
from torch.utils._foreach_utils import _get_foreach_kernels_supported_devices
@ -369,7 +370,7 @@ def optim_inputs_func_adadelta(device, dtype=None):
OptimizerInput(
params=None, kwargs={"rho": 0.95, "weight_decay": 0.9}, desc="rho"
),
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
def optim_error_inputs_func_adadelta(device, dtype):
@ -569,7 +570,7 @@ def optim_inputs_func_adam(device, dtype=None):
desc="amsgrad",
),
]
+ (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
+ (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
+ (mps_supported_configs if _get_device_type(device) == "mps" else [])
)
if dtype == torch.float16:
@ -650,7 +651,7 @@ def optim_error_inputs_func_adam(device, dtype):
error_regex=r"betas\[0\] as a Tensor is not supported for capturable=False and foreach=True",
),
]
if _get_device_type(device) == "cuda":
if _get_device_type(device) == GPU_TYPE:
sample_tensor = torch.empty((), device=device, dtype=dtype)
error_inputs += [
ErrorOptimizerInput(
@ -721,7 +722,7 @@ def optim_inputs_func_adamax(device, dtype=None):
kwargs={"weight_decay": 0.1, "maximize": True},
desc="maximize, weight_decay",
),
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
def optim_error_inputs_func_adamax(device, dtype):
@ -792,7 +793,7 @@ def optim_inputs_func_asgd(device, dtype=None):
kwargs={"weight_decay": 0.1, "maximize": True},
desc="maximize, nonzero weight_decay",
),
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
def optim_error_inputs_func_asgd(device, dtype):
@ -974,7 +975,7 @@ def optim_inputs_func_nadam(device, dtype=None):
kwargs={"weight_decay": 0.1, "maximize": True},
desc="maximize",
),
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
def optim_error_inputs_func_nadam(device, dtype):
@ -1052,7 +1053,7 @@ def optim_inputs_func_radam(device=None, dtype=None):
kwargs={"weight_decay": 0.1, "maximize": True},
desc="maximize",
),
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
def optim_error_inputs_func_radam(device, dtype):
@ -1137,7 +1138,7 @@ def optim_inputs_func_rmsprop(device, dtype=None):
},
desc="maximize, centered, weight_decay, w/ momentum",
),
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
def optim_error_inputs_func_rmsprop(device, dtype):
@ -1179,7 +1180,7 @@ def optim_inputs_func_rprop(device, dtype=None):
desc="non-default step_sizes",
),
OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
def optim_error_inputs_func_rprop(device, dtype):
@ -1671,7 +1672,7 @@ optim_db: list[OptimizerInfo] = [
"maximize",
"capturable",
),
supports_fused_on=("cpu", "cuda", "mps"),
supports_fused_on=("cpu", "cuda", "xpu", "mps"),
decorators=(
# Expected floating point error between fused and compiled forloop
DecorateInfo(
@ -2161,6 +2162,7 @@ optim_db: list[OptimizerInfo] = [
supports_fused_on=(
"cpu",
"cuda",
"xpu",
"mps",
),
skips=(

View File

@ -77,6 +77,7 @@ HAS_XPU_AND_TRITON = torch.xpu.is_available() and HAS_TRITON
HAS_MPS = torch.mps.is_available()
HAS_GPU = HAS_CUDA_AND_TRITON or HAS_XPU_AND_TRITON
HAS_GPU_AND_TRITON = HAS_GPU
GPU_TYPE = get_gpu_type()
@ -173,7 +174,7 @@ IS_H100 = LazyVal(
and get_gpu_shared_memory() == 232448
)
IS_BIG_GPU = LazyVal(lambda: HAS_CUDA_AND_TRITON and is_big_gpu())
IS_BIG_GPU = LazyVal(lambda: HAS_GPU_AND_TRITON and is_big_gpu())
def dummy_graph() -> GraphLowering:
"""