mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-24 15:44:58 +08:00
Compare commits
6 Commits
codex/add-
...
ciflow/xpu
| Author | SHA1 | Date | |
|---|---|---|---|
| fd73dfb96b | |||
| ecea269748 | |||
| 1db0f612b1 | |||
| 6a6d4fc0e9 | |||
| 003e55b2b8 | |||
| e9aff568a7 |
20
.github/workflows/xpu.yml
vendored
20
.github/workflows/xpu.yml
vendored
@ -59,14 +59,18 @@ jobs:
|
||||
runner: linux.c7i.12xlarge
|
||||
test-matrix: |
|
||||
{ include: [
|
||||
{ config: "default", shard: 1, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 8, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 1, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 2, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 3, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 4, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 5, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 6, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 7, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 8, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 9, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 10, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 11, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
{ config: "default", shard: 12, num_shards: 12, runner: "linux.idc.xpu" },
|
||||
]}
|
||||
secrets: inherit
|
||||
|
||||
|
||||
@ -79,7 +79,12 @@ from torch.testing._internal.common_utils import (
|
||||
TEST_WITH_ROCM,
|
||||
)
|
||||
from torch.testing._internal.custom_tensor import CustomTensorPlainOut
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU,
|
||||
HAS_XPU_AND_TRITON,
|
||||
IS_BIG_GPU,
|
||||
)
|
||||
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
|
||||
from torch.testing._internal.triton_utils import requires_gpu
|
||||
from torch.utils import _pytree as pytree
|
||||
@ -1543,7 +1548,9 @@ class AOTInductorTestsTemplate:
|
||||
)
|
||||
|
||||
# scaled_dot_product_flash_attention
|
||||
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
|
||||
@unittest.skipIf(
|
||||
not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
|
||||
)
|
||||
def test_sdpa(self):
|
||||
class Model(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
@ -5574,8 +5581,8 @@ class AOTInductorTestsTemplate:
|
||||
).run(code)
|
||||
|
||||
def test_aoti_debug_printing_model_inputs_codegen(self):
|
||||
if self.device != "cuda":
|
||||
raise unittest.SkipTest("requires CUDA")
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("requires GPU")
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
def __init__(self):
|
||||
@ -5911,11 +5918,12 @@ class AOTInductorTestsTemplate:
|
||||
example_inputs = (torch.randn(2, 128, 4096, device=self.device),)
|
||||
self.check_model(Model(), example_inputs, dynamic_shapes={"x": {0: bs}})
|
||||
|
||||
@skipIfXpu(msg="Currently Profiling not enabled on XPU CI builds")
|
||||
@requires_gpu
|
||||
def test_d2h_copy(self):
|
||||
# device to copy host should always have the same stride
|
||||
if "cuda" not in self.device:
|
||||
raise unittest.SkipTest("This test is only for CUDA")
|
||||
if GPU_TYPE not in self.device:
|
||||
raise unittest.SkipTest("This test is only for GPU")
|
||||
|
||||
class ToCpuModel(nn.Module):
|
||||
def forward(self, x):
|
||||
@ -5939,7 +5947,7 @@ class AOTInductorTestsTemplate:
|
||||
with torch.profiler.profile(
|
||||
activities=[
|
||||
torch.profiler.ProfilerActivity.CPU,
|
||||
torch.profiler.ProfilerActivity.CUDA,
|
||||
getattr(torch.profiler.ProfilerActivity, GPU_TYPE.upper()),
|
||||
],
|
||||
) as prof:
|
||||
true_res = aoti_model(input_tensor)
|
||||
@ -6367,8 +6375,8 @@ class AOTInductorTestsTemplate:
|
||||
runner.free_inactive_constant_buffer()
|
||||
|
||||
def test_update_user_managed_buffer(self):
|
||||
if self.device != "cuda":
|
||||
raise unittest.SkipTest("requires CUDA")
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("requires GPU")
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
def __init__(self, n, k, device):
|
||||
@ -6412,10 +6420,10 @@ class AOTInductorTestsTemplate:
|
||||
"L__self___weight": torch.randn(N, K, device=self.device),
|
||||
"L__self___bias": torch.randn(N, device=self.device),
|
||||
}
|
||||
mem_before, _ = torch.cuda.mem_get_info(self.device)
|
||||
mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
|
||||
# Do not use user managed_buffer, should have less free memory.
|
||||
runner.update_constant_buffer(new_weights, True, False, False)
|
||||
mem_after, _ = torch.cuda.mem_get_info(self.device)
|
||||
mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
|
||||
self.assertGreater(mem_before, mem_after)
|
||||
|
||||
runner.swap_constant_buffer()
|
||||
@ -6447,10 +6455,10 @@ class AOTInductorTestsTemplate:
|
||||
"L__self___weight": torch.randn(N, K, device=self.device),
|
||||
"L__self___bias": torch.randn(N, device=self.device),
|
||||
}
|
||||
mem_before, _ = torch.cuda.mem_get_info(self.device)
|
||||
mem_before, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
|
||||
# Try user managed_buffer, should have same free memory.
|
||||
runner.update_constant_buffer(new_weights, True, False, True)
|
||||
mem_after, _ = torch.cuda.mem_get_info(self.device)
|
||||
mem_after, _ = getattr(torch, GPU_TYPE).mem_get_info(self.device)
|
||||
self.assertEqual(mem_before, mem_after, atol=1e-3, rtol=1e-3)
|
||||
|
||||
runner.swap_constant_buffer()
|
||||
@ -6522,8 +6530,8 @@ class AOTInductorTestsTemplate:
|
||||
"To enable after the C shim FC window ends",
|
||||
)
|
||||
def test_misaligned_input_1(self):
|
||||
if self.device != "cuda":
|
||||
raise unittest.SkipTest("CUDA test only")
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("GPU test only")
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
@ -6549,8 +6557,8 @@ class AOTInductorTestsTemplate:
|
||||
torch.testing.assert_close(actual, expected)
|
||||
|
||||
def test_misaligned_input_2(self):
|
||||
if self.device != "cuda":
|
||||
raise unittest.SkipTest("CUDA test only")
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("GPU test only")
|
||||
|
||||
class Model(torch.nn.Module):
|
||||
def forward(self, x):
|
||||
@ -7098,8 +7106,8 @@ class AOTInductorTestsTemplate:
|
||||
self.check_model(Model(), example_inputs, dynamic_shapes=dynamic_shapes)
|
||||
|
||||
def test_sym_expr_indexing(self):
|
||||
if self.device != "cuda":
|
||||
raise unittest.SkipTest("requires CUDA")
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("requires GPU")
|
||||
|
||||
class Repro(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
@ -7117,7 +7125,7 @@ class AOTInductorTestsTemplate:
|
||||
arange_1 = torch.ops.aten.arange.start(
|
||||
180,
|
||||
181,
|
||||
device=torch.device(type="cuda", index=0),
|
||||
device=torch.device(type=GPU_TYPE, index=0),
|
||||
pin_memory=False,
|
||||
)
|
||||
add_14 = torch.ops.aten.add.Tensor(arange_1, 198)
|
||||
@ -7636,8 +7644,6 @@ GPU_TEST_FAILURES = {
|
||||
"test_quantized_linear_bias_none": fail_gpu(("cuda", "xpu")),
|
||||
# No scaled_dot_product_efficient_attention implementation for XPU yet.
|
||||
"test_scaled_dot_product_efficient_attention": fail_gpu(("xpu",)),
|
||||
# No fft implementation for XPU yet.
|
||||
"test_fft_c2c": fail_gpu(("xpu",), is_skip=True),
|
||||
}
|
||||
|
||||
MPS_TEST_FAILURES = {
|
||||
|
||||
@ -28,12 +28,7 @@ from torch.export.pt2_archive._package import (
|
||||
load_weights_to_pt2_contents,
|
||||
)
|
||||
from torch.testing._internal.common_cuda import _get_torch_cuda_version
|
||||
from torch.testing._internal.common_utils import (
|
||||
IS_FBCODE,
|
||||
skipIfRocm,
|
||||
skipIfXpu,
|
||||
TEST_CUDA,
|
||||
)
|
||||
from torch.testing._internal.common_utils import IS_FBCODE, skipIfRocm, skipIfXpu
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
|
||||
|
||||
|
||||
@ -688,13 +683,13 @@ class TestAOTInductorPackage(TestCase):
|
||||
self.assertEqual(loaded1(*example_inputs1), ep1.module()(*example_inputs1))
|
||||
self.assertEqual(loaded2(*example_inputs2), ep2.module()(*example_inputs2))
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "requires cuda")
|
||||
@unittest.skipIf(not HAS_GPU, "requires gpu")
|
||||
def test_duplicate_calls(self):
|
||||
options = {
|
||||
"aot_inductor.package": True,
|
||||
}
|
||||
|
||||
device = "cuda"
|
||||
device = GPU_TYPE
|
||||
|
||||
class Model1(torch.nn.Module):
|
||||
def __init__(self) -> None:
|
||||
|
||||
@ -9,7 +9,7 @@ import unittest
|
||||
|
||||
import torch
|
||||
from torch._inductor import config
|
||||
from torch.testing._internal.common_utils import IS_LINUX, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_LINUX
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
|
||||
|
||||
|
||||
@ -48,7 +48,6 @@ class TestKernelBestConfig(TestCase):
|
||||
config.max_autotune = cls.original_max_autotune
|
||||
super().tearDownClass()
|
||||
|
||||
@skipIfXpu
|
||||
def test_best_config_has_triton_cache_key(self):
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
os.environ["TORCHINDUCTOR_CACHE_DIR"] = tmpdir
|
||||
|
||||
@ -68,6 +68,7 @@ from torch.testing._internal.inductor_utils import (
|
||||
HAS_GPU,
|
||||
HAS_MULTIGPU,
|
||||
HAS_TRITON,
|
||||
HAS_XPU_AND_TRITON,
|
||||
patch_inductor_backend,
|
||||
requires_gpu,
|
||||
requires_triton,
|
||||
@ -1215,7 +1216,7 @@ class TestFxGraphCache(TestCase):
|
||||
self.assertEqual(counters["inductor"]["fxgraph_cache_hit"], 1)
|
||||
self.assertEqual(counters["inductor"]["fxgraph_lookup_write_file"], 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu()
|
||||
@config.patch({"fx_graph_cache": True})
|
||||
@config.patch({"fx_graph_remote_cache": False})
|
||||
@with_tf32_off
|
||||
@ -1238,7 +1239,7 @@ class TestFxGraphCache(TestCase):
|
||||
def fn2(q, k, v):
|
||||
return flex_attention(q, k, v, score_mod=score_mod2, block_mask=block_mask)
|
||||
|
||||
a, b, c = (torch.randn(1, 4, 512, 64).cuda() for _ in range(3))
|
||||
a, b, c = (torch.randn(1, 4, 512, 64).to(GPU_TYPE) for _ in range(3))
|
||||
compiled_fn = torch.compile(fn)
|
||||
compiled_fn2 = torch.compile(fn2)
|
||||
|
||||
@ -2923,8 +2924,8 @@ class TestAutotuneCache(TestCase):
|
||||
for k in global_stats.triton.cache.keys():
|
||||
self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
|
||||
@requires_gpu()
|
||||
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
|
||||
@config.patch({"fx_graph_cache": False})
|
||||
@config.patch({"fx_graph_remote_cache": False})
|
||||
@config.patch({"autotune_local_cache": False})
|
||||
@ -2942,10 +2943,10 @@ class TestAutotuneCache(TestCase):
|
||||
def f(x, y, a, b):
|
||||
return Model()(x, y, a, b)
|
||||
|
||||
x = torch.randn(100, 100).cuda()
|
||||
y = torch.randn(100, 100).cuda()
|
||||
a = torch.randn(1000, 100).cuda()
|
||||
b = torch.randn(1000, 100).cuda()
|
||||
x = torch.randn(100, 100).to(GPU_TYPE)
|
||||
y = torch.randn(100, 100).to(GPU_TYPE)
|
||||
a = torch.randn(1000, 100).to(GPU_TYPE)
|
||||
b = torch.randn(1000, 100).to(GPU_TYPE)
|
||||
f_compiled = torch.compile(f, fullgraph=True)
|
||||
|
||||
with PatchCaches():
|
||||
@ -2964,8 +2965,8 @@ class TestAutotuneCache(TestCase):
|
||||
for k in global_stats.triton.cache.keys():
|
||||
self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
|
||||
@requires_gpu()
|
||||
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
|
||||
@config.patch({"fx_graph_cache": False})
|
||||
@config.patch({"fx_graph_remote_cache": False})
|
||||
@config.patch({"autotune_local_cache": True})
|
||||
@ -2983,12 +2984,12 @@ class TestAutotuneCache(TestCase):
|
||||
|
||||
f_compiled = torch.compile(f, fullgraph=True)
|
||||
|
||||
a = torch.randn(101, 100).cuda()
|
||||
b = torch.randn(101, 100).cuda()
|
||||
c = torch.randn(102, 100).cuda()
|
||||
d = torch.randn(102, 100).cuda()
|
||||
e = torch.randn(103, 100).cuda()
|
||||
f = torch.randn(103, 100).cuda()
|
||||
a = torch.randn(101, 100).to(GPU_TYPE)
|
||||
b = torch.randn(101, 100).to(GPU_TYPE)
|
||||
c = torch.randn(102, 100).to(GPU_TYPE)
|
||||
d = torch.randn(102, 100).to(GPU_TYPE)
|
||||
e = torch.randn(103, 100).to(GPU_TYPE)
|
||||
f = torch.randn(103, 100).to(GPU_TYPE)
|
||||
|
||||
with PatchCaches():
|
||||
f_compiled(a, b, c, d, e, f)
|
||||
@ -3025,8 +3026,8 @@ class TestAutotuneCache(TestCase):
|
||||
self.assertRegex(k, r"triton:[0-9a-f]{64}::[0-9a-f]{64}:c[0-9]+")
|
||||
|
||||
@requires_triton()
|
||||
@requires_cuda_and_triton
|
||||
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
|
||||
@requires_gpu()
|
||||
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
|
||||
@config.patch({"fx_graph_cache": False})
|
||||
@config.patch({"fx_graph_remote_cache": False})
|
||||
@config.patch({"bundled_autotune_remote_cache": False})
|
||||
@ -3089,8 +3090,8 @@ class TestAutotuneCache(TestCase):
|
||||
|
||||
|
||||
class TestRemoteAOTAutogradCache(TestCase):
|
||||
@requires_cuda_and_triton
|
||||
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
|
||||
@requires_gpu()
|
||||
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
|
||||
@config.patch({"fx_graph_cache": False})
|
||||
@config.patch({"fx_graph_remote_cache": True})
|
||||
@torch._functorch.config.patch({"enable_autograd_cache": False})
|
||||
@ -3100,8 +3101,8 @@ class TestRemoteAOTAutogradCache(TestCase):
|
||||
return a + b
|
||||
|
||||
f_compiled = torch.compile(f)
|
||||
a = torch.randn(101, 100, device="cuda", requires_grad=False)
|
||||
b = torch.randn(101, 100, device="cuda", requires_grad=False)
|
||||
a = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
|
||||
b = torch.randn(101, 100, device=GPU_TYPE, requires_grad=False)
|
||||
with PatchCaches():
|
||||
f_compiled(a, b)
|
||||
|
||||
@ -3128,8 +3129,8 @@ class TestRemoteAOTAutogradCache(TestCase):
|
||||
for k in global_stats.fx_graph.cache.keys():
|
||||
self.assertRegex(k, r"pt2:fx-graph-v1::[0-9a-z]{52}:c[0-9]+")
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@unittest.skipIf(not SM80OrLater, "Requires SM80+")
|
||||
@requires_gpu()
|
||||
@unittest.skipIf(not HAS_XPU_AND_TRITON and not SM80OrLater, "Requires SM80+")
|
||||
@config.patch({"fx_graph_cache": False})
|
||||
@config.patch({"fx_graph_remote_cache": True})
|
||||
@torch._functorch.config.patch({"enable_autograd_cache": False})
|
||||
@ -3203,7 +3204,7 @@ class TestUtils(TestCase):
|
||||
|
||||
# This combination of settings exposed a bug where we cleared the
|
||||
# PyCodeCache disk artifacts while they were still needed:
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu()
|
||||
@config.patch(
|
||||
{
|
||||
"coordinate_descent_tuning": True,
|
||||
@ -3212,9 +3213,9 @@ class TestUtils(TestCase):
|
||||
)
|
||||
def test_force_disable_coordinate_descent(self):
|
||||
def fn():
|
||||
inp = torch.randn(32, 50, 768, device="cuda")
|
||||
weight = torch.randn(768, 768, device="cuda")
|
||||
layer = torch.nn.LayerNorm(768, device="cuda")
|
||||
inp = torch.randn(32, 50, 768, device=GPU_TYPE)
|
||||
weight = torch.randn(768, 768, device=GPU_TYPE)
|
||||
layer = torch.nn.LayerNorm(768, device=GPU_TYPE)
|
||||
return layer(inp @ weight)
|
||||
|
||||
torch.compile(fn)()
|
||||
|
||||
@ -201,14 +201,17 @@ KERNEL_COUNT_OVERRIDES = {
|
||||
"test_adamw_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_tensor_lr_tensor_betas_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_tensor_lr_tensor_betas_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_tensor_lr_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamw_tensor_lr_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_tensor_lr_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_tensor_lr_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_tensor_lr_tensor_betas_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_tensor_lr_tensor_betas_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_tensor_lr_tensor_betas_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_tensor_lr_tensor_betas_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_amsgrad_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adam_amsgrad_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adadelta_tensor_lr_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
@ -247,9 +250,9 @@ KERNEL_COUNT_OVERRIDES = {
|
||||
"test_adamax_tensor_lr_weight_decay_capturable_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_adamax_tensor_lr_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_asgd_tensor_lr_weight_decay_maximize_capturable_cuda": lambda x: assert_expected_inline(x, """5"""),
|
||||
"test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": lambda x: assert_expected_inline(x, """8"""),
|
||||
"test_asgd_tensor_lr_weight_decay_maximize_capturable_xpu": lambda x: assert_expected_inline(x, """5"""),
|
||||
"test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_cuda": lambda x: assert_expected_inline(x, """6"""), # noqa: B950
|
||||
"test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """9"""), # noqa: B950
|
||||
"test_nadam_tensor_lr_weight_decay_momentum_decay_decoupled_weight_decay_capturable_xpu": lambda x: assert_expected_inline(x, """6"""), # noqa: B950
|
||||
"test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_cuda": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_radam_tensor_lr_capturable_weight_decay_decoupled_weight_decay_xpu": lambda x: assert_expected_inline(x, """6"""),
|
||||
"test_sgd_tensor_lr_cpu": lambda x: assert_expected_inline(x, """2"""),
|
||||
@ -436,7 +439,7 @@ def make_test(
|
||||
closure=None,
|
||||
scheduler_cls=None,
|
||||
kernel_count=2,
|
||||
device="cuda",
|
||||
device=GPU_TYPE,
|
||||
**kwargs,
|
||||
):
|
||||
@config.patch("score_fusion_memory_threshold", 1)
|
||||
|
||||
@ -18,7 +18,7 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
|
||||
|
||||
|
||||
class TestingHeuristics(InductorChoices):
|
||||
@ -176,7 +176,7 @@ class CooperativeReductionTests(TestCase):
|
||||
return reduction_fn(x + y, dim=-1)
|
||||
|
||||
reduction_fn = getattr(torch, name)
|
||||
args = [torch.randn(1, 1024**2, device="cuda", dtype=dtype) for _ in range(2)]
|
||||
args = [torch.randn(1, 1024**2, device=GPU_TYPE, dtype=dtype) for _ in range(2)]
|
||||
self.run_and_check(fn, args, dtype)
|
||||
|
||||
def test_bool_reduction_fns(self):
|
||||
@ -190,7 +190,7 @@ class CooperativeReductionTests(TestCase):
|
||||
torch.all(x > y),
|
||||
]
|
||||
|
||||
args = [torch.randn(1024, device="cuda") for _ in range(2)]
|
||||
args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
|
||||
source_code = self.run_and_check(fn, args)
|
||||
if "async_compile.multi_kernel" in source_code:
|
||||
return
|
||||
@ -204,7 +204,7 @@ class CooperativeReductionTests(TestCase):
|
||||
def fn(x):
|
||||
return x.mean(), x.std() + x.min()
|
||||
|
||||
args = [torch.randn([bs, count], device="cuda")]
|
||||
args = [torch.randn([bs, count], device=GPU_TYPE)]
|
||||
self.run_and_check(fn, args)
|
||||
|
||||
def test_chained_reductions(self):
|
||||
@ -213,18 +213,19 @@ class CooperativeReductionTests(TestCase):
|
||||
x = x + torch.softmax(x, 1)
|
||||
return x
|
||||
|
||||
args = [torch.randn(4, 100000, device="cuda")]
|
||||
args = [torch.randn(4, 100000, device=GPU_TYPE)]
|
||||
source_code = self.run_and_check(fn, args)
|
||||
if "async_compile.multi_kernel" in source_code:
|
||||
return
|
||||
|
||||
# With online softmax, the computation of max and sum are done
|
||||
# jointly and they share a single barrier call.
|
||||
expected_num_barrier = 8 if config.online_softmax else 16
|
||||
# XPU doesn't support online softmax yet.
|
||||
expected_num_barrier = 8 if config.online_softmax and GPU_TYPE != "xpu" else 16
|
||||
self.assertEqual(
|
||||
source_code.count("triton_helpers.x_grid_barrier"), expected_num_barrier
|
||||
)
|
||||
self.assertEqual(source_code.count("empty_strided_cuda"), 5)
|
||||
self.assertEqual(source_code.count(f"empty_strided_{GPU_TYPE}"), 5)
|
||||
|
||||
def test_reduce_split(self):
|
||||
def fn(a, b):
|
||||
@ -233,8 +234,8 @@ class CooperativeReductionTests(TestCase):
|
||||
return a1, b1
|
||||
|
||||
inps = [
|
||||
torch.rand(2048, 512, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(2048, 512, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
]
|
||||
self.run_and_check(fn, inps, expect_kernel_count=2)
|
||||
|
||||
@ -290,7 +291,7 @@ class TestFixedConfigs(TestCase):
|
||||
def fn(x):
|
||||
return torch.softmax(x + 1, dim=-1) + x
|
||||
|
||||
args = [torch.randn(8, 8000, device="cuda")]
|
||||
args = [torch.randn(8, 8000, device=GPU_TYPE)]
|
||||
self._check(fn, args, persistent=persistent, cooperative=cooperative, cfg=cfg)
|
||||
|
||||
@parametrize(
|
||||
@ -315,7 +316,7 @@ class TestFixedConfigs(TestCase):
|
||||
cfg = {"XBLOCK": 64, "RSPLIT": rsplit, "num_warps": 8}
|
||||
if not persistent:
|
||||
cfg["R0_BLOCK"] = 64
|
||||
args = [torch.randn(x, r, device="cuda")]
|
||||
args = [torch.randn(x, r, device=GPU_TYPE)]
|
||||
self._check(fn, args, persistent=persistent, cfg=cfg)
|
||||
|
||||
@parametrize("persistent", [True, False])
|
||||
@ -335,8 +336,8 @@ class TestFixedConfigs(TestCase):
|
||||
args = [
|
||||
torch.stack(
|
||||
[
|
||||
torch.arange(10, 4096, device="cuda"),
|
||||
-torch.arange(10, 4096, device="cuda"),
|
||||
torch.arange(10, 4096, device=GPU_TYPE),
|
||||
-torch.arange(10, 4096, device=GPU_TYPE),
|
||||
]
|
||||
)
|
||||
]
|
||||
@ -346,12 +347,12 @@ class TestFixedConfigs(TestCase):
|
||||
[
|
||||
torch.tensor(
|
||||
[0.0] * 150 + [float("inf")] * 150,
|
||||
device="cuda",
|
||||
device=GPU_TYPE,
|
||||
dtype=torch.float32,
|
||||
),
|
||||
torch.tensor(
|
||||
[0.0] * 150 + [-float("inf")] * 150,
|
||||
device="cuda",
|
||||
device=GPU_TYPE,
|
||||
dtype=torch.float32,
|
||||
),
|
||||
]
|
||||
@ -374,12 +375,12 @@ class TestFixedConfigs(TestCase):
|
||||
cfg = {"XBLOCK": 128, "RSPLIT": rsplit, "num_warps": 16, "num_stages": 1}
|
||||
if not persistent:
|
||||
cfg["R0_BLOCK"] = 64
|
||||
args = [torch.randn(1024, device="cuda") for _ in range(2)]
|
||||
args = [torch.randn(1024, device=GPU_TYPE) for _ in range(2)]
|
||||
self._check(fn, args, persistent=persistent, cfg=cfg)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
if HAS_CUDA_AND_TRITON:
|
||||
if HAS_GPU:
|
||||
run_tests(needs="filelock")
|
||||
|
||||
@ -14,8 +14,8 @@ from torch.testing._internal.common_utils import (
|
||||
IS_FBCODE,
|
||||
parametrize,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU
|
||||
from torch.testing._internal.triton_utils import requires_gpu
|
||||
from torch.utils._pytree import tree_flatten
|
||||
|
||||
|
||||
@ -23,11 +23,11 @@ aten = torch.ops.aten
|
||||
|
||||
try:
|
||||
try:
|
||||
from .test_torchinductor import check_model, check_model_cuda
|
||||
from .test_torchinductor import check_model, check_model_gpu
|
||||
except ImportError:
|
||||
from test_torchinductor import ( # @manual=fbcode//caffe2/test/inductor:test_inductor-library
|
||||
check_model,
|
||||
check_model_cuda,
|
||||
check_model_gpu,
|
||||
)
|
||||
except (unittest.SkipTest, ImportError) as e:
|
||||
sys.stderr.write(f"{type(e)}: {e}\n")
|
||||
@ -188,30 +188,30 @@ decomp_ops = parametrize("op", compose_ops, name_fn=lambda f: f.__name__)
|
||||
def gen_args(op):
|
||||
if op in un_ops_under_test:
|
||||
return (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
elif op in bin_ops_under_test:
|
||||
return (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
else:
|
||||
return (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ForeachTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -239,7 +239,7 @@ class ForeachTests(TestCase):
|
||||
def fn(a0, a1, b0, b1, c0, c1):
|
||||
return op([a0, a1], [b0, b1], [c0, c1])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
gen_args(op),
|
||||
)
|
||||
@ -248,50 +248,50 @@ class ForeachTests(TestCase):
|
||||
def fn(a0, a1):
|
||||
return op([a0, a1], 3.3)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
)
|
||||
|
||||
def _test_single_scalar_tensor(self, op):
|
||||
def fn(a0, a1):
|
||||
return op([a0, a1], torch.tensor(3.3, device="cuda:0"))
|
||||
return op([a0, a1], torch.tensor(3.3, device=GPU_TYPE))
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
)
|
||||
|
||||
# called in test_cuda_cpp_wrapper.py
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
def test_foreach_cpp_wrapper_cuda(self):
|
||||
self._test_single_list(op=torch._foreach_add)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@all_ops
|
||||
def test_single_list(self, op):
|
||||
self._test_single_list(op)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
def test_single_scalar(self, op):
|
||||
self._test_single_scalar(op)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_tensor_bin_ops
|
||||
def test_single_scalar_tensor(self, op):
|
||||
self._test_single_scalar_tensor(op)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@all_ops
|
||||
def test_scheduler_fusion_list(self, op):
|
||||
if op in un_ops_under_test:
|
||||
@ -312,31 +312,31 @@ class ForeachTests(TestCase):
|
||||
c = op([a0, a1], [b0, b1], [c0, c1])
|
||||
return c, torch._foreach_add([a0, a1], c)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
gen_args(op),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
def test_scheduler_fusion_scalar(self, op):
|
||||
def fn(a0, a1):
|
||||
c = op([a0, a1], 3.4)
|
||||
return c, torch._foreach_add([a0, a1], c)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
def test_broadcasting(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
@ -345,17 +345,17 @@ class ForeachTests(TestCase):
|
||||
fn_opt = torch.compile(fn)
|
||||
|
||||
inputs = (
|
||||
torch.rand(10, 1, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(1, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 1, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(1, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
actual = fn_opt(*inputs)
|
||||
expected = fn(*inputs)
|
||||
self.assertEqual(actual, expected)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@all_ops
|
||||
def test_singleton_lists(self, op):
|
||||
if op in un_ops_under_test:
|
||||
@ -363,15 +363,15 @@ class ForeachTests(TestCase):
|
||||
def fn(a0):
|
||||
return op([a0])
|
||||
|
||||
args = (torch.rand(10, 10, device="cuda:0"),)
|
||||
args = (torch.rand(10, 10, device=GPU_TYPE),)
|
||||
elif op in bin_ops_under_test:
|
||||
|
||||
def fn(a0, b0):
|
||||
return op([a0], [b0])
|
||||
|
||||
args = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
else:
|
||||
@ -380,19 +380,19 @@ class ForeachTests(TestCase):
|
||||
return op([a0], [b0], [c0])
|
||||
|
||||
args = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
args,
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@bin_ops
|
||||
def test_type_promotion(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
@ -403,17 +403,17 @@ class ForeachTests(TestCase):
|
||||
max32 = torch.iinfo(torch.int32).max
|
||||
max64 = torch.iinfo(torch.int64).max
|
||||
inputs = (
|
||||
torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
|
||||
torch.randint(max32, (20, 20), device="cuda:0", dtype=torch.int32),
|
||||
torch.randint(max32, (10, 10), device="cuda:0", dtype=torch.int32),
|
||||
torch.randint(max64, (20, 20), device="cuda:0", dtype=torch.int64),
|
||||
torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
|
||||
torch.randint(max32, (20, 20), device=GPU_TYPE, dtype=torch.int32),
|
||||
torch.randint(max32, (10, 10), device=GPU_TYPE, dtype=torch.int32),
|
||||
torch.randint(max64, (20, 20), device=GPU_TYPE, dtype=torch.int64),
|
||||
)
|
||||
actual = fn_opt(*inputs)
|
||||
expected = fn(*inputs)
|
||||
self.assertEqual(actual, expected)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
def test_kernel_split_arg_limit_list(self, op):
|
||||
# NB: foeach_copy won't pass this test because it will dce one set of buffers
|
||||
@ -426,8 +426,8 @@ class ForeachTests(TestCase):
|
||||
max_args = 370
|
||||
max_list_len = (max_args // 3) + 1
|
||||
inputs = (
|
||||
[torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
|
||||
[torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],
|
||||
[torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
|
||||
[torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],
|
||||
)
|
||||
|
||||
actual = fn_opt(*inputs)
|
||||
@ -435,7 +435,7 @@ class ForeachTests(TestCase):
|
||||
self.assertEqual(actual, expected)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
@unittest.skip(
|
||||
"Triton recursion depth exceeded: https://github.com/triton-lang/triton/issues/1763"
|
||||
@ -448,27 +448,27 @@ class ForeachTests(TestCase):
|
||||
|
||||
max_args = 370
|
||||
max_list_len = (max_args // 2) + 1
|
||||
inputs = ([torch.rand(10, 10, device="cuda:0") for _ in range(max_list_len)],)
|
||||
inputs = ([torch.rand(10, 10, device=GPU_TYPE) for _ in range(max_list_len)],)
|
||||
|
||||
actual = fn_opt(*inputs)
|
||||
expected = fn(*inputs)
|
||||
self.assertEqual(actual, expected)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@bin_ops
|
||||
def test_fusion_duplicate_buffer_list(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
c = op([a0, a1], [b0, b1])
|
||||
return op([a0, b0], [c[0], c[0]])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
reference_in_float=False,
|
||||
check_lowp=False,
|
||||
@ -479,7 +479,7 @@ class ForeachTests(TestCase):
|
||||
kernel_count = 2
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, kernel_count)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@all_ops
|
||||
def test_non_foreach_consumer_list(self, op):
|
||||
if op in un_ops_under_test:
|
||||
@ -500,31 +500,31 @@ class ForeachTests(TestCase):
|
||||
c = op([a0, a1], [b0, b1], [c0, c1])
|
||||
return torch.mul(c[0], a0)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
gen_args(op),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
def test_non_foreach_consumer_scalar(self, op):
|
||||
def fn(a0, a1):
|
||||
c = op([a0, a1], 4.7)
|
||||
return torch.mul(c[0], a0)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@all_ops
|
||||
def test_non_foreach_producer_list(self, op):
|
||||
if op in un_ops_under_test:
|
||||
@ -548,13 +548,13 @@ class ForeachTests(TestCase):
|
||||
c1 = torch.add(a1, b1)
|
||||
return op([a0, a1], [b0, b1], [c0, c1])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn, gen_args(op), reference_in_float=False, check_lowp=False
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
def test_non_foreach_producer_scalar(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
@ -562,19 +562,19 @@ class ForeachTests(TestCase):
|
||||
c1 = torch.mul(a1, b1)
|
||||
return op([c0, c1], 5.6)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@all_ops
|
||||
def test_non_foreach_consumer_producer_list(self, op):
|
||||
if op in un_ops_under_test:
|
||||
@ -607,7 +607,7 @@ class ForeachTests(TestCase):
|
||||
e1 = torch.mul(d[1], a1)
|
||||
return [e0, e1]
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
gen_args(op),
|
||||
reference_in_float=False,
|
||||
@ -616,7 +616,7 @@ class ForeachTests(TestCase):
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@scalar_bin_ops
|
||||
def test_non_foreach_consumer_producer_scalar(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
@ -627,13 +627,13 @@ class ForeachTests(TestCase):
|
||||
e1 = torch.mul(d[1], a1)
|
||||
return [e0, e1]
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
reference_in_float=False,
|
||||
check_lowp=False,
|
||||
@ -641,7 +641,7 @@ class ForeachTests(TestCase):
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@bin_ops
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", False)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", False)
|
||||
@ -651,17 +651,17 @@ class ForeachTests(TestCase):
|
||||
return op([a0, a1], [b0, b1])
|
||||
|
||||
inputs = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
self.check_model_cuda(fn, inputs)
|
||||
self.check_model_gpu(fn, inputs)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", False)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", False)
|
||||
@torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
|
||||
@ -670,17 +670,17 @@ class ForeachTests(TestCase):
|
||||
return op([a0, a1], [b0, b1])
|
||||
|
||||
inputs = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
self.check_model_cuda(fn, inputs)
|
||||
self.check_model_gpu(fn, inputs)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", False)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", False)
|
||||
@torch._inductor.config.patch("combo_kernel_foreach_dynamic_shapes", True)
|
||||
@ -690,13 +690,13 @@ class ForeachTests(TestCase):
|
||||
return op([a0, a1], [b0, b1])
|
||||
|
||||
inputs = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
self.check_model_cuda(fn, inputs)
|
||||
self.check_model_gpu(fn, inputs)
|
||||
|
||||
@unittest.skipIf(IS_FBCODE, "cpp compile not supported in fbcode")
|
||||
@bin_ops
|
||||
@ -715,27 +715,27 @@ class ForeachTests(TestCase):
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@decomp_ops
|
||||
def test_decomp(self, op):
|
||||
def fn(a0, a1, b0, b1, c0, c1):
|
||||
return op([a0, a1], [b0, b1], [c0, c1], value=0.5)
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
def test_fuse_concat(self):
|
||||
def fn(x1, x2, x3, w1, w2, w3):
|
||||
x = torch.stack([x1, x2, x3])
|
||||
@ -745,73 +745,73 @@ class ForeachTests(TestCase):
|
||||
|
||||
return y
|
||||
|
||||
x1 = torch.randn(5, 4).cuda()
|
||||
x1 = torch.randn(5, 4).to(GPU_TYPE)
|
||||
x2 = x1 + 1
|
||||
x3 = x1 + 2
|
||||
w1 = torch.randn(4, 3).cuda()
|
||||
w1 = torch.randn(4, 3).to(GPU_TYPE)
|
||||
w2 = w1 + 1
|
||||
w3 = w1 + 2
|
||||
|
||||
args = (x1, x2, x3, w1, w2, w3)
|
||||
|
||||
self.check_model_cuda(fn, args)
|
||||
self.check_model_gpu(fn, args)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
def test_zero_elems(self):
|
||||
def fn(a0, a1, b0, b1):
|
||||
return torch._foreach_add([a0, a1], [b0, b1])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(0, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(0, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(0, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(0, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@bin_ops
|
||||
def test_2d_blocking(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
return op([a0, a1], [b0, b1])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 40, device="cuda:0"),
|
||||
torch.rand(10, 30, device="cuda:0"),
|
||||
torch.rand(40, 10, device="cuda:0").t(),
|
||||
torch.rand(30, 10, device="cuda:0").t(),
|
||||
torch.rand(10, 40, device=GPU_TYPE),
|
||||
torch.rand(10, 30, device=GPU_TYPE),
|
||||
torch.rand(40, 10, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 10, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@bin_ops
|
||||
def test_2d_blocking_partitioning(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
return op([a0, a1], [b0, b1])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda:0"),
|
||||
torch.rand(40, 30, device="cuda:0"),
|
||||
torch.rand(30, 20, device="cuda:0"),
|
||||
torch.rand(30, 40, device="cuda:0").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@bin_ops
|
||||
def test_2d_blocking_partitioning_elems(self, op):
|
||||
"""2D blocking should be grouped by number of yelems"""
|
||||
@ -819,21 +819,21 @@ class ForeachTests(TestCase):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
return op([a0, a1, a2], [b0, b1, b2])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 20, device="cuda:0"),
|
||||
torch.rand(30, 20, device="cuda:0"),
|
||||
torch.rand(10, 30, device="cuda:0"),
|
||||
torch.rand(20, 10, device="cuda:0").t(),
|
||||
torch.rand(20, 30, device="cuda:0").t(),
|
||||
torch.rand(30, 10, device="cuda:0").t(),
|
||||
torch.rand(10, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 30, device=GPU_TYPE),
|
||||
torch.rand(20, 10, device=GPU_TYPE).t(),
|
||||
torch.rand(20, 30, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 10, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@bin_ops
|
||||
@torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
|
||||
def test_2d_blocking_partitioning_mixed_sizes(self, op):
|
||||
@ -842,21 +842,21 @@ class ForeachTests(TestCase):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
return op([a0, a1, a2], [b0, b1, b2])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(10, 20, device="cuda:0"),
|
||||
torch.rand(30, 20, device="cuda:0"),
|
||||
torch.rand(10, 30, device="cuda:0"),
|
||||
torch.rand(20, 10, device="cuda:0").t(),
|
||||
torch.rand(20, 30, device="cuda:0").t(),
|
||||
torch.rand(30, 10, device="cuda:0").t(),
|
||||
torch.rand(10, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 30, device=GPU_TYPE),
|
||||
torch.rand(20, 10, device=GPU_TYPE).t(),
|
||||
torch.rand(20, 30, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 10, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@inplace_bin_ops
|
||||
def test_reinplacing(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
@ -864,63 +864,63 @@ class ForeachTests(TestCase):
|
||||
return [a0, a1]
|
||||
|
||||
inputs = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
self.check_model_cuda(fn, inputs, check_lowp=False)
|
||||
self.check_model_gpu(fn, inputs, check_lowp=False)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@inplace_bin_ops
|
||||
def test_reinplacing_mut_before(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
a0.add_(torch.ones(10, 10, device="cuda:0"))
|
||||
a0.add_(torch.ones(10, 10, device=GPU_TYPE))
|
||||
op([a0, a1], [b0, b1])
|
||||
return [a0, a1]
|
||||
|
||||
inputs = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
self.check_model_cuda(fn, inputs, check_lowp=False)
|
||||
self.check_model_gpu(fn, inputs, check_lowp=False)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@inplace_bin_ops
|
||||
def test_reinplacing_mut_after(self, op):
|
||||
def fn(a0, a1, b0, b1):
|
||||
op([a0, a1], [b0, b1])
|
||||
a0.add_(torch.ones(10, 10, device="cuda:0"))
|
||||
a0.add_(torch.ones(10, 10, device=GPU_TYPE))
|
||||
return [a0, a1]
|
||||
|
||||
inputs = (
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device="cuda:0"),
|
||||
torch.rand(20, 20, device="cuda:0"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
self.check_model_cuda(fn, inputs, check_lowp=False)
|
||||
self.check_model_gpu(fn, inputs, check_lowp=False)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
def test_multi_device(self):
|
||||
def test_foreach_add(a0, a1, b0, b1):
|
||||
return torch._foreach_add([a0, a1], [b0, b1])
|
||||
|
||||
inps = [
|
||||
torch.ones(10, 10, device="cuda"),
|
||||
torch.ones(10, 10, device=GPU_TYPE),
|
||||
torch.ones(20, 20, device="cpu"),
|
||||
torch.zeros(10, 10, device="cuda"),
|
||||
torch.zeros(10, 10, device=GPU_TYPE),
|
||||
torch.zeros(20, 20, device="cpu"),
|
||||
]
|
||||
|
||||
@ -930,13 +930,13 @@ class ForeachTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
def test_aliasing(self):
|
||||
def test_foreach_add(a0, a1, a2, b0, b1, b2):
|
||||
return torch._foreach_add_([a0, a1, a2], [b0, b1, b2])
|
||||
|
||||
input = torch.ones(10, 10, device="cuda")
|
||||
input2 = torch.ones(10, 10, device="cuda")
|
||||
input = torch.ones(10, 10, device=GPU_TYPE)
|
||||
input2 = torch.ones(10, 10, device=GPU_TYPE)
|
||||
inps = [
|
||||
input,
|
||||
input.view(10, 10),
|
||||
@ -952,7 +952,7 @@ class ForeachTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 1)
|
||||
def test_2d_block_no_mixed_sizes_no_mask(self):
|
||||
"""2D blocking with no mixed sizes constant mask"""
|
||||
@ -960,21 +960,21 @@ class ForeachTests(TestCase):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
return torch._foreach_add([a0, a1, a2], [b0, b1, b2])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(1024, 2048, device="cuda:0"),
|
||||
torch.rand(2048, 2048, device="cuda:0"),
|
||||
torch.rand(1024, 2048, device="cuda:0"),
|
||||
torch.rand(2048, 1024, device="cuda:0").t(),
|
||||
torch.rand(2048, 2048, device="cuda:0").t(),
|
||||
torch.rand(2048, 1024, device="cuda:0").t(),
|
||||
torch.rand(1024, 2048, device=GPU_TYPE),
|
||||
torch.rand(2048, 2048, device=GPU_TYPE),
|
||||
torch.rand(1024, 2048, device=GPU_TYPE),
|
||||
torch.rand(2048, 1024, device=GPU_TYPE).t(),
|
||||
torch.rand(2048, 2048, device=GPU_TYPE).t(),
|
||||
torch.rand(2048, 1024, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@torch._inductor.config.patch("combo_kernel_allow_mixed_sizes", 2)
|
||||
def test_2d_block_mixed_sizes_with_mask(self):
|
||||
"""2D blocking with mixed sizes should have mask"""
|
||||
@ -982,21 +982,21 @@ class ForeachTests(TestCase):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
return torch._foreach_add([a0, a1, a2], [b0, b1, b2])
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(1024, 2048, device="cuda:0"),
|
||||
torch.rand(2048, 2048, device="cuda:0"),
|
||||
torch.rand(1024, 2048, device="cuda:0"),
|
||||
torch.rand(2048, 1024, device="cuda:0").t(),
|
||||
torch.rand(2048, 2048, device="cuda:0").t(),
|
||||
torch.rand(2048, 1024, device="cuda:0").t(),
|
||||
torch.rand(1024, 2048, device=GPU_TYPE),
|
||||
torch.rand(2048, 2048, device=GPU_TYPE),
|
||||
torch.rand(1024, 2048, device=GPU_TYPE),
|
||||
torch.rand(2048, 1024, device=GPU_TYPE).t(),
|
||||
torch.rand(2048, 2048, device=GPU_TYPE).t(),
|
||||
torch.rand(2048, 1024, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@foreach_map_bin_ops
|
||||
def test_foreach_map_backward_binary(self, op):
|
||||
from torch._dynamo.polyfills import foreach_map_fn
|
||||
@ -1011,14 +1011,14 @@ class ForeachTests(TestCase):
|
||||
|
||||
ref_inps = (
|
||||
[
|
||||
torch.rand(10, 20, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(30, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
|
||||
],
|
||||
[
|
||||
torch.rand(10, 20, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(30, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
|
||||
],
|
||||
)
|
||||
inps = (
|
||||
@ -1037,7 +1037,7 @@ class ForeachTests(TestCase):
|
||||
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
def test_foreach_map_input_mutation(self):
|
||||
def fn(xs, ys):
|
||||
outs = foreach_map_add_inplace(xs, ys)
|
||||
@ -1045,14 +1045,14 @@ class ForeachTests(TestCase):
|
||||
|
||||
ref_inps = (
|
||||
[
|
||||
torch.rand(10, 20, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(30, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
|
||||
],
|
||||
[
|
||||
torch.rand(10, 20, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(30, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
|
||||
],
|
||||
)
|
||||
# Set requires_grad to be False to avoid mutating a leaf variable
|
||||
@ -1073,7 +1073,7 @@ class ForeachTests(TestCase):
|
||||
):
|
||||
_ = run_fw_bw_and_get_code(lambda: torch.compile(fn)(*inps))
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu
|
||||
@foreach_map_un_ops
|
||||
def test_foreach_map_backward_unary(self, op):
|
||||
from torch._dynamo.polyfills import foreach_map_fn
|
||||
@ -1087,9 +1087,9 @@ class ForeachTests(TestCase):
|
||||
return outs[0].sum() + outs[1].sum() + outs[2].sum()
|
||||
|
||||
ref_inp = [
|
||||
torch.rand(10, 20, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(30, 30, device="cuda:0", requires_grad=True),
|
||||
torch.rand(10, 20, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(10, 30, device=GPU_TYPE, requires_grad=True),
|
||||
torch.rand(30, 30, device=GPU_TYPE, requires_grad=True),
|
||||
]
|
||||
|
||||
inp = [x.clone().detach().requires_grad_(True) for x in ref_inp]
|
||||
@ -1109,5 +1109,5 @@ class ForeachTests(TestCase):
|
||||
if __name__ == "__main__":
|
||||
from torch._inductor.test_case import run_tests
|
||||
|
||||
if HAS_CPU or HAS_CUDA_AND_TRITON:
|
||||
if HAS_CPU or HAS_GPU:
|
||||
run_tests(needs="filelock")
|
||||
|
||||
@ -111,6 +111,7 @@ class MultiKernelTest(TestCase):
|
||||
@requires_triton()
|
||||
# TODO: bobrenjc93 to fix multi-kernel for ROCM
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
||||
def test_triton_gemm(self):
|
||||
def fn(x, y):
|
||||
@ -139,6 +140,7 @@ class MultiKernelTest(TestCase):
|
||||
@requires_triton()
|
||||
# TODO: bobrenjc93 to fix multi-kernel for ROCM
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
||||
def test_triton_relu_fused_gemm(self):
|
||||
def fn(x, y):
|
||||
|
||||
@ -74,7 +74,7 @@ class TestCase(InductorTestCase):
|
||||
def run(op, args, kwargs):
|
||||
return op(*args, **kwargs)
|
||||
|
||||
sample_inputs_itr = op.sample_inputs("cuda", dtype, requires_grad=False)
|
||||
sample_inputs_itr = op.sample_inputs(GPU_TYPE, dtype, requires_grad=False)
|
||||
for sample_input in sample_inputs_itr:
|
||||
args = (sample_input.input,) + sample_input.args
|
||||
kwargs = sample_input.kwargs
|
||||
@ -307,7 +307,9 @@ class TestCase(InductorTestCase):
|
||||
)
|
||||
|
||||
|
||||
instantiate_device_type_tests(TestCase, globals(), only_for=("cuda",))
|
||||
instantiate_device_type_tests(
|
||||
TestCase, globals(), only_for=("cuda", "xpu"), allow_xpu=True
|
||||
)
|
||||
|
||||
if __name__ == "__main__":
|
||||
from torch._inductor.test_case import run_tests
|
||||
|
||||
@ -35,7 +35,7 @@ from torch._inductor.virtualized import V
|
||||
from torch.fx.experimental.proxy_tensor import make_fx
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal.common_cuda import SM80OrLater, xfailIfSM89
|
||||
from torch.testing._internal.common_device_type import expectedFailureXPU, skipCUDAIf
|
||||
from torch.testing._internal.common_device_type import skipCUDAIf
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
IS_LINUX,
|
||||
@ -139,7 +139,7 @@ class TestPatternMatcher(TestCase):
|
||||
ref[indices], test[indices]
|
||||
) # also checks that dtype is correct
|
||||
|
||||
@skipIfXpu
|
||||
# @skipIfXpu
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@inductor_config.patch(
|
||||
{
|
||||
@ -239,7 +239,6 @@ class TestPatternMatcher(TestCase):
|
||||
self.assertEqual(f(inp), f_replaced(inp))
|
||||
self.assertEqual(count, 2)
|
||||
|
||||
@skipIfXpu
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@inductor_config.patch(
|
||||
{
|
||||
@ -288,7 +287,6 @@ class TestPatternMatcher(TestCase):
|
||||
self._test_fused_int_mm_mul_impl(fn2, args, True)
|
||||
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@inductor_config.patch(
|
||||
{
|
||||
@ -336,7 +334,6 @@ class TestPatternMatcher(TestCase):
|
||||
"triton_tem" if not extern_mm else "extern_kernels.mm"
|
||||
).run(code)
|
||||
|
||||
@expectedFailureXPU
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@inductor_config.patch(
|
||||
{
|
||||
@ -372,7 +369,6 @@ class TestPatternMatcher(TestCase):
|
||||
for args in args_list:
|
||||
self._test_mixed_impl(fn, args, True, False)
|
||||
|
||||
@expectedFailureXPU
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@inductor_config.patch(
|
||||
{
|
||||
@ -399,7 +395,6 @@ class TestPatternMatcher(TestCase):
|
||||
)
|
||||
self._test_mixed_impl(fn, args, True, False, rtol=0.16, atol=1e-4)
|
||||
|
||||
@expectedFailureXPU
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@inductor_config.patch(
|
||||
{
|
||||
@ -431,7 +426,6 @@ class TestPatternMatcher(TestCase):
|
||||
for args in args_list:
|
||||
self._test_mixed_impl(fn, args, True, False)
|
||||
|
||||
@expectedFailureXPU
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@inductor_config.patch(
|
||||
{
|
||||
@ -469,7 +463,6 @@ class TestPatternMatcher(TestCase):
|
||||
for args in args_list:
|
||||
self._test_mixed_impl(fn, args, True, False)
|
||||
|
||||
@expectedFailureXPU
|
||||
@skipCUDAIf(not SM80OrLater, "need sm_80")
|
||||
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
||||
def test_mixed_mm_gating(self):
|
||||
|
||||
@ -208,9 +208,9 @@ def associative_scan(
|
||||
raise ValueError(
|
||||
f"Combine_mode must either 'pointwise' or 'generic', but got {cm}"
|
||||
)
|
||||
if cm == "pointwise" and not all(l.device.type == "cuda" for l in lxs):
|
||||
if cm == "pointwise" and not all(l.device.type in ["cuda", "xpu"] for l in lxs):
|
||||
raise ValueError(
|
||||
"For combine_mode='pointwise', all input tensors need to be on CUDA"
|
||||
"For combine_mode='pointwise', all input tensors need to be on CUDA or XPU"
|
||||
)
|
||||
|
||||
# Checks for xs
|
||||
|
||||
@ -46,6 +46,7 @@ from torch.testing._internal.common_utils import (
|
||||
skipIfXpu,
|
||||
TEST_WITH_TORCHDYNAMO,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE
|
||||
from torch.utils._foreach_utils import _get_foreach_kernels_supported_devices
|
||||
|
||||
|
||||
@ -369,7 +370,7 @@ def optim_inputs_func_adadelta(device, dtype=None):
|
||||
OptimizerInput(
|
||||
params=None, kwargs={"rho": 0.95, "weight_decay": 0.9}, desc="rho"
|
||||
),
|
||||
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
|
||||
|
||||
def optim_error_inputs_func_adadelta(device, dtype):
|
||||
@ -569,7 +570,7 @@ def optim_inputs_func_adam(device, dtype=None):
|
||||
desc="amsgrad",
|
||||
),
|
||||
]
|
||||
+ (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
+ (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
+ (mps_supported_configs if _get_device_type(device) == "mps" else [])
|
||||
)
|
||||
if dtype == torch.float16:
|
||||
@ -650,7 +651,7 @@ def optim_error_inputs_func_adam(device, dtype):
|
||||
error_regex=r"betas\[0\] as a Tensor is not supported for capturable=False and foreach=True",
|
||||
),
|
||||
]
|
||||
if _get_device_type(device) == "cuda":
|
||||
if _get_device_type(device) == GPU_TYPE:
|
||||
sample_tensor = torch.empty((), device=device, dtype=dtype)
|
||||
error_inputs += [
|
||||
ErrorOptimizerInput(
|
||||
@ -721,7 +722,7 @@ def optim_inputs_func_adamax(device, dtype=None):
|
||||
kwargs={"weight_decay": 0.1, "maximize": True},
|
||||
desc="maximize, weight_decay",
|
||||
),
|
||||
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
|
||||
|
||||
def optim_error_inputs_func_adamax(device, dtype):
|
||||
@ -792,7 +793,7 @@ def optim_inputs_func_asgd(device, dtype=None):
|
||||
kwargs={"weight_decay": 0.1, "maximize": True},
|
||||
desc="maximize, nonzero weight_decay",
|
||||
),
|
||||
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
|
||||
|
||||
def optim_error_inputs_func_asgd(device, dtype):
|
||||
@ -974,7 +975,7 @@ def optim_inputs_func_nadam(device, dtype=None):
|
||||
kwargs={"weight_decay": 0.1, "maximize": True},
|
||||
desc="maximize",
|
||||
),
|
||||
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
|
||||
|
||||
def optim_error_inputs_func_nadam(device, dtype):
|
||||
@ -1052,7 +1053,7 @@ def optim_inputs_func_radam(device=None, dtype=None):
|
||||
kwargs={"weight_decay": 0.1, "maximize": True},
|
||||
desc="maximize",
|
||||
),
|
||||
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
|
||||
|
||||
def optim_error_inputs_func_radam(device, dtype):
|
||||
@ -1137,7 +1138,7 @@ def optim_inputs_func_rmsprop(device, dtype=None):
|
||||
},
|
||||
desc="maximize, centered, weight_decay, w/ momentum",
|
||||
),
|
||||
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
|
||||
|
||||
def optim_error_inputs_func_rmsprop(device, dtype):
|
||||
@ -1179,7 +1180,7 @@ def optim_inputs_func_rprop(device, dtype=None):
|
||||
desc="non-default step_sizes",
|
||||
),
|
||||
OptimizerInput(params=None, kwargs={"maximize": True}, desc="maximize"),
|
||||
] + (cuda_supported_configs if _get_device_type(device) == "cuda" else [])
|
||||
] + (cuda_supported_configs if _get_device_type(device) == GPU_TYPE else [])
|
||||
|
||||
|
||||
def optim_error_inputs_func_rprop(device, dtype):
|
||||
@ -1671,7 +1672,7 @@ optim_db: list[OptimizerInfo] = [
|
||||
"maximize",
|
||||
"capturable",
|
||||
),
|
||||
supports_fused_on=("cpu", "cuda", "mps"),
|
||||
supports_fused_on=("cpu", "cuda", "xpu", "mps"),
|
||||
decorators=(
|
||||
# Expected floating point error between fused and compiled forloop
|
||||
DecorateInfo(
|
||||
@ -2161,6 +2162,7 @@ optim_db: list[OptimizerInfo] = [
|
||||
supports_fused_on=(
|
||||
"cpu",
|
||||
"cuda",
|
||||
"xpu",
|
||||
"mps",
|
||||
),
|
||||
skips=(
|
||||
|
||||
@ -77,6 +77,7 @@ HAS_XPU_AND_TRITON = torch.xpu.is_available() and HAS_TRITON
|
||||
HAS_MPS = torch.mps.is_available()
|
||||
|
||||
HAS_GPU = HAS_CUDA_AND_TRITON or HAS_XPU_AND_TRITON
|
||||
HAS_GPU_AND_TRITON = HAS_GPU
|
||||
|
||||
GPU_TYPE = get_gpu_type()
|
||||
|
||||
@ -173,7 +174,7 @@ IS_H100 = LazyVal(
|
||||
and get_gpu_shared_memory() == 232448
|
||||
)
|
||||
|
||||
IS_BIG_GPU = LazyVal(lambda: HAS_CUDA_AND_TRITON and is_big_gpu())
|
||||
IS_BIG_GPU = LazyVal(lambda: HAS_GPU_AND_TRITON and is_big_gpu())
|
||||
|
||||
def dummy_graph() -> GraphLowering:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user