mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-12 14:54:55 +08:00
Compare commits
8 Commits
documentat
...
ciflow/h10
| Author | SHA1 | Date | |
|---|---|---|---|
| 8530256a41 | |||
| 598483e82e | |||
| cb9d989787 | |||
| da4e8e0673 | |||
| b4f6a78145 | |||
| 07ea62f600 | |||
| 95d4abebb4 | |||
| 13e5dadef2 |
@ -39,7 +39,10 @@ from torch.testing._internal.common_utils import (
|
||||
)
|
||||
from torch.testing._internal.hop_db import hop_db
|
||||
from torch.testing._internal.logging_utils import LoggingTestCase, make_logging_test
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
def count_ops(gm, args, freq, op):
|
||||
@ -6889,7 +6892,7 @@ class ActivationCheckpointingTests(torch._dynamo.test_case.TestCase):
|
||||
fn, backend, x, y, skip_check=True
|
||||
) # dropout decomp is known to diverge with eager
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._functorch.config.patch(functionalize_rng_ops=True)
|
||||
def test_fallback(self):
|
||||
def gn(x, y):
|
||||
|
||||
@ -1554,7 +1554,8 @@ class AOTInductorTestsTemplate:
|
||||
|
||||
# scaled_dot_product_flash_attention
|
||||
@unittest.skipIf(
|
||||
not HAS_XPU_AND_TRITON and not SM80OrLater, "bfloat16 only supported in sm80+"
|
||||
not SM80OrLater and not HAS_XPU_AND_TRITON,
|
||||
"bfloat16 only supported in sm80+ or XPU",
|
||||
)
|
||||
def test_sdpa(self):
|
||||
class Model(torch.nn.Module):
|
||||
@ -1571,7 +1572,10 @@ class AOTInductorTestsTemplate:
|
||||
)
|
||||
self.check_model(Model(), example_inputs)
|
||||
|
||||
@unittest.skipIf(not SM80OrLater, "bfloat16 only supported in sm80+")
|
||||
@unittest.skipIf(
|
||||
not SM80OrLater and not HAS_XPU_AND_TRITON,
|
||||
"bfloat16 only supported in sm80+ or XPU",
|
||||
)
|
||||
@unittest.skipIf(
|
||||
# for archs where this isn't lowered to flash attention, the math
|
||||
# backend will be used and it doesn't work for bfloat16
|
||||
@ -5926,8 +5930,8 @@ class AOTInductorTestsTemplate:
|
||||
@requires_gpu
|
||||
def test_d2h_copy(self):
|
||||
# device to copy host should always have the same stride
|
||||
if "cuda" not in self.device:
|
||||
raise unittest.SkipTest("This test is only for CUDA")
|
||||
if self.device not in ["cuda", "xpu"]:
|
||||
raise unittest.SkipTest("This test is only for CUDA or XPU")
|
||||
|
||||
class ToCpuModel(nn.Module):
|
||||
def forward(self, x):
|
||||
|
||||
@ -28,7 +28,7 @@ from torch.export.pt2_archive._package import (
|
||||
load_weights_to_pt2_contents,
|
||||
)
|
||||
from torch.testing._internal.common_cuda import _get_torch_cuda_version
|
||||
from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_FBCODE, skipIfXpu, TEST_CUDA
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU
|
||||
|
||||
|
||||
@ -267,9 +267,9 @@ class TestAOTInductorPackage(TestCase):
|
||||
|
||||
@unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
|
||||
@unittest.skipIf(
|
||||
_get_torch_cuda_version() < (12, 6), "Test is only supported on CUDA 12.6+"
|
||||
TEST_CUDA and _get_torch_cuda_version() < (12, 6),
|
||||
"Test is only supported on CUDA 12.6+",
|
||||
)
|
||||
@skipIfXpu # build system may be different
|
||||
def test_compile_after_package(self):
|
||||
self.check_package_cpp_only()
|
||||
|
||||
|
||||
@ -11,19 +11,19 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
TestCase,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import HAS_CPU, HAS_CUDA_AND_TRITON
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, HAS_GPU_AND_TRITON
|
||||
from torch.testing._internal.triton_utils import requires_gpu_and_triton
|
||||
|
||||
|
||||
aten = torch.ops.aten
|
||||
|
||||
try:
|
||||
try:
|
||||
from .test_torchinductor import check_model, check_model_cuda
|
||||
from .test_torchinductor import check_model, check_model_gpu
|
||||
except ImportError:
|
||||
from test_torchinductor import ( # @manual=fbcode//caffe2/test/inductor:test_inductor-library
|
||||
check_model,
|
||||
check_model_cuda,
|
||||
check_model_gpu,
|
||||
)
|
||||
except (unittest.SkipTest, ImportError) as e:
|
||||
sys.stderr.write(f"{type(e)}: {e}\n")
|
||||
@ -34,7 +34,7 @@ except (unittest.SkipTest, ImportError) as e:
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -56,7 +56,7 @@ class ComboKernelTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_activation_functions(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -65,9 +65,9 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -76,7 +76,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_functions(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -87,10 +87,10 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -99,7 +99,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(torch._inductor.metrics.generated_kernel_count <= 2)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_mutated_args(self):
|
||||
def test_mutated(a, b, c, d):
|
||||
a.add_(1)
|
||||
@ -110,10 +110,10 @@ class ComboKernelTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -122,7 +122,7 @@ class ComboKernelTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_split(self):
|
||||
def fn(a, b):
|
||||
a1 = torch.linalg.vector_norm(a)
|
||||
@ -130,15 +130,15 @@ class ComboKernelTests(TestCase):
|
||||
return a1, b1
|
||||
|
||||
inps = [
|
||||
torch.rand(2048, 512, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(2048, 512, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
]
|
||||
out_eager = fn(*inps)
|
||||
out_compiled = torch.compile(fn)(*inps)
|
||||
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_2d_blocking_partitioning(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -146,15 +146,15 @@ class ComboKernelTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
@ -163,7 +163,7 @@ class ComboKernelTests(TestCase):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelBenchmarkTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -185,7 +185,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_activation_benchmark(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -194,9 +194,9 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -205,7 +205,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_reduce_benchmark(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -216,10 +216,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -228,7 +228,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_mutated_benchmark(self):
|
||||
def test_mutated(a, b, c, d):
|
||||
a.add_(1)
|
||||
@ -239,10 +239,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -251,7 +251,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(torch._inductor.metrics.generated_kernel_count in [6, 9])
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_round_robin_dispatch(self):
|
||||
# combo kernel dispatch strategy: round robin
|
||||
def test_mutated(a, b, c, d):
|
||||
@ -263,10 +263,10 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 5, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(5, 18, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 5, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(5, 18, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -275,7 +275,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_2d_blocking_benchmark(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -283,28 +283,28 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_persistent_reduction_no_x_dim(self):
|
||||
def fn(x, y):
|
||||
return x.sum(1), y.sum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 256, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 256, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -317,7 +317,7 @@ class ComboKernelBenchmarkTests(TestCase):
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
class ComboKernelDynamicShapesTests(TestCase):
|
||||
check_model_cuda = check_model_cuda
|
||||
check_model_gpu = check_model_gpu
|
||||
check_model_cpu = check_model
|
||||
check_kernel_count = True
|
||||
|
||||
@ -347,7 +347,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
super().tearDown()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_activations(self):
|
||||
def test_activations(a, b, c):
|
||||
a1 = torch.nn.functional.relu(a)
|
||||
@ -356,9 +356,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -367,7 +367,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_2d_blocking(self):
|
||||
def fn(a0, a1, a2, b0, b1, b2):
|
||||
c0 = torch.add(a0, b0)
|
||||
@ -375,21 +375,21 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
c2 = torch.add(a2, b2)
|
||||
return c0, c1, c2
|
||||
|
||||
self.check_model_cuda(
|
||||
self.check_model_gpu(
|
||||
fn,
|
||||
(
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(40, 30, device="cuda"),
|
||||
torch.rand(36, 40, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda"),
|
||||
torch.rand(30, 40, device="cuda").t(),
|
||||
torch.rand(40, 36, device="cuda").t(),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(40, 30, device=GPU_TYPE),
|
||||
torch.rand(36, 40, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE),
|
||||
torch.rand(30, 40, device=GPU_TYPE).t(),
|
||||
torch.rand(40, 36, device=GPU_TYPE).t(),
|
||||
),
|
||||
)
|
||||
|
||||
self.assertTrue(7 <= torch._inductor.metrics.generated_kernel_count <= 8)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_reduce(self):
|
||||
def test_reduce(a, b, c, d):
|
||||
a1 = torch.sum(a, dim=0)
|
||||
@ -400,10 +400,10 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1, d1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(30, 8, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(30, 8, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_reduce(*inps)
|
||||
@ -412,7 +412,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(4 < torch._inductor.metrics.generated_kernel_count <= 10)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_dynamic_shapes_mutated(self):
|
||||
# combo kernel dispatch strategy: round robin
|
||||
def test_mutated(a, b, c, d):
|
||||
@ -424,10 +424,10 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a, b, c, d
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 5, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(5, 18, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 5, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(5, 18, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_mutated(*inps)
|
||||
@ -436,7 +436,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch("combo_kernels_autotune", 0)
|
||||
def test_dynamic_shapes_activations_no_autotune(self):
|
||||
def test_activations(a, b, c):
|
||||
@ -446,9 +446,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return a1, b1, c1
|
||||
|
||||
inps = [
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(20, 20, device="cuda"),
|
||||
torch.rand(10, 10, device="cuda"),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
torch.rand(20, 20, device=GPU_TYPE),
|
||||
torch.rand(10, 10, device=GPU_TYPE),
|
||||
]
|
||||
|
||||
out_eager = test_activations(*inps)
|
||||
@ -457,7 +457,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 5)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_persistent_reduction_no_x_dim(self):
|
||||
@ -465,8 +465,8 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(1), y.sum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 256, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 256, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -476,7 +476,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_persistent_reduction_no_x_dim_2(self):
|
||||
@ -484,8 +484,8 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(2), y.sum(2)
|
||||
|
||||
inps = (
|
||||
torch.rand(8, 16, 256, device="cuda"),
|
||||
torch.rand(8, 32, 256, device="cuda"),
|
||||
torch.rand(8, 16, 256, device=GPU_TYPE),
|
||||
torch.rand(8, 32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], (0, 1), min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], (0, 1), min=1, max=256)
|
||||
@ -495,7 +495,7 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 4)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
def test_dynamic_shapes_2d_blocking_round_robin(self):
|
||||
@ -506,12 +506,12 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return c0, c1, c2
|
||||
|
||||
inps = (
|
||||
torch.rand(20, 30, device="cuda"),
|
||||
torch.rand(30, 30, device="cuda"),
|
||||
torch.rand(40, 32, device="cuda"),
|
||||
torch.rand(30, 20, device="cuda").t(),
|
||||
torch.rand(30, 30, device="cuda").t(),
|
||||
torch.rand(32, 40, device="cuda").t(),
|
||||
torch.rand(20, 30, device=GPU_TYPE),
|
||||
torch.rand(30, 30, device=GPU_TYPE),
|
||||
torch.rand(40, 32, device=GPU_TYPE),
|
||||
torch.rand(30, 20, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 30, device=GPU_TYPE).t(),
|
||||
torch.rand(32, 40, device=GPU_TYPE).t(),
|
||||
)
|
||||
|
||||
out_eager = fn(*inps)
|
||||
@ -522,19 +522,19 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
torch._inductor.metrics.reset()
|
||||
|
||||
inps = (
|
||||
torch.rand(24, 30, device="cuda"),
|
||||
torch.rand(32, 30, device="cuda"),
|
||||
torch.rand(48, 32, device="cuda"),
|
||||
torch.rand(30, 24, device="cuda").t(),
|
||||
torch.rand(30, 32, device="cuda").t(),
|
||||
torch.rand(32, 48, device="cuda").t(),
|
||||
torch.rand(24, 30, device=GPU_TYPE),
|
||||
torch.rand(32, 30, device=GPU_TYPE),
|
||||
torch.rand(48, 32, device=GPU_TYPE),
|
||||
torch.rand(30, 24, device=GPU_TYPE).t(),
|
||||
torch.rand(30, 32, device=GPU_TYPE).t(),
|
||||
torch.rand(32, 48, device=GPU_TYPE).t(),
|
||||
)
|
||||
out_compiled = compiled(*inps)
|
||||
out_eager = fn(*inps)
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
self.assertTrue(5 <= torch._inductor.metrics.generated_kernel_count <= 6)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._dynamo.config.patch("automatic_dynamic_shapes", True)
|
||||
@torch._dynamo.config.patch("assume_static_by_default", True)
|
||||
@torch._inductor.config.patch("triton.autotune_at_compile_time", True)
|
||||
@ -543,9 +543,9 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
return x.sum(1), y.mean(1), z.max(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 128, device="cuda"),
|
||||
torch.rand(32, 128, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
torch._dynamo.mark_dynamic(inps[0], 0, min=1, max=256)
|
||||
torch._dynamo.mark_dynamic(inps[1], 0, min=1, max=256)
|
||||
@ -555,15 +555,15 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
|
||||
self.assertEqual(out_eager, out_compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_helper_fn_defined(self):
|
||||
def fn(x, y, z):
|
||||
return x.sum(1), y.mean(1), z.cumsum(1)
|
||||
|
||||
inps = (
|
||||
torch.rand(16, 128, device="cuda"),
|
||||
torch.rand(32, 128, device="cuda"),
|
||||
torch.rand(32, 256, device="cuda"),
|
||||
torch.rand(16, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 128, device=GPU_TYPE),
|
||||
torch.rand(32, 256, device=GPU_TYPE),
|
||||
)
|
||||
|
||||
out_eager = fn(*inps)
|
||||
@ -577,5 +577,5 @@ class ComboKernelDynamicShapesTests(TestCase):
|
||||
if __name__ == "__main__":
|
||||
from torch._dynamo.test_case import run_tests
|
||||
|
||||
if HAS_CPU or HAS_CUDA_AND_TRITON:
|
||||
if HAS_CPU or HAS_GPU_AND_TRITON:
|
||||
run_tests(needs="filelock")
|
||||
|
||||
@ -45,6 +45,7 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
scoped_load_inline,
|
||||
skipIfWindows,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.hop_db import hop_db
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
@ -52,9 +53,13 @@ from torch.testing._internal.inductor_utils import (
|
||||
HAS_CPU,
|
||||
HAS_CUDA_AND_TRITON,
|
||||
HAS_GPU,
|
||||
HAS_XPU_AND_TRITON,
|
||||
)
|
||||
from torch.testing._internal.logging_utils import logs_to_string
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
from torch.utils._python_dispatch import TorchDispatchMode
|
||||
|
||||
|
||||
@ -3049,13 +3054,14 @@ main()
|
||||
|
||||
self.assertEqual(counters["inductor"]["cudagraph_skips"], 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@skipIfXpu(msg="cudagraphs not supported on xpu for now!")
|
||||
@requires_gpu_and_triton
|
||||
def test_cudagraphs_sdpa(self):
|
||||
query = torch.rand(
|
||||
32, 8, 128, 64, dtype=torch.float16, device="cuda", requires_grad=True
|
||||
32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE, requires_grad=True
|
||||
)
|
||||
key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
|
||||
value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device="cuda")
|
||||
key = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
|
||||
value = torch.rand(32, 8, 128, 64, dtype=torch.float16, device=GPU_TYPE)
|
||||
out = torch.nn.functional.scaled_dot_product_attention(query, key, value)
|
||||
|
||||
with (
|
||||
@ -3747,7 +3753,7 @@ class CompiledAutograd0(torch.nn.Module):
|
||||
self.assertTrue(isinstance(view_nodes[0].args[1][0], torch.fx.Node))
|
||||
self.assertTrue(isinstance(view_nodes[1].args[1][0], torch.fx.Node))
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_flex_attention(self):
|
||||
def _squared(score, b, h, m, n):
|
||||
"""Joint graph needed for correctness"""
|
||||
@ -3765,7 +3771,7 @@ class CompiledAutograd0(torch.nn.Module):
|
||||
a * b,
|
||||
b,
|
||||
dtype=torch.bfloat16,
|
||||
device="cuda",
|
||||
device=GPU_TYPE,
|
||||
requires_grad=True,
|
||||
)
|
||||
fwd_bwd(v)
|
||||
@ -5333,12 +5339,13 @@ if IS_S390X:
|
||||
test_autograd = load_test_module("test_autograd")
|
||||
test_custom_ops = load_test_module("test_custom_ops")
|
||||
test_higher_order_ops = load_test_module("dynamo/test_higher_order_ops")
|
||||
|
||||
TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
|
||||
if not HAS_XPU_AND_TRITON:
|
||||
TestAutogradWithCompiledAutograd = wrap_test_class(test_autograd.TestAutograd)
|
||||
TestNestedCheckpointWithCompiledAutograd = wrap_test_class(
|
||||
test_autograd.TestNestedCheckpoint
|
||||
)
|
||||
TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
|
||||
if not HAS_XPU_AND_TRITON:
|
||||
TestCustomOpWithCompiledAutograd = wrap_test_class(test_custom_ops.TestCustomOp)
|
||||
HigherOrderOpTestsWithCompiledAutograd = wrap_test_class(
|
||||
test_higher_order_ops.HigherOrderOpTests
|
||||
)
|
||||
@ -5367,6 +5374,7 @@ class TestCompiledAutogradOpInfo(TestCase):
|
||||
super(TestCase, self).tearDown()
|
||||
reset()
|
||||
|
||||
@skipIfXpu(msg="NotImplementedError: The operator 'testlib::mutating_custom_op'")
|
||||
@ops(
|
||||
list(filter(lambda op: op.name not in xfail_hops, hop_db)),
|
||||
allowed_dtypes=(torch.float,),
|
||||
@ -5419,7 +5427,7 @@ class TestCompiledAutogradOpInfo(TestCase):
|
||||
self.assertEqual(expected, actual)
|
||||
|
||||
|
||||
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals())
|
||||
instantiate_device_type_tests(TestCompiledAutogradOpInfo, globals(), allow_xpu=True)
|
||||
instantiate_parametrized_tests(TestCompiledAutograd)
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@ -65,7 +65,11 @@ from torch.testing._internal.inductor_utils import (
|
||||
HAS_GPU,
|
||||
has_triton,
|
||||
)
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton, requires_gpu
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
def get_inputs(optim):
|
||||
@ -946,7 +950,7 @@ class CompiledOptimizerTests(TestCase):
|
||||
kwargs = aot_graph_input_parser(forward)
|
||||
torch.compile(forward)(**kwargs)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_foreach_map_adam(self):
|
||||
params = [
|
||||
torch.rand(
|
||||
|
||||
@ -208,7 +208,7 @@ class TestCustomLowering(InductorTestCase):
|
||||
|
||||
@requires_gpu()
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2328")
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_tanh_approx(self):
|
||||
def fn(inp):
|
||||
@ -223,7 +223,7 @@ class TestCustomLowering(InductorTestCase):
|
||||
|
||||
@requires_gpu()
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2328")
|
||||
@skipIf(GPU_TYPE == "mps", "Not applicable to MPS")
|
||||
def test_multi_inp_asm(self):
|
||||
def fn(a, b):
|
||||
|
||||
@ -10,10 +10,11 @@ from torch._inductor.utils import fresh_cache
|
||||
from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_CUDA_AND_TRITON,
|
||||
HAS_GPU_AND_TRITON,
|
||||
IS_BIG_GPU,
|
||||
)
|
||||
|
||||
@ -38,6 +39,7 @@ class DeterministicTest(TestCase):
|
||||
finally:
|
||||
torch.use_deterministic_algorithms(old_val, warn_only=True)
|
||||
|
||||
@skipIfXpu(msg="pad_mm is not enabled for XPU.")
|
||||
@parametrize("deterministic", [False, True])
|
||||
def test_mm_padding(self, deterministic):
|
||||
with inductor_config.patch(deterministic=deterministic):
|
||||
@ -106,5 +108,5 @@ class DeterministicTest(TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if HAS_CUDA_AND_TRITON:
|
||||
if HAS_GPU_AND_TRITON:
|
||||
run_tests()
|
||||
|
||||
@ -10,7 +10,10 @@ from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
skipIfRocm,
|
||||
)
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import requires_gpu_and_triton
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
@instantiate_parametrized_tests
|
||||
@ -55,14 +58,14 @@ class TestTorchDeviceAssertTrigger(TestCase):
|
||||
f_c = torch.compile(func_inline, backend=backend)
|
||||
f_c()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@skipIfRocm
|
||||
@torch._inductor.config.patch(force_disable_caches=True)
|
||||
def test_assert_fusion(self):
|
||||
torch._logging.set_logs(inductor_metrics=True)
|
||||
|
||||
def func():
|
||||
a = torch.tensor([1.0, 2.0], device="cuda")
|
||||
a = torch.tensor([1.0, 2.0], device=device_type)
|
||||
result = torch.all(a > 0)
|
||||
assert result, "should throw"
|
||||
|
||||
@ -74,13 +77,13 @@ class TestTorchDeviceAssertTrigger(TestCase):
|
||||
self.assertEqual(metrics.generated_kernel_count, 1)
|
||||
torch._logging.set_logs()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@skipIfRocm
|
||||
@torch._inductor.config.patch(force_disable_caches=True)
|
||||
def test_run_assert_triton(self):
|
||||
@torch.compile(backend="inductor")
|
||||
def fn():
|
||||
a = torch.tensor([1.0, 2.0], device="cuda")
|
||||
a = torch.tensor([1.0, 2.0], device=device_type)
|
||||
result = torch.all(a > 0)
|
||||
assert result, "should throw"
|
||||
|
||||
|
||||
@ -7,7 +7,7 @@ from torch import nn
|
||||
from torch._dynamo import compiled_autograd
|
||||
from torch._dynamo.test_case import run_tests, TestCase
|
||||
from torch._dynamo.testing import CompileCounter
|
||||
from torch.testing._internal.common_utils import IS_MACOS, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_MACOS
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, requires_gpu
|
||||
|
||||
|
||||
@ -483,7 +483,6 @@ class DistributedPatternTests(TestCase):
|
||||
# Recompile on grad==None/grad!=None
|
||||
self.assertEqual(bw_cnt.frame_count, 2)
|
||||
|
||||
@skipIfXpu
|
||||
@requires_gpu()
|
||||
@torch._functorch.config.patch(recompute_views=True)
|
||||
def test_fake_distributed_inductor(self):
|
||||
|
||||
@ -5,6 +5,10 @@ import torch
|
||||
from torch._inductor import config
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch.testing._internal.common_cuda import TEST_CUDA
|
||||
from torch.testing._internal.common_utils import TEST_XPU
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
|
||||
class MatMulModule(torch.nn.Module):
|
||||
@ -68,13 +72,13 @@ class TestInductorExternalCallable(TestCase):
|
||||
msg=f"torch.compile(..., external_matmul = {matmul_dup}) failed",
|
||||
)
|
||||
|
||||
@unittest.skipIf(not TEST_CUDA, "CUDA not found")
|
||||
@unittest.skipIf(not TEST_CUDA and not TEST_XPU, "CUDA and XPU not found")
|
||||
@unittest.skipIf(
|
||||
torch.cuda.is_available() and torch.cuda.get_device_capability() < (7, 0),
|
||||
"Triton does not support device capability < 7.0",
|
||||
)
|
||||
def test_matmul_cuda(self):
|
||||
device = torch.device("cuda")
|
||||
device = torch.device(device_type)
|
||||
x = (torch.eye(128, 128) * 2).to(device=device)
|
||||
opt_fn = torch.compile(
|
||||
MatMulModule().to(device),
|
||||
|
||||
@ -11,6 +11,8 @@ from torch.testing._internal.common_utils import slowTest
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, RUN_GPU
|
||||
|
||||
|
||||
device_type = acc.type if (acc := torch.accelerator.current_accelerator()) else "cpu"
|
||||
|
||||
try:
|
||||
try:
|
||||
from . import (
|
||||
@ -306,11 +308,11 @@ if RUN_GPU:
|
||||
|
||||
from torch._inductor.utils import is_big_gpu
|
||||
|
||||
if GPU_TYPE == "cuda" and is_big_gpu():
|
||||
if GPU_TYPE in ("cuda", "xpu") and is_big_gpu():
|
||||
skip_list = ["test_addmm", "test_linear_relu"]
|
||||
# need to skip instead of omit, otherwise fbcode ci can be flaky
|
||||
for test_name in skip_list:
|
||||
test_failures_gpu_wrapper[f"{test_name}_cuda"] = (
|
||||
test_failures_gpu_wrapper[f"{test_name}_{device_type}"] = (
|
||||
test_torchinductor.TestFailure(("gpu_wrapper",), is_skip=True)
|
||||
)
|
||||
test_failures_gpu_wrapper[f"{test_name}_gpu_dynamic_shapes"] = (
|
||||
|
||||
@ -16,7 +16,12 @@ from torch.testing._internal.common_device_type import (
|
||||
instantiate_device_type_tests,
|
||||
skipCUDAIf,
|
||||
)
|
||||
from torch.testing._internal.common_utils import parametrize, run_tests, TestCase
|
||||
from torch.testing._internal.common_utils import (
|
||||
parametrize,
|
||||
run_tests,
|
||||
skipIfXpu,
|
||||
TestCase,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import IS_BIG_GPU
|
||||
from torch.utils._ordered_set import OrderedSet
|
||||
|
||||
@ -91,6 +96,10 @@ class TestScheduler(TestCase):
|
||||
metrics.reset()
|
||||
torch._logging.set_logs()
|
||||
|
||||
@skipIfXpu(
|
||||
msg="InvalidModule: Invalid SPIR-V module, "
|
||||
"https://github.com/intel/torch-xpu-ops/issues/2329"
|
||||
)
|
||||
@dtypes(torch.float, torch.float16)
|
||||
@skipCUDAIf(not SM70OrLater, "GPU capability is < SM70")
|
||||
@parametrize(
|
||||
|
||||
@ -1416,11 +1416,6 @@ class TestMaxAutotune(TestCase):
|
||||
torch.compile(lambda a, b: a.matmul(b))(a, b)
|
||||
self.assertIn("NoValidChoicesError", str(context.exception))
|
||||
|
||||
@unittest.skipIf(
|
||||
not torch.cuda.is_available()
|
||||
or torch.cuda.get_device_properties().total_memory < 2e10,
|
||||
"Only if the GPU has at least 20GB memory to be safe",
|
||||
)
|
||||
@config.patch(force_shape_pad=True, max_autotune=True)
|
||||
def test_linear_and_cel(self):
|
||||
"""
|
||||
|
||||
@ -319,11 +319,6 @@ class TestOperatorReorderForPeakMemory(TestCase):
|
||||
# succ nodes should be forwarded to pre mutation buffer
|
||||
self.assertTrue(buffer_info[post][2] <= buffer_info[pre][2])
|
||||
|
||||
@unittest.skipIf(
|
||||
not torch.cuda.is_available()
|
||||
or torch.cuda.get_device_properties().total_memory < int(1e10),
|
||||
"Need 10GB memory to be safe to run the test",
|
||||
)
|
||||
def test_fusing_reductions_increase_peak_memory(self):
|
||||
@torch.compile
|
||||
def f(a, b, c):
|
||||
@ -332,9 +327,9 @@ class TestOperatorReorderForPeakMemory(TestCase):
|
||||
a = torch.randn(1024 * 32, 16, device=GPU_TYPE)
|
||||
b = torch.randn(1024 * 32, 16, device=GPU_TYPE)
|
||||
c = torch.randn(16, 1024 * 32, device=GPU_TYPE)
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.get_device_module(GPU_TYPE).reset_peak_memory_stats()
|
||||
f(a, b, c)
|
||||
peak_mem = torch.cuda.max_memory_allocated()
|
||||
peak_mem = torch.get_device_module(GPU_TYPE).max_memory_allocated()
|
||||
|
||||
expected_bound = a.size(0) * c.size(1) * a.dtype.itemsize * 2
|
||||
self.assertLess(peak_mem, expected_bound)
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
import sys
|
||||
import unittest
|
||||
|
||||
from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_CI, IS_WINDOWS
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, requires_gpu
|
||||
|
||||
|
||||
@ -82,7 +82,6 @@ class TestMemoryPlanning(TestCase):
|
||||
).run(code)
|
||||
self.assertTrue(same(f(*args), result))
|
||||
|
||||
@skipIfXpu(msg="aoti doesn't work on XPU")
|
||||
def test_aoti(self):
|
||||
f, args = self._generate(device=GPU_TYPE)
|
||||
dim0_x = Dim("dim0_x", min=1, max=2048)
|
||||
|
||||
@ -7,12 +7,7 @@ import torch._inductor.config as inductor_config
|
||||
from torch._dynamo.test_minifier_common import MinifierTestBase
|
||||
from torch._inductor import config
|
||||
from torch.export import load as export_load
|
||||
from torch.testing._internal.common_utils import (
|
||||
IS_JETSON,
|
||||
IS_MACOS,
|
||||
skipIfXpu,
|
||||
TEST_WITH_ASAN,
|
||||
)
|
||||
from torch.testing._internal.common_utils import IS_JETSON, IS_MACOS, TEST_WITH_ASAN
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE
|
||||
from torch.testing._internal.triton_utils import requires_gpu
|
||||
|
||||
@ -278,7 +273,6 @@ def forward(self, linear):
|
||||
self._aoti_check_relu_repro(res)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu(msg="AOTI for XPU not enabled yet")
|
||||
@inductor_config.patch(
|
||||
"triton.inject_relu_bug_TESTING_ONLY",
|
||||
"compile_error",
|
||||
@ -288,7 +282,6 @@ def forward(self, linear):
|
||||
self._aoti_check_relu_repro(res)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu(msg="AOTI for XPU not enabled yet")
|
||||
@inductor_config.patch(
|
||||
"triton.inject_relu_bug_TESTING_ONLY",
|
||||
"compile_error",
|
||||
@ -304,7 +297,6 @@ def forward(self, linear):
|
||||
self._aoti_check_relu_repro(res)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu(msg="AOTI for XPU not enabled yet")
|
||||
@inductor_config.patch("triton.inject_relu_bug_TESTING_ONLY", "accuracy")
|
||||
def test_aoti_gpu_accuracy_error(self):
|
||||
res = self._test_aoti(GPU_TYPE, "AccuracyError")
|
||||
|
||||
@ -7,19 +7,23 @@ import torch
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.utils import run_and_get_code
|
||||
from torch.testing import FileCheck
|
||||
from torch.testing._internal.common_cuda import TEST_MULTIGPU
|
||||
from torch.testing._internal.common_utils import IS_LINUX
|
||||
from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU_AND_TRITON,
|
||||
HAS_MULTIGPU,
|
||||
)
|
||||
|
||||
|
||||
requires_multigpu = functools.partial(
|
||||
unittest.skipIf, not TEST_MULTIGPU, "requires multiple cuda devices"
|
||||
unittest.skipIf, not HAS_MULTIGPU, f"requires multiple {GPU_TYPE} devices"
|
||||
)
|
||||
|
||||
|
||||
aten = torch.ops.aten
|
||||
|
||||
|
||||
class TestMoveConstructorsToCuda(TestCase):
|
||||
class TestMoveConstructorsToGpu(TestCase):
|
||||
def _check_fn(self, func, expect_cpu, *args):
|
||||
out_eager = func(*args)
|
||||
|
||||
@ -36,7 +40,7 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
def foo(x):
|
||||
return x[torch.arange(x.shape[0])]
|
||||
|
||||
inp = torch.rand(32, 77, 512, device="cuda")
|
||||
inp = torch.rand(32, 77, 512, device=GPU_TYPE)
|
||||
|
||||
self._check_fn(foo, False, inp)
|
||||
|
||||
@ -45,14 +49,14 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
tmp1 = torch.arange(x.shape[0])
|
||||
return tmp1, x[tmp1]
|
||||
|
||||
inp = torch.rand(32, 77, 512, device="cuda")
|
||||
inp = torch.rand(32, 77, 512, device=GPU_TYPE)
|
||||
|
||||
self._check_fn(foo, True, inp)
|
||||
|
||||
def test_non_convertable_op_failure(self):
|
||||
def foo(x):
|
||||
y = torch.arange(x.shape[0])
|
||||
return x + y, torch.ones([4], device="cuda")
|
||||
return x + y, torch.ones([4], device=GPU_TYPE)
|
||||
|
||||
inp = torch.rand([100])
|
||||
|
||||
@ -76,7 +80,7 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
c2 = torch.arange(-1, 3)
|
||||
return x[c1 + c2], c2 - 4 * 2
|
||||
|
||||
inp = torch.rand([4]).cuda()
|
||||
inp = torch.rand([4]).to(GPU_TYPE)
|
||||
_, code = run_and_get_code(foo, inp)
|
||||
FileCheck().check_not("triton.jit").run(code[0])
|
||||
|
||||
@ -95,12 +99,12 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
def foo(x):
|
||||
return (
|
||||
x[torch.arange(x.shape[0])],
|
||||
torch.ones([4], device="cuda:0"),
|
||||
torch.ones([4], device="cuda:1"),
|
||||
torch.ones([4], device=f"{GPU_TYPE}:0"),
|
||||
torch.ones([4], device=f"{GPU_TYPE}:1"),
|
||||
)
|
||||
|
||||
# nyi, multi-gpu
|
||||
inp = torch.rand([100], device="cuda")
|
||||
inp = torch.rand([100], device=GPU_TYPE)
|
||||
self._check_fn(foo, True, inp)
|
||||
|
||||
def test_no_gpu(self):
|
||||
@ -112,5 +116,5 @@ class TestMoveConstructorsToCuda(TestCase):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if IS_LINUX and HAS_CUDA_AND_TRITON:
|
||||
if IS_LINUX and HAS_GPU_AND_TRITON:
|
||||
run_tests()
|
||||
@ -17,7 +17,6 @@ from torch.testing._internal.common_utils import (
|
||||
instantiate_parametrized_tests,
|
||||
parametrize,
|
||||
skipIfRocm,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
@ -70,7 +69,6 @@ def make_cpp_wrapper_test(orig_test, **extra_args):
|
||||
"""
|
||||
|
||||
@config.patch("cpp_wrapper", True)
|
||||
@skipIfXpu(msg="cpp wrapper doesn't currently work on the XPU stack")
|
||||
def fn(self):
|
||||
# The same kernel may have been compiled by previous tests with
|
||||
# cpp_wrapper disabled. Clear the cache so we go ahead to re-compile
|
||||
@ -111,7 +109,6 @@ class MultiKernelTest(TestCase):
|
||||
@requires_triton()
|
||||
# TODO: bobrenjc93 to fix multi-kernel for ROCM
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
||||
def test_triton_gemm(self):
|
||||
def fn(x, y):
|
||||
@ -140,7 +137,6 @@ class MultiKernelTest(TestCase):
|
||||
@requires_triton()
|
||||
# TODO: bobrenjc93 to fix multi-kernel for ROCM
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@unittest.skipIf(not IS_BIG_GPU, "templates require big gpu")
|
||||
def test_triton_relu_fused_gemm(self):
|
||||
def fn(x, y):
|
||||
|
||||
@ -41,7 +41,6 @@ from torch.testing._internal.common_utils import (
|
||||
IS_LINUX,
|
||||
parametrize,
|
||||
skipIfRocm,
|
||||
skipIfXpu,
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU, IS_BIG_GPU
|
||||
from torch.utils import _pytree as pytree
|
||||
@ -1298,7 +1297,6 @@ class TestPatternMatcher(TestCase):
|
||||
# of search_fn).
|
||||
self.assertTrue(pattern.pattern_eq(search_fn_pattern))
|
||||
|
||||
@skipIfXpu
|
||||
@xfailIfSM89
|
||||
@inductor_config.patch(
|
||||
{
|
||||
|
||||
@ -12,8 +12,12 @@ import torch._inductor.utils
|
||||
from torch import _dynamo as torchdynamo
|
||||
from torch._inductor import config
|
||||
from torch.profiler import ProfilerActivity
|
||||
from torch.testing._internal.common_utils import TemporaryFileName
|
||||
from torch.testing._internal.inductor_utils import HAS_CUDA_AND_TRITON, IS_BIG_GPU
|
||||
from torch.testing._internal.common_utils import skipIfXpu, TemporaryFileName
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU_AND_TRITON,
|
||||
IS_BIG_GPU,
|
||||
)
|
||||
from torch.torch_version import TorchVersion
|
||||
from torch.utils._triton import has_triton
|
||||
|
||||
@ -22,6 +26,10 @@ HAS_TRITON = has_triton()
|
||||
|
||||
|
||||
class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
@skipIfXpu(
|
||||
msg="AssertionError: False is not true, "
|
||||
"https://github.com/intel/torch-xpu-ops/issues/2335"
|
||||
)
|
||||
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
|
||||
def test_inductor_profiling_triton_launch(self):
|
||||
# Verify that we get some sort of CPU-side indication of triton kernel launches
|
||||
@ -31,7 +39,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return (x + y).sin().cos()
|
||||
|
||||
x, y = (torch.rand((4, 4), device="cuda") for _ in range(2))
|
||||
x, y = (torch.rand((4, 4), device=GPU_TYPE) for _ in range(2))
|
||||
|
||||
with torch.profiler.profile() as prof:
|
||||
fn(x, y)
|
||||
@ -95,7 +103,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return (x + y).sin().cos()
|
||||
|
||||
args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
|
||||
args = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(2)]
|
||||
|
||||
events = self._test_profiling_kernel_names(fn, args, "sin")
|
||||
event_found = False
|
||||
@ -120,7 +128,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return x @ y
|
||||
|
||||
args = [torch.rand((4, 4), device="cuda") for _ in range(2)]
|
||||
args = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(2)]
|
||||
|
||||
def check_fn():
|
||||
# test_profiling_kernel_names will check this before asserting mm is in the trace.
|
||||
@ -153,8 +161,8 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return torch._foreach_add(x, y)
|
||||
|
||||
x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
x = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
|
||||
args = (x, y)
|
||||
|
||||
@ -206,8 +214,8 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
def fn(x, y):
|
||||
return torch._foreach_add(x, y)
|
||||
|
||||
x = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device="cuda") for _ in range(3)]
|
||||
x = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
y = [torch.rand((4, 4), device=GPU_TYPE) for _ in range(3)]
|
||||
|
||||
args = (x, y)
|
||||
fn_opt = torch.compile(fn)
|
||||
@ -216,11 +224,14 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
self.assertTrue(hooks_called["enter"])
|
||||
self.assertTrue(hooks_called["exit"])
|
||||
|
||||
@skipIfXpu(
|
||||
msg="TypeError: list indices must be integers or slices, not str, https://github.com/intel/torch-xpu-ops/issues/2335"
|
||||
)
|
||||
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
|
||||
def test_pt2_triton_attributes(self):
|
||||
from torch._inductor.codecache import code_hash
|
||||
|
||||
device = "cuda"
|
||||
device = GPU_TYPE
|
||||
debug = False # set to True to get output file
|
||||
|
||||
@torchdynamo.optimize("inductor")
|
||||
@ -295,7 +306,7 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
|
||||
@unittest.skipIf(not HAS_TRITON, "requires cuda & triton")
|
||||
def test_cupti_lazy_reinit(self):
|
||||
x, y = (torch.randn(4, 4, device="cuda") for _ in range(2))
|
||||
x, y = (torch.randn(4, 4, device=GPU_TYPE) for _ in range(2))
|
||||
|
||||
def fn(x, y):
|
||||
return (x + y).sin()
|
||||
@ -314,5 +325,5 @@ class DynamoProfilerTests(torch._inductor.test_case.TestCase):
|
||||
if __name__ == "__main__":
|
||||
from torch._inductor.test_case import run_tests
|
||||
|
||||
if HAS_CUDA_AND_TRITON:
|
||||
if HAS_GPU_AND_TRITON:
|
||||
run_tests()
|
||||
|
||||
@ -28,7 +28,11 @@ from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.utils import run_and_get_code, run_and_get_cpp_code
|
||||
from torch._inductor.virtualized import V
|
||||
from torch.testing._internal.common_utils import IS_MACOS
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
try:
|
||||
@ -70,8 +74,8 @@ class Model2(torch.nn.Module):
|
||||
class Model3(torch.nn.Module):
|
||||
def __init__(self, n, k):
|
||||
super().__init__()
|
||||
self.weight = torch.randn(n, k, device="cuda")
|
||||
self.bias = torch.randn(n, device="cuda")
|
||||
self.weight = torch.randn(n, k, device=GPU_TYPE)
|
||||
self.bias = torch.randn(n, device=GPU_TYPE)
|
||||
|
||||
def forward(self, a):
|
||||
return torch.nn.functional.linear(a, self.weight, self.bias)
|
||||
@ -151,7 +155,7 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
m = re.match(r"WARNING.* debug trace: (.*)", cm.output[0])
|
||||
self.assertTrue(m)
|
||||
filepath = Path(m.group(1))
|
||||
if device == "cuda":
|
||||
if device == "cuda" or device == "xpu":
|
||||
expected_mapping = [
|
||||
(
|
||||
"cppCodeToPost",
|
||||
@ -201,13 +205,20 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
},
|
||||
),
|
||||
]
|
||||
if backend == "aot_inductor":
|
||||
if backend == "aot_inductor" and device == "cuda":
|
||||
expected_mapping[0][1]["aoti_torch_cuda_mm_out:2"] = [
|
||||
"mm_default"
|
||||
]
|
||||
expected_mapping[1][1]["mm_default"] = [
|
||||
"aoti_torch_cuda_mm_out:2"
|
||||
]
|
||||
elif backend == "aot_inductor" and device == "xpu":
|
||||
expected_mapping[0][1]["aoti_torch_xpu_mm_out:2"] = [
|
||||
"mm_default"
|
||||
]
|
||||
expected_mapping[1][1]["mm_default"] = [
|
||||
"aoti_torch_xpu_mm_out:2"
|
||||
]
|
||||
else:
|
||||
expected_mapping[0][1]["extern_kernels.mm:2"] = [
|
||||
"mm_default"
|
||||
@ -254,21 +265,21 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
if filepath:
|
||||
shutil.rmtree(filepath)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_triton_kernel_to_post_grad_tracing_cuda(self):
|
||||
self._test_triton_kernel_to_post_grad_tracing(device="cuda")
|
||||
self._test_triton_kernel_to_post_grad_tracing(device=GPU_TYPE)
|
||||
|
||||
def test_triton_kernel_to_post_grad_tracing_cpu(self):
|
||||
self._test_triton_kernel_to_post_grad_tracing(device="cpu")
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_triton_kernel_to_post_grad_tracing_extern_kernel(self):
|
||||
M = 8
|
||||
N = 6
|
||||
K = 16
|
||||
model = Model3(N, K)
|
||||
batch = 2
|
||||
a = torch.randn(batch, M, K, device="cuda")
|
||||
a = torch.randn(batch, M, K, device=GPU_TYPE)
|
||||
example_inputs = (a,)
|
||||
filepath = None
|
||||
|
||||
@ -302,9 +313,10 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
else:
|
||||
# backend = aot_inductor
|
||||
expected_data = {
|
||||
"aoti_torch_cuda_addmm_out:2": ["addmm"],
|
||||
f"aoti_torch_{GPU_TYPE}_addmm_out:2": ["addmm"],
|
||||
"triton_poi_fused_0:1": ["_tensor_constant1"],
|
||||
}
|
||||
|
||||
self._check_provenance_tracing_kernel_to_post_grad(
|
||||
filepath, expected_data
|
||||
)
|
||||
@ -312,12 +324,12 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
if filepath:
|
||||
shutil.rmtree(filepath)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def _test_pt_tracing_combo_kernel(self, backend):
|
||||
"""This test checks that generated provenance tracing artifact from triton combo kernel to post grad nodes"""
|
||||
a = torch.randn(10, 10, device="cuda")
|
||||
b = torch.randn(20, 20, device="cuda")
|
||||
c = torch.randn(10, 10, device="cuda")
|
||||
a = torch.randn(10, 10, device=GPU_TYPE)
|
||||
b = torch.randn(20, 20, device=GPU_TYPE)
|
||||
c = torch.randn(10, 10, device=GPU_TYPE)
|
||||
example_inputs = (a, b, c)
|
||||
|
||||
model = Model2()
|
||||
@ -348,7 +360,7 @@ class TestProvenanceTracingArtifact(TestCase):
|
||||
expected_data = {"triton_poi_fused_0:1": ["relu", "sigmoid", "tanh"]}
|
||||
self._check_provenance_tracing_kernel_to_post_grad(filepath, expected_data)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_triton_kernel_to_post_grad_tracing_combo_kernel(self):
|
||||
self._test_pt_tracing_combo_kernel(backend="inductor")
|
||||
self._test_pt_tracing_combo_kernel(backend="aot_inductor")
|
||||
@ -465,7 +477,7 @@ class TestProvenanceTracingNodeMeta(TestCase):
|
||||
"""
|
||||
return next(iter([node for node in gm.graph.nodes if node.target == target]))
|
||||
|
||||
@requires_cuda_and_triton # test only works for cuda pattern matcher
|
||||
@requires_gpu_and_triton # test only works for cuda pattern matcher
|
||||
def test_pattern_matcher_transfer_meta(self):
|
||||
"""
|
||||
Test that stack trace is transfered when node is decomposed in post_grad_passes
|
||||
@ -484,9 +496,9 @@ class TestProvenanceTracingNodeMeta(TestCase):
|
||||
x = self.sigmoid(x)
|
||||
return x * 3
|
||||
|
||||
x = torch.randn(8, 10).to("cuda")
|
||||
x = torch.randn(8, 10).to(GPU_TYPE)
|
||||
example_inputs = (x,)
|
||||
model = Model().to("cuda")
|
||||
model = Model().to(GPU_TYPE)
|
||||
|
||||
# mimic the before_post_grad graph
|
||||
ep = torch.export.export(model, example_inputs).run_decompositions()
|
||||
@ -546,9 +558,9 @@ class TestProvenanceTracingStackTraces(TestCase):
|
||||
return s.split("\n")[i].strip()
|
||||
|
||||
@torch._inductor.config.patch({"trace.provenance_tracking_level": 2})
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_tlparse_kernel_stack_traces(self):
|
||||
device = "cuda"
|
||||
device = GPU_TYPE
|
||||
model = Model4().to(device)
|
||||
x = torch.randn(8, 10).to(device)
|
||||
a = torch.randn(10, 20).to(device)
|
||||
@ -642,16 +654,16 @@ class TestProvenanceTracingStackTraces(TestCase):
|
||||
for item in data[field]:
|
||||
self.assertIsInstance(item, str)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch("trace.provenance_tracking_level", 1)
|
||||
def test_kernel_information_generation(self):
|
||||
"""Test basic kernel information generation in AOTI packages."""
|
||||
|
||||
model = Model4().to("cuda")
|
||||
x = torch.randn(8, 10, device="cuda")
|
||||
a = torch.randn(10, 20, device="cuda")
|
||||
b = torch.randn(20, 30, device="cuda")
|
||||
c = torch.randn(10, 30, device="cuda")
|
||||
model = Model4().to(GPU_TYPE)
|
||||
x = torch.randn(8, 10, device=GPU_TYPE)
|
||||
a = torch.randn(10, 20, device=GPU_TYPE)
|
||||
b = torch.randn(20, 30, device=GPU_TYPE)
|
||||
c = torch.randn(10, 30, device=GPU_TYPE)
|
||||
inputs = (x, a, b, c)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
@ -712,14 +724,14 @@ class TestProvenanceTracingStackTraces(TestCase):
|
||||
],
|
||||
"pre_grad_nodes": ["gelu", "addmm"],
|
||||
},
|
||||
"aoti_torch_cuda_mm_out:1": {
|
||||
f"aoti_torch_{GPU_TYPE}_mm_out:1": {
|
||||
"stack_traces": [
|
||||
"x = self.fc1(x)",
|
||||
],
|
||||
"post_grad_nodes": ["mm_default_1"],
|
||||
"pre_grad_nodes": ["linear"],
|
||||
},
|
||||
"aoti_torch_cuda_mm_out:4": {
|
||||
f"aoti_torch_{GPU_TYPE}_mm_out:4": {
|
||||
"stack_traces": [
|
||||
"y = torch.addmm(c, d, b)",
|
||||
],
|
||||
|
||||
@ -29,7 +29,7 @@ from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch._inductor.utils import is_big_gpu, run_and_get_kernels
|
||||
from torch._inductor.virtualized import V
|
||||
from torch._prims_common import ELEMENTWISE_TYPE_PROMOTION_KIND
|
||||
from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm, skipIfXpu
|
||||
from torch.testing._internal.common_utils import IS_LINUX, skipIfRocm
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_GPU,
|
||||
@ -180,7 +180,6 @@ class TestSelectAlgorithm(TestCase):
|
||||
self.assertEqual(counters["inductor"]["select_algorithm_autotune"], 1)
|
||||
|
||||
@patches
|
||||
@skipIfXpu(msg="Double datatype matmul is not supported in oneDNN")
|
||||
def test_mm_skip(self):
|
||||
@torch.compile
|
||||
def foo(a, b):
|
||||
@ -249,7 +248,6 @@ class TestSelectAlgorithm(TestCase):
|
||||
|
||||
# TODO: fix accuracy failure of the triton template on XPU.
|
||||
# and enable this test case.
|
||||
@skipIfXpu
|
||||
@patches
|
||||
def test_mm_plus_mm2(self):
|
||||
@torch.compile
|
||||
|
||||
@ -5,7 +5,7 @@ import torch._inductor
|
||||
from torch._dynamo.utils import counters
|
||||
from torch._inductor.test_case import run_tests, TestCase
|
||||
from torch.testing._internal.inductor_utils import GPU_TYPE
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import requires_gpu_and_triton
|
||||
|
||||
|
||||
try:
|
||||
@ -248,7 +248,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_dict_tensors(ref_grad, res_grad, rtol=rtol, atol=atol)
|
||||
)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
@ -291,7 +291,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
|
||||
counters.clear()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
@ -317,7 +317,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
|
||||
counters.clear()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
@ -342,7 +342,7 @@ class TestSplitCatAten(TestCase):
|
||||
self.compare_parameters(module, traced, rtol=1e-8, atol=1e-8)
|
||||
counters.clear()
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(
|
||||
pre_grad_fusion_options={},
|
||||
post_grad_fusion_options={
|
||||
|
||||
@ -140,7 +140,10 @@ from torch.testing._internal.inductor_utils import (
|
||||
skipCPUIf,
|
||||
skipCUDAIf,
|
||||
)
|
||||
from torch.testing._internal.triton_utils import requires_cuda_and_triton
|
||||
from torch.testing._internal.triton_utils import (
|
||||
requires_cuda_and_triton,
|
||||
requires_gpu_and_triton,
|
||||
)
|
||||
|
||||
|
||||
_T = TypeVar("_T")
|
||||
@ -189,7 +192,7 @@ test_int_dtypes = [
|
||||
torch.int64,
|
||||
]
|
||||
|
||||
if SM80OrLater or MACOS_VERSION >= 14.0:
|
||||
if SM80OrLater or MACOS_VERSION >= 14.0 or GPU_TYPE == "xpu":
|
||||
test_dtypes.append(torch.bfloat16)
|
||||
|
||||
|
||||
@ -2317,7 +2320,7 @@ class CommonTemplate:
|
||||
{"dynamic_shapes": False, "assume_static_by_default": True}
|
||||
)
|
||||
def test_custom_scan_op(self):
|
||||
if self.device != "cuda":
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("associative_scan only supported on GPU")
|
||||
|
||||
def sum_combine(a, b):
|
||||
@ -2346,7 +2349,7 @@ class CommonTemplate:
|
||||
{"dynamic_shapes": False, "assume_static_by_default": True}
|
||||
)
|
||||
def test_custom_scan_op_compiled(self):
|
||||
if self.device != "cuda":
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("associative_scan only supported on GPU")
|
||||
|
||||
from torch._higher_order_ops.associative_scan import associative_scan
|
||||
@ -2376,7 +2379,7 @@ class CommonTemplate:
|
||||
{"dynamic_shapes": False, "assume_static_by_default": True}
|
||||
)
|
||||
def test_custom_scan_op_multi_input(self):
|
||||
if self.device != "cuda":
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("associative_scan only supported on GPU")
|
||||
|
||||
def argmax_combine(a, b):
|
||||
@ -2403,7 +2406,7 @@ class CommonTemplate:
|
||||
{"dynamic_shapes": False, "assume_static_by_default": True}
|
||||
)
|
||||
def test_custom_scan_would_split(self):
|
||||
if self.device != "cuda":
|
||||
if self.device != GPU_TYPE:
|
||||
raise unittest.SkipTest("associative_scan only supported on GPU")
|
||||
|
||||
def combine_linear_recurrence(left, right):
|
||||
@ -2454,7 +2457,6 @@ class CommonTemplate:
|
||||
self.common(fn, [packed])
|
||||
|
||||
@xfail_if_mps_unimplemented
|
||||
@skipIfXpu(msg="No _weight_int8pack_mm implementation on XPU")
|
||||
def test_int8_weight_only_quant(self):
|
||||
def convert_weight_to_int8pack(b):
|
||||
b_int8pack, b_scales, _ = _dynamically_quantize_per_channel(
|
||||
@ -3912,7 +3914,6 @@ class CommonTemplate:
|
||||
check_lowp=True,
|
||||
)
|
||||
|
||||
@skipIfXpu
|
||||
def test_mm_mixed_dtype(self):
|
||||
def fn(a, b):
|
||||
return torch.mm(a, b)
|
||||
@ -3928,7 +3929,6 @@ class CommonTemplate:
|
||||
with self.assertRaisesRegex(RuntimeError, msg):
|
||||
torch.compile(fn)(t1, t2)
|
||||
|
||||
@skipIfXpu
|
||||
@xfail_if_mps_unimplemented # linear for non-float inputs
|
||||
def test_linear_mixed_dtype(self):
|
||||
class Net(nn.Module):
|
||||
@ -7502,8 +7502,8 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
|
||||
@requires_gpu()
|
||||
def test_grid_sampler_expand_preserves_view(self):
|
||||
if not self.device.startswith("cuda"):
|
||||
self.skipTest("requires CUDA")
|
||||
if not self.device.startswith("cuda") and not self.device.startswith("xpu"):
|
||||
self.skipTest("requires CUDA or XPU")
|
||||
|
||||
torch.manual_seed(0)
|
||||
torch._dynamo.reset()
|
||||
@ -11906,7 +11906,6 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
torch.preserve_format,
|
||||
)
|
||||
|
||||
@skipIfXpu
|
||||
def test_resize_as(self):
|
||||
def fn(x, y, memory_format):
|
||||
return torch.ops.aten.resize_as(x, y, memory_format=memory_format)
|
||||
@ -13586,7 +13585,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
"assert_size_stride(buf2, (16, 32), (32, 1)"
|
||||
).run(code)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@config.patch(use_fast_math=True)
|
||||
def test_prepare_softmax_with_fast_math(self):
|
||||
"""
|
||||
@ -14085,7 +14084,11 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
inputs = (torch.randn(4, device=self.device),)
|
||||
self.common(Model(), inputs)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@skipIfXpu(
|
||||
msg="Profile not enabled on XPU CI, "
|
||||
"https://github.com/intel/torch-xpu-ops/issues/2334"
|
||||
)
|
||||
@requires_gpu_and_triton
|
||||
@parametrize("use_cat", [True, False])
|
||||
def test_copy_non_blocking_is_pinned(self, use_cat):
|
||||
def f(a_list):
|
||||
@ -14211,7 +14214,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
inp = torch.randn(100, 100, device=self.device)
|
||||
self.assertTrue(CommonTemplate._is_triggering_buffer_reuse(fn, m, inp))
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_cpu_scalar_with_gpu_tensor(self):
|
||||
def fn(a, b):
|
||||
return a + b[0]
|
||||
@ -14225,7 +14228,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
self.assertEqual(eager, compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@torch._inductor.config.patch(cpp_wrapper=True)
|
||||
def test_cpu_scalar_with_gpu_tensor_cpp(self):
|
||||
def fn(a, b):
|
||||
@ -14238,7 +14241,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
compiled = torch.compile(fn, backend="inductor")(a, b)
|
||||
self.assertEqual(eager, compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_cpu_scalar_with_gpu_tensor_dynamic(self):
|
||||
def fn(a, b):
|
||||
return a + b[0]
|
||||
@ -14263,7 +14266,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
self.assertEqual(eager, compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_gpu_scalar_with_gpu_tensor(self):
|
||||
def fn(a, b):
|
||||
return a + b[0]
|
||||
@ -14277,7 +14280,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
self.assertEqual(eager, compiled)
|
||||
self.assertEqual(torch._inductor.metrics.generated_kernel_count, 1)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_cpu_tensor_with_gpu_tensor(self):
|
||||
def fn(a, b):
|
||||
return a + b
|
||||
@ -14310,7 +14313,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
compiled = torch.compile(fn, backend="inductor")(a, b)
|
||||
self.assertEqual(eager, compiled)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_gpu_scalar_with_cpu_tensor(self):
|
||||
def fn(a, b):
|
||||
return a[0] + b
|
||||
@ -14321,7 +14324,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
with self.assertRaises(RuntimeError):
|
||||
compiled = torch.compile(fn, backend="inductor")(a, b)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@config.patch(emulate_precision_casts=True)
|
||||
def test_emulate_precision_triton_fp_fusion(self):
|
||||
def fn(a, b):
|
||||
@ -14335,7 +14338,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
self.assertTrue("'enable_fp_fusion': False" in code)
|
||||
torch.testing.assert_close(out, fn(a, b), atol=0, rtol=0)
|
||||
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@config.patch(runtime_triton_nan_asserts=True)
|
||||
def test_nan_assert_inside_triton_kernel(self):
|
||||
def fn(x):
|
||||
@ -14352,7 +14355,7 @@ def forward(self, arg0_1: "Sym(s77)", arg1_1: "Sym(s27)", arg2_1: "Sym(s53)", ar
|
||||
torch.testing.assert_close(out, fn(x))
|
||||
|
||||
@skip_if_cpp_wrapper("skip cpp wrapper")
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
def test_repeat_interleave_decomposition_has_clamp(self):
|
||||
repeat = torch.ones(2560, dtype=torch.int64, device=GPU_TYPE)
|
||||
output_size = 505450
|
||||
|
||||
@ -657,7 +657,6 @@ class CommonTemplate:
|
||||
(False, 0), # We can't infer that the load is a power of 2.
|
||||
],
|
||||
)
|
||||
@skipIfXpu(msg="Remove this after Intel triton issue #4000 resolved.")
|
||||
def test_dynamic_shapes_reduction(self, with_tiling: bool, num_block_pointers: int):
|
||||
"""
|
||||
Test a reduction kernel with dynamic shapes.
|
||||
@ -906,6 +905,10 @@ class CommonTemplate:
|
||||
# Check for 2 reduction dimensions.
|
||||
self._assert_reduction_ndims(code, 2)
|
||||
|
||||
@skipIfXpu(
|
||||
msg="AssertionError: Scalars are not equal!, "
|
||||
"https://github.com/intel/torch-xpu-ops/issues/2332"
|
||||
)
|
||||
@xfail_if_use_tensor_descriptor # Cannot use TMA API for store with no x dimension.
|
||||
@test_torchinductor.skip_if_triton_cpu # Illegal instruction File; cannot xfail because it crashes process
|
||||
def test_2d_reduction_multi_kernel(self):
|
||||
|
||||
@ -20,8 +20,8 @@ from torch.testing._internal.common_utils import (
|
||||
)
|
||||
from torch.testing._internal.inductor_utils import (
|
||||
GPU_TYPE,
|
||||
HAS_CUDA_AND_TRITON,
|
||||
HAS_GPU,
|
||||
HAS_GPU_AND_TRITON,
|
||||
requires_cuda_with_enough_memory,
|
||||
)
|
||||
|
||||
@ -128,11 +128,11 @@ class TestTritonHeuristics(TestCase):
|
||||
]
|
||||
self.assertEqual(forward(*args), foo_c(*args))
|
||||
|
||||
@skipIfXpu
|
||||
# @skipIfXpu
|
||||
def test_artificial_zgrid(self):
|
||||
self._test_artificial_zgrid()
|
||||
|
||||
@skipIfXpu
|
||||
# @skipIfXpu
|
||||
@config.patch("cpp_wrapper", True)
|
||||
def test_artificial_grid_cpp_wrapper(self):
|
||||
self._test_artificial_zgrid()
|
||||
@ -152,7 +152,7 @@ class TestTritonHeuristics(TestCase):
|
||||
|
||||
triton_meta = {
|
||||
"signature": {"in_ptr0": "*fp32", "out_ptr0": "*fp32", "xnumel": "i32"},
|
||||
"device": DeviceProperties.create(torch.device("cuda")),
|
||||
"device": DeviceProperties.create(torch.device(GPU_TYPE)),
|
||||
"constants": {},
|
||||
"configs": [
|
||||
AttrsDescriptorWrapper(divisible_by_16=(0, 1, 2), equal_to_1=())
|
||||
@ -178,7 +178,7 @@ class TestTritonHeuristics(TestCase):
|
||||
"inductor_meta": inductor_meta,
|
||||
}
|
||||
|
||||
@skipIfXpu
|
||||
# @skipIfXpu
|
||||
def test_pre_hook_assert(self):
|
||||
# assert if any of the configs passed to the CachingAutotuner have pre-hooks
|
||||
args = self._get_cos_kernel_caching_autotuner_args()
|
||||
@ -272,9 +272,9 @@ class TestTritonHeuristics(TestCase):
|
||||
res = torch.compile(fn)(x)
|
||||
self.assertEqual(ref, res)
|
||||
|
||||
@skipIfXpu
|
||||
@skipIfXpu(msg="https://github.com/intel/torch-xpu-ops/issues/2331")
|
||||
@skipIfRocm
|
||||
@skipUnless(HAS_CUDA_AND_TRITON, "requires CUDA")
|
||||
@skipUnless(HAS_GPU_AND_TRITON, "requires gpu and triton")
|
||||
@parametrize("do_pruning", [False, True])
|
||||
def test_prune_configs_over_shared_memory_limit(self, do_pruning):
|
||||
from torch._inductor.template_heuristics.triton import (
|
||||
@ -326,7 +326,7 @@ class TestArgumentCloneAndRestore(TestCase):
|
||||
return out
|
||||
|
||||
def _do_test(self, gpu_tensor):
|
||||
torch.cuda.reset_peak_memory_stats()
|
||||
torch.get_device_module(GPU_TYPE).reset_peak_memory_stats()
|
||||
autotuner = self._create_caching_autotuner()
|
||||
|
||||
old_storage_offset = gpu_tensor.storage_offset()
|
||||
@ -348,7 +348,7 @@ class TestArgumentCloneAndRestore(TestCase):
|
||||
|
||||
# Note: torch.allclose somehow allocates large amount of extra memory.
|
||||
# Record peak memory before that.
|
||||
peak_mem_after = torch.cuda.max_memory_allocated()
|
||||
peak_mem_after = torch.get_device_module(GPU_TYPE).max_memory_allocated()
|
||||
|
||||
self.assertTrue(torch.allclose(gpu_tensor, gpu_tensor_clone))
|
||||
self.assertTrue(
|
||||
|
||||
@ -711,7 +711,6 @@ def forward(self, x_1, output_1):
|
||||
self.assertEqual(int_result, resulti)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu
|
||||
def test_triton_kernel_constants(self):
|
||||
@triton.jit
|
||||
def mulC_kernel(
|
||||
@ -723,6 +722,7 @@ def forward(self, x_1, output_1):
|
||||
):
|
||||
pid = tl.program_id(axis=0)
|
||||
block_start = pid * BLOCK_SIZE
|
||||
|
||||
offsets = block_start + tl.arange(0, BLOCK_SIZE)
|
||||
mask = offsets < n_elements
|
||||
x = tl.load(in_ptr0 + offsets, mask=mask)
|
||||
@ -2243,7 +2243,7 @@ def forward(self, arg0_1, arg1_1):
|
||||
self.assertEqual(compiled_out, eager_out)
|
||||
|
||||
# TODO enable this test case on XPU.
|
||||
@requires_cuda_and_triton
|
||||
@requires_gpu_and_triton
|
||||
@parametrize("cfg", ["normal", "cpp_wrapper"])
|
||||
def test_triton_kernel_dtype_view(self, cfg):
|
||||
# https://github.com/pytorch/pytorch/issues/136159
|
||||
@ -2542,8 +2542,11 @@ def forward(self, arg0_1, arg1_1):
|
||||
self.assertEqual(actual, expected)
|
||||
|
||||
@requires_gpu
|
||||
@skipIfXpu(
|
||||
msg="XPU Triton result in nan, "
|
||||
"https://github.com/intel/torch-xpu-ops/issues/2330"
|
||||
)
|
||||
@skipIfRocm
|
||||
@skipIfXpu
|
||||
@inductor_config.patch({"triton.autotune_at_compile_time": True})
|
||||
@parametrize("quotes", ["single", "double"])
|
||||
def test_kernel_inline_asm(self, quotes):
|
||||
|
||||
@ -38,10 +38,6 @@ class TestUnbackedSymints(InductorTestCase):
|
||||
|
||||
torch.testing.assert_close(actual, expected)
|
||||
|
||||
@skipIfXpu(
|
||||
msg="The OP aten.nonzero implemented by XPU has different memory layout with fake tensor."
|
||||
" Remove this skip after #146883 fixed."
|
||||
)
|
||||
@skipGPUIf(not HAS_GPU, "requires gpu and triton")
|
||||
@dynamo_config.patch({"capture_dynamic_output_shape_ops": True})
|
||||
def test_expand_ok_with_runtime_assert(self, device):
|
||||
@ -653,6 +649,9 @@ class TestUnbackedSymints(InductorTestCase):
|
||||
expected = fn(*example_inputs)
|
||||
torch.testing.assert_close(actual, expected)
|
||||
|
||||
@skipIfXpu(
|
||||
msg="Invalid SPIR-V modul,https://github.com/intel/torch-xpu-ops/issues/2329"
|
||||
)
|
||||
@skipGPUIf(not HAS_GPU, "requires gpu and triton")
|
||||
@inductor_config.patch({"max_autotune": True})
|
||||
@dynamo_config.patch({"capture_scalar_outputs": True})
|
||||
|
||||
@ -221,7 +221,7 @@ S390X_BLOCKLIST = [
|
||||
"inductor/test_inplacing_pass",
|
||||
"inductor/test_kernel_benchmark",
|
||||
"inductor/test_max_autotune",
|
||||
"inductor/test_move_constructors_to_cuda",
|
||||
"inductor/test_move_constructors_to_gpu",
|
||||
"inductor/test_multi_kernel",
|
||||
"inductor/test_pattern_matcher",
|
||||
"inductor/test_perf",
|
||||
|
||||
Reference in New Issue
Block a user