[ROCm] Enabling additional UTs on ROCm (#115738)

Unskips mostly for dynamo/inductor UT. Pull Request resolved: https://github.com/pytorch/pytorch/pull/115738 Approved by: https://github.com/jithunnair-amd, https://github.com/malfet
2025-10-20 21:14:14 +08:00 · 2024-01-09 08:36:07 +00:00
parent f0bbc2fcf5
commit db79ceb110
7 changed files with 4 additions and 22 deletions
--- a/test/dynamo/test_activation_checkpointing.py
+++ b/test/dynamo/test_activation_checkpointing.py
@ -13,7 +13,7 @@ from functorch.compile import min_cut_rematerialization_partition
 from torch._dynamo.backends.common import aot_autograd
 from torch._dynamo.testing import CompileCounterWithBackend
 from torch._higher_order_ops.wrap import tag_activation_checkpoint
-from torch.testing._internal.common_utils import IS_WINDOWS
+from torch.testing._internal.common_utils import IS_WINDOWS, skipIfRocm
 from torch.testing._internal.inductor_utils import HAS_CUDA
 from torch.testing._internal.two_tensor import TwoTensor
 from torch.utils.checkpoint import _pt2_selective_checkpoint_context_fn_gen, checkpoint
@ -858,6 +858,7 @@ class ActivationCheckpointingViaTagsTests(torch._dynamo.test_case.TestCase):
            self._validate(fn, backend, x, y)

    @requires_cuda()
+    @skipIfRocm
    def test_autocast_flash_attention(self):
        def fn(primals_1, primals_2, primals_3):
            return torch.ops.aten._scaled_dot_product_efficient_attention.default(
--- a/test/dynamo/test_cudagraphs.py
+++ b/test/dynamo/test_cudagraphs.py
@ -10,7 +10,7 @@ import torch._dynamo.config
 import torch._dynamo.test_case
 import torch._dynamo.testing
 from torch._dynamo.testing import same
-from torch.testing._internal.common_utils import skipIfRocm, TEST_CUDA_GRAPH
+from torch.testing._internal.common_utils import TEST_CUDA_GRAPH


 def composed(*decs):
@ -105,7 +105,6 @@ class TestAotCudagraphs(torch._dynamo.test_case.TestCase):
        y = torch.randn((), device="cpu")
        fn(x, y)

-    @skipIfRocm
    def test_mutate_input(self):
        def model(x, y):
            y.add_(3)
--- a/test/inductor/test_fused_attention.py
+++ b/test/inductor/test_fused_attention.py
@ -127,7 +127,6 @@ class TestSDPAPatternRewriterTemplate(TestCase):
        when an intermediate result is being used / returned downstream
        """

-        @skipIfRocm
        @torch.compile(fullgraph=True)
        def dot_prod_attention(
            query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
--- a/test/inductor/test_inductor_freezing.py
+++ b/test/inductor/test_inductor_freezing.py
@ -23,7 +23,6 @@ sys.path.append(pytorch_test_dir)
 from torch.testing._internal.common_utils import (
    IS_CI,
    IS_WINDOWS,
-    skipIfRocm,
    TEST_WITH_ASAN,
    TestCase as TorchTestCase,
 )
@ -470,7 +469,6 @@ class OptimizeForInferenceTemplate(TestCase):
                mod_eager = mod(x)
                self.assertEqual(foo(mod, x), mod_eager)

-    @skipIfRocm
    def test_cpp_wrapper(self):
        mod = ConvBN(3, 32, kernel_size=3, stride=2).eval().to(self.device)

--- a/test/inductor/test_perf.py
+++ b/test/inductor/test_perf.py
@ -8,11 +8,7 @@ import torch
 import torch._inductor.config as config
 from torch._inductor import metrics
 from torch._inductor.compile_fx import compile_fx, count_bytes_inner
-from torch.testing._internal.common_utils import (
-    IS_WINDOWS,
-    skipIfRocm,
-    TestCase as TorchTestCase,
-)
+from torch.testing._internal.common_utils import IS_WINDOWS, TestCase as TorchTestCase

 # Defines all the kernels for tests
 from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
@ -704,7 +700,6 @@ class InplacingTests(TestCase):
        self.assertExpectedInline(count_numel(f, *inp), """42""")

    @requires_cuda()
-    @skipIfRocm
    def test_inplace_triton_kernel_v1(self):
        def f(x: torch.Tensor, y: torch.Tensor):
            output = torch.zeros_like(x)
@ -717,7 +712,6 @@ class InplacingTests(TestCase):
        self.assertExpectedInline(count_numel(f, *inp), """40""")

    @requires_cuda()
-    @skipIfRocm
    def test_inplace_triton_kernel_v2(self):
        def f(x: torch.Tensor, y: torch.Tensor):
            output = torch.zeros_like(x)
@ -731,7 +725,6 @@ class InplacingTests(TestCase):
        self.assertExpectedInline(count_numel(f, *inp), """60""")

    @requires_cuda()
-    @skipIfRocm
    def test_inplace_triton_kernel_v3(self):
        def f(x: torch.Tensor, y: torch.Tensor):
            output = torch.zeros_like(x)
@ -745,7 +738,6 @@ class InplacingTests(TestCase):
        self.assertExpectedInline(count_numel(f, *inp), """80""")

    @requires_cuda()
-    @skipIfRocm
    def test_inplace_triton_kernel_v4(self):
        def f(x: torch.Tensor, y: torch.Tensor):
            x_view = x.view(-1)
@ -760,7 +752,6 @@ class InplacingTests(TestCase):
        self.assertExpectedInline(count_numel(f, *inp), """60""")

    @requires_cuda()
-    @skipIfRocm
    def test_inplace_triton_kernel_v5(self):
        def f(x: torch.Tensor, y: torch.Tensor):
            x_view = x.view(-1)
@ -775,7 +766,6 @@ class InplacingTests(TestCase):
        self.assertExpectedInline(count_numel(f, *inp), """80""")

    @requires_cuda()
-    @skipIfRocm
    def test_inplace_triton_kernel_v6(self):
        def f(x: torch.Tensor, y: torch.Tensor):
            output = torch.zeros_like(x)
--- a/test/run_test.py
+++ b/test/run_test.py
@ -300,7 +300,6 @@ ROCM_BLOCKLIST = [
    "test_jit_legacy",
    "test_cuda_nvml_based_avail",
    "test_jit_cuda_fuser",
-    "dynamo/test_activation_checkpointing",
 ]

 # The tests inside these files should never be run in parallel with each other
--- a/test/test_sparse_csr.py
+++ b/test/test_sparse_csr.py
@ -3462,7 +3462,6 @@ class TestSparseCompressedTritonKernels(TestCase):
        return d

    @onlyCUDA
-    @skipIfRocm
    @dtypes(torch.half, torch.bfloat16, torch.float)
    @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
@ -3498,7 +3497,6 @@ class TestSparseCompressedTritonKernels(TestCase):
    @parametrize("block_size", [16, 32, 64])
    @parametrize("index_dtype", [torch.int32, torch.int64])
    @onlyCUDA
-    @skipIfRocm
    @dtypes(torch.half, torch.bfloat16, torch.float)
    @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
    @unittest.skipIf((not TEST_WITH_TORCHINDUCTOR) or (IS_FBCODE and IS_REMOTE_GPU) or torch._running_with_deploy(),
@ -3577,7 +3575,6 @@ class TestSparseCompressedTritonKernels(TestCase):
                self.assertEqual(res_tri, res_dense)

    @onlyCUDA
-    @skipIfRocm
    @dtypes(torch.half)
    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU or torch._running_with_deploy(),
                     "Skipped for deploy and internal with remote GPUs")
@ -3787,7 +3784,6 @@ class TestSparseCompressedTritonKernels(TestCase):

    @parametrize("blocksize", [2, '2x3', 16, '16x32', 32, 64])
    @onlyCUDA
-    @skipIfRocm
    @dtypes(torch.half, torch.bfloat16, torch.float)
    @dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
    @unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")