[ROCm] Enabling additional UTs on ROCm (#115738)

Unskips mostly for dynamo/inductor UT.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/115738
Approved by: https://github.com/jithunnair-amd, https://github.com/malfet
This commit is contained in:
Jack Taylor
2024-01-09 08:36:07 +00:00
committed by PyTorch MergeBot
parent f0bbc2fcf5
commit db79ceb110
7 changed files with 4 additions and 22 deletions

View File

@ -13,7 +13,7 @@ from functorch.compile import min_cut_rematerialization_partition
from torch._dynamo.backends.common import aot_autograd
from torch._dynamo.testing import CompileCounterWithBackend
from torch._higher_order_ops.wrap import tag_activation_checkpoint
from torch.testing._internal.common_utils import IS_WINDOWS
from torch.testing._internal.common_utils import IS_WINDOWS, skipIfRocm
from torch.testing._internal.inductor_utils import HAS_CUDA
from torch.testing._internal.two_tensor import TwoTensor
from torch.utils.checkpoint import _pt2_selective_checkpoint_context_fn_gen, checkpoint
@ -858,6 +858,7 @@ class ActivationCheckpointingViaTagsTests(torch._dynamo.test_case.TestCase):
self._validate(fn, backend, x, y)
@requires_cuda()
@skipIfRocm
def test_autocast_flash_attention(self):
def fn(primals_1, primals_2, primals_3):
return torch.ops.aten._scaled_dot_product_efficient_attention.default(

View File

@ -10,7 +10,7 @@ import torch._dynamo.config
import torch._dynamo.test_case
import torch._dynamo.testing
from torch._dynamo.testing import same
from torch.testing._internal.common_utils import skipIfRocm, TEST_CUDA_GRAPH
from torch.testing._internal.common_utils import TEST_CUDA_GRAPH
def composed(*decs):
@ -105,7 +105,6 @@ class TestAotCudagraphs(torch._dynamo.test_case.TestCase):
y = torch.randn((), device="cpu")
fn(x, y)
@skipIfRocm
def test_mutate_input(self):
def model(x, y):
y.add_(3)

View File

@ -127,7 +127,6 @@ class TestSDPAPatternRewriterTemplate(TestCase):
when an intermediate result is being used / returned downstream
"""
@skipIfRocm
@torch.compile(fullgraph=True)
def dot_prod_attention(
query: torch.Tensor, key: torch.Tensor, value: torch.Tensor

View File

@ -23,7 +23,6 @@ sys.path.append(pytorch_test_dir)
from torch.testing._internal.common_utils import (
IS_CI,
IS_WINDOWS,
skipIfRocm,
TEST_WITH_ASAN,
TestCase as TorchTestCase,
)
@ -470,7 +469,6 @@ class OptimizeForInferenceTemplate(TestCase):
mod_eager = mod(x)
self.assertEqual(foo(mod, x), mod_eager)
@skipIfRocm
def test_cpp_wrapper(self):
mod = ConvBN(3, 32, kernel_size=3, stride=2).eval().to(self.device)

View File

@ -8,11 +8,7 @@ import torch
import torch._inductor.config as config
from torch._inductor import metrics
from torch._inductor.compile_fx import compile_fx, count_bytes_inner
from torch.testing._internal.common_utils import (
IS_WINDOWS,
skipIfRocm,
TestCase as TorchTestCase,
)
from torch.testing._internal.common_utils import IS_WINDOWS, TestCase as TorchTestCase
# Defines all the kernels for tests
from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
@ -704,7 +700,6 @@ class InplacingTests(TestCase):
self.assertExpectedInline(count_numel(f, *inp), """42""")
@requires_cuda()
@skipIfRocm
def test_inplace_triton_kernel_v1(self):
def f(x: torch.Tensor, y: torch.Tensor):
output = torch.zeros_like(x)
@ -717,7 +712,6 @@ class InplacingTests(TestCase):
self.assertExpectedInline(count_numel(f, *inp), """40""")
@requires_cuda()
@skipIfRocm
def test_inplace_triton_kernel_v2(self):
def f(x: torch.Tensor, y: torch.Tensor):
output = torch.zeros_like(x)
@ -731,7 +725,6 @@ class InplacingTests(TestCase):
self.assertExpectedInline(count_numel(f, *inp), """60""")
@requires_cuda()
@skipIfRocm
def test_inplace_triton_kernel_v3(self):
def f(x: torch.Tensor, y: torch.Tensor):
output = torch.zeros_like(x)
@ -745,7 +738,6 @@ class InplacingTests(TestCase):
self.assertExpectedInline(count_numel(f, *inp), """80""")
@requires_cuda()
@skipIfRocm
def test_inplace_triton_kernel_v4(self):
def f(x: torch.Tensor, y: torch.Tensor):
x_view = x.view(-1)
@ -760,7 +752,6 @@ class InplacingTests(TestCase):
self.assertExpectedInline(count_numel(f, *inp), """60""")
@requires_cuda()
@skipIfRocm
def test_inplace_triton_kernel_v5(self):
def f(x: torch.Tensor, y: torch.Tensor):
x_view = x.view(-1)
@ -775,7 +766,6 @@ class InplacingTests(TestCase):
self.assertExpectedInline(count_numel(f, *inp), """80""")
@requires_cuda()
@skipIfRocm
def test_inplace_triton_kernel_v6(self):
def f(x: torch.Tensor, y: torch.Tensor):
output = torch.zeros_like(x)

View File

@ -300,7 +300,6 @@ ROCM_BLOCKLIST = [
"test_jit_legacy",
"test_cuda_nvml_based_avail",
"test_jit_cuda_fuser",
"dynamo/test_activation_checkpointing",
]
# The tests inside these files should never be run in parallel with each other

View File

@ -3462,7 +3462,6 @@ class TestSparseCompressedTritonKernels(TestCase):
return d
@onlyCUDA
@skipIfRocm
@dtypes(torch.half, torch.bfloat16, torch.float)
@dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
@unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
@ -3498,7 +3497,6 @@ class TestSparseCompressedTritonKernels(TestCase):
@parametrize("block_size", [16, 32, 64])
@parametrize("index_dtype", [torch.int32, torch.int64])
@onlyCUDA
@skipIfRocm
@dtypes(torch.half, torch.bfloat16, torch.float)
@dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
@unittest.skipIf((not TEST_WITH_TORCHINDUCTOR) or (IS_FBCODE and IS_REMOTE_GPU) or torch._running_with_deploy(),
@ -3577,7 +3575,6 @@ class TestSparseCompressedTritonKernels(TestCase):
self.assertEqual(res_tri, res_dense)
@onlyCUDA
@skipIfRocm
@dtypes(torch.half)
@unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU or torch._running_with_deploy(),
"Skipped for deploy and internal with remote GPUs")
@ -3787,7 +3784,6 @@ class TestSparseCompressedTritonKernels(TestCase):
@parametrize("blocksize", [2, '2x3', 16, '16x32', 32, 64])
@onlyCUDA
@skipIfRocm
@dtypes(torch.half, torch.bfloat16, torch.float)
@dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
@unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")