mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[ROCm] Enabling additional UTs on ROCm (#115738)
Unskips mostly for dynamo/inductor UT. Pull Request resolved: https://github.com/pytorch/pytorch/pull/115738 Approved by: https://github.com/jithunnair-amd, https://github.com/malfet
This commit is contained in:
committed by
PyTorch MergeBot
parent
f0bbc2fcf5
commit
db79ceb110
@ -13,7 +13,7 @@ from functorch.compile import min_cut_rematerialization_partition
|
||||
from torch._dynamo.backends.common import aot_autograd
|
||||
from torch._dynamo.testing import CompileCounterWithBackend
|
||||
from torch._higher_order_ops.wrap import tag_activation_checkpoint
|
||||
from torch.testing._internal.common_utils import IS_WINDOWS
|
||||
from torch.testing._internal.common_utils import IS_WINDOWS, skipIfRocm
|
||||
from torch.testing._internal.inductor_utils import HAS_CUDA
|
||||
from torch.testing._internal.two_tensor import TwoTensor
|
||||
from torch.utils.checkpoint import _pt2_selective_checkpoint_context_fn_gen, checkpoint
|
||||
@ -858,6 +858,7 @@ class ActivationCheckpointingViaTagsTests(torch._dynamo.test_case.TestCase):
|
||||
self._validate(fn, backend, x, y)
|
||||
|
||||
@requires_cuda()
|
||||
@skipIfRocm
|
||||
def test_autocast_flash_attention(self):
|
||||
def fn(primals_1, primals_2, primals_3):
|
||||
return torch.ops.aten._scaled_dot_product_efficient_attention.default(
|
||||
|
@ -10,7 +10,7 @@ import torch._dynamo.config
|
||||
import torch._dynamo.test_case
|
||||
import torch._dynamo.testing
|
||||
from torch._dynamo.testing import same
|
||||
from torch.testing._internal.common_utils import skipIfRocm, TEST_CUDA_GRAPH
|
||||
from torch.testing._internal.common_utils import TEST_CUDA_GRAPH
|
||||
|
||||
|
||||
def composed(*decs):
|
||||
@ -105,7 +105,6 @@ class TestAotCudagraphs(torch._dynamo.test_case.TestCase):
|
||||
y = torch.randn((), device="cpu")
|
||||
fn(x, y)
|
||||
|
||||
@skipIfRocm
|
||||
def test_mutate_input(self):
|
||||
def model(x, y):
|
||||
y.add_(3)
|
||||
|
@ -127,7 +127,6 @@ class TestSDPAPatternRewriterTemplate(TestCase):
|
||||
when an intermediate result is being used / returned downstream
|
||||
"""
|
||||
|
||||
@skipIfRocm
|
||||
@torch.compile(fullgraph=True)
|
||||
def dot_prod_attention(
|
||||
query: torch.Tensor, key: torch.Tensor, value: torch.Tensor
|
||||
|
@ -23,7 +23,6 @@ sys.path.append(pytorch_test_dir)
|
||||
from torch.testing._internal.common_utils import (
|
||||
IS_CI,
|
||||
IS_WINDOWS,
|
||||
skipIfRocm,
|
||||
TEST_WITH_ASAN,
|
||||
TestCase as TorchTestCase,
|
||||
)
|
||||
@ -470,7 +469,6 @@ class OptimizeForInferenceTemplate(TestCase):
|
||||
mod_eager = mod(x)
|
||||
self.assertEqual(foo(mod, x), mod_eager)
|
||||
|
||||
@skipIfRocm
|
||||
def test_cpp_wrapper(self):
|
||||
mod = ConvBN(3, 32, kernel_size=3, stride=2).eval().to(self.device)
|
||||
|
||||
|
@ -8,11 +8,7 @@ import torch
|
||||
import torch._inductor.config as config
|
||||
from torch._inductor import metrics
|
||||
from torch._inductor.compile_fx import compile_fx, count_bytes_inner
|
||||
from torch.testing._internal.common_utils import (
|
||||
IS_WINDOWS,
|
||||
skipIfRocm,
|
||||
TestCase as TorchTestCase,
|
||||
)
|
||||
from torch.testing._internal.common_utils import IS_WINDOWS, TestCase as TorchTestCase
|
||||
|
||||
# Defines all the kernels for tests
|
||||
from torch.testing._internal.triton_utils import HAS_CUDA, requires_cuda
|
||||
@ -704,7 +700,6 @@ class InplacingTests(TestCase):
|
||||
self.assertExpectedInline(count_numel(f, *inp), """42""")
|
||||
|
||||
@requires_cuda()
|
||||
@skipIfRocm
|
||||
def test_inplace_triton_kernel_v1(self):
|
||||
def f(x: torch.Tensor, y: torch.Tensor):
|
||||
output = torch.zeros_like(x)
|
||||
@ -717,7 +712,6 @@ class InplacingTests(TestCase):
|
||||
self.assertExpectedInline(count_numel(f, *inp), """40""")
|
||||
|
||||
@requires_cuda()
|
||||
@skipIfRocm
|
||||
def test_inplace_triton_kernel_v2(self):
|
||||
def f(x: torch.Tensor, y: torch.Tensor):
|
||||
output = torch.zeros_like(x)
|
||||
@ -731,7 +725,6 @@ class InplacingTests(TestCase):
|
||||
self.assertExpectedInline(count_numel(f, *inp), """60""")
|
||||
|
||||
@requires_cuda()
|
||||
@skipIfRocm
|
||||
def test_inplace_triton_kernel_v3(self):
|
||||
def f(x: torch.Tensor, y: torch.Tensor):
|
||||
output = torch.zeros_like(x)
|
||||
@ -745,7 +738,6 @@ class InplacingTests(TestCase):
|
||||
self.assertExpectedInline(count_numel(f, *inp), """80""")
|
||||
|
||||
@requires_cuda()
|
||||
@skipIfRocm
|
||||
def test_inplace_triton_kernel_v4(self):
|
||||
def f(x: torch.Tensor, y: torch.Tensor):
|
||||
x_view = x.view(-1)
|
||||
@ -760,7 +752,6 @@ class InplacingTests(TestCase):
|
||||
self.assertExpectedInline(count_numel(f, *inp), """60""")
|
||||
|
||||
@requires_cuda()
|
||||
@skipIfRocm
|
||||
def test_inplace_triton_kernel_v5(self):
|
||||
def f(x: torch.Tensor, y: torch.Tensor):
|
||||
x_view = x.view(-1)
|
||||
@ -775,7 +766,6 @@ class InplacingTests(TestCase):
|
||||
self.assertExpectedInline(count_numel(f, *inp), """80""")
|
||||
|
||||
@requires_cuda()
|
||||
@skipIfRocm
|
||||
def test_inplace_triton_kernel_v6(self):
|
||||
def f(x: torch.Tensor, y: torch.Tensor):
|
||||
output = torch.zeros_like(x)
|
||||
|
@ -300,7 +300,6 @@ ROCM_BLOCKLIST = [
|
||||
"test_jit_legacy",
|
||||
"test_cuda_nvml_based_avail",
|
||||
"test_jit_cuda_fuser",
|
||||
"dynamo/test_activation_checkpointing",
|
||||
]
|
||||
|
||||
# The tests inside these files should never be run in parallel with each other
|
||||
|
@ -3462,7 +3462,6 @@ class TestSparseCompressedTritonKernels(TestCase):
|
||||
return d
|
||||
|
||||
@onlyCUDA
|
||||
@skipIfRocm
|
||||
@dtypes(torch.half, torch.bfloat16, torch.float)
|
||||
@dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
|
||||
@unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
|
||||
@ -3498,7 +3497,6 @@ class TestSparseCompressedTritonKernels(TestCase):
|
||||
@parametrize("block_size", [16, 32, 64])
|
||||
@parametrize("index_dtype", [torch.int32, torch.int64])
|
||||
@onlyCUDA
|
||||
@skipIfRocm
|
||||
@dtypes(torch.half, torch.bfloat16, torch.float)
|
||||
@dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
|
||||
@unittest.skipIf((not TEST_WITH_TORCHINDUCTOR) or (IS_FBCODE and IS_REMOTE_GPU) or torch._running_with_deploy(),
|
||||
@ -3577,7 +3575,6 @@ class TestSparseCompressedTritonKernels(TestCase):
|
||||
self.assertEqual(res_tri, res_dense)
|
||||
|
||||
@onlyCUDA
|
||||
@skipIfRocm
|
||||
@dtypes(torch.half)
|
||||
@unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU or torch._running_with_deploy(),
|
||||
"Skipped for deploy and internal with remote GPUs")
|
||||
@ -3787,7 +3784,6 @@ class TestSparseCompressedTritonKernels(TestCase):
|
||||
|
||||
@parametrize("blocksize", [2, '2x3', 16, '16x32', 32, 64])
|
||||
@onlyCUDA
|
||||
@skipIfRocm
|
||||
@dtypes(torch.half, torch.bfloat16, torch.float)
|
||||
@dtypesIfCUDA(torch.half, *[torch.bfloat16] if SM80OrLater else [], torch.float)
|
||||
@unittest.skipIf(IS_FBCODE and IS_REMOTE_GPU, "Test requires Triton")
|
||||
|
Reference in New Issue
Block a user