diff --git a/test/distributed/_composable/fsdp/test_fully_shard_compile.py b/test/distributed/_composable/fsdp/test_fully_shard_compile.py index 630e20a2540f..57fff5fe8947 100644 --- a/test/distributed/_composable/fsdp/test_fully_shard_compile.py +++ b/test/distributed/_composable/fsdp/test_fully_shard_compile.py @@ -32,7 +32,7 @@ from torch.testing._internal.common_distributed import ( sm_is_or_higher_than, ) from torch.testing._internal.common_fsdp import FSDPTest, get_devtype, MLP -from torch.testing._internal.common_utils import run_tests, skipIfRocm +from torch.testing._internal.common_utils import run_tests from torch.testing._internal.distributed._tensor.common_dtensor import ( ModelArgs, Transformer, @@ -133,7 +133,11 @@ class TestFullyShardCompile(FSDPTest): device_type.type, self.rank % torch.get_device_module(device_type).device_count(), ) - if device_type.type == "cuda" and not sm_is_or_higher_than(device, 8, 0): + if ( + device_type.type == "cuda" + and not torch.version.hip + and not sm_is_or_higher_than(device, 8, 0) + ): self.skipTest("bf16 requires sm >= 8.0") def test_dynamo_trace_use_training_state(self): @@ -478,7 +482,6 @@ val.shape: {[node.meta["val"].shape for node in aliased_graph_inputs]}, file_check = file_check.check("torch.ops._c10d_functional.wait_tensor.") return file_check - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_compiled_autograd_ctx(self): self.skipTestForOldSm() @@ -643,14 +646,12 @@ Unsupported Tensor.backward() call return model_init_fn, input_creation_fn - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_simple_mlp_fullgraph_backend_aot_eager(self): self._test_traceable_fsdp( *self._create_simple_mlp_factory_fns(), "aot_eager", fwd_fullgraph=True ) - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_simple_mlp_fullgraph_backend_aot_eager_decomp_partition(self): self._test_traceable_fsdp( @@ -659,7 +660,6 @@ Unsupported Tensor.backward() call fwd_fullgraph=True, ) - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_simple_mlp_fullgraph_backend_inductor(self): self.skipTestForOldSm() @@ -731,7 +731,6 @@ Unsupported Tensor.backward() call return model_init_fn, input_creation_fn - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_aot_eager(self): # TODO: fix fwd_fullgraph=False case @@ -744,7 +743,6 @@ Unsupported Tensor.backward() call fwd_fullgraph=fwd_fullgraph, ) - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_aot_eager_decomp_partition(self): # TODO: fix fwd_fullgraph=False case @@ -866,19 +864,16 @@ Unsupported Tensor.backward() call pass file_check.run(bwd_code) - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_inductor_fullgraph_True(self): self._test_nested_fully_shard_backend_inductor_fullgraph_True() - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") @torch._inductor.config.patch("graph_partition", True) def test_nested_fully_shard_backend_inductor_fullgraph_True_graph_partition(self): self._test_nested_fully_shard_backend_inductor_fullgraph_True() @unittest.skip("TODO: fix fwd_fullgraph=False case") - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_nested_fully_shard_backend_inductor_fullgraph_False(self): self.skipTestForOldSm() @@ -956,7 +951,6 @@ Unsupported Tensor.backward() call else: return contextlib.nullcontext() - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") def test_transformer_backend_aot_eager(self): # TODO: fix fwd_fullgraph=False case @@ -975,7 +969,6 @@ Unsupported Tensor.backward() call fwd_fullgraph=fwd_fullgraph, ) - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") # TODO: native_dropout has worse accuracy after decomp, need to figure out why @torch._inductor.config.patch(fallback_random=True) @@ -1111,7 +1104,6 @@ Unsupported Tensor.backward() call file_check.run(bwd_code) @unittest.skip('"Traceable FSDP2" is not being maintained anymore.') - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") # TODO: native_dropout causes CUDA IMA error, need to figure out why @torch._inductor.config.patch(fallback_random=True) @@ -1119,7 +1111,6 @@ Unsupported Tensor.backward() call self._test_transformer_backend_inductor_fullgraph_True() @unittest.skip('"Traceable FSDP2" is not being maintained anymore.') - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") # TODO: native_dropout causes CUDA IMA error, need to figure out why @torch._inductor.config.patch(fallback_random=True) @@ -1128,7 +1119,6 @@ Unsupported Tensor.backward() call self._test_transformer_backend_inductor_fullgraph_True() @unittest.skip("TODO: fix fwd_fullgraph=False case") - @skipIfRocm @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch") # TODO: native_dropout causes CUDA IMA error, need to figure out why @torch._inductor.config.patch(fallback_random=True) diff --git a/test/inductor/test_distributed_patterns.py b/test/inductor/test_distributed_patterns.py index 780fac7db528..e067bdfedc09 100644 --- a/test/inductor/test_distributed_patterns.py +++ b/test/inductor/test_distributed_patterns.py @@ -7,7 +7,7 @@ from torch import nn from torch._dynamo import compiled_autograd from torch._dynamo.test_case import run_tests, TestCase from torch._dynamo.testing import CompileCounter -from torch.testing._internal.common_utils import IS_MACOS, skipIfRocm, skipIfXpu +from torch.testing._internal.common_utils import IS_MACOS, skipIfXpu from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_CPU, requires_gpu @@ -205,7 +205,6 @@ class DistributedPatternTests(TestCase): def test_storage_resize_zero_cpu(self): self._test_storage_resize_zero("cpu") - @skipIfRocm @requires_gpu() def test_storage_resize_zero_gpu(self): self._test_storage_resize_zero(GPU_TYPE) @@ -230,7 +229,6 @@ class DistributedPatternTests(TestCase): def test_storage_resize_nonzero_cpu(self): self._test_storage_resize_nonzero("cpu") - @skipIfRocm @requires_gpu() def test_storage_resize_nonzero_gpu(self): self._test_storage_resize_nonzero(GPU_TYPE) @@ -485,7 +483,6 @@ class DistributedPatternTests(TestCase): # Recompile on grad==None/grad!=None self.assertEqual(bw_cnt.frame_count, 2) - @skipIfRocm @skipIfXpu @requires_gpu() @torch._functorch.config.patch(recompute_views=True) diff --git a/torch/csrc/inductor/resize_storage_bytes.cpp b/torch/csrc/inductor/resize_storage_bytes.cpp index 018acb1a0fc5..b41b99aca747 100644 --- a/torch/csrc/inductor/resize_storage_bytes.cpp +++ b/torch/csrc/inductor/resize_storage_bytes.cpp @@ -14,8 +14,7 @@ using namespace at; static void resize_storage_bytes_(const Tensor& variable, SymInt new_size) { // similar to THPStorage_resize_ in StorageMethods.cpp, but is traceable if (variable.storage().device_type() == at::kCUDA) { - // rocm build has undefined reference to resize_bytes_cuda -#if defined(USE_CUDA) && !defined(USE_ROCM) +#if defined(USE_CUDA) at::native::resize_bytes_cuda( variable.storage().unsafeGetStorageImpl(), new_size.expect_int()); #else