From ee6a1ecb0a1035f068484c8fcfba44b2efc9e837 Mon Sep 17 00:00:00 2001 From: Jithun Nair Date: Thu, 9 Oct 2025 18:03:08 +0000 Subject: [PATCH] [ROCm] Enable MI355 CI on PRs, and run full set of UTs on PRs (#160215) Useful to have PR testing for PRs such as https://github.com/pytorch/pytorch/pull/151360 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160215 Approved by: https://github.com/malfet, https://github.com/atalman Co-authored-by: Jeff Daily --- .github/pytorch-probot.yml | 1 + .github/workflows/rocm-mi355.yml | 7 ++++++- aten/src/ATen/native/cuda/int4mm.cu | 4 ++-- test/inductor/test_cuda_repro.py | 3 +++ test/inductor/test_mkldnn_pattern_matcher.py | 5 +++-- test/test_flop_counter.py | 3 ++- torch/testing/_internal/common_utils.py | 1 + 7 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index 086b016b93d8..b682a0990b60 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -30,6 +30,7 @@ ciflow_push_tags: - ciflow/riscv64 - ciflow/rocm - ciflow/rocm-mi300 +- ciflow/rocm-mi355 - ciflow/s390 - ciflow/slow - ciflow/torchbench diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml index 5403a7300615..bd791e61f443 100644 --- a/.github/workflows/rocm-mi355.yml +++ b/.github/workflows/rocm-mi355.yml @@ -1,6 +1,9 @@ name: rocm-mi355 on: + push: + tags: + - ciflow/rocm-mi355/* workflow_dispatch: schedule: - cron: 30 11,1 * * * # about 4:30am PDT and 6:30pm PDT @@ -64,5 +67,7 @@ jobs: build-environment: linux-noble-rocm-py3.12-mi355 docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }} test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }} - tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" + tests-to-include: >- + ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda' + || '' }} secrets: inherit diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu index 5444bb57eba7..e16a5d1f4d96 100644 --- a/aten/src/ATen/native/cuda/int4mm.cu +++ b/aten/src/ATen/native/cuda/int4mm.cu @@ -127,7 +127,7 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) { return diff == 0 ? 0 : uint32_t(Align) - diff; } -#if defined (__gfx90a__) || defined(__gfx942__) +#if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__) #define CDNA2_OR_LATER 1 #else #define CDNA2_OR_LATER 0 @@ -143,7 +143,7 @@ template using VecT = T __attribute__((ext_vector_type(Rank))); static bool isCDNA2orLater(int index) { - return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index); + return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index); } #else diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py index 911d588df536..2119a998f6d7 100644 --- a/test/inductor/test_cuda_repro.py +++ b/test/inductor/test_cuda_repro.py @@ -39,6 +39,8 @@ from torch.testing._internal.common_utils import ( DeterministicGuard, freeze_rng_state, IS_FBCODE, + MI350_ARCH, + skipIfRocmArch, TEST_WITH_ASAN, TEST_WITH_ROCM, xfailIfPy312Plus, @@ -218,6 +220,7 @@ class CudaReproTests(TestCase): # dont check rng state self.assertEqual(out[:2], fn(query, key, value, input_tensor2)[:2]) + @skipIfRocmArch(MI350_ARCH) def test_effn_attn_bias_padding_misaligned(self): seqlen_start = 1008 diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py index 55e880ffad07..16f88b3c9419 100644 --- a/test/inductor/test_mkldnn_pattern_matcher.py +++ b/test/inductor/test_mkldnn_pattern_matcher.py @@ -31,6 +31,7 @@ from torch.testing._internal.common_utils import ( IS_LINUX, IS_X86, MI300_ARCH, + MI350_ARCH, parametrize, skipIfNoXPU, skipIfRocm, @@ -1187,7 +1188,7 @@ class TestPatternMatcher(TestPatternMatcherBase): @skipIfNoDynamoSupport @skipIfNoONEDNNBF16 @skipIfNoONEDNN - @skipIfRocmArch(MI300_ARCH) + @skipIfRocmArch(MI300_ARCH + MI350_ARCH) def test_qconv2d_int8_mixed_bf16(self): r""" This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization. @@ -1197,7 +1198,7 @@ class TestPatternMatcher(TestPatternMatcherBase): @skipIfNoDynamoSupport @skipIfNoONEDNNBF16 @skipIfNoONEDNN - @skipIfRocmArch(MI300_ARCH) + @skipIfRocmArch(MI300_ARCH + MI350_ARCH) def test_qconv2d_int8_mixed_bf16_use_autocast(self): r""" This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization. diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py index 17e699e04e58..03eb15744b54 100644 --- a/test/test_flop_counter.py +++ b/test/test_flop_counter.py @@ -13,6 +13,7 @@ from torch.testing._internal.common_cuda import ( PLATFORM_SUPPORTS_FP8, PLATFORM_SUPPORTS_MEM_EFF_ATTENTION, ) +from torch.testing._internal.common_device_type import e4m3_type from torch.testing._internal.common_utils import ( run_tests, TEST_WITH_TORCHDYNAMO, @@ -853,7 +854,7 @@ class TestFlopCounter(TestCase): "FP8 is only supported on H100+, SM 8.9 and MI300+ devices", ) def test_scaled_mm(self): - dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn + dtype = e4m3_type with FlopCounterMode() as mode: torch._scaled_mm( torch.randn((3 * 16, 5 * 16), device="cuda").to(dtype), diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py index ce1d42144aed..d7803d5c7c6e 100644 --- a/torch/testing/_internal/common_utils.py +++ b/torch/testing/_internal/common_utils.py @@ -102,6 +102,7 @@ except ImportError: SEED = 1234 +MI350_ARCH = ("gfx950",) MI300_ARCH = ("gfx942",) MI200_ARCH = ("gfx90a") NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")