[ROCm] Enable MI355 CI on PRs, and run full set of UTs on PRs (#160215)

Useful to have PR testing for PRs such as https://github.com/pytorch/pytorch/pull/151360 Pull Request resolved: https://github.com/pytorch/pytorch/pull/160215 Approved by: https://github.com/malfet, https://github.com/atalman Co-authored-by: Jeff Daily <jeff.daily@amd.com>
2025-10-20 12:54:11 +08:00 · 2025-10-09 18:03:08 +00:00
parent 3c0577bd15
commit ee6a1ecb0a
7 changed files with 18 additions and 6 deletions
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@ -30,6 +30,7 @@ ciflow_push_tags:
 - ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
+- ciflow/rocm-mi355
 - ciflow/s390
 - ciflow/slow
 - ciflow/torchbench
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@ -1,6 +1,9 @@
 name: rocm-mi355

 on:
+  push:
+    tags:
+      - ciflow/rocm-mi355/*
  workflow_dispatch:
  schedule:
    - cron: 30 11,1 * * *  # about 4:30am PDT and 6:30pm PDT
@ -64,5 +67,7 @@ jobs:
      build-environment: linux-noble-rocm-py3.12-mi355
      docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
      test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: >-
+                        ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
+                           || '' }}
    secrets: inherit
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@ -127,7 +127,7 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
  return diff == 0 ? 0 : uint32_t(Align) - diff;
 }

-#if defined (__gfx90a__) || defined(__gfx942__)
+#if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
 #define CDNA2_OR_LATER 1
 #else
 #define CDNA2_OR_LATER 0
@ -143,7 +143,7 @@ template<typename T, uint32_t Rank>
 using VecT = T __attribute__((ext_vector_type(Rank)));

 static bool isCDNA2orLater(int index) {
-    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index);
+    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index);
 }

 #else
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@ -39,6 +39,8 @@ from torch.testing._internal.common_utils import (
    DeterministicGuard,
    freeze_rng_state,
    IS_FBCODE,
+    MI350_ARCH,
+    skipIfRocmArch,
    TEST_WITH_ASAN,
    TEST_WITH_ROCM,
    xfailIfPy312Plus,
@ -218,6 +220,7 @@ class CudaReproTests(TestCase):
        # dont check rng state
        self.assertEqual(out[:2], fn(query, key, value, input_tensor2)[:2])

+    @skipIfRocmArch(MI350_ARCH)
    def test_effn_attn_bias_padding_misaligned(self):
        seqlen_start = 1008

--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@ -31,6 +31,7 @@ from torch.testing._internal.common_utils import (
    IS_LINUX,
    IS_X86,
    MI300_ARCH,
+    MI350_ARCH,
    parametrize,
    skipIfNoXPU,
    skipIfRocm,
@ -1187,7 +1188,7 @@ class TestPatternMatcher(TestPatternMatcherBase):
    @skipIfNoDynamoSupport
    @skipIfNoONEDNNBF16
    @skipIfNoONEDNN
-    @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
    def test_qconv2d_int8_mixed_bf16(self):
        r"""
        This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
@ -1197,7 +1198,7 @@ class TestPatternMatcher(TestPatternMatcherBase):
    @skipIfNoDynamoSupport
    @skipIfNoONEDNNBF16
    @skipIfNoONEDNN
-    @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
    def test_qconv2d_int8_mixed_bf16_use_autocast(self):
        r"""
        This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@ -13,6 +13,7 @@ from torch.testing._internal.common_cuda import (
    PLATFORM_SUPPORTS_FP8,
    PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
 )
+from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_utils import (
    run_tests,
    TEST_WITH_TORCHDYNAMO,
@ -853,7 +854,7 @@ class TestFlopCounter(TestCase):
        "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
    )
    def test_scaled_mm(self):
-        dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
+        dtype = e4m3_type
        with FlopCounterMode() as mode:
            torch._scaled_mm(
                torch.randn((3 * 16, 5 * 16), device="cuda").to(dtype),
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@ -102,6 +102,7 @@ except ImportError:


 SEED = 1234
+MI350_ARCH = ("gfx950",)
 MI300_ARCH = ("gfx942",)
 MI200_ARCH = ("gfx90a")
 NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")