From ee6a1ecb0a1035f068484c8fcfba44b2efc9e837 Mon Sep 17 00:00:00 2001
From: Jithun Nair <jithun.nair@amd.com>
Date: Thu, 9 Oct 2025 18:03:08 +0000
Subject: [PATCH] [ROCm] Enable MI355 CI on PRs, and run full set of UTs on PRs
 (#160215)

Useful to have PR testing for PRs such as https://github.com/pytorch/pytorch/pull/151360

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160215
Approved by: https://github.com/malfet, https://github.com/atalman

Co-authored-by: Jeff Daily <jeff.daily@amd.com>
---
 .github/pytorch-probot.yml                   | 1 +
 .github/workflows/rocm-mi355.yml             | 7 ++++++-
 aten/src/ATen/native/cuda/int4mm.cu          | 4 ++--
 test/inductor/test_cuda_repro.py             | 3 +++
 test/inductor/test_mkldnn_pattern_matcher.py | 5 +++--
 test/test_flop_counter.py                    | 3 ++-
 torch/testing/_internal/common_utils.py      | 1 +
 7 files changed, 18 insertions(+), 6 deletions(-)
diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml
index 086b016b93d8..b682a0990b60 100644
--- a/.github/pytorch-probot.yml
+++ b/.github/pytorch-probot.yml
@@ -30,6 +30,7 @@ ciflow_push_tags:
 - ciflow/riscv64
 - ciflow/rocm
 - ciflow/rocm-mi300
+- ciflow/rocm-mi355
 - ciflow/s390
 - ciflow/slow
 - ciflow/torchbench
diff --git a/.github/workflows/rocm-mi355.yml b/.github/workflows/rocm-mi355.yml
index 5403a7300615..bd791e61f443 100644
--- a/.github/workflows/rocm-mi355.yml
+++ b/.github/workflows/rocm-mi355.yml
@@ -1,6 +1,9 @@
 name: rocm-mi355
 
 on:
+  push:
+    tags:
+      - ciflow/rocm-mi355/*
   workflow_dispatch:
   schedule:
     - cron: 30 11,1 * * *  # about 4:30am PDT and 6:30pm PDT
@@ -64,5 +67,7 @@ jobs:
       build-environment: linux-noble-rocm-py3.12-mi355
       docker-image: ${{ needs.linux-noble-rocm-py3_12-build.outputs.docker-image }}
       test-matrix: ${{ needs.linux-noble-rocm-py3_12-build.outputs.test-matrix }}
-      tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor"
+      tests-to-include: >-
+                        ${{ github.event_name == 'schedule' && 'test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor test_matmul_cuda test_scaled_matmul_cuda'
+                           || '' }}
     secrets: inherit
diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu
index 5444bb57eba7..e16a5d1f4d96 100644
--- a/aten/src/ATen/native/cuda/int4mm.cu
+++ b/aten/src/ATen/native/cuda/int4mm.cu
@@ -127,7 +127,7 @@ inline __host__ __device__ uint32_t getAlignmentRoundUp(const void* p) {
   return diff == 0 ? 0 : uint32_t(Align) - diff;
 }
 
-#if defined (__gfx90a__) || defined(__gfx942__)
+#if defined (__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)
 #define CDNA2_OR_LATER 1
 #else
 #define CDNA2_OR_LATER 0
@@ -143,7 +143,7 @@ template<typename T, uint32_t Rank>
 using VecT = T __attribute__((ext_vector_type(Rank)));
 
 static bool isCDNA2orLater(int index) {
-    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942"}, index);
+    return at::detail::getCUDAHooks().isGPUArch({"gfx90a", "gfx942", "gfx950"}, index);
 }
 
 #else
diff --git a/test/inductor/test_cuda_repro.py b/test/inductor/test_cuda_repro.py
index 911d588df536..2119a998f6d7 100644
--- a/test/inductor/test_cuda_repro.py
+++ b/test/inductor/test_cuda_repro.py
@@ -39,6 +39,8 @@ from torch.testing._internal.common_utils import (
     DeterministicGuard,
     freeze_rng_state,
     IS_FBCODE,
+    MI350_ARCH,
+    skipIfRocmArch,
     TEST_WITH_ASAN,
     TEST_WITH_ROCM,
     xfailIfPy312Plus,
@@ -218,6 +220,7 @@ class CudaReproTests(TestCase):
         # dont check rng state
         self.assertEqual(out[:2], fn(query, key, value, input_tensor2)[:2])
 
+    @skipIfRocmArch(MI350_ARCH)
     def test_effn_attn_bias_padding_misaligned(self):
         seqlen_start = 1008
 
diff --git a/test/inductor/test_mkldnn_pattern_matcher.py b/test/inductor/test_mkldnn_pattern_matcher.py
index 55e880ffad07..16f88b3c9419 100644
--- a/test/inductor/test_mkldnn_pattern_matcher.py
+++ b/test/inductor/test_mkldnn_pattern_matcher.py
@@ -31,6 +31,7 @@ from torch.testing._internal.common_utils import (
     IS_LINUX,
     IS_X86,
     MI300_ARCH,
+    MI350_ARCH,
     parametrize,
     skipIfNoXPU,
     skipIfRocm,
@@ -1187,7 +1188,7 @@ class TestPatternMatcher(TestPatternMatcherBase):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
     def test_qconv2d_int8_mixed_bf16(self):
         r"""
         This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
@@ -1197,7 +1198,7 @@ class TestPatternMatcher(TestPatternMatcherBase):
     @skipIfNoDynamoSupport
     @skipIfNoONEDNNBF16
     @skipIfNoONEDNN
-    @skipIfRocmArch(MI300_ARCH)
+    @skipIfRocmArch(MI300_ARCH + MI350_ARCH)
     def test_qconv2d_int8_mixed_bf16_use_autocast(self):
         r"""
         This testcase will quantize a single Conv2d module with int8_mixed_bf16 quantization.
diff --git a/test/test_flop_counter.py b/test/test_flop_counter.py
index 17e699e04e58..03eb15744b54 100644
--- a/test/test_flop_counter.py
+++ b/test/test_flop_counter.py
@@ -13,6 +13,7 @@ from torch.testing._internal.common_cuda import (
     PLATFORM_SUPPORTS_FP8,
     PLATFORM_SUPPORTS_MEM_EFF_ATTENTION,
 )
+from torch.testing._internal.common_device_type import e4m3_type
 from torch.testing._internal.common_utils import (
     run_tests,
     TEST_WITH_TORCHDYNAMO,
@@ -853,7 +854,7 @@ class TestFlopCounter(TestCase):
         "FP8 is only supported on H100+, SM 8.9 and MI300+ devices",
     )
     def test_scaled_mm(self):
-        dtype = torch.float8_e4m3fnuz if torch.version.hip else torch.float8_e4m3fn
+        dtype = e4m3_type
         with FlopCounterMode() as mode:
             torch._scaled_mm(
                 torch.randn((3 * 16, 5 * 16), device="cuda").to(dtype),
diff --git a/torch/testing/_internal/common_utils.py b/torch/testing/_internal/common_utils.py
index ce1d42144aed..d7803d5c7c6e 100644
--- a/torch/testing/_internal/common_utils.py
+++ b/torch/testing/_internal/common_utils.py
@@ -102,6 +102,7 @@ except ImportError:
 
 
 SEED = 1234
+MI350_ARCH = ("gfx950",)
 MI300_ARCH = ("gfx942",)
 MI200_ARCH = ("gfx90a")
 NAVI_ARCH = ("gfx1030", "gfx1100", "gfx1101", "gfx1200", "gfx1201")