Revert "[cutlass backend][BE][ez] Make matmul layouts be row x column (#156656)"

This reverts commit 84c588e5eada9e7921608065edc444a15c22cb1c. Reverted https://github.com/pytorch/pytorch/pull/156656 on behalf of https://github.com/henrylhtsang due to breaking fbcode A100 tests ([comment](https://github.com/pytorch/pytorch/pull/156656#issuecomment-3020769914))
2025-10-20 21:14:14 +08:00 · 2025-06-30 21:16:04 +00:00
parent 3684be056d
commit d3efd73234
1 changed files with 16 additions and 16 deletions
--- a/test/inductor/test_cutlass_backend.py
+++ b/test/inductor/test_cutlass_backend.py
@ -261,7 +261,7 @@ class TestCutlassBackend(TestCase):
        M, N, K = 4096, 2048, 25728

        a = torch.randn(M, K).cuda().half()
-        b = torch.randn(N, K).cuda().half().t()
+        b = torch.randn(K, N).cuda().half()

        with config.patch(
            {
@ -289,7 +289,7 @@ class TestCutlassBackend(TestCase):
        M, N, K = 4096, 2048, 25728

        a = torch.randn(M, K).cuda().half()
-        b = torch.randn(N, K).cuda().half().t()
+        b = torch.randn(K, N).cuda().half()

        x_shapes = [
            (M, N),
@ -326,7 +326,7 @@ class TestCutlassBackend(TestCase):
        B, M, N, K = 10, 4096, 2048, 25728

        a = torch.randn(B, M, K).cuda().half()
-        b = torch.randn(B, N, K).cuda().half().permute(0, 2, 1)
+        b = torch.randn(B, K, N).cuda().half()

        with config.patch(
            {
@ -358,8 +358,8 @@ class TestCutlassBackend(TestCase):

        model = MyModel()
        a = torch.randn(128, 16).cuda().half()
-        b = torch.randn(128, 16).cuda().half().t()
-        c = torch.randn(512, 16).cuda().half().t()
+        b = torch.randn(16, 128).cuda().half()
+        c = torch.randn(16, 512).cuda().half()

        with config.patch(
            {
@ -400,8 +400,8 @@ class TestCutlassBackend(TestCase):

        model = MyModel()
        a = torch.randn(128, 16).cuda().half()
-        b = torch.randn(128, 16).cuda().half().t()
-        c = torch.randn(512, 16).cuda().half().t()
+        b = torch.randn(16, 128).cuda().half()
+        c = torch.randn(16, 512).cuda().half()

        with config.patch(
            {
@ -465,7 +465,7 @@ class TestCutlassBackend(TestCase):
        model = MyModel().cuda()

        inputs = [
-            (torch.randn(M, K).cuda().to(dtype), torch.randn(N, K).cuda().to(dtype).t())
+            (torch.randn(M, K).cuda().to(dtype), torch.randn(K, N).cuda().to(dtype))
            for (M, N, K) in shapes
        ]

@ -633,7 +633,7 @@ class TestCutlassBackend(TestCase):
                (
                    torch.randn(x_shape(M, N)).cuda().to(dtype),
                    torch.randn(M, K).cuda().to(dtype),
-                    torch.randn(N, K).cuda().to(dtype).t(),
+                    torch.randn(K, N).cuda().to(dtype),
                )
                for (M, N, K) in shapes
            ]
@ -744,7 +744,7 @@ class TestCutlassBackend(TestCase):
            return a @ b

        a = torch.randn(128, 16).cuda().half()
-        b = torch.randn(128, 16).cuda().half().t()
+        b = torch.randn(16, 128).cuda().half()

        with config.patch(
            {
@ -770,7 +770,7 @@ class TestCutlassBackend(TestCase):
                ),
            ):
                a = torch.randn(M, K).cuda().half()
-                b = torch.randn(N, K).cuda().half().t()
+                b = torch.randn(K, N).cuda().half()
                Y_compiled = torch.compile(mm, dynamic=dynamic)(a, b)
                Y = mm(a, b)
                # we need relaxed numerical limits due to the sheer size of the
@ -935,7 +935,7 @@ class TestCutlassBackend(TestCase):
            }

            x = torch.randn(M, K).cuda().half()
-            w = torch.randn(N, K).cuda().half().t()
+            w = torch.randn(K, N).cuda().half()

            actual = AOTIRunnerUtil.run(
                model,
@ -973,7 +973,7 @@ class TestCutlassBackend(TestCase):
            }

            x = torch.randn(M, K).cuda().half()
-            w = torch.randn(N, K).cuda().half().t()
+            w = torch.randn(K, N).cuda().half()

            actual = AOTIRunnerUtil.run(
                model,
@ -1003,7 +1003,7 @@ class TestCutlassBackend(TestCase):
            M, N, K = 200, 5216, 10_432

            x = torch.randn(M, K).cuda().half()
-            w = torch.randn(N, K).cuda().half().t()
+            w = torch.randn(K, N).cuda().half()

            actual = AOTIRunnerUtil.run(
                model,
@ -1032,7 +1032,7 @@ class TestCutlassBackend(TestCase):
        mask = torch.tensor([0, 0, 1, 1]).tile(m, k // 4).cuda().half()
        a = torch.rand(m, k).cuda().half() * mask
        a_sparse = to_sparse_semi_structured(a)
-        b = torch.rand(n, k).cuda().half().t()
+        b = torch.rand(k, n).cuda().half()

        with config.patch(
            {
@ -1335,7 +1335,7 @@ class TestCutlassBackend(TestCase):

        M, N, K = (128, 128, 16)
        A = torch.randn(M, K).cuda().half()
-        B = torch.randn(N, K).cuda().half().t()
+        B = torch.randn(K, N).cuda().half()

        def select_no_algorithm(*args, **kwargs):
            raise NoValidChoicesError