[AOTI][reland] Fix assert_function call in cpu autotune template (#135920)

Summary: Reland https://github.com/pytorch/pytorch/pull/135086. In the ABI-compatible mode, assert_function should be AOTI_TORCH_CHECK. Test Plan: CI Differential Revision: D62500592 Pull Request resolved: https://github.com/pytorch/pytorch/pull/135920 Approved by: https://github.com/chenyang78
2025-10-21 21:49:24 +08:00 · 2024-09-13 12:21:57 +00:00
parent 2f53d570fe
commit ea2ecab15b
7 changed files with 21 additions and 23 deletions
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@ -3270,7 +3270,9 @@ class AOTInductorTestsTemplate:
            model, example_inputs_list, dynamic_shapes=dynamic_shapes
        )
-    @common_utils.parametrize("max_autotune", [False, True])
+    # max_autotune is disabled due to https://github.com/pytorch/pytorch/issues/135106
    # @common_utils.parametrize("max_autotune", [False, True])
    @common_utils.parametrize("max_autotune", [False])
    def test_misc_1(self, max_autotune):
        if self.device == "cpu" and IS_MACOS and max_autotune:
            raise unittest.SkipTest("max_autotune not supported on macos")
--- a/test/inductor/test_cpu_cpp_wrapper.py
+++ b/test/inductor/test_cpu_cpp_wrapper.py
@ -87,13 +87,7 @@ if TEST_WITH_ROCM:
        }
    )
 if config.abi_compatible:
-    xfail_list = [
+    xfail_list = []
        *[
            func
            for func in dir(test_cpu_select_algorithm.TestSelectAlgorithmCPU())
            if func.startswith("test_linear_with_pointwise")
        ],
    ]
    for test_name in xfail_list:
        test_failures_cpp_wrapper[test_name] = test_torchinductor.TestFailure(
            ("cpp_wrapper",), is_skip=False
@ -103,6 +97,11 @@ if config.abi_compatible:
        ] = test_torchinductor.TestFailure(("cpp_wrapper",), is_skip=False)
    skip_list = [
        "test_multihead_attention_cpu",
        *[
            func
            for func in dir(test_cpu_select_algorithm.TestSelectAlgorithmCPU())
            if func.startswith("test_linear_with_pointwise")
        ],
    ]
    for test_name in skip_list:
        test_failures_cpp_wrapper[test_name] = test_torchinductor.TestFailure(
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@ -2146,9 +2146,7 @@ class CppKernel(Kernel):
    @property
    def assert_function(self) -> str:
-        if V.graph.aot_mode:
+        if config.abi_compatible:
            # TODO: Using AOTI_TORCH_CHECK is causing performance drop for some models
            # compared with JIT Inductor which uses TORCH_CHECK
            return "AOTI_TORCH_CHECK"
        else:
            return "TORCH_CHECK"
--- a/torch/_inductor/codegen/cpp_micro_gemm.py
+++ b/torch/_inductor/codegen/cpp_micro_gemm.py
@ -332,8 +332,8 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
    TEMPLATE_ENTRY = r"""
 {{declare_kernel}} {
-    TORCH_CHECK(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
+    {{kernel.assert_function}}(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
-    TORCH_CHECK(K % {{block_k}} == 0, "K dimension must be multiple of {{block_k}}");
+    {{kernel.assert_function}}(K % {{block_k}} == 0, "K dimension must be multiple of {{block_k}}");
    // TODO(jgong5): loop unroll for M and N
    for (int64_t m = 0; m < M; m += {{block_m}}) {
        int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
@ -364,7 +364,7 @@ class CppMicroGemmFP32Vec(CppMicroGemm):
                    break;
 {%- endfor %}
                default:
-                    {{kernel.assert_function}}(false, "Unsupported block_m: ", block_m);
+                    {{kernel.assert_function}}(false, "Unsupported block_m: {{block_m}}");
                }
            }
        }
@ -509,8 +509,8 @@ class CppMicroGemmAMX(CppMicroGemm):
    TEMPLATE_ENTRY = r"""
 {{declare_kernel}} {
-    TORCH_CHECK(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
+    {{kernel.assert_function}}(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
-    TORCH_CHECK(K % 2 == 0, "K dimension must be multiple of 2");
+    {{kernel.assert_function}}(K % 2 == 0, "K dimension must be multiple of 2");
    // TODO(jgong5): loop unroll for M and N
    for (int64_t m = 0; m < M; m += {{block_m}}) {
        int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
--- a/torch/_inductor/codegen/cpp_prefix.h
+++ b/torch/_inductor/codegen/cpp_prefix.h
@ -26,6 +26,7 @@
 #include <c10/util/generic_math.h>
 #include <c10/util/Half.h>
 #include <c10/util/TypeCast.h>
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #if defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_ZVECTOR) || defined(CPU_CAPABILITY_NEON) || defined(CPU_CAPABILITY_VSX)
 #define INDUCTOR_USE_VECTOR_TYPES() 1
--- a/torch/_inductor/codegen/cpp_template.py
+++ b/torch/_inductor/codegen/cpp_template.py
@ -111,11 +111,10 @@ class CppTemplate(KernelTemplate):
    def header(self) -> IndentedBuffer:
        res = IndentedBuffer()
        res.writeline(codecache.cpp_prefix())
-        res.splice(
+        # TODO: add c10::ForcedUnroll test to test_aoti_abi_check
-            """
+        res.splice("""#include <c10/util/Unroll.h>""")
-                #include "c10/util/Unroll.h"
+        if config.abi_compatible:
-            """
+            res.splice("""#include <torch/csrc/inductor/aoti_torch/c/shim.h>""")
        )
        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
            "linux",
            "win32",
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@ -854,7 +854,6 @@ class GraphLowering(torch.fx.Interpreter):
    def allocate_non_dup_const_name(
        self, name: Optional[str], data: Union[Tensor]
    ) -> str:
        orig_name = name
        if not config.aot_inductor.use_runtime_constant_folding:
            for constant_name, value in self.constants.items():
                if (
@ -871,7 +870,7 @@ class GraphLowering(torch.fx.Interpreter):
        if name is None:
            name = f"constant{len(self.constants)}"
-        assert name is not None
+        orig_name = name
        if name[0].isdigit():
            name = f"constant_{name}"
        name = self.qualify_name(name)