[AOTI][CPP] add flag TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL (#157949)

Summary: Add flag TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL to force inline the kernel function when TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL=1. It's disabled by default because force inlining may increase the build time. Differential Revision: D77915987 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157949 Approved by: https://github.com/desertfire
2025-10-20 21:14:14 +08:00 · 2025-07-15 10:51:43 +00:00
parent 6200584193
commit 156a377f4c
2 changed files with 13 additions and 1 deletions
--- a/torch/_inductor/codegen/cpp.py
+++ b/torch/_inductor/codegen/cpp.py
@ -5282,8 +5282,11 @@ class KernelGroup:
        arg_defs, _, _ = self.args.cpp_argdefs()
        arg_defs = ",\n".ljust(25).join(arg_defs)
        func_export_decl = get_export_declaration()
+        inline_attr = (
+            "C10_ALWAYS_INLINE_ATTRIBUTE" if config.cpp.force_inline_kernel else ""
+        )
        code.writeline(
-            f'extern "C" {func_export_decl} void {kernel_decl_name}({arg_defs})'
+            f'extern "C" {func_export_decl} void {inline_attr} {kernel_decl_name}({arg_defs})'
        )

        # 3. Function body
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -1004,6 +1004,11 @@ autotune_lookup_table: dict[str, dict[str, Any]] = {}

 # config specific to codegen/cpp.py
 class cpp:
+    """
+    Settings for cpp backend.
+    This class provides a centralized location for managing cpp backend settings.
+    """
+
    # set to torch.get_num_threads()
    threads = -1

@ -1119,6 +1124,10 @@ class cpp:
    # Use a small dequant buffer for wgt of woq int4 size as: [q_group_size, Nr]
    use_small_dequant_buffer = False

+    force_inline_kernel = (
+        os.environ.get("TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL", "0") == "1"
+    )
+

 class triton:
    """