[AOTI][CPP] add flag TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL (#157949)

Summary: Add flag TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL to force inline the kernel function when TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL=1. It's disabled by default because force inlining may increase the build time.

Differential Revision: D77915987

Pull Request resolved: https://github.com/pytorch/pytorch/pull/157949
Approved by: https://github.com/desertfire
This commit is contained in:
Xiangyang (Mark) Guo
2025-07-15 10:51:43 +00:00
committed by PyTorch MergeBot
parent 6200584193
commit 156a377f4c
2 changed files with 13 additions and 1 deletions

View File

@ -5282,8 +5282,11 @@ class KernelGroup:
arg_defs, _, _ = self.args.cpp_argdefs()
arg_defs = ",\n".ljust(25).join(arg_defs)
func_export_decl = get_export_declaration()
inline_attr = (
"C10_ALWAYS_INLINE_ATTRIBUTE" if config.cpp.force_inline_kernel else ""
)
code.writeline(
f'extern "C" {func_export_decl} void {kernel_decl_name}({arg_defs})'
f'extern "C" {func_export_decl} void {inline_attr} {kernel_decl_name}({arg_defs})'
)
# 3. Function body

View File

@ -1004,6 +1004,11 @@ autotune_lookup_table: dict[str, dict[str, Any]] = {}
# config specific to codegen/cpp.py
class cpp:
"""
Settings for cpp backend.
This class provides a centralized location for managing cpp backend settings.
"""
# set to torch.get_num_threads()
threads = -1
@ -1119,6 +1124,10 @@ class cpp:
# Use a small dequant buffer for wgt of woq int4 size as: [q_group_size, Nr]
use_small_dequant_buffer = False
force_inline_kernel = (
os.environ.get("TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL", "0") == "1"
)
class triton:
"""