mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[AOTI][CPP] add flag TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL (#157949)
Summary: Add flag TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL to force inline the kernel function when TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL=1. It's disabled by default because force inlining may increase the build time. Differential Revision: D77915987 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157949 Approved by: https://github.com/desertfire
This commit is contained in:
committed by
PyTorch MergeBot
parent
6200584193
commit
156a377f4c
@ -5282,8 +5282,11 @@ class KernelGroup:
|
||||
arg_defs, _, _ = self.args.cpp_argdefs()
|
||||
arg_defs = ",\n".ljust(25).join(arg_defs)
|
||||
func_export_decl = get_export_declaration()
|
||||
inline_attr = (
|
||||
"C10_ALWAYS_INLINE_ATTRIBUTE" if config.cpp.force_inline_kernel else ""
|
||||
)
|
||||
code.writeline(
|
||||
f'extern "C" {func_export_decl} void {kernel_decl_name}({arg_defs})'
|
||||
f'extern "C" {func_export_decl} void {inline_attr} {kernel_decl_name}({arg_defs})'
|
||||
)
|
||||
|
||||
# 3. Function body
|
||||
|
@ -1004,6 +1004,11 @@ autotune_lookup_table: dict[str, dict[str, Any]] = {}
|
||||
|
||||
# config specific to codegen/cpp.py
|
||||
class cpp:
|
||||
"""
|
||||
Settings for cpp backend.
|
||||
This class provides a centralized location for managing cpp backend settings.
|
||||
"""
|
||||
|
||||
# set to torch.get_num_threads()
|
||||
threads = -1
|
||||
|
||||
@ -1119,6 +1124,10 @@ class cpp:
|
||||
# Use a small dequant buffer for wgt of woq int4 size as: [q_group_size, Nr]
|
||||
use_small_dequant_buffer = False
|
||||
|
||||
force_inline_kernel = (
|
||||
os.environ.get("TORCHINDUCTOR_CPP_FORCE_INLINE_KERNEL", "0") == "1"
|
||||
)
|
||||
|
||||
|
||||
class triton:
|
||||
"""
|
||||
|
Reference in New Issue
Block a user