mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
cpp_wrapper: build non-performance-sensitive code at O1 (#148773)
Builds on #148212, applying the same improvements to `cpp_wrapper` mode. Benchmark results: * [A100 Benchmarks](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Wed%2C%2014%20May%202025%2015%3A10%3A05%20GMT&stopTime=Wed%2C%2021%20May%202025%2015%3A10%3A05%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(a100)&lBranch=gh/benjaminglass1/77/orig&lCommit=ca7d0a3f16e3c511534d2cd03d695be8524570d3&rBranch=main&rCommit=1075bb37d34e483763a09c7810790d5491441e13) * [x86 Benchmarks](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Wed%2C%2014%20May%202025%2015%3A10%3A05%20GMT&stopTime=Wed%2C%2021%20May%202025%2015%3A10%3A05%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cpu%20(x86)&lBranch=gh/benjaminglass1/77/orig&lCommit=ca7d0a3f16e3c511534d2cd03d695be8524570d3&rBranch=main&rCommit=1075bb37d34e483763a09c7810790d5491441e13) Pull Request resolved: https://github.com/pytorch/pytorch/pull/148773 Approved by: https://github.com/desertfire
This commit is contained in:
committed by
PyTorch MergeBot
parent
3c0cbf4b44
commit
768cb734ec
@ -1,8 +1,8 @@
|
||||
add_loop_eager,compile_time_instruction_count,2987000000,0.015
|
||||
add_loop_eager,compile_time_instruction_count,2960000000,0.015
|
||||
|
||||
|
||||
|
||||
add_loop_eager_dynamic,compile_time_instruction_count,5928000000,0.025
|
||||
add_loop_eager_dynamic,compile_time_instruction_count,5827000000,0.025
|
||||
|
||||
|
||||
|
||||
@ -10,7 +10,7 @@ add_loop_inductor,compile_time_instruction_count,29370000000,0.015
|
||||
|
||||
|
||||
|
||||
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44480000000,0.025
|
||||
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44080000000,0.025
|
||||
|
||||
|
||||
|
||||
@ -34,15 +34,31 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,10370000
|
||||
|
||||
|
||||
|
||||
update_hint_regression,compile_time_instruction_count,1715000000,0.02
|
||||
basic_InlineMod_eager,compile_time_instruction_count,7101000000,0.015
|
||||
|
||||
|
||||
|
||||
float_args,compile_time_instruction_count,444500000,0.015
|
||||
update_hint_regression,compile_time_instruction_count,1683000000,0.02
|
||||
|
||||
|
||||
|
||||
sum_floordiv_regression,compile_time_instruction_count,1009000000,0.015
|
||||
float_args,compile_time_instruction_count,455100000,0.015
|
||||
|
||||
|
||||
|
||||
mm_loop_inductor_gpu,compile_time_instruction_count,4407000000,0.015
|
||||
|
||||
|
||||
|
||||
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,7381000000,0.015
|
||||
|
||||
|
||||
|
||||
basic_NestedModule_eager,compile_time_instruction_count,8241000000,0.015
|
||||
|
||||
|
||||
|
||||
sum_floordiv_regression,compile_time_instruction_count,1000000000,0.015
|
||||
|
||||
|
||||
|
||||
@ -66,7 +82,7 @@ aotdispatcher_partitioner_cpu,compile_time_instruction_count,8630000000,0.015
|
||||
|
||||
|
||||
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1900000000,0.015
|
||||
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1890000000,0.015
|
||||
|
||||
|
||||
|
||||
|
|
@ -138,6 +138,7 @@ if TYPE_CHECKING:
|
||||
from concurrent.futures import Future
|
||||
|
||||
from .compile_fx import _CompileFxKwargs
|
||||
from .cpp_builder import BuildOptionsBase
|
||||
from .graph import GraphLowering
|
||||
from .ir import ChoiceCaller
|
||||
from .output_code import CompiledFxGraphConstants, OutputCode
|
||||
@ -2225,7 +2226,7 @@ def _precompile_header(
|
||||
os.makedirs(_HEADER_LOCK_DIR, exist_ok=True)
|
||||
_worker_compile_cpp(
|
||||
os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"),
|
||||
cpp_builder,
|
||||
(cpp_builder,),
|
||||
)
|
||||
|
||||
return header_full_path
|
||||
@ -2251,6 +2252,9 @@ def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:
|
||||
|
||||
@clear_on_fresh_inductor_cache
|
||||
class CppCodeCache:
|
||||
"""Compiles and caches C++ libraries. Users of this class supply the source code to
|
||||
be compiled, while compilation flags are set by CppBuilder."""
|
||||
|
||||
cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
|
||||
cache_clear = staticmethod(cache.clear)
|
||||
cpp_compile_command_flags: dict[str, Any] = {}
|
||||
@ -2292,11 +2296,14 @@ class CppCodeCache:
|
||||
@classmethod
|
||||
def load_async(
|
||||
cls,
|
||||
source_code: str,
|
||||
main_code: str,
|
||||
device_type: str = "cpu",
|
||||
submit_fn: Any = None,
|
||||
extra_flags: Sequence[str] = (),
|
||||
optimized_code: Optional[str] = None,
|
||||
) -> Any:
|
||||
"""Compile and load a C++ library. Returns a callable that returns the loaded
|
||||
library."""
|
||||
compile_command = {
|
||||
**cls.cpp_compile_command_flags,
|
||||
"device_type": device_type,
|
||||
@ -2307,48 +2314,112 @@ class CppCodeCache:
|
||||
|
||||
_set_gpu_runtime_env() # cpp_extension consults the env
|
||||
|
||||
cpp_build_option = CppTorchDeviceOptions(**compile_command)
|
||||
command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option)
|
||||
# write function will calc source_code hash, the same source code with different
|
||||
# ISA level should be generate different hash.
|
||||
# So we need get a command_line which contains isa related parameter as a part of hash key.
|
||||
# And then pass the command_line to below write function as extra parameter to
|
||||
# guarantee the source code hash contains ISA difference.
|
||||
vec_isa_cmd = repr(command_gen.get_command_line())
|
||||
key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
|
||||
# Note the distinction between the two booleans. We do minimal optimization if
|
||||
# the optimized_code argument is present at all, since that's how the user of
|
||||
# this function opts in, but we do compilation and linking in one step if the
|
||||
# optimized_code argument is empty (as a micro-optimization).
|
||||
main_build_option = CppTorchDeviceOptions(
|
||||
compile_only=bool(optimized_code),
|
||||
min_optimize=optimized_code is not None,
|
||||
**compile_command,
|
||||
)
|
||||
optimized_build_option = CppTorchDeviceOptions(
|
||||
compile_only=True, **compile_command
|
||||
)
|
||||
|
||||
def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
|
||||
"""Writing the code to file will calculate a hash, which we need to vary if
|
||||
the command line flags change. This implements a mostly-generic way of
|
||||
validating that."""
|
||||
return CppBuilder(
|
||||
name="o", sources="i", BuildOption=build_option
|
||||
).get_command_line()
|
||||
|
||||
main_cmd_line = get_hashable_command_line(main_build_option)
|
||||
optimized_cmd_line = get_hashable_command_line(optimized_build_option)
|
||||
|
||||
key, main_path = write(
|
||||
main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}"
|
||||
)
|
||||
|
||||
# Don't bother writing if the argument is empty.
|
||||
if optimized_code:
|
||||
_, optimized_path = write(
|
||||
optimized_code, "optimized.cpp", extra=optimized_cmd_line
|
||||
)
|
||||
else:
|
||||
# Unused, but makes type checkers happy.
|
||||
optimized_path = os.devnull
|
||||
|
||||
if key not in cls.cache:
|
||||
from torch.utils._filelock import FileLock
|
||||
|
||||
lock_path = os.path.join(get_lock_dir(), key + ".lock")
|
||||
output_name, output_dir = get_name_and_dir_from_output_file_path(input_path)
|
||||
future: Optional[Future[Any]] = None
|
||||
lib = None
|
||||
|
||||
# if requested, pre-compile any headers
|
||||
if (
|
||||
config.cpp_cache_precompile_headers
|
||||
and not _IS_WINDOWS
|
||||
and (header_file := cls._get_uncompiled_header(device_type))
|
||||
):
|
||||
cpp_build_option.precompiled_header = _precompile_header(
|
||||
header_file,
|
||||
vec_isa_cmd,
|
||||
**compile_command,
|
||||
if config.cpp_cache_precompile_headers and not _IS_WINDOWS:
|
||||
if header := cls._get_uncompiled_header(device_type):
|
||||
main_build_option.precompiled_header = _precompile_header(
|
||||
header,
|
||||
main_cmd_line,
|
||||
min_optimize=optimized_code is not None,
|
||||
**compile_command,
|
||||
)
|
||||
|
||||
# Currently, the optimized_code field is only used for cpp kernel code,
|
||||
# so go ahead and precompile the relevant header here. Revisit this
|
||||
# decision if that ever changes.
|
||||
if optimized_code and (header := _get_cpp_prefix_header(device_type)):
|
||||
optimized_build_option.precompiled_header = _precompile_header(
|
||||
header,
|
||||
optimized_cmd_line,
|
||||
**compile_command,
|
||||
)
|
||||
|
||||
main_name, output_dir = get_name_and_dir_from_output_file_path(main_path)
|
||||
main_builder = CppBuilder(
|
||||
name=main_name,
|
||||
sources=main_path,
|
||||
BuildOption=main_build_option,
|
||||
output_dir=output_dir,
|
||||
)
|
||||
|
||||
if optimized_code:
|
||||
optimized_name, _ = get_name_and_dir_from_output_file_path(
|
||||
optimized_path
|
||||
)
|
||||
optimized_builder = CppBuilder(
|
||||
name=optimized_name,
|
||||
sources=optimized_path,
|
||||
BuildOption=optimized_build_option,
|
||||
output_dir=output_dir,
|
||||
)
|
||||
|
||||
cpp_builder = CppBuilder(
|
||||
name=output_name,
|
||||
sources=input_path,
|
||||
output_dir=output_dir,
|
||||
BuildOption=cpp_build_option,
|
||||
)
|
||||
worker_fn = functools.partial(
|
||||
_worker_compile_cpp,
|
||||
lock_path,
|
||||
cpp_builder,
|
||||
)
|
||||
binary_path = normalize_path_separator(cpp_builder.get_target_file_path())
|
||||
linker = CppBuilder(
|
||||
name=main_name,
|
||||
sources=[
|
||||
main_builder.get_target_file_path(),
|
||||
optimized_builder.get_target_file_path(),
|
||||
],
|
||||
BuildOption=CppTorchDeviceOptions(**compile_command),
|
||||
output_dir=output_dir,
|
||||
)
|
||||
|
||||
worker_fn = functools.partial(
|
||||
_worker_compile_cpp,
|
||||
lock_path,
|
||||
(main_builder, optimized_builder, linker),
|
||||
)
|
||||
binary_path = normalize_path_separator(linker.get_target_file_path())
|
||||
else:
|
||||
worker_fn = functools.partial(
|
||||
_worker_compile_cpp, lock_path, (main_builder,)
|
||||
)
|
||||
binary_path = normalize_path_separator(
|
||||
main_builder.get_target_file_path()
|
||||
)
|
||||
|
||||
def load_fn() -> Any:
|
||||
nonlocal lib
|
||||
@ -2371,19 +2442,20 @@ class CppCodeCache:
|
||||
return cls.cache[key]
|
||||
|
||||
@classmethod
|
||||
def load(cls, source_code: str, device_type: str = "cpu") -> Any:
|
||||
return cls.load_async(source_code, device_type)()
|
||||
def load(cls, *args: Any, **kwargs: Any) -> Any:
|
||||
return cls.load_async(*args, **kwargs)()
|
||||
|
||||
|
||||
def _worker_compile_cpp(
|
||||
lock_path: str,
|
||||
cpp_builder: CppBuilder,
|
||||
cpp_builders: Sequence[CppBuilder],
|
||||
) -> None:
|
||||
from torch.utils._filelock import FileLock
|
||||
|
||||
with FileLock(lock_path, timeout=LOCK_TIMEOUT):
|
||||
if not os.path.exists(cpp_builder.get_target_file_path()):
|
||||
cpp_builder.build()
|
||||
for builder in cpp_builders:
|
||||
if not os.path.exists(builder.get_target_file_path()):
|
||||
builder.build()
|
||||
|
||||
|
||||
# Customized Python binding for cpp kernels
|
||||
@ -2513,19 +2585,24 @@ class CppPythonBindingsCodeCache(CppCodeCache):
|
||||
@classmethod
|
||||
def load_pybinding_async(
|
||||
cls,
|
||||
argtypes: list[str],
|
||||
source_code: str,
|
||||
argtypes: Sequence[str],
|
||||
main_code: str,
|
||||
device_type: str = "cpu",
|
||||
num_outputs: int = -1,
|
||||
submit_fn: Any = None,
|
||||
extra_flags: Sequence[str] = (),
|
||||
kernel_code: Optional[str] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Wrap a C++ function in fast Python bindings.
|
||||
|
||||
Args:
|
||||
argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
|
||||
source_code: C++ source code containing a ENTRY_FUNCTION() function
|
||||
main_code: C++ source code containing ENTRY_FUNCTION(). Will be built at
|
||||
-O3 if kernel_code is None (to maximize performance in any kernels that
|
||||
are present), or -O1 otherwise (to minimize compile time).
|
||||
kernel_code: If present, C++ source code that will be built at -O3 and
|
||||
linked to main_code.
|
||||
|
||||
Returns:
|
||||
A python version of ENTRY_FUNCTION()
|
||||
@ -2541,10 +2618,11 @@ class CppPythonBindingsCodeCache(CppCodeCache):
|
||||
extra_parse_arg=cls.extra_parse_arg.format(array_len=num_outputs),
|
||||
)
|
||||
get_result = cls.load_async(
|
||||
source_code + suffix,
|
||||
main_code + suffix,
|
||||
device_type,
|
||||
submit_fn=submit_fn,
|
||||
extra_flags=extra_flags,
|
||||
optimized_code=kernel_code,
|
||||
)
|
||||
result = None
|
||||
|
||||
|
@ -1074,6 +1074,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
|
||||
result.writeline("} // inductor_entry_impl")
|
||||
|
||||
def generate_end(self, result):
|
||||
"""Generates the end of the code block, and any code needed to call it."""
|
||||
if V.graph.aot_mode:
|
||||
if V.graph.is_const_graph:
|
||||
result.writeline("} // AOTInductorModel::_const_run_impl")
|
||||
@ -1081,19 +1082,29 @@ class CppWrapperCpu(PythonWrapperCodegen):
|
||||
result.writeline("} // namespace torch::aot_inductor\n\n\n")
|
||||
return
|
||||
|
||||
# Add any kernel definitions into the wrapped code. We currently only build
|
||||
# them in separate files in AOT mode.
|
||||
result.splice(self.kernel_declarations.getvalue())
|
||||
self.kernel_declarations.clear()
|
||||
# Close the wrapper code block, then write any kernel definitions.
|
||||
result.splice("'''\n)")
|
||||
if self.kernel_declarations:
|
||||
result.splice("\nkernel_src = (\nr'''")
|
||||
result.splice(self.kernel_declarations.getvalue())
|
||||
result.splice("'''\n)")
|
||||
else:
|
||||
result.splice(
|
||||
"""
|
||||
kernel_src = ''
|
||||
"""
|
||||
)
|
||||
|
||||
# cpp entry function for JIT with cpp wrapper
|
||||
result.splice(
|
||||
f"""
|
||||
'''
|
||||
)
|
||||
|
||||
inductor_entry = CppWrapperCodeCache.load_pybinding(
|
||||
["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
|
||||
argtypes=["std::vector<AtenTensorHandle>"],
|
||||
main_code=cpp_wrapper_src,
|
||||
device_type="{self.device}",
|
||||
num_outputs={len(V.graph.graph_outputs)},
|
||||
kernel_code=kernel_src,
|
||||
)
|
||||
"""
|
||||
)
|
||||
|
||||
|
@ -2284,8 +2284,8 @@ class GraphLowering(torch.fx.Interpreter):
|
||||
return self._compile_to_module()
|
||||
|
||||
def _compile_to_module(self) -> CompiledModule:
|
||||
# Currently, if we're here, we don't have to worry about the kernel code, which
|
||||
# is only available in AOTInductor mode.
|
||||
# If we're here, we don't have to worry about the kernel code, which is only
|
||||
# returned separately in AOTInductor mode.
|
||||
wrapper_code, _ = (
|
||||
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
|
||||
)
|
||||
|
Reference in New Issue
Block a user