cpp_wrapper: build non-performance-sensitive code at O1 (#148773)

Builds on #148212, applying the same improvements to `cpp_wrapper` mode. Benchmark results: * [A100 Benchmarks](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Wed%2C%2014%20May%202025%2015%3A10%3A05%20GMT&stopTime=Wed%2C%2021%20May%202025%2015%3A10%3A05%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cuda%20(a100)&lBranch=gh/benjaminglass1/77/orig&lCommit=ca7d0a3f16e3c511534d2cd03d695be8524570d3&rBranch=main&rCommit=1075bb37d34e483763a09c7810790d5491441e13) * [x86 Benchmarks](https://hud.pytorch.org/benchmark/compilers?dashboard=torchinductor&startTime=Wed%2C%2014%20May%202025%2015%3A10%3A05%20GMT&stopTime=Wed%2C%2021%20May%202025%2015%3A10%3A05%20GMT&granularity=hour&mode=inference&dtype=bfloat16&deviceName=cpu%20(x86)&lBranch=gh/benjaminglass1/77/orig&lCommit=ca7d0a3f16e3c511534d2cd03d695be8524570d3&rBranch=main&rCommit=1075bb37d34e483763a09c7810790d5491441e13) Pull Request resolved: https://github.com/pytorch/pytorch/pull/148773 Approved by: https://github.com/desertfire
2025-10-20 21:14:14 +08:00 · 2025-05-22 21:10:00 +00:00
parent 3c0cbf4b44
commit 768cb734ec
4 changed files with 164 additions and 59 deletions
--- a/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
+++ b/benchmarks/dynamo/pr_time_benchmarks/expected_results.csv
@ -1,8 +1,8 @@
-add_loop_eager,compile_time_instruction_count,2987000000,0.015
+add_loop_eager,compile_time_instruction_count,2960000000,0.015



-add_loop_eager_dynamic,compile_time_instruction_count,5928000000,0.025
+add_loop_eager_dynamic,compile_time_instruction_count,5827000000,0.025



@ -10,7 +10,7 @@ add_loop_inductor,compile_time_instruction_count,29370000000,0.015



-add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44480000000,0.025
+add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44080000000,0.025



@ -34,15 +34,31 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,10370000



-update_hint_regression,compile_time_instruction_count,1715000000,0.02
+basic_InlineMod_eager,compile_time_instruction_count,7101000000,0.015



-float_args,compile_time_instruction_count,444500000,0.015
+update_hint_regression,compile_time_instruction_count,1683000000,0.02



-sum_floordiv_regression,compile_time_instruction_count,1009000000,0.015
+float_args,compile_time_instruction_count,455100000,0.015
+
+
+
+mm_loop_inductor_gpu,compile_time_instruction_count,4407000000,0.015
+
+
+
+mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,7381000000,0.015
+
+
+
+basic_NestedModule_eager,compile_time_instruction_count,8241000000,0.015
+
+
+
+sum_floordiv_regression,compile_time_instruction_count,1000000000,0.015



@ -66,7 +82,7 @@ aotdispatcher_partitioner_cpu,compile_time_instruction_count,8630000000,0.015



-aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1900000000,0.015
+aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1890000000,0.015



--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -138,6 +138,7 @@ if TYPE_CHECKING:
    from concurrent.futures import Future

    from .compile_fx import _CompileFxKwargs
+    from .cpp_builder import BuildOptionsBase
    from .graph import GraphLowering
    from .ir import ChoiceCaller
    from .output_code import CompiledFxGraphConstants, OutputCode
@ -2225,7 +2226,7 @@ def _precompile_header(
    os.makedirs(_HEADER_LOCK_DIR, exist_ok=True)
    _worker_compile_cpp(
        os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"),
-        cpp_builder,
+        (cpp_builder,),
    )

    return header_full_path
@ -2251,6 +2252,9 @@ def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:

@clear_on_fresh_inductor_cache
 class CppCodeCache:
+    """Compiles and caches C++ libraries.  Users of this class supply the source code to
+    be compiled, while compilation flags are set by CppBuilder."""
+
    cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
    cache_clear = staticmethod(cache.clear)
    cpp_compile_command_flags: dict[str, Any] = {}
@ -2292,11 +2296,14 @@ class CppCodeCache:
    @classmethod
    def load_async(
        cls,
-        source_code: str,
+        main_code: str,
        device_type: str = "cpu",
        submit_fn: Any = None,
        extra_flags: Sequence[str] = (),
+        optimized_code: Optional[str] = None,
    ) -> Any:
+        """Compile and load a C++ library.  Returns a callable that returns the loaded
+        library."""
        compile_command = {
            **cls.cpp_compile_command_flags,
            "device_type": device_type,
@ -2307,48 +2314,112 @@ class CppCodeCache:

        _set_gpu_runtime_env()  # cpp_extension consults the env

-        cpp_build_option = CppTorchDeviceOptions(**compile_command)
-        command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option)
-        # write function will calc source_code hash, the same source code with different
-        # ISA level should be generate different hash.
-        # So we need get a command_line which contains isa related parameter as a part of hash key.
-        # And then pass the command_line to below write function as extra parameter to
-        # guarantee the source code hash contains ISA difference.
-        vec_isa_cmd = repr(command_gen.get_command_line())
-        key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
+        # Note the distinction between the two booleans.  We do minimal optimization if
+        # the optimized_code argument is present at all, since that's how the user of
+        # this function opts in, but we do compilation and linking in one step if the
+        # optimized_code argument is empty (as a micro-optimization).
+        main_build_option = CppTorchDeviceOptions(
+            compile_only=bool(optimized_code),
+            min_optimize=optimized_code is not None,
+            **compile_command,
+        )
+        optimized_build_option = CppTorchDeviceOptions(
+            compile_only=True, **compile_command
+        )
+
+        def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
+            """Writing the code to file will calculate a hash, which we need to vary if
+            the command line flags change.  This implements a mostly-generic way of
+            validating that."""
+            return CppBuilder(
+                name="o", sources="i", BuildOption=build_option
+            ).get_command_line()
+
+        main_cmd_line = get_hashable_command_line(main_build_option)
+        optimized_cmd_line = get_hashable_command_line(optimized_build_option)
+
+        key, main_path = write(
+            main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}"
+        )
+
+        # Don't bother writing if the argument is empty.
+        if optimized_code:
+            _, optimized_path = write(
+                optimized_code, "optimized.cpp", extra=optimized_cmd_line
+            )
+        else:
+            # Unused, but makes type checkers happy.
+            optimized_path = os.devnull

        if key not in cls.cache:
            from torch.utils._filelock import FileLock

            lock_path = os.path.join(get_lock_dir(), key + ".lock")
-            output_name, output_dir = get_name_and_dir_from_output_file_path(input_path)
            future: Optional[Future[Any]] = None
            lib = None

            # if requested, pre-compile any headers
-            if (
-                config.cpp_cache_precompile_headers
-                and not _IS_WINDOWS
-                and (header_file := cls._get_uncompiled_header(device_type))
-            ):
-                cpp_build_option.precompiled_header = _precompile_header(
-                    header_file,
-                    vec_isa_cmd,
-                    **compile_command,
+            if config.cpp_cache_precompile_headers and not _IS_WINDOWS:
+                if header := cls._get_uncompiled_header(device_type):
+                    main_build_option.precompiled_header = _precompile_header(
+                        header,
+                        main_cmd_line,
+                        min_optimize=optimized_code is not None,
+                        **compile_command,
+                    )
+
+                # Currently, the optimized_code field is only used for cpp kernel code,
+                # so go ahead and precompile the relevant header here.  Revisit this
+                # decision if that ever changes.
+                if optimized_code and (header := _get_cpp_prefix_header(device_type)):
+                    optimized_build_option.precompiled_header = _precompile_header(
+                        header,
+                        optimized_cmd_line,
+                        **compile_command,
+                    )
+
+            main_name, output_dir = get_name_and_dir_from_output_file_path(main_path)
+            main_builder = CppBuilder(
+                name=main_name,
+                sources=main_path,
+                BuildOption=main_build_option,
+                output_dir=output_dir,
+            )
+
+            if optimized_code:
+                optimized_name, _ = get_name_and_dir_from_output_file_path(
+                    optimized_path
+                )
+                optimized_builder = CppBuilder(
+                    name=optimized_name,
+                    sources=optimized_path,
+                    BuildOption=optimized_build_option,
+                    output_dir=output_dir,
                )

-            cpp_builder = CppBuilder(
-                name=output_name,
-                sources=input_path,
-                output_dir=output_dir,
-                BuildOption=cpp_build_option,
-            )
-            worker_fn = functools.partial(
-                _worker_compile_cpp,
-                lock_path,
-                cpp_builder,
-            )
-            binary_path = normalize_path_separator(cpp_builder.get_target_file_path())
+                linker = CppBuilder(
+                    name=main_name,
+                    sources=[
+                        main_builder.get_target_file_path(),
+                        optimized_builder.get_target_file_path(),
+                    ],
+                    BuildOption=CppTorchDeviceOptions(**compile_command),
+                    output_dir=output_dir,
+                )
+
+                worker_fn = functools.partial(
+                    _worker_compile_cpp,
+                    lock_path,
+                    (main_builder, optimized_builder, linker),
+                )
+                binary_path = normalize_path_separator(linker.get_target_file_path())
+            else:
+                worker_fn = functools.partial(
+                    _worker_compile_cpp, lock_path, (main_builder,)
+                )
+                binary_path = normalize_path_separator(
+                    main_builder.get_target_file_path()
+                )

            def load_fn() -> Any:
                nonlocal lib
@ -2371,19 +2442,20 @@ class CppCodeCache:
        return cls.cache[key]

    @classmethod
-    def load(cls, source_code: str, device_type: str = "cpu") -> Any:
-        return cls.load_async(source_code, device_type)()
+    def load(cls, *args: Any, **kwargs: Any) -> Any:
+        return cls.load_async(*args, **kwargs)()


 def _worker_compile_cpp(
    lock_path: str,
-    cpp_builder: CppBuilder,
+    cpp_builders: Sequence[CppBuilder],
 ) -> None:
    from torch.utils._filelock import FileLock

    with FileLock(lock_path, timeout=LOCK_TIMEOUT):
-        if not os.path.exists(cpp_builder.get_target_file_path()):
-            cpp_builder.build()
+        for builder in cpp_builders:
+            if not os.path.exists(builder.get_target_file_path()):
+                builder.build()


 # Customized Python binding for cpp kernels
@ -2513,19 +2585,24 @@ class CppPythonBindingsCodeCache(CppCodeCache):
    @classmethod
    def load_pybinding_async(
        cls,
-        argtypes: list[str],
-        source_code: str,
+        argtypes: Sequence[str],
+        main_code: str,
        device_type: str = "cpu",
        num_outputs: int = -1,
        submit_fn: Any = None,
        extra_flags: Sequence[str] = (),
+        kernel_code: Optional[str] = None,
    ) -> Any:
        """
        Wrap a C++ function in fast Python bindings.

        Args:
            argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
-            source_code: C++ source code containing a ENTRY_FUNCTION() function
+            main_code: C++ source code containing ENTRY_FUNCTION().  Will be built at
+                -O3 if kernel_code is None (to maximize performance in any kernels that
+                are present), or -O1 otherwise (to minimize compile time).
+            kernel_code: If present, C++ source code that will be built at -O3 and
+                linked to main_code.

        Returns:
            A python version of ENTRY_FUNCTION()
@ -2541,10 +2618,11 @@ class CppPythonBindingsCodeCache(CppCodeCache):
            extra_parse_arg=cls.extra_parse_arg.format(array_len=num_outputs),
        )
        get_result = cls.load_async(
-            source_code + suffix,
+            main_code + suffix,
            device_type,
            submit_fn=submit_fn,
            extra_flags=extra_flags,
+            optimized_code=kernel_code,
        )
        result = None

--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@ -1074,6 +1074,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
                result.writeline("} // inductor_entry_impl")

    def generate_end(self, result):
+        """Generates the end of the code block, and any code needed to call it."""
        if V.graph.aot_mode:
            if V.graph.is_const_graph:
                result.writeline("} // AOTInductorModel::_const_run_impl")
@ -1081,19 +1082,29 @@ class CppWrapperCpu(PythonWrapperCodegen):
                result.writeline("} // namespace torch::aot_inductor\n\n\n")
            return

-        # Add any kernel definitions into the wrapped code.  We currently only build
-        # them in separate files in AOT mode.
-        result.splice(self.kernel_declarations.getvalue())
-        self.kernel_declarations.clear()
+        # Close the wrapper code block, then write any kernel definitions.
+        result.splice("'''\n)")
+        if self.kernel_declarations:
+            result.splice("\nkernel_src = (\nr'''")
+            result.splice(self.kernel_declarations.getvalue())
+            result.splice("'''\n)")
+        else:
+            result.splice(
+                """
+                kernel_src = ''
+                """
+            )

        # cpp entry function for JIT with cpp wrapper
        result.splice(
            f"""
-            '''
-            )
-
            inductor_entry = CppWrapperCodeCache.load_pybinding(
-                ["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
+                argtypes=["std::vector<AtenTensorHandle>"],
+                main_code=cpp_wrapper_src,
+                device_type="{self.device}",
+                num_outputs={len(V.graph.graph_outputs)},
+                kernel_code=kernel_src,
+            )
            """
        )

--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@ -2284,8 +2284,8 @@ class GraphLowering(torch.fx.Interpreter):
            return self._compile_to_module()

    def _compile_to_module(self) -> CompiledModule:
-        # Currently, if we're here, we don't have to worry about the kernel code, which
-        # is only available in AOTInductor mode.
+        # If we're here, we don't have to worry about the kernel code, which is only
+        # returned separately in AOTInductor mode.
        wrapper_code, _ = (
            self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
        )