Benjamin Glass
2025-05-22 21:10:00 +00:00
committed by PyTorch MergeBot
parent 3c0cbf4b44
commit 768cb734ec
4 changed files with 164 additions and 59 deletions

View File

@ -1,8 +1,8 @@
add_loop_eager,compile_time_instruction_count,2987000000,0.015
add_loop_eager,compile_time_instruction_count,2960000000,0.015
add_loop_eager_dynamic,compile_time_instruction_count,5928000000,0.025
add_loop_eager_dynamic,compile_time_instruction_count,5827000000,0.025
@ -10,7 +10,7 @@ add_loop_inductor,compile_time_instruction_count,29370000000,0.015
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44480000000,0.025
add_loop_inductor_dynamic_gpu,compile_time_instruction_count,44080000000,0.025
@ -34,15 +34,31 @@ basic_modules_ListOfLinears_inductor_gpu,compile_time_instruction_count,10370000
update_hint_regression,compile_time_instruction_count,1715000000,0.02
basic_InlineMod_eager,compile_time_instruction_count,7101000000,0.015
float_args,compile_time_instruction_count,444500000,0.015
update_hint_regression,compile_time_instruction_count,1683000000,0.02
sum_floordiv_regression,compile_time_instruction_count,1009000000,0.015
float_args,compile_time_instruction_count,455100000,0.015
mm_loop_inductor_gpu,compile_time_instruction_count,4407000000,0.015
mm_loop_inductor_dynamic_gpu,compile_time_instruction_count,7381000000,0.015
basic_NestedModule_eager,compile_time_instruction_count,8241000000,0.015
sum_floordiv_regression,compile_time_instruction_count,1000000000,0.015
@ -66,7 +82,7 @@ aotdispatcher_partitioner_cpu,compile_time_instruction_count,8630000000,0.015
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1900000000,0.015
aotdispatcher_partitioner_cpu2,compile_time_instruction_count,1890000000,0.015

1 add_loop_eager compile_time_instruction_count 2987000000 2960000000 0.015
2 add_loop_eager_dynamic compile_time_instruction_count 5928000000 5827000000 0.025
3 add_loop_inductor compile_time_instruction_count 29370000000 29370000000 0.015
4 add_loop_inductor_dynamic_gpu compile_time_instruction_count 44480000000 44080000000 0.025
5 add_loop_inductor_gpu compile_time_instruction_count 25900000000 25900000000 0.015
6 basic_modules_ListOfLinears_eager compile_time_instruction_count 939900000 939900000 0.015
7 basic_modules_ListOfLinears_inductor compile_time_instruction_count 18240000000 18240000000 0.015
8 basic_modules_ListOfLinears_inductor_gpu_force_shape_pad compile_time_instruction_count 16340000000 16340000000 0.015
10 update_hint_regression basic_InlineMod_eager compile_time_instruction_count 1715000000 7101000000 0.02 0.015
11 float_args update_hint_regression compile_time_instruction_count 444500000 1683000000 0.015 0.02
12 sum_floordiv_regression float_args compile_time_instruction_count 1009000000 455100000 0.015
13 symint_sum mm_loop_inductor_gpu compile_time_instruction_count 3252000000 4407000000 0.015
14 symint_sum_loop mm_loop_inductor_dynamic_gpu compile_time_instruction_count 4262000000 7381000000 0.015
15 aotdispatcher_inference_nosubclass_cpu basic_NestedModule_eager compile_time_instruction_count 2091000000 8241000000 0.015
16 aotdispatcher_inference_subclass_cpu sum_floordiv_regression compile_time_instruction_count 5981000000 1000000000 0.015
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
82
83
84
85
86
87
88

View File

@ -138,6 +138,7 @@ if TYPE_CHECKING:
from concurrent.futures import Future
from .compile_fx import _CompileFxKwargs
from .cpp_builder import BuildOptionsBase
from .graph import GraphLowering
from .ir import ChoiceCaller
from .output_code import CompiledFxGraphConstants, OutputCode
@ -2225,7 +2226,7 @@ def _precompile_header(
os.makedirs(_HEADER_LOCK_DIR, exist_ok=True)
_worker_compile_cpp(
os.path.join(_HEADER_LOCK_DIR, f"{header_hash}.lock"),
cpp_builder,
(cpp_builder,),
)
return header_full_path
@ -2251,6 +2252,9 @@ def _get_cpp_wrapper_header(device: str, aot_mode: bool = False) -> str:
@clear_on_fresh_inductor_cache
class CppCodeCache:
"""Compiles and caches C++ libraries. Users of this class supply the source code to
be compiled, while compilation flags are set by CppBuilder."""
cache: dict[str, Callable[[], Union[CDLL, ModuleType]]] = {}
cache_clear = staticmethod(cache.clear)
cpp_compile_command_flags: dict[str, Any] = {}
@ -2292,11 +2296,14 @@ class CppCodeCache:
@classmethod
def load_async(
cls,
source_code: str,
main_code: str,
device_type: str = "cpu",
submit_fn: Any = None,
extra_flags: Sequence[str] = (),
optimized_code: Optional[str] = None,
) -> Any:
"""Compile and load a C++ library. Returns a callable that returns the loaded
library."""
compile_command = {
**cls.cpp_compile_command_flags,
"device_type": device_type,
@ -2307,48 +2314,112 @@ class CppCodeCache:
_set_gpu_runtime_env() # cpp_extension consults the env
cpp_build_option = CppTorchDeviceOptions(**compile_command)
command_gen = CppBuilder(name="o", sources="i", BuildOption=cpp_build_option)
# write function will calc source_code hash, the same source code with different
# ISA level should be generate different hash.
# So we need get a command_line which contains isa related parameter as a part of hash key.
# And then pass the command_line to below write function as extra parameter to
# guarantee the source code hash contains ISA difference.
vec_isa_cmd = repr(command_gen.get_command_line())
key, input_path = write(source_code, "cpp", extra=vec_isa_cmd)
# Note the distinction between the two booleans. We do minimal optimization if
# the optimized_code argument is present at all, since that's how the user of
# this function opts in, but we do compilation and linking in one step if the
# optimized_code argument is empty (as a micro-optimization).
main_build_option = CppTorchDeviceOptions(
compile_only=bool(optimized_code),
min_optimize=optimized_code is not None,
**compile_command,
)
optimized_build_option = CppTorchDeviceOptions(
compile_only=True, **compile_command
)
def get_hashable_command_line(build_option: BuildOptionsBase) -> str:
"""Writing the code to file will calculate a hash, which we need to vary if
the command line flags change. This implements a mostly-generic way of
validating that."""
return CppBuilder(
name="o", sources="i", BuildOption=build_option
).get_command_line()
main_cmd_line = get_hashable_command_line(main_build_option)
optimized_cmd_line = get_hashable_command_line(optimized_build_option)
key, main_path = write(
main_code, "main.cpp", extra=f"{optimized_code} {main_cmd_line}"
)
# Don't bother writing if the argument is empty.
if optimized_code:
_, optimized_path = write(
optimized_code, "optimized.cpp", extra=optimized_cmd_line
)
else:
# Unused, but makes type checkers happy.
optimized_path = os.devnull
if key not in cls.cache:
from torch.utils._filelock import FileLock
lock_path = os.path.join(get_lock_dir(), key + ".lock")
output_name, output_dir = get_name_and_dir_from_output_file_path(input_path)
future: Optional[Future[Any]] = None
lib = None
# if requested, pre-compile any headers
if (
config.cpp_cache_precompile_headers
and not _IS_WINDOWS
and (header_file := cls._get_uncompiled_header(device_type))
):
cpp_build_option.precompiled_header = _precompile_header(
header_file,
vec_isa_cmd,
**compile_command,
if config.cpp_cache_precompile_headers and not _IS_WINDOWS:
if header := cls._get_uncompiled_header(device_type):
main_build_option.precompiled_header = _precompile_header(
header,
main_cmd_line,
min_optimize=optimized_code is not None,
**compile_command,
)
# Currently, the optimized_code field is only used for cpp kernel code,
# so go ahead and precompile the relevant header here. Revisit this
# decision if that ever changes.
if optimized_code and (header := _get_cpp_prefix_header(device_type)):
optimized_build_option.precompiled_header = _precompile_header(
header,
optimized_cmd_line,
**compile_command,
)
main_name, output_dir = get_name_and_dir_from_output_file_path(main_path)
main_builder = CppBuilder(
name=main_name,
sources=main_path,
BuildOption=main_build_option,
output_dir=output_dir,
)
if optimized_code:
optimized_name, _ = get_name_and_dir_from_output_file_path(
optimized_path
)
optimized_builder = CppBuilder(
name=optimized_name,
sources=optimized_path,
BuildOption=optimized_build_option,
output_dir=output_dir,
)
cpp_builder = CppBuilder(
name=output_name,
sources=input_path,
output_dir=output_dir,
BuildOption=cpp_build_option,
)
worker_fn = functools.partial(
_worker_compile_cpp,
lock_path,
cpp_builder,
)
binary_path = normalize_path_separator(cpp_builder.get_target_file_path())
linker = CppBuilder(
name=main_name,
sources=[
main_builder.get_target_file_path(),
optimized_builder.get_target_file_path(),
],
BuildOption=CppTorchDeviceOptions(**compile_command),
output_dir=output_dir,
)
worker_fn = functools.partial(
_worker_compile_cpp,
lock_path,
(main_builder, optimized_builder, linker),
)
binary_path = normalize_path_separator(linker.get_target_file_path())
else:
worker_fn = functools.partial(
_worker_compile_cpp, lock_path, (main_builder,)
)
binary_path = normalize_path_separator(
main_builder.get_target_file_path()
)
def load_fn() -> Any:
nonlocal lib
@ -2371,19 +2442,20 @@ class CppCodeCache:
return cls.cache[key]
@classmethod
def load(cls, source_code: str, device_type: str = "cpu") -> Any:
return cls.load_async(source_code, device_type)()
def load(cls, *args: Any, **kwargs: Any) -> Any:
return cls.load_async(*args, **kwargs)()
def _worker_compile_cpp(
lock_path: str,
cpp_builder: CppBuilder,
cpp_builders: Sequence[CppBuilder],
) -> None:
from torch.utils._filelock import FileLock
with FileLock(lock_path, timeout=LOCK_TIMEOUT):
if not os.path.exists(cpp_builder.get_target_file_path()):
cpp_builder.build()
for builder in cpp_builders:
if not os.path.exists(builder.get_target_file_path()):
builder.build()
# Customized Python binding for cpp kernels
@ -2513,19 +2585,24 @@ class CppPythonBindingsCodeCache(CppCodeCache):
@classmethod
def load_pybinding_async(
cls,
argtypes: list[str],
source_code: str,
argtypes: Sequence[str],
main_code: str,
device_type: str = "cpu",
num_outputs: int = -1,
submit_fn: Any = None,
extra_flags: Sequence[str] = (),
kernel_code: Optional[str] = None,
) -> Any:
"""
Wrap a C++ function in fast Python bindings.
Args:
argtypes: The types of args to ENTRY_FUNCTION(), e.g. ["float*", "long"]
source_code: C++ source code containing a ENTRY_FUNCTION() function
main_code: C++ source code containing ENTRY_FUNCTION(). Will be built at
-O3 if kernel_code is None (to maximize performance in any kernels that
are present), or -O1 otherwise (to minimize compile time).
kernel_code: If present, C++ source code that will be built at -O3 and
linked to main_code.
Returns:
A python version of ENTRY_FUNCTION()
@ -2541,10 +2618,11 @@ class CppPythonBindingsCodeCache(CppCodeCache):
extra_parse_arg=cls.extra_parse_arg.format(array_len=num_outputs),
)
get_result = cls.load_async(
source_code + suffix,
main_code + suffix,
device_type,
submit_fn=submit_fn,
extra_flags=extra_flags,
optimized_code=kernel_code,
)
result = None

View File

@ -1074,6 +1074,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
result.writeline("} // inductor_entry_impl")
def generate_end(self, result):
"""Generates the end of the code block, and any code needed to call it."""
if V.graph.aot_mode:
if V.graph.is_const_graph:
result.writeline("} // AOTInductorModel::_const_run_impl")
@ -1081,19 +1082,29 @@ class CppWrapperCpu(PythonWrapperCodegen):
result.writeline("} // namespace torch::aot_inductor\n\n\n")
return
# Add any kernel definitions into the wrapped code. We currently only build
# them in separate files in AOT mode.
result.splice(self.kernel_declarations.getvalue())
self.kernel_declarations.clear()
# Close the wrapper code block, then write any kernel definitions.
result.splice("'''\n)")
if self.kernel_declarations:
result.splice("\nkernel_src = (\nr'''")
result.splice(self.kernel_declarations.getvalue())
result.splice("'''\n)")
else:
result.splice(
"""
kernel_src = ''
"""
)
# cpp entry function for JIT with cpp wrapper
result.splice(
f"""
'''
)
inductor_entry = CppWrapperCodeCache.load_pybinding(
["std::vector<AtenTensorHandle>"], cpp_wrapper_src, "{self.device}", {len(V.graph.graph_outputs)})
argtypes=["std::vector<AtenTensorHandle>"],
main_code=cpp_wrapper_src,
device_type="{self.device}",
num_outputs={len(V.graph.graph_outputs)},
kernel_code=kernel_src,
)
"""
)

View File

@ -2284,8 +2284,8 @@ class GraphLowering(torch.fx.Interpreter):
return self._compile_to_module()
def _compile_to_module(self) -> CompiledModule:
# Currently, if we're here, we don't have to worry about the kernel code, which
# is only available in AOTInductor mode.
# If we're here, we don't have to worry about the kernel code, which is only
# returned separately in AOTInductor mode.
wrapper_code, _ = (
self.codegen_with_cpp_wrapper() if self.cpp_wrapper else self.codegen()
)