[AOTI] Add more default options to compile_standalone (#158560)

Summary: When compiling for standalone, make embed_kernel_binary and emit_multi_arch_kernel default to True, and add a default name for model_name_for_generated_files to make the generated cpp project easier to understand. Also improved the weights object file naming to be more readable. Pull Request resolved: https://github.com/pytorch/pytorch/pull/158560 Approved by: https://github.com/yushangdi
2025-10-20 21:14:14 +08:00 · 2025-07-21 09:23:02 -07:00
parent 9e0473b566
commit a991e285ae
8 changed files with 171 additions and 62 deletions
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -1674,12 +1674,6 @@ class AotCodeCompiler:
            wrapper_code = "\n".join((wrapper_code, kernel_code))
            kernel_code = ""

-        from .utils import aoti_model_name_from_config
-
-        model_class_name = ""
-        if config.aot_inductor.compile_standalone:
-            model_class_name = aoti_model_name_from_config()
-
        wrapper_key, wrapper_path = write(
            wrapper_code,
            "wrapper.cpp",
@ -1712,6 +1706,8 @@ class AotCodeCompiler:
                    "model.h",
                )
            ) as f:
+                # model_name_for_generated_files is guaranteed to be non-empty when compile_standalone
+                model_class_name = config.aot_inductor.model_name_for_generated_files
                class_name = f"AOTInductorModel{model_class_name}"
                header_code = f.read()

@ -1726,7 +1722,7 @@ class AotCodeCompiler:
                    header_code,
                    "h",
                    specified_dir=specified_output_path,
-                    key=f"{model_class_name}",
+                    key=model_class_name,
                )

        # Log the AOTInductor wrapper and kernel code, if needed.
@ -1840,7 +1836,7 @@ class AotCodeCompiler:
                    consts_asm += f"\t.space {len(consts) - 8}\n"
                consts_asm += f".globl\t{symbol_prefix}_binary_constants_bin_end\n"
                consts_asm += f"{symbol_prefix}_binary_constants_bin_end:\n"
-                return consts_asm, "S"
+                return consts_asm, "weights.S"

            # Use c++ to convert consts to object file can support more compilers, such as msvc and icx.
            def format_consts_to_cpp(
@ -1865,7 +1861,7 @@ ATTRIBUTE_NO_SANITIZE_ADDRESS\t\n"""
                        const_cpp += "\t\n"
                const_cpp += "};\t\n"
                const_cpp += f"alignas({align_bytes}) extern unsigned char * {symbol_prefix}_binary_constants_bin_end;\t\n"
-                return const_cpp, "cpp"
+                return const_cpp, "weights.cpp"

            if use_asm_build:
                consts_code, code_ext = format_consts_to_asm(
@ -1880,6 +1876,7 @@ ATTRIBUTE_NO_SANITIZE_ADDRESS\t\n"""
                consts_code,
                code_ext,
                specified_dir=str(specified_sub_dir),
+                key=config.aot_inductor.model_name_for_generated_files,
            )
            consts_s = Path(consts_s)
            object_build_options = CppTorchDeviceOptions(
@ -2173,7 +2170,13 @@ ATTRIBUTE_NO_SANITIZE_ADDRESS\t\n"""
            asm_files = []
            if not _IS_WINDOWS:
                ld, objcopy = get_ld_and_objcopy(use_relative_path)
+                kernels = getattr(V.graph.wrapper_code, "_kernel_name_to_body", {})
                for kernel_name, value in CudaKernelParamCache.cache.items():
+                    if kernel_name not in kernels:
+                        # It is possible that CudaKernelParamCache contains more Triton kernels
+                        # than what the current graph uses
+                        continue
+
                    if asm_file := value["asm"]:
                        asm_files.append(asm_file)