[AOTI] Embed cubin files into .so (#150739)

Summary: Embed cubin files so AOTI is one step closer to generate a single binary. Controlled by a flag and off as default. Differential Revision: [D72535357](https://our.internmc.facebook.com/intern/diff/D72535357) Pull Request resolved: https://github.com/pytorch/pytorch/pull/150739 Approved by: https://github.com/angelayi
2025-10-21 05:34:18 +08:00 · 2025-05-18 13:24:40 -07:00
parent a8986963da
commit a2d0ef242d
10 changed files with 170 additions and 35 deletions
--- a/test/inductor/test_aot_inductor.py
+++ b/test/inductor/test_aot_inductor.py
@ -124,7 +124,8 @@ except (unittest.SkipTest, ImportError):
 class AOTInductorTestsTemplate:
-    def test_simple(self):
+    @common_utils.parametrize("embed_cubin", [False, True])
    def test_simple(self, embed_cubin):
        class Model(torch.nn.Module):
            def __init__(self) -> None:
                super().__init__()
@ -138,7 +139,18 @@ class AOTInductorTestsTemplate:
            torch.randn(10, 10, device=self.device),
        )
        model = Model()
-        self.check_model(model, example_inputs)
+        with config.patch({"aot_inductor.embed_cubin": embed_cubin}):
            self.check_model(model, example_inputs)
            _, code = run_and_get_cpp_code(
                AOTIRunnerUtil.compile, model, example_inputs
            )
            if self.device == GPU_TYPE:
                FileCheck().check("launchKernel(").run(code)
                if config.aot_inductor.embed_cubin:
                    # Not expect to see launchKernel("CUBIN_FILE_NAME"
                    FileCheck().check_not('launchKernel("').run(code)
        if self.use_minimal_arrayref_interface:
            self.code_check_count(
                model, example_inputs, "AOTInductorModelRunMinimalArrayrefInterface(", 1
@ -3234,7 +3246,8 @@ class AOTInductorTestsTemplate:
            self.check_model(Model(), inputs)
-    def test_repeated_user_defined_triton_kernel(self):
+    @common_utils.parametrize("embed_cubin", [False, True])
    def test_repeated_user_defined_triton_kernel(self, embed_cubin):
        if self.device != GPU_TYPE:
            raise unittest.SkipTest("requires GPU")
@ -3248,7 +3261,14 @@ class AOTInductorTestsTemplate:
                return x
        inputs = (torch.randn(4, 4, device=self.device),)
-        self.check_model(Model(), inputs)
+        with config.patch({"aot_inductor.embed_cubin": embed_cubin}):
            model = Model()
            self.check_model(model, inputs)
            _, code = run_and_get_cpp_code(AOTIRunnerUtil.compile, model, inputs)
            FileCheck().check("launchKernel(").run(code)
            if config.aot_inductor.embed_cubin:
                # Not expect to see launchKernel("CUBIN_FILE_NAME"
                FileCheck().check_not('launchKernel("').run(code)
    @unittest.skipIf(
        not IS_BIG_GPU, "Skipping triton backend only since not big GPU (not enough SM)"
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@ -205,6 +205,8 @@ class TestAOTInductorPackage(TestCase):
            options = {
                "aot_inductor.package_cpp_only": self.package_cpp_only,
                # Require kernels to be compiled into .o files
                "aot_inductor.embed_cubin": True,
            }
            ep = torch.export.export(model, example_inputs, strict=True)
            package_path = torch._inductor.aoti_compile_and_package(
@ -216,6 +218,10 @@ class TestAOTInductorPackage(TestCase):
                zip_ref.extractall(tmp_dir)
                tmp_path = Path(tmp_dir) / "data" / "aotinductor" / "model"
                self.assertTrue(tmp_path.exists())
                if self.device == GPU_TYPE:
                    self.assertTrue(not list(tmp_path.glob("*.cubin")))
                    self.assertTrue(list(tmp_path.glob("*.cubin.o")))
                build_path = tmp_path / "build"
                self.assertTrue(not build_path.exists())
--- a/test/inductor/test_cpp_wrapper_hipify.py
+++ b/test/inductor/test_cpp_wrapper_hipify.py
@ -79,6 +79,21 @@ class TestCppWrapperHipify(TestCase):
                return func;
            }
            static inline hipFunction_t loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
                hipModule_t mod;
                hipFunction_t func;
                CUDA_DRIVER_CHECK(hipModuleLoadData(&mod, start));
                CUDA_DRIVER_CHECK(hipModuleGetFunction(&func, mod, funcName.c_str()));
                if (sharedMemBytes > 0) {
                    CUDA_DRIVER_CHECK(hipFuncSetAttribute(
                        func,
                        hipFuncAttributeMaxDynamicSharedMemorySize,
                        sharedMemBytes
                    ))
                }
                return func;
            }
            static inline void launchKernel(
                    hipFunction_t func,
                    uint32_t gridX,
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -62,10 +62,12 @@ from torch._inductor.cpp_builder import (
    _set_gpu_runtime_env,
    _TORCH_PATH,
    _transform_cuda_paths,
    convert_cubin_to_obj,
    CppBuilder,
    CppOptions,
    CppTorchDeviceOptions,
    get_compiler_version_info,
    get_ld_and_objcopy,
    get_name_and_dir_from_output_file_path,
    normalize_path_separator,
 )
@ -1960,7 +1962,16 @@ class AotCodeCompiler:
                for entry in gpu_codecache.cache.values()
                if entry.output_path.endswith(".o")
            ]
-            gpu_kernels_o = " ".join(gpu_kernels_o)
+
            cubins_o = []
            if config.aot_inductor.embed_cubin:
                # Embed cubin files into .so using objcopy
                ld, objcopy = get_ld_and_objcopy(use_relative_path)
                for kernel_name, value in CudaKernelParamCache.cache.items():
                    cubin_file = value[get_cpp_wrapper_cubin_path_name()]
                    cubins_o.append(
                        convert_cubin_to_obj(cubin_file, kernel_name, ld, objcopy)
                    )
            output_name, output_dir = get_name_and_dir_from_output_file_path(output_so)
            so_build_options = CppTorchDeviceOptions(
@ -1970,11 +1981,10 @@ class AotCodeCompiler:
                use_relative_path=use_relative_path,
            )
            obj_srcs = [wrapper_o, kernel_o, consts_o, *gpu_kernels_o, *cubins_o]
            so_builder = CppBuilder(
                name=output_name,
-                sources=[wrapper_o, kernel_o, consts_o, gpu_kernels_o]
+                sources=obj_srcs,
                if gpu_kernels_o
                else [wrapper_o, kernel_o, consts_o],
                output_dir=output_dir,
                BuildOption=so_build_options,
            )
@ -2019,17 +2029,14 @@ class AotCodeCompiler:
                    generated_files.append(weight_file)
-                generated_files.append(consts_o)
+                obj_srcs = [consts_o, *gpu_kernels_o, *cubins_o]
-                generated_files.append(gpu_kernels_o)
+                generated_files.extend(obj_srcs)
-
+                for obj in obj_srcs:
-                so_builder.save_src_to_cmake(cmake_path, consts_o)
+                    so_builder.save_src_to_cmake(cmake_path, obj)
                for gpu_o in gpu_kernels_o.split():
                    so_builder.save_src_to_cmake(cmake_path, gpu_o)
                so_builder.save_link_cmd_to_cmake(cmake_path)
            else:
                so_builder.build()
-
+                for o_file in obj_srcs:
                for o_file in [wrapper_o, kernel_o, consts_o]:
                    # Remove these as they are not needed anymore
                    os.remove(o_file)
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@ -666,7 +666,15 @@ class CppWrapperCpu(PythonWrapperCodegen):
            signature = kernel.get_signature().replace(name, kernel_ptr)
            self.prefix.writeline(f"    {signature} = torch::aot_inductor::{name};")
        self.prefix.writeline("};")
-        self.prefix.writeline("}  // namespace")
+        self.prefix.writeline("}  // namespace\n\n")
        if config.aot_inductor.embed_cubin:
            self.prefix.writeline('extern "C" {')
            for name in sorted(declare_kernel):
                self.prefix.writeline(
                    f"    extern const unsigned char __{name}_start[];"
                )
            self.prefix.writeline("}")
    def codegen_model_constructor(self):
        """
--- a/torch/_inductor/codegen/cpp_wrapper_gpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_gpu.py
@ -58,6 +58,9 @@ class DeferredTritonCallWrapper:
    arg_types: list[Any]
    def generate(self, wrapper: CppWrapperGpu):
        """
        Generate the GPU kernel definition, as well as load and launch code.
        """
        prefix = wrapper.prefix
        if self.kernel_name.startswith("multi_kernel_"):
            # MultiKernel will select one kernel after running the autotune block
@ -132,10 +135,12 @@ class DeferredTritonCallWrapper:
            self.generate_load_kernel(prefix, kernel_var_name, params)
            self.generate_launch_kernel(prefix, wrapper, kernel_var_name, params)
        prefix.writeline("}")
-        # Ensure the cubin file is included in the package
+
-        V.graph.wrapper_code.additional_files.append(
+        if not config.aot_inductor.embed_cubin:
-            params[get_cpp_wrapper_cubin_path_name()]
+            # Ensure the cubin file is included in the package
-        )
+            V.graph.wrapper_code.additional_files.append(
                params[get_cpp_wrapper_cubin_path_name()]
            )
    def generate_grid(
        self,
@ -160,12 +165,20 @@ class DeferredTritonCallWrapper:
    def generate_load_kernel(self, prefix, kernel_var_name, params):
        prefix.writeline(f"if ({kernel_var_name} == nullptr) {{")
        with prefix.indent():
-            load_kernel_args = [
+            load_kernel_args = (
-                cpp_string_literal(params[get_cpp_wrapper_cubin_path_name()]),
+                [
-                cpp_string_literal(params["mangled_name"]),
+                    f"__{params['inductor_meta']['kernel_name']}_start",
-                str(params["shared_mem"]),
+                    cpp_string_literal(params["mangled_name"]),
-                "cubin_dir_",
+                    str(params["shared_mem"]),
-            ]
+                ]
                if V.graph.aot_mode and config.aot_inductor.embed_cubin
                else [
                    cpp_string_literal(params[get_cpp_wrapper_cubin_path_name()]),
                    cpp_string_literal(params["mangled_name"]),
                    str(params["shared_mem"]),
                    "cubin_dir_",
                ]
            )
            prefix.writeline(
                f"{kernel_var_name} = loadKernel({', '.join(load_kernel_args)}); "
            )
--- a/torch/_inductor/codegen/cuda/device_op_overrides.py
+++ b/torch/_inductor/codegen/cuda/device_op_overrides.py
@ -88,6 +88,21 @@ class CUDADeviceOpOverrides(DeviceOpOverrides):
                return func;
            }
            static inline CUfunction loadKernel(const void* start, const std::string &funcName, uint32_t sharedMemBytes) {
                CUmodule mod;
                CUfunction func;
                CUDA_DRIVER_CHECK(cuModuleLoadData(&mod, start));
                CUDA_DRIVER_CHECK(cuModuleGetFunction(&func, mod, funcName.c_str()));
                if (sharedMemBytes > 0) {
                    CUDA_DRIVER_CHECK(cuFuncSetAttribute(
                        func,
                        CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
                        sharedMemBytes
                    ))
                }
                return func;
            }
            static inline void launchKernel(
                    CUfunction func,
                    uint32_t gridX,
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -1295,6 +1295,9 @@ class aot_inductor:
    # Experimental.  Controls automatic precompiling of common AOTI include files.
    precompile_headers: bool = not is_fbcode()
    # Embed generated .cubin files into the .so
    embed_cubin: bool = False
 class cuda:
    """Settings for cuda backend, today this consists of cutlass"""
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@ -157,6 +157,51 @@ def get_cpp_compiler() -> str:
    return compiler
 def get_ld_and_objcopy(use_relative_path: bool) -> tuple[str, str]:
    if _IS_WINDOWS:
        raise RuntimeError("Windows is not supported yet.")
    else:
        if config.is_fbcode():
            ld = build_paths.ld
            objcopy = (
                build_paths.objcopy_fallback
                if use_relative_path
                else build_paths.objcopy
            )
        else:
            ld = "ld"
            objcopy = "objcopy"
    return ld, objcopy
 def convert_cubin_to_obj(
    cubin_file: str,
    kernel_name: str,
    ld: str,
    objcopy: str,
 ) -> str:
    obj_file = cubin_file + ".o"
    # Convert .cubin to .o
    cmd = f"{ld} -r -b binary -z noexecstack -o {obj_file} {cubin_file}"
    subprocess.run(cmd.split(), capture_output=True, text=True)
    os.remove(cubin_file)
    # Rename .data to .rodata
    cmd = f"{objcopy} --rename-section .data=.rodata,alloc,load,readonly,data,contents {obj_file}"
    subprocess.run(cmd.split(), capture_output=True, text=True)
    # By default objcopy will create *_start, *_size, *_end symbols using the full path
    # Rename to use the unique kernel name
    file_name = re.sub(r"[\W]", "_", cubin_file)
    cmd = (
        objcopy
        + f" --redefine-sym _binary_{file_name}_start=__{kernel_name}_start "
        + f"--redefine-sym _binary_{file_name}_size=__{kernel_name}_size "
        + f"--redefine-sym _binary_{file_name}_end=__{kernel_name}_end "
        + obj_file
    )
    subprocess.run(cmd.split(), capture_output=True, text=True)
    return obj_file
@functools.lru_cache(None)
 def _is_apple_clang(cpp_compiler: str) -> bool:
    version_string = subprocess.check_output([cpp_compiler, "--version"]).decode("utf8")
--- a/torch/csrc/inductor/aoti_package/model_package_loader.cpp
+++ b/torch/csrc/inductor/aoti_package/model_package_loader.cpp
@ -264,7 +264,7 @@ bool recursive_rmdir(const std::string& path) {
 std::string compile_so(
    const std::string& cpp_filename,
-    const std::string& consts_filename) {
+    std::vector<std::string>& obj_filenames) {
  // Compile the cpp file into a .so
  size_t lastindex = cpp_filename.find_last_of('.');
@ -280,8 +280,9 @@ std::string compile_so(
      cpp_filename.substr(0, lastindex) + "_linker_flags.json";
  const nlohmann::json linker_flags = load_json_file(linker_flags_path);
-  auto [link_cmd, output_so] = get_cpp_compile_command(
+  obj_filenames.push_back(output_o);
-      filename, {output_o, consts_filename}, linker_flags);
+  auto [link_cmd, output_so] =
      get_cpp_compile_command(filename, obj_filenames, linker_flags);
  // Run the commands to generate a .so file
  int status = system(compile_cmd.c_str());
@ -369,7 +370,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
  temp_dir_ = create_temp_dir();
  std::string so_filename;
  std::string cpp_filename;
-  std::string consts_filename;
+  std::vector<std::string> obj_filenames;
  std::string found_filenames; // Saving for bookkeeping
  std::string model_directory =
      "data" + k_separator + "aotinductor" + k_separator + model_name;
@ -408,8 +409,10 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
        if (lastSlash != std::string::npos) {
          filename = filename_str.substr(lastSlash + 1);
        }
-        output_path_str +=
+        output_path_str.append(k_separator)
-            k_separator + model_directory + k_separator + filename;
+            .append(model_directory)
            .append(k_separator)
            .append(filename);
      }
      LOG(INFO) << "Extract file: " << filename_str << " to "
@ -440,7 +443,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
        if (filename_extension == ".cpp") {
          cpp_filename = output_path_str;
        } else if (filename_extension == ".o") {
-          consts_filename = output_path_str;
+          obj_filenames.push_back(output_path_str);
        } else if (filename_extension == ".so") {
          so_filename = output_path_str;
        }
@ -465,7 +468,7 @@ AOTIModelPackageLoader::AOTIModelPackageLoader(
  // Compile the .so
  std::string so_path = !so_filename.empty()
      ? so_filename
-      : compile_so(cpp_filename, consts_filename);
+      : compile_so(cpp_filename, obj_filenames);
  // Load metadata which can be queried by user
  load_metadata(cpp_filename);