[AOTI] codegen for static linkage (#157129)

Design doc: https://docs.google.com/document/d/1ncV7RpJ8xDwy8-_aCBfvZmpTTL824C-aoNPBLLVkOHM/edit?tab=t.0 (internal) - Add codegen for static linkage - refactor test code for test_compile_after_package tests For now, the following options must be used together with `"aot_inductor.compile_standalone": True`. "aot_inductor.package_cpp_only": True, Will change `"aot_inductor.package_cpp_only"` to be automatically set to True in followup PR. ``` python test/inductor/test_aot_inductor_package.py -k test_compile_after_package python test/inductor/test_aot_inductor_package.py -k test_run_static_linkage_model ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/157129 Approved by: https://github.com/desertfire
2025-10-20 12:54:11 +08:00 · 2025-07-10 16:03:50 +00:00
parent 9bdf87e891
commit 4781d72faa
10 changed files with 509 additions and 88 deletions
--- a/.lintrunner.toml
+++ b/.lintrunner.toml
@ -231,7 +231,8 @@ include_patterns = [
    'c10/**/*.cpp',
    'c10/**/*.h',
    'torch/*.h',
-    'torch/_inductor/codegen/aoti_runtime/interface.cpp',
+    'torch/_inductor/codegen/aoti_runtime/*.h',
+    'torch/_inductor/codegen/aoti_runtime/*.cpp',
    'torch/csrc/*.h',
    'torch/csrc/*.cpp',
    'torch/csrc/**/*.h',
--- a/setup.py
+++ b/setup.py
@ -1310,7 +1310,9 @@ def main() -> None:
        "include/**/*.hpp",
        "include/*.cuh",
        "include/**/*.cuh",
+        "csrc/inductor/aoti_runtime/model.h",
        "_inductor/codegen/*.h",
+        "_inductor/codegen/aoti_runtime/*.h",
        "_inductor/codegen/aoti_runtime/*.cpp",
        "_inductor/script.ld",
        "_export/serde/*.yaml",
--- a/test/inductor/test_aot_inductor_package.py
+++ b/test/inductor/test_aot_inductor_package.py
@ -30,6 +30,20 @@ from torch.testing._internal.common_utils import (
 from torch.testing._internal.inductor_utils import GPU_TYPE, HAS_GPU


+try:
+    from test_static_linkage_utils import (
+        get_static_linkage_main_cpp_file,
+        get_static_linkage_makelist_file_cpu,
+        get_static_linkage_makelist_file_cuda,
+    )
+except ImportError:
+    from .test_static_linkage_utils import (
+        get_static_linkage_main_cpp_file,
+        get_static_linkage_makelist_file_cpu,
+        get_static_linkage_makelist_file_cuda,
+    )
+
+
 def skipif(predicate: Callable[[str, bool], bool], reason: str):
    def decorator(func):
        @functools.wraps(func)
@ -126,6 +140,54 @@ class TestAOTInductorPackage(TestCase):
        self.assertEqual(actual, expected, atol=atol, rtol=rtol)
        return compiled_model

+    def check_package_cpp_only(self: TestCase) -> None:
+        """
+        Check if cmake and make are available.
+        Skip self.package_cpp_only=False tests
+        """
+        if not self.package_cpp_only:
+            raise unittest.SkipTest("Only meant to test cpp package")
+        if shutil.which("cmake") is None:
+            raise unittest.SkipTest("cmake is not available")
+        if shutil.which("make") is None:
+            raise unittest.SkipTest("make is not available")
+
+    def cmake_compile(self, model, example_inputs, options, tmp_dir):
+        """
+        Exports model, compiles it using AOTInductor, extracts the
+        generated files to tmp_dir, and builds the C++ code using CMake and Make.
+
+        Returns:
+            - build_path (Path): Path to the CMake build directory containing the compiled binary.
+            - tmp_path (Path): Path to the extracted model source directory.
+        """
+        ep = torch.export.export(model, example_inputs)
+        package_path = torch._inductor.aoti_compile_and_package(
+            ep, inductor_configs=options
+        )
+        with (
+            zipfile.ZipFile(package_path, "r") as zip_ref,
+        ):
+            filenames = zip_ref.namelist()
+            prefix = filenames[0].split("/")[0]
+            zip_ref.extractall(tmp_dir)
+            tmp_path = Path(tmp_dir) / prefix / "data" / "aotinductor" / "model"
+            self.assertTrue(tmp_path.exists())
+            # Create a build directory to run cmake
+            build_path = tmp_path / "build"
+            self.assertTrue(not build_path.exists())
+            build_path.mkdir()
+            custom_env = os.environ.copy()
+            custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+            subprocess.run(
+                ["cmake", ".."],
+                cwd=build_path,
+                env=custom_env,
+                check=True,
+            )
+            subprocess.run(["make"], cwd=build_path, check=True)
+        return build_path, tmp_path
+
    def test_add(self):
        class Model(torch.nn.Module):
            def forward(self, x, y):
@ -189,12 +251,7 @@ class TestAOTInductorPackage(TestCase):
    @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
    @skipIfXpu  # build system may be different
    def test_compile_after_package(self):
-        if not self.package_cpp_only:
-            raise unittest.SkipTest("Only meant to test cpp package")
-        if shutil.which("cmake") is None:
-            raise unittest.SkipTest("cmake is not available")
-        if shutil.which("make") is None:
-            raise unittest.SkipTest("make is not available")
+        self.check_package_cpp_only()

        class Model(torch.nn.Module):
            def __init__(self) -> None:
@ -217,39 +274,19 @@ class TestAOTInductorPackage(TestCase):
                # Require kernels to be compiled into .o files
                "aot_inductor.embed_kernel_binary": True,
            }
-            ep = torch.export.export(model, example_inputs, strict=True)
-            package_path = torch._inductor.aoti_compile_and_package(
-                ep, inductor_configs=options
-            )
            with (
                tempfile.TemporaryDirectory() as tmp_dir,
-                zipfile.ZipFile(package_path, "r") as zip_ref,
            ):
-                filenames = zip_ref.namelist()
-                prefix = filenames[0].split("/")[0]
-                zip_ref.extractall(tmp_dir)
-                tmp_path = Path(tmp_dir) / prefix / "data" / "aotinductor" / "model"
-                self.assertTrue(tmp_path.exists())
+                build_path, tmp_path = self.cmake_compile(
+                    model, example_inputs, options, tmp_dir
+                )
+
                if self.device == GPU_TYPE:
                    kernel_bin = get_kernel_bin_format(self.device)
                    self.assertTrue(not list(tmp_path.glob(f"*.{kernel_bin}")))
                    # Check if .cubin.o files exist and use unique kernel names
                    self.assertTrue(list(tmp_path.glob(f"triton_*.{kernel_bin}.o")))

-                build_path = tmp_path / "build"
-                self.assertTrue(not build_path.exists())
-
-                # Create a build directory to run cmake
-                build_path.mkdir()
-                custom_env = os.environ.copy()
-                custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
-                subprocess.run(
-                    ["cmake", ".."],
-                    cwd=build_path,
-                    env=custom_env,
-                )
-                subprocess.run(["make"], cwd=build_path)
-
                # Check if the .so file was build successfully
                so_path = build_path / "libaoti_model.so"
                self.assertTrue(so_path.exists())
@ -263,12 +300,7 @@ class TestAOTInductorPackage(TestCase):
    def test_compile_after_package_multi_arch(self):
        if self.device != GPU_TYPE:
            raise unittest.SkipTest("Only meant to test GPU_TYPE")
-        if not self.package_cpp_only:
-            raise unittest.SkipTest("Only meant to test cpp package")
-        if shutil.which("cmake") is None:
-            raise unittest.SkipTest("cmake is not available")
-        if shutil.which("make") is None:
-            raise unittest.SkipTest("make is not available")
+        self.check_package_cpp_only()

        class Model(torch.nn.Module):
            def __init__(self) -> None:
@ -293,31 +325,12 @@ class TestAOTInductorPackage(TestCase):
                "aot_inductor.emit_multi_arch_kernel": True,
                "aot_inductor.embed_kernel_binary": True,
            }
-            ep = torch.export.export(model, example_inputs)
-            package_path = torch._inductor.aoti_compile_and_package(
-                ep, inductor_configs=options
-            )
            with (
                tempfile.TemporaryDirectory() as tmp_dir,
-                zipfile.ZipFile(package_path, "r") as zip_ref,
            ):
-                filenames = zip_ref.namelist()
-                prefix = filenames[0].split("/")[0]
-                zip_ref.extractall(tmp_dir)
-                tmp_path = Path(tmp_dir) / prefix / "data" / "aotinductor" / "model"
-                self.assertTrue(tmp_path.exists())
-                # Create a build directory to run cmake
-                build_path = tmp_path / "build"
-                build_path.mkdir()
-                custom_env = os.environ.copy()
-                custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
-                subprocess.run(
-                    ["cmake", ".."],
-                    cwd=build_path,
-                    env=custom_env,
+                build_path, _ = self.cmake_compile(
+                    model, example_inputs, options, tmp_dir
                )
-                subprocess.run(["make"], cwd=build_path)
-
                # Check if the .so file was build successfully
                so_path = build_path / "libaoti_model.so"
                self.assertTrue(so_path.exists())
@ -325,6 +338,137 @@ class TestAOTInductorPackage(TestCase):
                actual = optimized(*example_inputs)
                self.assertTrue(torch.allclose(actual, expected))

+    @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+    @skipIfXpu  # build system may be different
+    def test_compile_after_package_static(self):
+        # compile_standalone will set package_cpp_only=True
+        self.check_package_cpp_only()
+
+        class Model(torch.nn.Module):
+            def __init__(self) -> None:
+                super().__init__()
+                self.linear = torch.nn.Linear(10, 10)
+
+            def forward(self, x, y):
+                return x + self.linear(y)
+
+        with torch.no_grad():
+            example_inputs = (
+                torch.randn(10, 10, device=self.device),
+                torch.randn(10, 10, device=self.device),
+            )
+            model = Model().to(device=self.device)
+
+            # Test compilation when no name is passed in
+            options = {
+                "aot_inductor.compile_standalone": True,
+            }
+            with (
+                tempfile.TemporaryDirectory() as tmp_dir,
+            ):
+                build_path, _ = self.cmake_compile(
+                    model, example_inputs, options, tmp_dir
+                )
+                # Check if the .a file was build successfully
+                a_path = build_path / "libaoti_model.a"
+                self.assertTrue(a_path.exists())
+
+            # Test compilation when model name is passed in
+            options = {
+                "aot_inductor.compile_standalone": True,
+                "aot_inductor.model_name_for_generated_files": "linear",
+            }
+            with (
+                tempfile.TemporaryDirectory() as tmp_dir,
+            ):
+                build_path, _ = self.cmake_compile(
+                    model, example_inputs, options, tmp_dir
+                )
+                # Check if the .a file was build successfully
+                a_path = build_path / "liblinear.a"
+                self.assertTrue(a_path.exists())
+
+            # test invalid model name
+            options = {
+                "aot_inductor.compile_standalone": True,
+                "aot_inductor.model_name_for_generated_files": "linear/linear",
+            }
+            with self.assertRaisesRegex(Exception, "Invalid AOTI model name"):
+                self.cmake_compile(model, example_inputs, options, "")
+
+    @unittest.skipIf(IS_FBCODE, "cmake won't work in fbcode")
+    @skipIfRocm  # doesn't support multi-arch binary
+    @skipIfXpu  # doesn't support multi-arch binary
+    def test_run_static_linkage_model(self):
+        self.check_package_cpp_only()
+
+        class Model1(torch.nn.Module):
+            def forward(self, x, y):
+                return x + y
+
+        class Model2(torch.nn.Module):
+            def forward(self, x, y):
+                return x - y
+
+        example_inputs = (
+            torch.randn(10, 10, device=self.device),
+            torch.randn(10, 10, device=self.device),
+        )
+
+        model1 = Model1().to(self.device)
+        model2 = Model2().to(self.device)
+
+        models = [model1, model2]
+
+        i = 0
+        model_names = ["Plus", "Minus"]
+        with (
+            tempfile.TemporaryDirectory() as tmp_dir,
+        ):
+            for i in range(2):
+                model = models[i]
+                # TODO: should be done through _ExportPackage
+                ep = torch.export.export(model, example_inputs)
+
+                package_path = torch._inductor.aoti_compile_and_package(
+                    ep,
+                    inductor_configs={
+                        "aot_inductor.compile_standalone": True,
+                        "always_keep_tensor_constants": True,
+                        "aot_inductor.model_name_for_generated_files": model_names[i],
+                    },
+                )
+                with (
+                    zipfile.ZipFile(package_path, "r") as zip_ref,
+                ):
+                    zip_ref.extractall(tmp_dir)
+
+            file_str = get_static_linkage_main_cpp_file()
+            with open(Path(tmp_dir) / "main.cpp", "w") as f:
+                f.write(file_str)
+
+            if self.device == GPU_TYPE:
+                cmake_file_str = get_static_linkage_makelist_file_cuda()
+            else:
+                cmake_file_str = get_static_linkage_makelist_file_cpu()
+            with open(Path(tmp_dir) / "CMakeLists.txt", "w") as f:
+                f.write(cmake_file_str)
+
+            build_path = Path(tmp_dir) / "build"
+            build_path.mkdir()
+            custom_env = os.environ.copy()
+            custom_env["CMAKE_PREFIX_PATH"] = str(Path(torch.__file__).parent)
+            subprocess.run(
+                ["cmake", ".."],
+                cwd=build_path,
+                env=custom_env,
+            )
+
+            subprocess.run(["make"], cwd=build_path, check=True)
+            subprocess.run(
+                ["./main", f"{tmp_dir}/", self.device], cwd=build_path, check=True
+            )
+
    def test_metadata(self):
        class Model(torch.nn.Module):
            def __init__(self) -> None:
--- a/test/inductor/test_static_linkage_utils.py
+++ b/test/inductor/test_static_linkage_utils.py
@ -0,0 +1,157 @@
+# Owner(s): ["module: inductor"]
+from torch.testing._internal.common_utils import run_tests
+
+
+def get_static_linkage_main_cpp_file():
+    return """
+#include <dlfcn.h>
+#include <iostream>
+#include <memory>
+#include <torch/torch.h>
+#include <vector>
+
+#include <cuda.h>
+#include <cuda_runtime_api.h>
+// Include the AOTInductor headers
+#include "Minus.wrapper/data/aotinductor/model/Minus.h"
+#include "Plus.wrapper/data/aotinductor/model/Plus.h"
+#include <torch/csrc/inductor/aoti_runtime/model_container.h>
+#include <torch/csrc/inductor/aoti_torch/tensor_converter.h>
+
+using torch::aot_inductor::AOTInductorModelMinus;
+using torch::aot_inductor::AOTInductorModelPlus;
+using torch::aot_inductor::ConstantHandle;
+using torch::aot_inductor::ConstantMap;
+
+
+int main(int argc, char* argv[]) {
+  if (argc < 2) {
+    std::cerr
+        << "Usage: ./main <path> <device>"
+        << std::endl;
+    return 1;
+  }
+  std::string path = argv[1];
+  std::string device_str = argv[2];
+  try {
+    torch::Device device(device_str);
+
+    // Create two input tensors (10x10)
+    auto tensor1 = torch::ones({10, 10}, device);
+    auto tensor2 = torch::ones({10, 10}, device);
+    // Create two input tensors (10x10)
+    auto tensor3 = torch::ones({10, 10}, device);
+    auto tensor4 = torch::ones({10, 10}, device);
+
+    std::vector<at::Tensor> input_tensors = {tensor1, tensor2};
+    std::vector<at::Tensor> input_tensors2 = {tensor3, tensor4};
+
+    // Create array of input handles
+    auto input_handles1 =
+        torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(
+            input_tensors);
+    auto input_handles2 =
+        torch::aot_inductor::unsafe_alloc_new_handles_from_tensors(
+            input_tensors2);
+
+    // Create array for output handle
+    AtenTensorHandle output_handle1;
+    AtenTensorHandle output_handle2;
+
+    auto constants_map = std::make_shared<ConstantMap>();
+    auto constants_array = std::make_shared<std::vector<ConstantHandle>>();
+    auto model1 = AOTInductorModelPlus::Create(
+        constants_map, constants_array, device_str,
+        path + "Plus.wrapper/data/"
+        "aotinductor/model/");
+    model1->load_constants();
+
+    auto constants_map2 = std::make_shared<ConstantMap>();
+    auto constants_array2 = std::make_shared<std::vector<ConstantHandle>>();
+    auto model2 = AOTInductorModelMinus::Create(
+        constants_map2, constants_array2, device_str,
+        path + "Minus.wrapper/data/"
+        "aotinductor/model/");
+    model2->load_constants();
+
+    // Run the model
+    torch::aot_inductor::DeviceStreamType stream1 = nullptr;
+    torch::aot_inductor::DeviceStreamType stream2 = nullptr;
+    model1->run(&input_handles1[0], &output_handle1, stream1, nullptr);
+    model2->run(&input_handles2[0], &output_handle2, stream2, nullptr);
+
+    // Convert output handle to tensor
+    auto output_tensor1 =
+        torch::aot_inductor::alloc_tensors_by_stealing_from_handles(
+            &output_handle1, 1);
+    auto output_tensor2 =
+        torch::aot_inductor::alloc_tensors_by_stealing_from_handles(
+            &output_handle2, 1);
+
+    if (!(torch::all(output_tensor1[0] == 2).item<bool>())){
+      std::cout << "Wrong Output for Plus Model: " << output_tensor1 << std::endl;
+      throw std::runtime_error("Tensor does not contain only the expected value 2.");
+    }
+    if (!(torch::all(output_tensor2[0] == 0).item<bool>())){
+      std::cout << "Wrong Output for Minus Model: " << output_tensor1 << std::endl;
+      throw std::runtime_error("Tensor does not contain only the expected value 0.");
+    }
+
+    return 0;
+  } catch (const std::exception &e) {
+    std::cerr << "Error: " << e.what() << std::endl;
+    return 1;
+  }
+}
+
+"""
+
+
+def get_static_linkage_makelist_file_cuda():
+    return """
+cmake_minimum_required(VERSION 3.10)
+project(TestProject)
+
+set(CMAKE_CXX_STANDARD 17)
+
+find_package(Torch REQUIRED)
+find_package(CUDA REQUIRED)
+
+add_subdirectory(Plus.wrapper/data/aotinductor/model/)
+add_subdirectory(Minus.wrapper/data/aotinductor/model/)
+
+# Create executable
+add_executable(main main.cpp)
+
+target_compile_definitions(main PRIVATE USE_CUDA)
+
+target_link_libraries(main PRIVATE torch cuda
+                    ${CUDA_LIBRARIES}
+                    Plus
+                    Minus)
+"""
+
+
+def get_static_linkage_makelist_file_cpu():
+    return """
+cmake_minimum_required(VERSION 3.10)
+project(TestProject)
+
+set(CMAKE_CXX_STANDARD 17)
+
+find_package(Torch REQUIRED)
+
+add_subdirectory(Plus.wrapper/data/aotinductor/model/)
+add_subdirectory(Minus.wrapper/data/aotinductor/model/)
+
+# Create executable
+add_executable(main main.cpp)
+
+target_link_libraries(main PRIVATE torch
+                    Plus
+                    Minus)
+"""
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -1660,6 +1660,12 @@ class AotCodeCompiler:
            wrapper_code = "\n".join((wrapper_code, kernel_code))
            kernel_code = ""

+        from .utils import aoti_model_name_from_config
+
+        model_class_name = ""
+        if config.aot_inductor.compile_standalone:
+            model_class_name = aoti_model_name_from_config()
+
        wrapper_key, wrapper_path = write(
            wrapper_code,
            "wrapper.cpp",
@ -1679,6 +1685,36 @@ class AotCodeCompiler:
            key=config.aot_inductor.model_name_for_generated_files,
        )

+        header_code = ""
+        header_path = ""
+        if config.aot_inductor.compile_standalone:
+            # to link statically, we also need a header file
+            with open(
+                os.path.join(
+                    os.path.dirname(os.path.dirname(__file__)),
+                    "csrc",
+                    "inductor",
+                    "aoti_runtime",
+                    "model.h",
+                )
+            ) as f:
+                class_name = f"AOTInductorModel{model_class_name}"
+                header_code = f.read()
+
+                # we replace like this to avoid replacing
+                # AOTInductorModelBase and AOTInductorModelKernelsBase
+                header_code = (
+                    header_code.replace("<AOTInductorModel>", f"<{class_name}>")
+                    .replace("AOTInductorModel(", f"{class_name}(")
+                    .replace("AOTInductorModel :", f"{class_name} :")
+                )
+                _, header_path = write(
+                    header_code,
+                    "h",
+                    specified_dir=specified_output_path,
+                    key=f"{model_class_name}",
+                )
+
        # Log the AOTInductor wrapper and kernel code, if needed.
        with tempfile.NamedTemporaryFile("w+") as t:
            t.writelines((wrapper_code, "\n", kernel_code, "\n"))
@ -1689,6 +1725,8 @@ class AotCodeCompiler:
            generated_files.append(wrapper_path)
            if not config.aot_inductor.package_cpp_only:
                generated_files.append(kernel_path)
+            if config.aot_inductor.compile_standalone:
+                generated_files.append(header_path)

        output_code_log.info("Wrapper code written to: %s", wrapper_path)
        output_code_log.info("Kernel code written to: %s", kernel_path)
@ -1710,6 +1748,17 @@ class AotCodeCompiler:
            },
            payload_fn=lambda: kernel_code,
        )
+        if config.aot_inductor.compile_standalone:
+            output_code_log.info("Header code written to: %s", header_path)
+            trace_structured(
+                "graph_dump",
+                lambda: {
+                    "name": "inductor_aot_header_code",
+                    "type": "cpp",
+                    "filename": header_path,
+                },
+                payload_fn=lambda: header_code,
+            )

        # We use a file lock below to protect FS operations. The lock file
        # is scoped to the 'key', so make sure the consts_s is protected
--- a/torch/_inductor/codegen/cpp_wrapper_cpu.py
+++ b/torch/_inductor/codegen/cpp_wrapper_cpu.py
@ -22,7 +22,13 @@ from torch.utils._ordered_set import OrderedSet
 from torch.utils._sympy.symbol import symbol_is_type, SymT

 from .. import config, ir
-from ..utils import _align, DeferredLineBase, LineContext, normalize_name
+from ..utils import (
+    _align,
+    aoti_model_name_from_config,
+    DeferredLineBase,
+    LineContext,
+    normalize_name,
+)
 from ..virtualized import V
 from .aoti_hipify_utils import maybe_hipify_code_wrapper
 from .common import get_device_op_overrides, IndentedBuffer, Kernel
@ -58,6 +64,10 @@ class CppWrapperCpu(PythonWrapperCodegen):
            self.device = "cpu"
        # must be initialized prior to calling super().__init__()
        self.included_devices: OrderedSet[str] = OrderedSet()
+        self.model_class_name_suffix = ""
+        if config.aot_inductor.compile_standalone:
+            self.model_class_name_suffix = aoti_model_name_from_config()
+        self.aoti_model_class_name = f"AOTInductorModel{self.model_class_name_suffix}"
        super().__init__()
        self.declare = "auto "
        self.declare_maybe_reference = "decltype(auto) "
@ -208,10 +218,16 @@ class CppWrapperCpu(PythonWrapperCodegen):
        self.add_device_include(self.device)

        if V.graph.aot_mode:
-            with open(
-                os.path.join(os.path.dirname(__file__), "aoti_runtime", "interface.cpp")
-            ) as f:
-                self.header.splice(f.read())
+            if not config.aot_inductor.compile_standalone:
+                with open(
+                    os.path.join(
+                        os.path.dirname(__file__), "aoti_runtime", "interface.cpp"
+                    )
+                ) as f:
+                    self.header.splice(f.read())
+            else:
+                # we produce a separate model header for each model in static linkage
+                self.header.splice(f"""#include \"{self.model_class_name_suffix}.h\"""")
            self.header.splice("\n")

        enable_kernel_profile = config.cpp.enable_kernel_profile and sys.platform in [
@ -508,12 +524,12 @@ class CppWrapperCpu(PythonWrapperCodegen):

            if V.graph.is_const_graph:
                self.prefix.splice(
-                    """
-                    void AOTInductorModel::_const_run_impl(
+                    f"""
+                    void {self.aoti_model_class_name}::_const_run_impl(
                        std::vector<AtenTensorHandle>& output_handles,
                        DeviceStreamType stream,
                        AOTIProxyExecutorHandle proxy_executor
-                    ) {
+                    ) {{
                    """
                )
            else:
@ -521,18 +537,18 @@ class CppWrapperCpu(PythonWrapperCodegen):
                    # If we do not split the constant graph, we'll just create
                    # an empty implementation when wrapping the main module.
                    self.prefix.splice(
-                        """
-                        void AOTInductorModel::_const_run_impl(
+                        f"""
+                        void {self.aoti_model_class_name}::_const_run_impl(
                            std::vector<AtenTensorHandle>& output_handles,
                            DeviceStreamType stream,
                            AOTIProxyExecutorHandle proxy_executor
-                        ) {}
+                        ) {{}}

                        """
                    )

-                run_impl_proto = """
-                    void AOTInductorModel::run_impl(
+                run_impl_proto = f"""
+                    void {self.aoti_model_class_name}::run_impl(
                        AtenTensorHandle*
                            input_handles, // array of input AtenTensorHandle; handles
                                            // are stolen; the array itself is borrowed
@ -542,7 +558,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
                                            // borrowed
                        DeviceStreamType stream,
                        AOTIProxyExecutorHandle proxy_executor
-                    ) {
+                    ) {{
                        __check_inputs_outputs(input_handles, output_handles);
                    """

@ -734,7 +750,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
        )
        self.prefix.splice(
            f"""
-            AOTInductorModel::AOTInductorModel(std::shared_ptr<ConstantMap> constants_map,
+            {self.aoti_model_class_name}::{self.aoti_model_class_name}(std::shared_ptr<ConstantMap> constants_map,
                                               std::shared_ptr<std::vector<ConstantHandle>> constants_array,
                                               const std::string& device_str,
                                               std::optional<std::string> cubin_dir)
@ -891,12 +907,12 @@ class CppWrapperCpu(PythonWrapperCodegen):
        """

        self.prefix.splice(
-            """
-            std::unordered_map<std::string, AtenTensorHandle> AOTInductorModel::const_run_impl(
+            f"""
+            std::unordered_map<std::string, AtenTensorHandle> {self.aoti_model_class_name}::const_run_impl(
                DeviceStreamType stream,
                AOTIProxyExecutorHandle proxy_executor,
                bool initialization
-            ) {
+            ) {{
            """
        )
        if not config.aot_inductor.use_runtime_constant_folding:
@ -1079,7 +1095,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
    def generate_before_suffix(self, result):
        if not V.graph.is_const_graph:
            if V.graph.aot_mode:
-                result.writeline("} // AOTInductorModel::run_impl")
+                result.writeline(f"}} // {self.aoti_model_class_name}::run_impl")
            else:
                result.writeline("} // inductor_entry_impl")

@ -1087,7 +1103,7 @@ class CppWrapperCpu(PythonWrapperCodegen):
        """Generates the end of the code block, and any code needed to call it."""
        if V.graph.aot_mode:
            if V.graph.is_const_graph:
-                result.writeline("} // AOTInductorModel::_const_run_impl")
+                result.writeline(f"}} // {self.aoti_model_class_name}::_const_run_impl")
            else:
                result.writeline("} // namespace torch::aot_inductor\n\n\n")
            return
--- a/torch/_inductor/compile_fx.py
+++ b/torch/_inductor/compile_fx.py
@ -2407,6 +2407,10 @@ def compile_fx(
        )

        if V.aot_compilation:
+            from .utils import is_valid_aoti_model_name
+
+            is_valid_aoti_model_name()
+
            with functorch_config.patch(unlift_effect_tokens=True):
                gm, graph_signature = aot_export_module(
                    model_,
--- a/torch/_inductor/config.py
+++ b/torch/_inductor/config.py
@ -1421,6 +1421,13 @@ class aot_inductor:

    # If not None, the generated files with use this name in file stem.
    # If None, we will use a hash to name files.
+    #
+    # If package_cpp_only, this name is also used for the target name in CMakelists.txt
+    # The default target name is "aoti_model"
+    #
+    # If compile_standalone, the aoti model class name is f"AOTInductorModel{name}"
+    #
+    # This name can only contain letters, numbers, and underscores.
    model_name_for_generated_files: Optional[str] = None

    # Custom ops that have implemented C shim wrappers, defined as an op to C shim declaration dict
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@ -28,6 +28,7 @@ from torch._dynamo.utils import dynamo_timed
 from torch._inductor import config, exc
 from torch._inductor.cpu_vec_isa import invalid_vec_isa, VecISA
 from torch._inductor.runtime.runtime_utils import cache_dir
+from torch._inductor.utils import aoti_model_name_from_config
 from torch.torch_version import TorchVersion


@ -1505,6 +1506,7 @@ class CppBuilder:
        self._aot_mode: bool = False

        self._name = name
+        self._target_name = aoti_model_name_from_config()

        # Code start here, initial self internal variables firstly.
        self._build_option = BuildOption
@ -1730,25 +1732,29 @@ class CppBuilder:
        """

        definitions = " ".join(self._build_option.get_definitions())
+        target_library_type = (
+            "STATIC" if config.aot_inductor.compile_standalone else "SHARED"
+        )
+
        contents = textwrap.dedent(
            f"""
            cmake_minimum_required(VERSION 3.27 FATAL_ERROR)
-            project(aoti_model LANGUAGES CXX)
+            project({self._target_name} LANGUAGES CXX)
            set(CMAKE_CXX_STANDARD 17)

            # May need to point CMAKE_PREFIX_PATH to the right torch location
            find_package(Torch REQUIRED)

            # Set a shared library target
-            add_library(aoti_model SHARED)
+            add_library({self._target_name} {target_library_type})

            # Add macro definitions
-            target_compile_definitions(aoti_model PRIVATE {definitions})
+            target_compile_definitions({self._target_name} PRIVATE {definitions})

            # Add compile flags
-            target_compile_options(aoti_model PRIVATE {self._cflags_args})
+            target_compile_options({self._target_name} PRIVATE {self._cflags_args})
            # Backend specific flags
-            target_compile_options(aoti_model PRIVATE {self._passthrough_parameters_args} -c)
+            target_compile_options({self._target_name} PRIVATE {self._passthrough_parameters_args} -c)

            """
        )
@ -1823,7 +1829,7 @@ class CppBuilder:
        # Remove the directory part of file_path
        src_path = "${CMAKE_CURRENT_SOURCE_DIR}/" + Path(src_path).name
        with open(cmake_path, "a") as f:
-            f.write(f"target_sources(aoti_model PRIVATE {src_path})\n")
+            f.write(f"target_sources({self._target_name} PRIVATE {src_path})\n")

    def save_kernel_asm_to_cmake(self, cmake_path: str, asm_files: list[str]) -> None:
        # TODO: make this work beyond CUDA
@ -1837,9 +1843,9 @@ class CppBuilder:
                    """
                )
                f.write(contents)
-            f.write("add_dependencies(aoti_model ${KERNEL_TARGETS})\n")
+            f.write(f"add_dependencies({self._target_name} ${{KERNEL_TARGETS}})\n")
            f.write(
-                "target_link_libraries(aoti_model PRIVATE ${KERNEL_OBJECT_FILES})\n"
+                f"target_link_libraries({self._target_name} PRIVATE ${{KERNEL_OBJECT_FILES}})\n"
            )

    def save_link_cmd_to_cmake(self, cmake_path: str) -> None:
@ -1848,10 +1854,10 @@ class CppBuilder:
        contents = textwrap.dedent(
            f"""
            # Add linker flags
-            target_link_options(aoti_model PRIVATE {lflags})
+            target_link_options({self._target_name} PRIVATE {lflags})

            # Add libraries
-            target_link_libraries(aoti_model PRIVATE {libs})
+            target_link_libraries({self._target_name} PRIVATE {libs})
         """
        )

--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@ -3308,3 +3308,38 @@ def maybe_aoti_standalone_config(config_patches: dict[str, Any]) -> dict[str, An
                "Please set aot_inductor.package_cpp_only=True in your inductor config."
            )
    return config_patches
+
+
+def is_valid_aoti_model_name() -> bool:
+    """
+    Validates if a model name is suitable for use in code generation.
+
+    """
+    from torch._inductor import config
+
+    model_name = config.aot_inductor.model_name_for_generated_files
+
+    if model_name is None:
+        return True
+
+    if not isinstance(model_name, str):
+        raise ValueError("Invalid AOTI model name: Model name must be a string")
+
+    if model_name == "":
+        return True
+
+    # Can only contain alphanumeric characters and underscores
+    if not re.match(r"^[a-zA-Z_][a-zA-Z0-9_]*$", model_name):
+        raise ValueError(
+            "Invalid AOTI model name: Model name can only contain letters, numbers, and underscores"
+        )
+
+    return True
+
+
+def aoti_model_name_from_config() -> str:
+    from torch._inductor import config
+
+    model_name = config.aot_inductor.model_name_for_generated_files
+    model_name = "aoti_model" if model_name is None else model_name
+    return model_name