[AOTInductor] Use CudaCachingAllocator for memory allocation (#162893)

Summary: Use c10::CudaCachingAllocator for AOTInductor's initial constant buffer allocation. Test Plan: Activate test under test/cpp/aoti_inference/test.cpp Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/162893 Approved by: https://github.com/desertfire
2025-10-21 05:34:18 +08:00 · 2025-09-13 18:21:59 -07:00
parent 0e9f9c3a61
commit 2291199e9b
5 changed files with 69 additions and 4 deletions
--- a/test/cpp/aoti_inference/test.cpp
+++ b/test/cpp/aoti_inference/test.cpp
@ -879,12 +879,15 @@ void test_cuda_alloc_test() {
  if (cudaStatus != cudaSuccess || device_idx == -1) {
    throw std::runtime_error("cudaGetDevice failed!");
  }
  c10::cuda::CUDACachingAllocator::emptyCache();
  c10::cuda::CUDACachingAllocator::DeviceStats stats =
      c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
-  size_t initTorchActive = stats.active_bytes[0].current;
+  size_t initTorchActive = stats.allocated_bytes[0].current;
  auto runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
      model_so_path);
-  size_t torchActive = stats.active_bytes[0].current;
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
  size_t torchActive = stats.allocated_bytes[0].current;
  ASSERT_EQ(initTorchActive + DATASIZE, torchActive);
@ -1113,8 +1116,7 @@ TEST(AotInductorTest, MultiStreamTestCuda) {
  test_multi_cuda_streams("cuda");
 }
-// TODO: ENABLE CUDACachingAllocator Test
+TEST(AotInductorTest, CudaAllocTestCuda) {
 TEST(DISABLED_AotInductorTest, CudaAllocTestCuda) {
  test_cuda_alloc_test();
 }
 #endif
--- a/torch/_inductor/cpp_builder.py
+++ b/torch/_inductor/cpp_builder.py
@ -1345,6 +1345,15 @@ def get_mmap_self_macro(use_mmap_weights: bool) -> list[str]:
    return macros
 def get_caching_allocator_macro() -> list[str]:
    from torch._inductor import config
    macros = []
    if config.aot_inductor.weight_use_caching_allocator:
        macros.append(" AOT_INDUCTOR_USE_CACHING_ALLOCATOR")
    return macros
 def get_cpp_torch_options(
    cpp_compiler: str,
    vec_isa: VecISA,
@ -1401,6 +1410,7 @@ def get_cpp_torch_options(
    fb_macro_passthrough_args = _use_fb_internal_macros()
    mmap_self_macros = get_mmap_self_macro(use_mmap_weights)
    caching_allocator_macros = get_caching_allocator_macro()
    definitions = (
        torch_cpp_wrapper_definitions
@ -1408,6 +1418,7 @@ def get_cpp_torch_options(
        + isa_macros
        + fb_macro_passthrough_args
        + mmap_self_macros
        + caching_allocator_macros
    )
    include_dirs = (
        sys_libs_include_dirs
--- a/torch/csrc/inductor/aoti_runtime/model_base.h
+++ b/torch/csrc/inductor/aoti_runtime/model_base.h
@ -325,10 +325,23 @@ using RAIIDataPtr = std::unique_ptr<void, std::function<void(void*)>>;
 // NOLINTNEXTLINE(clang-diagnostic-unneeded-internal-declaration)
 RAIIDataPtr RAII_gpuMalloc(size_t num_bytes) {
 #ifdef AOT_INDUCTOR_USE_CACHING_ALLOCATOR
  // Use caching allocator for allocating GPU memory
  void* data_ptr = nullptr;
  AOTI_TORCH_ERROR_CODE_CHECK(
      aoti_torch_cuda_caching_allocator_raw_alloc(num_bytes, &data_ptr));
  auto deleter = [](void* ptr) {
    AOTI_TORCH_ERROR_CODE_CHECK(
        aoti_torch_cuda_caching_allocator_raw_delete(ptr));
  };
  return RAIIDataPtr(data_ptr, deleter);
 #else
  // Use cudaMalloc directly for allocating GPU memory
  void* data_ptr = nullptr;
  AOTI_RUNTIME_CUDA_CHECK(cudaMalloc((void**)&data_ptr, num_bytes));
  auto deleter = [](void* ptr) { AOTI_RUNTIME_CUDA_CHECK(cudaFree(ptr)); };
  return RAIIDataPtr(data_ptr, deleter);
 #endif
 }
 #elif defined(USE_XPU)
--- a/torch/csrc/inductor/aoti_torch/c/shim.h
+++ b/torch/csrc/inductor/aoti_torch/c/shim.h
@ -579,6 +579,15 @@ aoti_torch_delete_cuda_stream_guard(CUDAStreamGuardHandle guard);
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_get_current_cuda_stream(int32_t device_index, void** ret_stream);
 // CUDA memory allocation using CUDACachingAllocator
 AOTI_TORCH_EXPORT AOTITorchError aoti_torch_cuda_caching_allocator_raw_alloc(
    uint64_t nbytes,
    void** ret_ptr // returns raw GPU memory pointer
 );
 AOTI_TORCH_EXPORT AOTITorchError
 aoti_torch_cuda_caching_allocator_raw_delete(void* ptr);
 #endif // USE_CUDA
 // See `ProxyExecutor Design Note` in ir.py for more details
--- a/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
+++ b/torch/csrc/inductor/aoti_torch/shim_cuda.cpp
@ -2,6 +2,7 @@
 #include <torch/csrc/inductor/aoti_torch/c/shim.h>
 #include <torch/csrc/inductor/aoti_torch/utils.h>
 #include <c10/cuda/CUDACachingAllocator.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
@ -53,3 +54,32 @@ AOTITorchError aoti_torch_get_current_cuda_stream(
    *(cudaStream_t*)(ret_stream) = at::cuda::getCurrentCUDAStream(device_index);
  });
 }
 AOTITorchError aoti_torch_cuda_caching_allocator_raw_alloc(
    uint64_t nbytes,
    void** ret_ptr) {
  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
    if (nbytes == 0) {
      *ret_ptr = nullptr;
      return AOTI_TORCH_SUCCESS;
    }
    *ret_ptr = c10::cuda::CUDACachingAllocator::raw_alloc(nbytes);
    if (*ret_ptr == nullptr) {
      TORCH_CHECK(
          false,
          "Failed to allocate ",
          nbytes,
          " bytes from CUDA caching allocator");
    }
  });
 }
 AOTITorchError aoti_torch_cuda_caching_allocator_raw_delete(void* ptr) {
  AOTI_TORCH_CONVERT_EXCEPTION_TO_ERROR_CODE({
    if (ptr != nullptr) {
      c10::cuda::CUDACachingAllocator::raw_delete(ptr);
    }
  });
 }