[AOTInductor] Use CudaCachingAllocator for memory allocation (#162893)

Summary: Use c10::CudaCachingAllocator for AOTInductor's initial constant buffer allocation. Test Plan: Activate test under test/cpp/aoti_inference/test.cpp Reviewers: Subscribers: Tasks: Tags: Pull Request resolved: https://github.com/pytorch/pytorch/pull/162893 Approved by: https://github.com/desertfire
2025-10-20 21:14:14 +08:00 · 2025-09-13 18:21:59 -07:00
parent 0e9f9c3a61
commit 2291199e9b
5 changed files with 69 additions and 4 deletions
--- a/test/cpp/aoti_inference/test.cpp
+++ b/test/cpp/aoti_inference/test.cpp
@ -879,12 +879,15 @@ void test_cuda_alloc_test() {
  if (cudaStatus != cudaSuccess || device_idx == -1) {
    throw std::runtime_error("cudaGetDevice failed!");
  }
+
+  c10::cuda::CUDACachingAllocator::emptyCache();
  c10::cuda::CUDACachingAllocator::DeviceStats stats =
      c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
-  size_t initTorchActive = stats.active_bytes[0].current;
+  size_t initTorchActive = stats.allocated_bytes[0].current;
  auto runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
      model_so_path);
-  size_t torchActive = stats.active_bytes[0].current;
+  stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
+  size_t torchActive = stats.allocated_bytes[0].current;

  ASSERT_EQ(initTorchActive + DATASIZE, torchActive);

@ -1113,8 +1116,7 @@ TEST(AotInductorTest, MultiStreamTestCuda) {
  test_multi_cuda_streams("cuda");
 }

-// TODO: ENABLE CUDACachingAllocator Test
-TEST(DISABLED_AotInductorTest, CudaAllocTestCuda) {
+TEST(AotInductorTest, CudaAllocTestCuda) {
  test_cuda_alloc_test();
 }
 #endif