[AOTInductor] Use CudaCachingAllocator for memory allocation (#162893)

Summary:
Use c10::CudaCachingAllocator for AOTInductor's initial constant buffer
allocation.

Test Plan:
Activate test under test/cpp/aoti_inference/test.cpp

Reviewers:

Subscribers:

Tasks:

Tags:

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162893
Approved by: https://github.com/desertfire
This commit is contained in:
Mu-Chu Lee
2025-09-13 18:21:59 -07:00
committed by PyTorch MergeBot
parent 0e9f9c3a61
commit 2291199e9b
5 changed files with 69 additions and 4 deletions

View File

@ -879,12 +879,15 @@ void test_cuda_alloc_test() {
if (cudaStatus != cudaSuccess || device_idx == -1) {
throw std::runtime_error("cudaGetDevice failed!");
}
c10::cuda::CUDACachingAllocator::emptyCache();
c10::cuda::CUDACachingAllocator::DeviceStats stats =
c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
size_t initTorchActive = stats.active_bytes[0].current;
size_t initTorchActive = stats.allocated_bytes[0].current;
auto runner = std::make_unique<torch::inductor::AOTIModelContainerRunnerCuda>(
model_so_path);
size_t torchActive = stats.active_bytes[0].current;
stats = c10::cuda::CUDACachingAllocator::getDeviceStats(device_idx);
size_t torchActive = stats.allocated_bytes[0].current;
ASSERT_EQ(initTorchActive + DATASIZE, torchActive);
@ -1113,8 +1116,7 @@ TEST(AotInductorTest, MultiStreamTestCuda) {
test_multi_cuda_streams("cuda");
}
// TODO: ENABLE CUDACachingAllocator Test
TEST(DISABLED_AotInductorTest, CudaAllocTestCuda) {
TEST(AotInductorTest, CudaAllocTestCuda) {
test_cuda_alloc_test();
}
#endif