From 12d2ef557f6e127100267c31a31572d8ab5cc788 Mon Sep 17 00:00:00 2001 From: morrison-turnansky Date: Wed, 8 Oct 2025 06:41:42 +0000 Subject: [PATCH] Update round size with 1 division behavior (#162203) have round size return nearest power of 2 greater than or equal to size with 1 division Fixes #161139 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162203 Approved by: https://github.com/ezyang --- c10/cuda/CUDACachingAllocator.cpp | 2 ++ test/test_cuda.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+) diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp index 7ef2947e15e6..2ce4761d55b3 100644 --- a/c10/cuda/CUDACachingAllocator.cpp +++ b/c10/cuda/CUDACachingAllocator.cpp @@ -2502,6 +2502,8 @@ class DeviceCachingAllocator { auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size); if (divisions > 1 && size > (kMinBlockSize * divisions)) { return roundup_power2_next_division(size, divisions); + } else if (divisions == 1) { + return llvm::PowerOf2Ceil(size); } else { return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize); } diff --git a/test/test_cuda.py b/test/test_cuda.py index 34f9cb6e8f71..74cfdec2e904 100644 --- a/test/test_cuda.py +++ b/test/test_cuda.py @@ -4522,6 +4522,21 @@ class TestCudaMallocAsync(TestCase): reg_mem = torch.cuda.memory_stats()[key_allocated] self.assertEqual(reg_mem - start_mem, nbytes) + # Test division==1 case. + torch.cuda.memory.empty_cache() + div1_start_mem = torch.cuda.memory_stats()[key_allocated] + div1_start_requested = torch.cuda.memory_stats()[key_requested] + torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:1") + torch.rand(nelems, device="cuda") + div1_end_mem = torch.cuda.memory_stats()[key_allocated] + div1_end_requested = torch.cuda.memory_stats()[key_requested] + + self.assertEqual(div1_start_mem - start_mem, nbytes) + if not TEST_CUDAMALLOCASYNC: + # not supported with the cudaMallocAsync backend + self.assertEqual(div1_end_mem - div1_start_mem, power2_div(nbytes, 1)) + self.assertEqual(div1_end_requested - div1_start_requested, nbytes) + with self.assertRaises(RuntimeError): torch.cuda.memory._set_allocator_settings("foo:1,bar:2")