Update round size with 1 division behavior (#162203)

have round size return nearest power of 2 greater than or equal to size with 1 division Fixes #161139 Pull Request resolved: https://github.com/pytorch/pytorch/pull/162203 Approved by: https://github.com/ezyang
2025-10-20 12:54:11 +08:00 · 2025-10-08 06:41:42 +00:00
parent 65aa62d50d
commit 12d2ef557f
2 changed files with 17 additions and 0 deletions
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@ -2502,6 +2502,8 @@ class DeviceCachingAllocator {
      auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
      if (divisions > 1 && size > (kMinBlockSize * divisions)) {
        return roundup_power2_next_division(size, divisions);
+      } else if (divisions == 1) {
+        return llvm::PowerOf2Ceil(size);
      } else {
        return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
      }
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@ -4522,6 +4522,21 @@ class TestCudaMallocAsync(TestCase):
        reg_mem = torch.cuda.memory_stats()[key_allocated]
        self.assertEqual(reg_mem - start_mem, nbytes)

+        # Test division==1 case.
+        torch.cuda.memory.empty_cache()
+        div1_start_mem = torch.cuda.memory_stats()[key_allocated]
+        div1_start_requested = torch.cuda.memory_stats()[key_requested]
+        torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:1")
+        torch.rand(nelems, device="cuda")
+        div1_end_mem = torch.cuda.memory_stats()[key_allocated]
+        div1_end_requested = torch.cuda.memory_stats()[key_requested]
+
+        self.assertEqual(div1_start_mem - start_mem, nbytes)
+        if not TEST_CUDAMALLOCASYNC:
+            # not supported with the cudaMallocAsync backend
+            self.assertEqual(div1_end_mem - div1_start_mem, power2_div(nbytes, 1))
+            self.assertEqual(div1_end_requested - div1_start_requested, nbytes)
+
        with self.assertRaises(RuntimeError):
            torch.cuda.memory._set_allocator_settings("foo:1,bar:2")