From 12d2ef557f6e127100267c31a31572d8ab5cc788 Mon Sep 17 00:00:00 2001
From: morrison-turnansky <mturnans@redhat.com>
Date: Wed, 8 Oct 2025 06:41:42 +0000
Subject: [PATCH] Update round size with 1 division behavior (#162203)

have round size return nearest power of 2 greater than or equal to size with 1 division

Fixes #161139

Pull Request resolved: https://github.com/pytorch/pytorch/pull/162203
Approved by: https://github.com/ezyang
---
 c10/cuda/CUDACachingAllocator.cpp |  2 ++
 test/test_cuda.py                 | 15 +++++++++++++++
 2 files changed, 17 insertions(+)

diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 7ef2947e15e6..2ce4761d55b3 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -2502,6 +2502,8 @@ class DeviceCachingAllocator {
       auto divisions = CUDAAllocatorConfig::roundup_power2_divisions(size);
       if (divisions > 1 && size > (kMinBlockSize * divisions)) {
         return roundup_power2_next_division(size, divisions);
+      } else if (divisions == 1) {
+        return llvm::PowerOf2Ceil(size);
       } else {
         return kMinBlockSize * ((size + kMinBlockSize - 1) / kMinBlockSize);
       }
diff --git a/test/test_cuda.py b/test/test_cuda.py
index 34f9cb6e8f71..74cfdec2e904 100644
--- a/test/test_cuda.py
+++ b/test/test_cuda.py
@@ -4522,6 +4522,21 @@ class TestCudaMallocAsync(TestCase):
         reg_mem = torch.cuda.memory_stats()[key_allocated]
         self.assertEqual(reg_mem - start_mem, nbytes)
 
+        # Test division==1 case.
+        torch.cuda.memory.empty_cache()
+        div1_start_mem = torch.cuda.memory_stats()[key_allocated]
+        div1_start_requested = torch.cuda.memory_stats()[key_requested]
+        torch.cuda.memory._set_allocator_settings("roundup_power2_divisions:1")
+        torch.rand(nelems, device="cuda")
+        div1_end_mem = torch.cuda.memory_stats()[key_allocated]
+        div1_end_requested = torch.cuda.memory_stats()[key_requested]
+
+        self.assertEqual(div1_start_mem - start_mem, nbytes)
+        if not TEST_CUDAMALLOCASYNC:
+            # not supported with the cudaMallocAsync backend
+            self.assertEqual(div1_end_mem - div1_start_mem, power2_div(nbytes, 1))
+            self.assertEqual(div1_end_requested - div1_start_requested, nbytes)
+
         with self.assertRaises(RuntimeError):
             torch.cuda.memory._set_allocator_settings("foo:1,bar:2")