diff --git a/c10/cuda/CUDAAllocatorConfig.cpp b/c10/cuda/CUDAAllocatorConfig.cpp
index 19aedb2cbb02..7b410bdd6ef5 100644
--- a/c10/cuda/CUDAAllocatorConfig.cpp
+++ b/c10/cuda/CUDAAllocatorConfig.cpp
@@ -12,6 +12,7 @@ constexpr size_t kRoundUpPowerOfTwoIntervals = 16;
 
 CUDAAllocatorConfig::CUDAAllocatorConfig()
     : m_max_split_size(std::numeric_limits<size_t>::max()),
+      m_max_non_split_rounding_size(kLargeBuffer),
       m_garbage_collection_threshold(0),
       m_pinned_num_register_threads(1),
       m_expandable_segments(false),
@@ -94,6 +95,27 @@ size_t CUDAAllocatorConfig::parseMaxSplitSize(
   return i;
 }
 
+size_t CUDAAllocatorConfig::parseMaxNonSplitRoundingSize(
+    const std::vector<std::string>& config,
+    size_t i) {
+  consumeToken(config, ++i, ':');
+  constexpr int mb = 1024 * 1024;
+  if (++i < config.size()) {
+    size_t val1 = stoi(config[i]);
+    TORCH_CHECK(
+        val1 > kLargeBuffer / mb,
+        "CachingAllocator option max_non_split_rounding_mb too small, must be > ",
+        kLargeBuffer / mb,
+        "");
+    val1 = std::max(val1, kLargeBuffer / mb);
+    val1 = std::min(val1, (std::numeric_limits<size_t>::max() / mb));
+    m_max_non_split_rounding_size = val1 * 1024 * 1024;
+  } else {
+    TORCH_CHECK(false, "Error, expecting max_non_split_rounding_mb value", "");
+  }
+  return i;
+}
+
 size_t CUDAAllocatorConfig::parseGarbageCollectionThreshold(
     const std::vector<std::string>& config,
     size_t i) {
@@ -258,6 +280,9 @@ void CUDAAllocatorConfig::parseArgs(const char* env) {
     if (config_item_view == "max_split_size_mb") {
       i = parseMaxSplitSize(config, i);
       used_native_specific_option = true;
+    } else if (config_item_view == "max_non_split_rounding_mb") {
+      i = parseMaxNonSplitRoundingSize(config, i);
+      used_native_specific_option = true;
     } else if (config_item_view == "garbage_collection_threshold") {
       i = parseGarbageCollectionThreshold(config, i);
       used_native_specific_option = true;
diff --git a/c10/cuda/CUDAAllocatorConfig.h b/c10/cuda/CUDAAllocatorConfig.h
index 3106fc1b46ba..38adc4732e3d 100644
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@@ -63,6 +63,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
     return instance().m_roundup_power2_divisions;
   }
 
+  static size_t max_non_split_rounding_size() {
+    return instance().m_max_non_split_rounding_size;
+  }
+
   static std::string last_allocator_settings() {
     std::lock_guard<std::mutex> lock(
         instance().m_last_allocator_settings_mutex);
@@ -90,6 +94,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
       size_t i,
       const char c);
   size_t parseMaxSplitSize(const std::vector<std::string>& config, size_t i);
+  size_t parseMaxNonSplitRoundingSize(
+      const std::vector<std::string>& config,
+      size_t i);
   size_t parseGarbageCollectionThreshold(
       const std::vector<std::string>& config,
       size_t i);
@@ -108,6 +115,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
       size_t i);
 
   std::atomic<size_t> m_max_split_size;
+  std::atomic<size_t> m_max_non_split_rounding_size;
   std::vector<size_t> m_roundup_power2_divisions;
   std::atomic<double> m_garbage_collection_threshold;
   std::atomic<size_t> m_pinned_num_register_threads;
diff --git a/c10/cuda/CUDACachingAllocator.cpp b/c10/cuda/CUDACachingAllocator.cpp
index 3da3c6d4f5d0..a67a720717bb 100644
--- a/c10/cuda/CUDACachingAllocator.cpp
+++ b/c10/cuda/CUDACachingAllocator.cpp
@@ -2527,7 +2527,8 @@ class DeviceCachingAllocator {
       return false;
     // Allow oversized block size to be rounded up but within a limit
     if ((p.size() >= CUDAAllocatorConfig::max_split_size()) &&
-        ((*it)->size >= p.size() + kLargeBuffer))
+        ((*it)->size >=
+         p.size() + CUDAAllocatorConfig::max_non_split_rounding_size()))
       return false;
     p.block = *it;
     pool.blocks.erase(it);
diff --git a/docs/source/notes/cuda.rst b/docs/source/notes/cuda.rst
index 7d434bbbba64..c0b82adc7e07 100644
--- a/docs/source/notes/cuda.rst
+++ b/docs/source/notes/cuda.rst
@@ -471,6 +471,13 @@ Available options:
   set the knob value to: [256:1,512:2,1024:4,>:8].
   ``roundup_power2_divisions`` is only meaningful with ``backend:native``.
   With ``backend:cudaMallocAsync``, ``roundup_power2_divisions`` is ignored.
+* ``max_non_split_rounding_mb`` will allow non-split blocks for better reuse, eg,
+   a 1024MB cached block can be re-used for a 512MB allocation request. In the default
+   case, we only allow up to 20MB of rounding of non-split blocks, so a 512MB block
+   can only be served with between 512-532 MB size block. If we set the value of this
+   option to 1024, it will alow 512-1536 MB size blocks to be used for a 512MB block
+   which increases reuse of larger blocks. This will also help in reducing the stalls
+   in avoiding expensive cudaMalloc calls.
 * ``garbage_collection_threshold`` helps actively reclaiming unused GPU memory to
   avoid triggering expensive sync-and-reclaim-all operation (release_cached_blocks),
   which can be unfavorable to latency-critical GPU applications (e.g., servers).