[PyTorch Pinned Allocator] Add support of background thread to process events (#135524)

Summary: Currently we process events in the regular allocation path and we call cudaEventQuery to check on the events and this path can take some locks in libcuda driver. Its not entirely needed to do process events in the allocation path, we could move this to a background thread and keep processing events regularly and put the freed block to the free list. Differential Revision: D62396585 Pull Request resolved: https://github.com/pytorch/pytorch/pull/135524 Approved by: https://github.com/zyan0
2025-10-20 21:07:10 +08:00 · 2024-09-17 21:08:10 +00:00
parent 48d18fbd4c
commit a575ce0dc6
5 changed files with 155 additions and 37 deletions
--- a/c10/cuda/CUDAAllocatorConfig.h
+++ b/c10/cuda/CUDAAllocatorConfig.h
@ -46,6 +46,10 @@ class C10_CUDA_API CUDAAllocatorConfig {
    return instance().m_pinned_num_register_threads;
  }

+  static bool pinned_use_background_threads() {
+    return instance().m_pinned_use_background_threads;
+  }
+
  static size_t pinned_max_register_threads() {
    // Based on the benchmark results, we see better allocation performance
    // with 8 threads. However on future systems, we may need more threads
@ -113,6 +117,9 @@ class C10_CUDA_API CUDAAllocatorConfig {
  size_t parsePinnedNumRegisterThreads(
      const std::vector<std::string>& config,
      size_t i);
+  size_t parsePinnedUseBackgroundThreads(
+      const std::vector<std::string>& config,
+      size_t i);

  std::atomic<size_t> m_max_split_size;
  std::atomic<size_t> m_max_non_split_rounding_size;
@ -122,6 +129,7 @@ class C10_CUDA_API CUDAAllocatorConfig {
  std::atomic<bool> m_expandable_segments;
  std::atomic<bool> m_release_lock_on_cudamalloc;
  std::atomic<bool> m_pinned_use_cuda_host_register;
+  std::atomic<bool> m_pinned_use_background_threads;
  std::string m_last_allocator_settings;
  std::mutex m_last_allocator_settings_mutex;
 };