Re-land "Fix thread safety in getCurrentCUDABlasHandle and getCUDABlasLtWorkspace"

Summary: getCurrentCUDABlasHandle() and getCUDABlasLtWorkspace() use static mutable maps that are not protected from concurrent read-and-write. This leads to crashes. This diff adds mutexes to synchronize access to the static maps. Note: this is a re-land of D86316117 / https://github.com/pytorch/pytorch/pull/167248 Test Plan: Use a GPU OD, run multi-threaded tests (cuda_cublas_handle_pool_test) with TSAN: ``` buck test fbcode//mode/dev-tsan fbcode//caffe2:cuda_cublas_handle_pool_test -- --stress-runs 100 ``` https://www.internalfb.com/intern/testinfra/testrun/14355223937501118 TSAN output (before synchronization was added): P2026731804 Differential Revision: D86964261
2025-11-18 09:34:57 +08:00 · 2025-11-13 04:33:11 -08:00
4 changed files with 164 additions and 19 deletions
--- a/aten/src/ATen/cuda/CUDAContextLight.h
+++ b/aten/src/ATen/cuda/CUDAContextLight.h
@ -3,6 +3,7 @@

 #include <cstdint>
 #include <map>
+#include <shared_mutex>

 #include <cuda_runtime_api.h>
 #include <cusparse.h>
@ -88,8 +89,13 @@ TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle();
 TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle();

 TORCH_CUDA_CPP_API void clearCublasWorkspaces();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace();
-TORCH_CUDA_CPP_API std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace();
+struct WorkspaceMapWithMutex {
+  std::map<std::tuple<void*, void*>, at::DataPtr> map;
+  std::shared_mutex mutex;
+};
+
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublas_handle_stream_to_workspace();
+TORCH_CUDA_CPP_API WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace();
 TORCH_CUDA_CPP_API size_t getChosenWorkspaceSize();
 TORCH_CUDA_CPP_API size_t getCUDABlasLtWorkspaceSize();
 TORCH_CUDA_CPP_API void* getCUDABlasLtWorkspace();
--- a/aten/src/ATen/cuda/CublasHandlePool.cpp
+++ b/aten/src/ATen/cuda/CublasHandlePool.cpp
@ -99,7 +99,7 @@ void destroyCublasHandle(cublasHandle_t handle) {
 //   - Comments of @soumith copied from cuDNN handle pool implementation
 #ifdef NO_CUDNN_DESTROY_HANDLE
 #else
-    cublasDestroy(handle);
+  cublasDestroy(handle);
 #endif
 }

@ -107,19 +107,27 @@ using CuBlasPoolType = DeviceThreadHandlePool<cublasHandle_t, createCublasHandle

 } // namespace

-std::map<std::tuple<void *, void *>, at::DataPtr>& cublas_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublas_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
  return instance;
 }

-std::map<std::tuple<void *, void *>, at::DataPtr>& cublaslt_handle_stream_to_workspace() {
-  static auto& instance = *new std::map<std::tuple<void *, void *>, at::DataPtr>;
+WorkspaceMapWithMutex& cublaslt_handle_stream_to_workspace() {
+  static auto& instance = *new WorkspaceMapWithMutex;
  return instance;
 }

 void clearCublasWorkspaces() {
-  cublas_handle_stream_to_workspace().clear();
-  cublaslt_handle_stream_to_workspace().clear();
+  {
+    auto& workspace = cublas_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
+  {
+    auto& workspace = cublaslt_handle_stream_to_workspace();
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    workspace.map.clear();
+  }
 }

 size_t parseChosenWorkspaceSize() {
@ -241,8 +249,10 @@ void* getCUDABlasLtWorkspace() {
    auto stream = c10::cuda::getCurrentCUDAStream();
    cudaStream_t _stream = stream;
    auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-    auto workspace_it = at::cuda::cublas_handle_stream_to_workspace().find(key);
-    TORCH_INTERNAL_ASSERT(workspace_it != at::cuda::cublas_handle_stream_to_workspace().end());
+    auto& workspace = at::cuda::cublas_handle_stream_to_workspace();
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    TORCH_INTERNAL_ASSERT(workspace_it != workspace.map.end());
    return workspace_it->second.mutable_get();
  }
 #endif
@ -250,11 +260,34 @@ void* getCUDABlasLtWorkspace() {
  auto stream = c10::cuda::getCurrentCUDAStream();
  cudaStream_t _stream = stream;
  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublaslt_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublaslt_handle_stream_to_workspace().end()) {
-    workspace_it = cublaslt_handle_stream_to_workspace().insert(workspace_it, {key, getNewCUDABlasLtWorkspace()});
+
+  auto& workspace = cublaslt_handle_stream_to_workspace();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      return workspace_it->second.mutable_get();
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewCUDABlasLtWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it == workspace.map.end()) {
+      workspace_it =
+          workspace.map.emplace(key, std::move(new_workspace)).first;
+    }
+    // else: another thread inserted it, our new_workspace will be automatically
+    // freed
+    return workspace_it->second.mutable_get();
  }
-  return workspace_it->second.mutable_get();
 }

 cublasHandle_t getCurrentCUDABlasHandle() {
@ -300,11 +333,39 @@ cublasHandle_t getCurrentCUDABlasHandle() {
  // all the memory and cublas's cudaMallocAsync will return OOM
  cudaStream_t _stream = stream;
  auto key = std::make_tuple(static_cast<void *>(handle), static_cast<void *>(_stream));
-  auto workspace_it = cublas_handle_stream_to_workspace().find(key);
-  if (workspace_it == cublas_handle_stream_to_workspace().end()) {
-    workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()});
+
+  auto& workspace = cublas_handle_stream_to_workspace();
+
+  size_t workspace_size = getChosenWorkspaceSize();
+
+  // Fast path: check if workspace already exists
+  {
+    std::shared_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it != workspace.map.end()) {
+      TORCH_CUDABLAS_CHECK(cublasSetWorkspace(
+          handle, workspace_it->second.get(), workspace_size));
+      return handle;
+    }
+  }
+
+  // Slow path: allocate workspace outside the lock
+  auto new_workspace = getNewWorkspace();
+
+  // Insert with lock (double-check in case another thread inserted while we
+  // were allocating)
+  {
+    std::unique_lock<std::shared_mutex> lock(workspace.mutex);
+    auto workspace_it = workspace.map.find(key);
+    if (workspace_it == workspace.map.end()) {
+      workspace_it =
+          workspace.map.emplace(key, std::move(new_workspace)).first;
+    }
+    // else: another thread inserted it, our new_workspace will be automatically
+    // freed
+    TORCH_CUDABLAS_CHECK(
+        cublasSetWorkspace(handle, workspace_it->second.get(), workspace_size));
  }
-  TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize()));
 #if !defined(USE_ROCM)
  // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup
  // FP32 data type calculations based on the value of the allow_tf32 flag.
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -61,6 +61,7 @@ list(APPEND ATen_CUDA_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_math_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_complex_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cub_test.cu
+  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_cublas_handle_pool_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_device_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_distributions_test.cu
  ${CMAKE_CURRENT_SOURCE_DIR}/cuda_dlconvertor_test.cpp
--- a/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
+++ b/aten/src/ATen/test/cuda_cublas_handle_pool_test.cpp
@ -0,0 +1,77 @@
+#include <gtest/gtest.h>
+
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDACachingAllocator.h>
+#include <c10/cuda/CUDAGuard.h>
+
+#include <atomic>
+#include <thread>
+#include <vector>
+
+// Test concurrent access to getCurrentCUDABlasHandle and getCUDABlasLtWorkspace
+// to verify that the data race fix is working correctly
+
+TEST(CUDABlasHandlePoolTest, ConcurrentGetAndClearWorkspaces) {
+  if (!at::cuda::is_available()) {
+    return;
+  }
+
+  constexpr int num_accessor_threads = 15;
+  constexpr int num_clear_threads = 5;
+  constexpr int iterations_per_thread = 50;
+
+  std::atomic<bool> stop{false};
+  std::atomic<int> error_count{0};
+  std::vector<std::thread> threads;
+  threads.reserve(num_accessor_threads + num_clear_threads);
+
+  // Launch accessor threads
+  for (int i = 0; i < num_accessor_threads; ++i) {
+    threads.emplace_back([&stop, &error_count]() {
+      try {
+        at::cuda::CUDAGuard device_guard(0);
+
+        while (!stop.load(std::memory_order_relaxed)) {
+          const auto handle = at::cuda::getCurrentCUDABlasHandle();
+          const auto workspace = at::cuda::getCUDABlasLtWorkspace();
+
+          if (handle == nullptr || workspace == nullptr) {
+            error_count++;
+          }
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Launch threads that clear workspaces
+  for (int i = 0; i < num_clear_threads; ++i) {
+    threads.emplace_back([&error_count]() {
+      try {
+        for (int j = 0; j < iterations_per_thread; ++j) {
+          at::cuda::clearCublasWorkspaces();
+          std::this_thread::yield();
+        }
+      } catch (const std::exception& e) {
+        error_count++;
+      }
+    });
+  }
+
+  // Let them run for a bit
+  std::this_thread::sleep_for(std::chrono::milliseconds(100));
+  stop.store(true, std::memory_order_relaxed);
+
+  for (auto& thread : threads) {
+    thread.join();
+  }
+
+  EXPECT_EQ(error_count.load(), 0);
+}
+
+int main(int argc, char* argv[]) {
+  ::testing::InitGoogleTest(&argc, argv);
+  c10::cuda::CUDACachingAllocator::init(1);
+  return RUN_ALL_TESTS();
+}