From 0254646654d1875b8d3fdd55d9ea720c9b7115c4 Mon Sep 17 00:00:00 2001 From: Natalia Gimelshein Date: Mon, 18 Aug 2025 22:35:46 +0000 Subject: [PATCH] harden fabric checks for symmetric memory (#160790) Now we check only that fabric allocation succeeded, but sometimes we fail during export or import afterwards, with no recourse. Check the full cycle before attempting to allocate memory with the fabric. TODO: move it to c10/cuda so that it can be used from CUDACachingAllocator too Pull Request resolved: https://github.com/pytorch/pytorch/pull/160790 Approved by: https://github.com/Skylion007 --- aten/src/ATen/cuda/PeerToPeerAccess.cpp | 135 +++++++++++++++++- aten/src/ATen/cuda/PeerToPeerAccess.h | 1 + c10/cuda/driver_api.cpp | 7 + c10/cuda/driver_api.h | 3 + caffe2/CMakeLists.txt | 5 + .../c10d/symm_mem/CUDASymmetricMemory.cu | 32 +---- 6 files changed, 150 insertions(+), 33 deletions(-) diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.cpp b/aten/src/ATen/cuda/PeerToPeerAccess.cpp index 91b487cd9c83..66a75db6ea06 100644 --- a/aten/src/ATen/cuda/PeerToPeerAccess.cpp +++ b/aten/src/ATen/cuda/PeerToPeerAccess.cpp @@ -4,6 +4,9 @@ #include #include +#if !defined(USE_ROCM) && defined(PYTORCH_C10_DRIVER_API_SUPPORTED) +#include +#endif #include #include @@ -12,6 +15,7 @@ namespace at::cuda { static std::vector p2pAccessEnabled_; +static std::vector fabricAccessEnabled_; static int64_t num_devices_ = -1; namespace detail { @@ -29,20 +33,23 @@ void init_p2p_access_cache(int64_t num_devices) { for (const auto i : c10::irange(num_devices)) { p2pAccessEnabled_[i * num_devices + i] = 1; } + fabricAccessEnabled_.clear(); + fabricAccessEnabled_.resize(num_devices, -1); } -} // namespace detail +} // namespace detail bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) { at::globalContext().lazyInitDevice(c10::DeviceType::CUDA); - TORCH_CHECK(dev >= 0 || dev < num_devices_, - dev, " is not a device"); - TORCH_CHECK(dev_to_access >= 0 || dev_to_access < num_devices_, - dev_to_access, " is not a device"); + TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device"); + TORCH_CHECK( + dev_to_access >= 0 || dev_to_access < num_devices_, + dev_to_access, + " is not a device"); TORCH_INTERNAL_ASSERT(num_devices_ >= 0, "p2p access cache not initialized"); - auto &cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access]; + auto& cache = p2pAccessEnabled_[dev * num_devices_ + dev_to_access]; if (cache != -1) { return cache; @@ -58,4 +65,118 @@ bool get_p2p_access(c10::DeviceIndex dev, c10::DeviceIndex dev_to_access) { return cache; } -} // namespace at::cuda::detail +namespace { +#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED + +nvmlDevice_t get_nvml_device(c10::DeviceIndex dev) { + static bool nvml_init [[maybe_unused]] = []() { + TORCH_INTERNAL_ASSERT(NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_()); + return true; + }(); + + auto prop = at::cuda::getDeviceProperties(dev); + char pci_id // NOLINT(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) + [NVML_DEVICE_PCI_BUS_ID_BUFFER_SIZE]; + snprintf( + pci_id, + sizeof(pci_id), + NVML_DEVICE_PCI_BUS_ID_FMT, + prop->pciDomainID, + prop->pciBusID, + prop->pciDeviceID); + + nvmlDevice_t nvml_device = nullptr; + TORCH_INTERNAL_ASSERT( + NVML_SUCCESS == + DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_( + pci_id, &nvml_device)); + return nvml_device; +} + +bool isFabricSupported() { + // 1. try allocating memory + CUmemGenericAllocationHandle handle = 0; + CUmemAllocationProp prop = {}; + prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; + prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC; + prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; + + size_t granularity{}; + const auto driver_api = c10::cuda::DriverAPI::get(); + C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_( + &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); + + auto status = driver_api->cuMemCreate_(&handle, granularity, &prop, 0); + if (status != CUDA_SUCCESS) { + LOG(INFO) + << "status " << status + << " Could not allocate memory with FABRIC handle, falling back to fd handle exchange\n"; + return false; + } + // 2. check export + CUmemFabricHandle sharedHandle; + status = driver_api->cuMemExportToShareableHandle_( + &sharedHandle, handle, CU_MEM_HANDLE_TYPE_FABRIC, 0); + if (status != CUDA_SUCCESS) { + LOG(INFO) + << "status " << status + << " Could not export FABRIC handle, falling back to fd handle exchange\n"; + driver_api->cuMemRelease_(handle); + return false; + } + // 3. check import + CUmemGenericAllocationHandle import_handle = 0; + status = driver_api->cuMemImportFromShareableHandle_( + &import_handle, &sharedHandle, CU_MEM_HANDLE_TYPE_FABRIC); + if (status != CUDA_SUCCESS) { + LOG(INFO) + << "status " << status + << " Could not import FABRIC handle, falling back to fd handle exchange\n"; + driver_api->cuMemRelease_(handle); + return false; + } + driver_api->cuMemRelease_(import_handle); + driver_api->cuMemRelease_(handle); + LOG(INFO) << "using fabric to exchange memory handles\n"; + return true; +} +#endif +} // namespace + +bool get_fabric_access(c10::DeviceIndex dev) { +#if !defined USE_ROCM && defined CUDA_VERSION && CUDA_VERSION >= 12040 && defined PYTORCH_C10_DRIVER_API_SUPPORTED + at::globalContext().lazyInitDevice(c10::DeviceType::CUDA); + + TORCH_CHECK(dev >= 0 || dev < num_devices_, dev, " is not a device"); + auto& cache = fabricAccessEnabled_[dev]; + if (cache != -1) { + return cache; + } + auto nvml_device = get_nvml_device(dev); + if (nvml_device != nullptr) { + nvmlGpuFabricInfoV_t fabricInfo; + fabricInfo.state = NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; + fabricInfo.version = nvmlGpuFabricInfo_v2; + if (DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_ == nullptr) { + return false; + } + TORCH_CHECK( + NVML_SUCCESS == + DriverAPI::get()->nvmlDeviceGetGpuFabricInfoV_( + nvml_device, &fabricInfo)); + auto state = fabricInfo.state != NVML_GPU_FABRIC_STATE_NOT_SUPPORTED; + if (state) { + // now perform the full cycle of allocating - exporting - importing memory + state = isFabricSupported(); + } + cache = state ? 1 : 0; + return cache; + } else { + return false; + } +#else + return false; +#endif +} + +} // namespace at::cuda diff --git a/aten/src/ATen/cuda/PeerToPeerAccess.h b/aten/src/ATen/cuda/PeerToPeerAccess.h index 5b63a855f3f4..30d21af83ed8 100644 --- a/aten/src/ATen/cuda/PeerToPeerAccess.h +++ b/aten/src/ATen/cuda/PeerToPeerAccess.h @@ -8,5 +8,6 @@ void init_p2p_access_cache(int64_t num_devices); } TORCH_CUDA_CPP_API bool get_p2p_access(c10::DeviceIndex source_dev, c10::DeviceIndex dest_dev); +TORCH_CUDA_CPP_API bool get_fabric_access(c10::DeviceIndex device); } // namespace at::cuda diff --git a/c10/cuda/driver_api.cpp b/c10/cuda/driver_api.cpp index f4b62e53fcc0..f936b02ec9ab 100644 --- a/c10/cuda/driver_api.cpp +++ b/c10/cuda/driver_api.cpp @@ -38,6 +38,13 @@ DriverAPI create_driver_api() { C10_NVML_DRIVER_API(LOOKUP_NVML_ENTRY) #undef LOOKUP_NVML_ENTRY } + + if (handle_1) { +#define LOOKUP_NVML_ENTRY_OPTIONAL(name) \ + r.name##_ = ((decltype(&name))dlsym(handle_1, #name)); + C10_NVML_DRIVER_API_OPTIONAL(LOOKUP_NVML_ENTRY_OPTIONAL) +#undef LOOKUP_NVML_ENTRY_OPTIONAL + } return r; } diff --git a/c10/cuda/driver_api.h b/c10/cuda/driver_api.h index 6702cb9b532d..405870bdf342 100644 --- a/c10/cuda/driver_api.h +++ b/c10/cuda/driver_api.h @@ -67,6 +67,8 @@ _(nvmlDeviceGetComputeRunningProcesses) \ _(nvmlSystemGetCudaDriverVersion_v2) +#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV) + namespace c10::cuda { struct DriverAPI { @@ -75,6 +77,7 @@ struct DriverAPI { C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED) C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED) C10_NVML_DRIVER_API(CREATE_MEMBER) + C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER) #undef CREATE_MEMBER_VERSIONED #undef CREATE_MEMBER diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt index 96ed0c3b918e..781e134ad0d3 100644 --- a/caffe2/CMakeLists.txt +++ b/caffe2/CMakeLists.txt @@ -1122,6 +1122,11 @@ elseif(USE_CUDA) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseCUDABlas.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/native/sparse/cuda/SparseBlasImpl.cpp PROPERTIES COMPILE_FLAGS -Wno-deprecated-declarations) endif() + # Set driver api defined for PeerToPeerAccess + if(NOT WIN32) + set_source_files_properties(${CMAKE_CURRENT_SOURCE_DIR}/../aten/src/ATen/cuda/PeerToPeerAccess.cpp PROPERTIES COMPILE_FLAGS "-DPYTORCH_C10_DRIVER_API_SUPPORTED=1") + endif() + endif() if(USE_XPU) diff --git a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu index ef155a443a72..110ff4606a01 100644 --- a/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu +++ b/torch/csrc/distributed/c10d/symm_mem/CUDASymmetricMemory.cu @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -420,23 +421,11 @@ void* CUDASymmetricMemoryAllocator::alloc( prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; // NOLINTNEXTLINE(bugprone-signed-char-misuse) prop.location.id = device_idx; - const auto driver_api = c10::cuda::DriverAPI::get(); - + bool has_fabric_support = at::cuda::get_fabric_access(device_idx); + LOG(INFO) << "CUDASymmetricMemoryAllocator::alloc: has_fabric_support " << has_fabric_support; if (handle_type_ == Expandable_Segments_Handle_Type::UNSPECIFIED) { - // Initialize NVML - if (driver_api->nvmlInit_v2_() == NVML_SUCCESS) { - // Get the driver version - int version = -1; - const auto res = driver_api->nvmlSystemGetCudaDriverVersion_v2_(&version); - if (res == NVML_SUCCESS) { - // Check if driver is sufficiently new - if (version < 12040) { - handle_type_ = Expandable_Segments_Handle_Type::POSIX_FD; - } - } - } + handle_type_ = has_fabric_support ? Expandable_Segments_Handle_Type::FABRIC_HANDLE : Expandable_Segments_Handle_Type::POSIX_FD; } - if (handle_type_ == Expandable_Segments_Handle_Type::POSIX_FD) { prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; } else { @@ -444,22 +433,13 @@ void* CUDASymmetricMemoryAllocator::alloc( } size_t granularity; + auto driver_api = c10::cuda::DriverAPI::get(); C10_CUDA_DRIVER_CHECK(driver_api->cuMemGetAllocationGranularity_( &granularity, &prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED)); block_size = at::round_up(block_size, granularity); HandleType handle; - auto status = driver_api->cuMemCreate_(&handle, block_size, &prop, 0); - if (handle_type_ == Expandable_Segments_Handle_Type::UNSPECIFIED) { - if (status != CUDA_SUCCESS) { - prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; - handle_type_ = Expandable_Segments_Handle_Type::POSIX_FD; - status = driver_api->cuMemCreate_(&handle, block_size, &prop, 0); - } else { - handle_type_ = Expandable_Segments_Handle_Type::FABRIC_HANDLE; - } - } - C10_CUDA_DRIVER_CHECK(status); + C10_CUDA_DRIVER_CHECK(driver_api->cuMemCreate_(&handle, block_size, &prop, 0)); #elif defined(USE_ROCM) hipMemAllocationProp prop = {};