mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
### Summary - Added multicast support to SymmetricMemory. If the cuda runtime and cuda driver have multicast support, SymmetricMemory associate all peer buffers with a multicast object and exposes the multicast virtual address. - Implemented `multimem_all_reduce_` and `multimem_one_shot_all_reduce` based on the multicast support. The two variants shows different performance characteristic for different message size. We plan to use Inductor for collective algo selection (and required symmetric memory buffer allocation). ### Benchmark 8xH100 (non-standard version with HBM2e at 650W). NVSwitch V3 with NVLS support.   Differential Revision: [D61682507](https://our.internmc.facebook.com/intern/diff/D61682507) Pull Request resolved: https://github.com/pytorch/pytorch/pull/133424 Approved by: https://github.com/yf225, https://github.com/weifengpy
63 lines
2.3 KiB
C++
63 lines
2.3 KiB
C++
#pragma once
|
|
#include <cuda.h>
|
|
#define NVML_NO_UNVERSIONED_FUNC_DEFS
|
|
#include <nvml.h>
|
|
|
|
#define C10_CUDA_DRIVER_CHECK(EXPR) \
|
|
do { \
|
|
CUresult __err = EXPR; \
|
|
if (__err != CUDA_SUCCESS) { \
|
|
const char* err_str; \
|
|
CUresult get_error_str_err C10_UNUSED = \
|
|
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
|
|
if (get_error_str_err != CUDA_SUCCESS) { \
|
|
AT_ERROR("CUDA driver error: unknown error"); \
|
|
} else { \
|
|
AT_ERROR("CUDA driver error: ", err_str); \
|
|
} \
|
|
} \
|
|
} while (0)
|
|
|
|
#define C10_LIBCUDA_DRIVER_API(_) \
|
|
_(cuMemAddressReserve) \
|
|
_(cuMemRelease) \
|
|
_(cuMemMap) \
|
|
_(cuMemAddressFree) \
|
|
_(cuMemSetAccess) \
|
|
_(cuMemUnmap) \
|
|
_(cuMemCreate) \
|
|
_(cuMemGetAllocationGranularity) \
|
|
_(cuMemExportToShareableHandle) \
|
|
_(cuMemImportFromShareableHandle) \
|
|
_(cuGetErrorString)
|
|
|
|
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
|
|
#define C10_LIBCUDA_DRIVER_API_12030(_) \
|
|
_(cuMulticastAddDevice) \
|
|
_(cuMulticastBindMem) \
|
|
_(cuMulticastCreate)
|
|
#else
|
|
#define C10_LIBCUDA_DRIVER_API_12030(_)
|
|
#endif
|
|
|
|
#define C10_NVML_DRIVER_API(_) \
|
|
_(nvmlInit_v2) \
|
|
_(nvmlDeviceGetHandleByPciBusId_v2) \
|
|
_(nvmlDeviceGetNvLinkRemoteDeviceType) \
|
|
_(nvmlDeviceGetNvLinkRemotePciInfo_v2) \
|
|
_(nvmlDeviceGetComputeRunningProcesses)
|
|
|
|
namespace c10::cuda {
|
|
|
|
struct DriverAPI {
|
|
#define CREATE_MEMBER(name) decltype(&name) name##_;
|
|
C10_LIBCUDA_DRIVER_API(CREATE_MEMBER)
|
|
C10_LIBCUDA_DRIVER_API_12030(CREATE_MEMBER)
|
|
C10_NVML_DRIVER_API(CREATE_MEMBER)
|
|
#undef CREATE_MEMBER
|
|
static DriverAPI* get();
|
|
static void* get_nvml_handle();
|
|
};
|
|
|
|
} // namespace c10::cuda
|