Files
pytorch/c10/cuda/driver_api.h
Natalia Gimelshein 0254646654 harden fabric checks for symmetric memory (#160790)
Now we check only that fabric allocation succeeded, but sometimes we fail during export or import afterwards, with no recourse. Check the full cycle before attempting to allocate memory with the fabric.
TODO: move it to c10/cuda so that it can be used from CUDACachingAllocator too

Pull Request resolved: https://github.com/pytorch/pytorch/pull/160790
Approved by: https://github.com/Skylion007
2025-08-18 22:35:50 +00:00

89 lines
3.5 KiB
C++

#pragma once
#include <cuda.h>
#define NVML_NO_UNVERSIONED_FUNC_DEFS
#include <nvml.h>
#include <c10/util/Exception.h>
#define C10_CUDA_DRIVER_CHECK(EXPR) \
do { \
CUresult __err = EXPR; \
if (__err != CUDA_SUCCESS) { \
const char* err_str; \
CUresult get_error_str_err [[maybe_unused]] = \
c10::cuda::DriverAPI::get()->cuGetErrorString_(__err, &err_str); \
if (get_error_str_err != CUDA_SUCCESS) { \
TORCH_CHECK(false, "CUDA driver error: unknown error"); \
} else { \
TORCH_CHECK(false, "CUDA driver error: ", err_str); \
} \
} \
} while (0)
// The integer in the second column specifies the requested CUDA Driver API
// version. The dynamic loader will accept a driver with a newer version, but it
// ensures that the requested symbol exists in *at least* the specified version
// or earlier.
// Keep these requested versions as low as possible to maximize compatibility
// across different driver versions.
// Why do we pin to an older version instead of using the latest?
// If a user installs a newer driver, blindly resolving the symbol may bind to a
// newer version of the function with different behavior, potentially breaking
// PyTorch.
#define C10_LIBCUDA_DRIVER_API_REQUIRED(_) \
_(cuDeviceGetAttribute, 12000) \
_(cuMemAddressReserve, 12000) \
_(cuMemRelease, 12000) \
_(cuMemMap, 12000) \
_(cuMemAddressFree, 12000) \
_(cuMemSetAccess, 12000) \
_(cuMemUnmap, 12000) \
_(cuMemCreate, 12000) \
_(cuMemGetAllocationGranularity, 12000) \
_(cuMemExportToShareableHandle, 12000) \
_(cuMemImportFromShareableHandle, 12000) \
_(cuMemsetD32Async, 12000) \
_(cuStreamWriteValue32, 12000) \
_(cuGetErrorString, 12000)
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12030)
#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_) \
_(cuMulticastAddDevice, 12030) \
_(cuMulticastBindMem, 12030) \
_(cuMulticastCreate, 12030) \
_(cuMulticastUnbind, 12030)
#else
#define C10_LIBCUDA_DRIVER_API_OPTIONAL(_)
#endif
#define C10_NVML_DRIVER_API(_) \
_(nvmlInit_v2) \
_(nvmlDeviceGetHandleByPciBusId_v2) \
_(nvmlDeviceGetNvLinkRemoteDeviceType) \
_(nvmlDeviceGetNvLinkRemotePciInfo_v2) \
_(nvmlDeviceGetComputeRunningProcesses) \
_(nvmlSystemGetCudaDriverVersion_v2)
#define C10_NVML_DRIVER_API_OPTIONAL(_) _(nvmlDeviceGetGpuFabricInfoV)
namespace c10::cuda {
struct DriverAPI {
#define CREATE_MEMBER_VERSIONED(name, version) decltype(&name) name##_;
#define CREATE_MEMBER(name) decltype(&name) name##_;
C10_LIBCUDA_DRIVER_API_REQUIRED(CREATE_MEMBER_VERSIONED)
C10_LIBCUDA_DRIVER_API_OPTIONAL(CREATE_MEMBER_VERSIONED)
C10_NVML_DRIVER_API(CREATE_MEMBER)
C10_NVML_DRIVER_API_OPTIONAL(CREATE_MEMBER)
#undef CREATE_MEMBER_VERSIONED
#undef CREATE_MEMBER
static DriverAPI* get();
static void* get_nvml_handle();
};
} // namespace c10::cuda