[SymmMem] Add runtime detection of NVSHMEM (#156291)

so that we can pick the default backend for SymmetricMemory without
fully relying on env var `TORCH_SYMMMEM=CUDA | NVSHMEM`

On Python side, the following API is added:
`torch.distributed._symmetric_memory.is_nvshmem_available()`

Pull Request resolved: https://github.com/pytorch/pytorch/pull/156291
Approved by: https://github.com/Skylion007
ghstack dependencies: #155506, #155835, #155968, #155971, #155975, #156116, #156117
This commit is contained in:
Ke Wen
2025-06-18 19:07:34 -07:00
committed by PyTorch MergeBot
parent eabf7cd3c5
commit 8fcda2c60d
8 changed files with 73 additions and 24 deletions

View File

@ -1027,6 +1027,7 @@ elseif(USE_CUDA)
nvshmem_device
)
target_compile_definitions(torch_cuda PUBLIC USE_NVSHMEM)
target_compile_definitions(nvshmem_extension PUBLIC USE_NVSHMEM)
target_link_libraries(torch_cuda PRIVATE nvshmem_extension)
install(TARGETS nvshmem_extension EXPORT Caffe2Targets DESTINATION lib)
endif()

View File

@ -2,11 +2,7 @@
# To run:
# TORCH_SYMMMEM=NVSHMEM python test/distributed/test_nvshmem.py
# OR
# TORCH_SYMMMEM=NVSHMEM torchrun --nproc-per-node 4 test/distributed/test_nvshmem.py
import os
import sys
import torch
import torch.distributed as dist
@ -24,21 +20,11 @@ from torch.testing._internal.common_utils import (
from torch.testing._internal.inductor_utils import requires_triton
symm_mem_backend = os.getenv("TORCH_SYMMMEM")
if symm_mem_backend != "NVSHMEM":
print(
"test_nvshmem requires setting `TORCH_SYMMMEM=NVSHMEM`, skipping tests",
file=sys.stderr,
)
sys.exit(0)
# Decorator
def requires_nvshmem():
return skip_but_pass_in_sandcastle_if(
symm_mem_backend != "NVSHMEM",
"test_nvshmem requires setting `TORCH_SYMMMEM=NVSHMEM`",
not symm_mem.is_nvshmem_available(),
"test_nvshmem requires NVSHMEM, skipping tests",
)

View File

@ -705,6 +705,9 @@ def _unregister_process_group(group_name: str) -> None: ...
# Python. At C++ interface, it is converted to a uintptr_t.
def _nvshmemx_cumodule_init(module: int) -> None: ...
# Check if NVSHMEM is available on current system.
def _is_nvshmem_available() -> bool: ...
class _SymmetricMemory:
@staticmethod
def set_group_info(

View File

@ -1005,13 +1005,17 @@ This class does not support ``__members__`` property.)");
return ::c10d::unregister_all_process_groups();
});
#ifdef USE_NVSHMEM
// Intializes the device state in CUmodule so that its able to perform
// NVSHMEM operations.
#ifdef USE_NVSHMEM
module.def(
"_nvshmemx_cumodule_init",
::c10d::nvshmem_extension::nvshmemx_cumodule_init,
py::arg("module"));
// Check if NVSHMEM is available on current system.
module.def(
"_is_nvshmem_available", ::c10d::nvshmem_extension::is_nvshmem_available);
#endif
py::class_<::c10d::BroadcastOptions>(module, "BroadcastOptions")

View File

@ -32,15 +32,23 @@ bool allow_overlapping_devices() {
// Query environment variable to get the backend used for CUDA Symmetric Memory.
std::string getSymmMemBackendCUDA() {
// TORCH_SYMMMEM environment variable can be used to indicate the preferred
// backend.
static auto val = c10::utils::get_env("TORCH_SYMMMEM");
if (!val.has_value()) {
// In-house implementation: `CUDASymmetricMemory`
return "CUDA";
} else {
// Other backends:
// - "NVSHMEM": `NVSHMEMSymmetricMemory`
if (val.has_value()) {
TORCH_CHECK(
val.value() == "CUDA" || val.value() == "NVSHMEM" ||
val.value() == "NCCL",
"TORCH_SYMMMEM environment variable must be one of 'CUDA', 'NVSHMEM', 'NCCL'.")
return val.value();
}
// If TORCH_SYMMMEM is not set, check if NVSHMEM is available (for broader
// support).
// TODO: uncomment this once all single-node tests work with NVSHMEM
// if (is_nvshmem_available()) {
// return "NVSHMEM";
// }
return "CUDA";
}
IpcChannel::IpcChannel()

View File

@ -1,3 +1,4 @@
#include <dlfcn.h>
#include <c10/cuda/CUDAGuard.h>
#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
@ -20,6 +21,28 @@ static StoreExchange storeExchange = StoreExchange("nvshmem_ext");
constexpr int MiB = 1024 * 1024;
// Check if NVSHMEM is available
bool is_nvshmem_available() {
// Runtime check
static std::mutex mutex;
static int is_available = -2;
std::lock_guard<std::mutex> lock(mutex);
if (is_available == -2) {
void* handle{};
// Open the shared library, RTLD_LAZY defers symbol resolution until needed
handle = dlopen("libnvshmem_host.so.3", RTLD_LAZY);
if (!handle) {
std::cerr << dlerror() << "\n";
is_available = 0;
} else {
is_available = 1;
// Close the shared library
dlclose(handle);
}
}
return is_available == 1;
}
// Bootstrap based on user's setting for NCCL
// Long term, this may be a bit unclean; short term, it improves UX
void maybe_initialize_env_vars() {
@ -71,6 +94,11 @@ void initialize_nvshmem_with_store(
"nvshmemx_init_attr failed");
is_initialized = true;
// Print version
int major, minor;
::nvshmem_info_get_version(&major, &minor);
LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor;
}
// Intializes the device state in CUmodule so that its able to perform NVSHMEM

View File

@ -11,6 +11,9 @@ void initialize_nvshmem_with_store(
int rank,
int world_size);
// Check if NVSHMEM is available
TORCH_API bool is_nvshmem_available();
// Intializes the device state in CUmodule so that its able to perform NVSHMEM
// operations.
TORCH_API void nvshmemx_cumodule_init(uintptr_t module);

View File

@ -1704,4 +1704,20 @@ def rendezvous(
return _SymmetricMemory.rendezvous(tensor, group_name)
__all__ = ["empty", "rendezvous"]
def is_nvshmem_available() -> bool:
r"""
is_nvshmem_available() -> bool
Check if NVSHMEM is available in current build and on current system.
"""
try:
from torch._C._distributed_c10d import _is_nvshmem_available
except ImportError:
# Not all builds have NVSHMEM support.
return False
# Check if NVSHMEM is available on current system.
return _is_nvshmem_available()
__all__ = ["empty", "rendezvous", "is_nvshmem_available"]