mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[SymmMem] Add runtime detection of NVSHMEM (#156291)
so that we can pick the default backend for SymmetricMemory without fully relying on env var `TORCH_SYMMMEM=CUDA | NVSHMEM` On Python side, the following API is added: `torch.distributed._symmetric_memory.is_nvshmem_available()` Pull Request resolved: https://github.com/pytorch/pytorch/pull/156291 Approved by: https://github.com/Skylion007 ghstack dependencies: #155506, #155835, #155968, #155971, #155975, #156116, #156117
This commit is contained in:
@ -1027,6 +1027,7 @@ elseif(USE_CUDA)
|
||||
nvshmem_device
|
||||
)
|
||||
target_compile_definitions(torch_cuda PUBLIC USE_NVSHMEM)
|
||||
target_compile_definitions(nvshmem_extension PUBLIC USE_NVSHMEM)
|
||||
target_link_libraries(torch_cuda PRIVATE nvshmem_extension)
|
||||
install(TARGETS nvshmem_extension EXPORT Caffe2Targets DESTINATION lib)
|
||||
endif()
|
||||
|
@ -2,11 +2,7 @@
|
||||
|
||||
# To run:
|
||||
# TORCH_SYMMMEM=NVSHMEM python test/distributed/test_nvshmem.py
|
||||
# OR
|
||||
# TORCH_SYMMMEM=NVSHMEM torchrun --nproc-per-node 4 test/distributed/test_nvshmem.py
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
import torch
|
||||
import torch.distributed as dist
|
||||
@ -24,21 +20,11 @@ from torch.testing._internal.common_utils import (
|
||||
from torch.testing._internal.inductor_utils import requires_triton
|
||||
|
||||
|
||||
symm_mem_backend = os.getenv("TORCH_SYMMMEM")
|
||||
|
||||
if symm_mem_backend != "NVSHMEM":
|
||||
print(
|
||||
"test_nvshmem requires setting `TORCH_SYMMMEM=NVSHMEM`, skipping tests",
|
||||
file=sys.stderr,
|
||||
)
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
# Decorator
|
||||
def requires_nvshmem():
|
||||
return skip_but_pass_in_sandcastle_if(
|
||||
symm_mem_backend != "NVSHMEM",
|
||||
"test_nvshmem requires setting `TORCH_SYMMMEM=NVSHMEM`",
|
||||
not symm_mem.is_nvshmem_available(),
|
||||
"test_nvshmem requires NVSHMEM, skipping tests",
|
||||
)
|
||||
|
||||
|
||||
|
@ -705,6 +705,9 @@ def _unregister_process_group(group_name: str) -> None: ...
|
||||
# Python. At C++ interface, it is converted to a uintptr_t.
|
||||
def _nvshmemx_cumodule_init(module: int) -> None: ...
|
||||
|
||||
# Check if NVSHMEM is available on current system.
|
||||
def _is_nvshmem_available() -> bool: ...
|
||||
|
||||
class _SymmetricMemory:
|
||||
@staticmethod
|
||||
def set_group_info(
|
||||
|
@ -1005,13 +1005,17 @@ This class does not support ``__members__`` property.)");
|
||||
return ::c10d::unregister_all_process_groups();
|
||||
});
|
||||
|
||||
#ifdef USE_NVSHMEM
|
||||
// Intializes the device state in CUmodule so that it’s able to perform
|
||||
// NVSHMEM operations.
|
||||
#ifdef USE_NVSHMEM
|
||||
module.def(
|
||||
"_nvshmemx_cumodule_init",
|
||||
::c10d::nvshmem_extension::nvshmemx_cumodule_init,
|
||||
py::arg("module"));
|
||||
|
||||
// Check if NVSHMEM is available on current system.
|
||||
module.def(
|
||||
"_is_nvshmem_available", ::c10d::nvshmem_extension::is_nvshmem_available);
|
||||
#endif
|
||||
|
||||
py::class_<::c10d::BroadcastOptions>(module, "BroadcastOptions")
|
||||
|
@ -32,15 +32,23 @@ bool allow_overlapping_devices() {
|
||||
|
||||
// Query environment variable to get the backend used for CUDA Symmetric Memory.
|
||||
std::string getSymmMemBackendCUDA() {
|
||||
// TORCH_SYMMMEM environment variable can be used to indicate the preferred
|
||||
// backend.
|
||||
static auto val = c10::utils::get_env("TORCH_SYMMMEM");
|
||||
if (!val.has_value()) {
|
||||
// In-house implementation: `CUDASymmetricMemory`
|
||||
return "CUDA";
|
||||
} else {
|
||||
// Other backends:
|
||||
// - "NVSHMEM": `NVSHMEMSymmetricMemory`
|
||||
if (val.has_value()) {
|
||||
TORCH_CHECK(
|
||||
val.value() == "CUDA" || val.value() == "NVSHMEM" ||
|
||||
val.value() == "NCCL",
|
||||
"TORCH_SYMMMEM environment variable must be one of 'CUDA', 'NVSHMEM', 'NCCL'.")
|
||||
return val.value();
|
||||
}
|
||||
// If TORCH_SYMMMEM is not set, check if NVSHMEM is available (for broader
|
||||
// support).
|
||||
// TODO: uncomment this once all single-node tests work with NVSHMEM
|
||||
// if (is_nvshmem_available()) {
|
||||
// return "NVSHMEM";
|
||||
// }
|
||||
return "CUDA";
|
||||
}
|
||||
|
||||
IpcChannel::IpcChannel()
|
||||
|
@ -1,3 +1,4 @@
|
||||
#include <dlfcn.h>
|
||||
#include <c10/cuda/CUDAGuard.h>
|
||||
|
||||
#include <torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cuh>
|
||||
@ -20,6 +21,28 @@ static StoreExchange storeExchange = StoreExchange("nvshmem_ext");
|
||||
|
||||
constexpr int MiB = 1024 * 1024;
|
||||
|
||||
// Check if NVSHMEM is available
|
||||
bool is_nvshmem_available() {
|
||||
// Runtime check
|
||||
static std::mutex mutex;
|
||||
static int is_available = -2;
|
||||
std::lock_guard<std::mutex> lock(mutex);
|
||||
if (is_available == -2) {
|
||||
void* handle{};
|
||||
// Open the shared library, RTLD_LAZY defers symbol resolution until needed
|
||||
handle = dlopen("libnvshmem_host.so.3", RTLD_LAZY);
|
||||
if (!handle) {
|
||||
std::cerr << dlerror() << "\n";
|
||||
is_available = 0;
|
||||
} else {
|
||||
is_available = 1;
|
||||
// Close the shared library
|
||||
dlclose(handle);
|
||||
}
|
||||
}
|
||||
return is_available == 1;
|
||||
}
|
||||
|
||||
// Bootstrap based on user's setting for NCCL
|
||||
// Long term, this may be a bit unclean; short term, it improves UX
|
||||
void maybe_initialize_env_vars() {
|
||||
@ -71,6 +94,11 @@ void initialize_nvshmem_with_store(
|
||||
"nvshmemx_init_attr failed");
|
||||
|
||||
is_initialized = true;
|
||||
|
||||
// Print version
|
||||
int major, minor;
|
||||
::nvshmem_info_get_version(&major, &minor);
|
||||
LOG(INFO) << "NVSHMEM is available, version: " << major << "." << minor;
|
||||
}
|
||||
|
||||
// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
|
||||
|
@ -11,6 +11,9 @@ void initialize_nvshmem_with_store(
|
||||
int rank,
|
||||
int world_size);
|
||||
|
||||
// Check if NVSHMEM is available
|
||||
TORCH_API bool is_nvshmem_available();
|
||||
|
||||
// Intializes the device state in CUmodule so that it’s able to perform NVSHMEM
|
||||
// operations.
|
||||
TORCH_API void nvshmemx_cumodule_init(uintptr_t module);
|
||||
|
@ -1704,4 +1704,20 @@ def rendezvous(
|
||||
return _SymmetricMemory.rendezvous(tensor, group_name)
|
||||
|
||||
|
||||
__all__ = ["empty", "rendezvous"]
|
||||
def is_nvshmem_available() -> bool:
|
||||
r"""
|
||||
is_nvshmem_available() -> bool
|
||||
|
||||
Check if NVSHMEM is available in current build and on current system.
|
||||
"""
|
||||
try:
|
||||
from torch._C._distributed_c10d import _is_nvshmem_available
|
||||
except ImportError:
|
||||
# Not all builds have NVSHMEM support.
|
||||
return False
|
||||
|
||||
# Check if NVSHMEM is available on current system.
|
||||
return _is_nvshmem_available()
|
||||
|
||||
|
||||
__all__ = ["empty", "rendezvous", "is_nvshmem_available"]
|
||||
|
Reference in New Issue
Block a user