Detect NVSHMEM location (#153010)

### Changes
- Detect NVSHMEM install location via `sysconfig.get_path("purelib")`, which typically resolves to `<conda_env>/lib/python/site-packages`, and NVSHMEM include and lib live under `nvidia/nvshmem`
- Added link dir via `target_link_directories`
- Removed direct dependency on mlx5
- Added preload rule (following other other NVIDIA libs)

### Plan of Record
1. End user experience: link against NVSHMEM dynamically (NVSHMEM lib size is 100M, similar to NCCL, thus we'd like users to `pip install nvshmem` than torch carrying the bits)
2. Developer experience: at compile time, prefers wheel dependency than using Git submodule
General rule: submodule for small lib that torch can statically link with
If user pip install a lib, our CI build process should do the same, rather than building from Git submodule (just for its header, for example)
3. Keep `USE_NVSHMEM` to gate non-Linux platforms, like Windows, Mac
4. At configuration time, we should be able to detect whether nvshmem is available, if not, we don't build `NVSHMEMSymmetricMemory` at all.

For now, we have symbol dependency on two particular libs from NVSHMEM:
- libnvshmem_host.so: contains host side APIs;
- libnvshmem_device.a: contains device-side global variables AND device function impls.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/153010
Approved by: https://github.com/ngimel, https://github.com/fduwjj, https://github.com/Skylion007
This commit is contained in:
Ke Wen
2025-05-06 18:17:43 -07:00
committed by PyTorch MergeBot
parent df1ec045b5
commit 5bf0c3518c
3 changed files with 26 additions and 9 deletions

View File

@ -979,10 +979,21 @@ elseif(USE_CUDA)
endif()
# Use env var for these for now for prototyping purposes
set(USE_NVSHMEM $ENV{USE_NVSHMEM} CACHE BOOL "Enable NVSHMEM support")
set(NVSHMEM_HOME $ENV{NVSHMEM_HOME} CACHE PATH "Path to NVSHMEM build dir")
set(USE_NVSHMEM $ENV{USE_NVSHMEM} CACHE BOOL "Whether to build with NVSHMEM support")
# If user has specified NVSHMEM_HOME, we use it;
# Otherwise, NVSHMEM_HOME is auto detected in tools/setup_helpers/cmake.py
if($ENV{NVSHMEM_HOME})
set(NVSHMEM_HOME $ENV{NVSHMEM_HOME} CACHE PATH "Path to NVSHMEM build dir")
endif()
if(USE_NVSHMEM AND NOT DEFINED NVSHMEM_HOME)
message(WARNING "USE_NVSHMEM set to 1 but NVSHMEM_HOME not found. Please run `pip install nvidia-nvshmem-<version>`, or set NVSHMEM_HOME to the NVSHMEM build dir")
# Disable nvshmem if NVSHMEM_HOME is not found
set(USE_NVSHMEM FALSE CACHE BOOL "Whether to build with NVSHMEM support")
endif()
if(USE_NVSHMEM)
message("Building with NVSHMEM support: '${NVSHMEM_HOME}'")
set(NVSHMEM_INCLUDE_DIR "${NVSHMEM_HOME}/include")
set(NVSHMEM_LIB_DIR "${NVSHMEM_HOME}/lib")
@ -1000,18 +1011,17 @@ elseif(USE_CUDA)
set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
target_compile_options(nvshmem_extension PRIVATE "-U__CUDA_NO_HALF_OPERATORS__")
target_link_directories(nvshmem_extension PRIVATE ${NVSHMEM_LIB_DIR})
target_link_libraries(nvshmem_extension PRIVATE
${NVSHMEM_LIB_DIR}/libnvshmem.a
${NVSHMEM_LIB_DIR}/nvshmem_bootstrap_uid.so
# Full path needed bc nvshmem wheel ships with .so.3 instead of .so;
# otherwise, we could just write `nvshmem_host`
${NVSHMEM_LIB_DIR}/libnvshmem_host.so.3
nvshmem_device
)
target_link_libraries(nvshmem_extension PRIVATE mlx5)
target_link_libraries(torch_cuda PRIVATE nvshmem_extension)
install(TARGETS nvshmem_extension EXPORT Caffe2Targets DESTINATION lib)
install(
FILES ${NVSHMEM_LIB_DIR}/nvshmem_bootstrap_uid.so
DESTINATION lib
)
endif()
if(USE_UCC)
target_link_libraries(torch_cuda PRIVATE __caffe2_ucc)
target_compile_definitions(torch_cuda PRIVATE USE_UCC)

View File

@ -288,6 +288,12 @@ class CMake:
}
)
# Detect build dependencies from python lib path (in order to set *_HOME variables)
# NVSHMEM
nvshmem_home = py_lib_path + "/nvidia/nvshmem"
if os.path.exists(nvshmem_home):
build_options["NVSHMEM_HOME"] = nvshmem_home
# Options starting with CMAKE_
cmake__options = {
"CMAKE_INSTALL_PREFIX": install_dir,

View File

@ -357,6 +357,7 @@ def _load_global_deps() -> None:
"cusolver": "libcusolver.so.*[0-9]",
"nccl": "libnccl.so.*[0-9]",
"nvtx": "libnvToolsExt.so.*[0-9]",
"nvshmem": "libnvshmem_host.so.*[0-9]",
}
# cufiile is only available on cuda 12+
# TODO: Remove once CUDA 11.8 binaries are deprecated