[SymmMem] find_path does not search /usr/local/lib (#157695)

This PR uses `find_library` to replace `find_path`. It also searches for NVSHMEM host lib and device lib separately. Tested against system install location: /usr/local/lib and /usr/local/include. Pull Request resolved: https://github.com/pytorch/pytorch/pull/157695 Approved by: https://github.com/Skylion007 ghstack dependencies: #157513
2025-10-20 21:14:14 +08:00 · 2025-07-07 18:17:39 -07:00
parent 30a1cc11a4
commit c5589074e6
4 changed files with 23 additions and 13 deletions
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -999,20 +999,30 @@ elseif(USE_CUDA)
    # location, e.g.
    # `/path/to/conda/lib/python3.10/site-packages/nvidia/nvshmem`,
    # 3. Let CMake find it in the default system paths, e.g. /usr/local.
-    find_path(NVSHMEM_LIB_DIR
+    find_library(NVSHMEM_HOST_LIB
      # In pip install case, the lib suffix is `.so.3` instead of `.so`
-      NAMES libnvshmem_host.so libnvshmem_host.so.3
-      PATHS $ENV{NVSHMEM_HOME}/lib ${NVSHMEM_PY_DIR}/lib
-      DOC "The location of NVSHMEM library.")
+      NAMES nvshmem_host nvshmem_host.so.3
+      HINTS $ENV{NVSHMEM_HOME} ${NVSHMEM_PY_DIR}
+      PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64
+      DOC "The location of NVSHMEM host library.")
+    find_library(NVSHMEM_DEVICE_LIB
+      # Device lib is a `.a` file
+      NAMES nvshmem_device
+      HINTS $ENV{NVSHMEM_HOME} ${NVSHMEM_PY_DIR}
+      PATH_SUFFIXES lib lib64 cuda/lib cuda/lib64 lib/x64
+      DOC "The location of NVSHMEM device library.")
    find_path(NVSHMEM_INCLUDE_DIR
      NAMES nvshmem.h
-      PATHS $ENV{NVSHMEM_HOME}/include ${NVSHMEM_PY_DIR}/include
+      HINTS $ENV{NVSHMEM_HOME}/include ${NVSHMEM_PY_DIR}/include
      DOC "The location of NVSHMEM headers.")
+    message(STATUS "NVSHMEM_HOST_LIB:  '${NVSHMEM_HOST_LIB}'")
+    message(STATUS "NVSHMEM_DEVICE_LIB:  '${NVSHMEM_DEVICE_LIB}'")
+    message(STATUS "NVSHMEM_INCLUDE_DIR:  '${NVSHMEM_INCLUDE_DIR}'")
  endif()

  # If NVSHMEM_LIBRARY is found, we build torch_cuda with NVSHMEM support.
-  if(NVSHMEM_LIB_DIR AND NVSHMEM_INCLUDE_DIR)
-    message(STATUS "Building with NVSHMEM support:  '${NVSHMEM_LIB_DIR}'")
+  if(NVSHMEM_HOST_LIB AND NVSHMEM_DEVICE_LIB AND NVSHMEM_INCLUDE_DIR)
+    message(STATUS "NVSHMEM found, building with NVSHMEM support")
    include_directories(${NVSHMEM_INCLUDE_DIR})

    # Linking with nvshmem requires the source binary to be built with -rdc
@ -1027,12 +1037,9 @@ elseif(USE_CUDA)
    set_target_properties(nvshmem_extension PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
    target_compile_options(nvshmem_extension PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-rdc=true>)
    target_compile_options(nvshmem_extension PRIVATE "-U__CUDA_NO_HALF_OPERATORS__")
-    target_link_directories(nvshmem_extension PRIVATE ${NVSHMEM_LIB_DIR})
    target_link_libraries(nvshmem_extension PRIVATE
-        # Full path needed bc nvshmem wheel ships with .so.3 instead of .so;
-        # otherwise, we could just write `nvshmem_host`
-        ${NVSHMEM_LIB_DIR}/libnvshmem_host.so.3
-        nvshmem_device
+        ${NVSHMEM_HOST_LIB}
+        ${NVSHMEM_DEVICE_LIB}
    )
    target_compile_definitions(torch_cuda PUBLIC USE_NVSHMEM)
    target_compile_definitions(nvshmem_extension PUBLIC USE_NVSHMEM)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -172,7 +172,7 @@ function(caffe2_print_configuration_summary)
  if(${USE_NCCL})
    message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
  endif()
-  message(STATUS "  NVSHMEM_LIB_DIR       : ${NVSHMEM_LIB_DIR}")
+  message(STATUS "  Found NVSHMEM         : ${NVSHMEM_INCLUDE_DIR}")
  message(STATUS "  USE_NNPACK            : ${USE_NNPACK}")
  message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
  message(STATUS "  USE_OBSERVERS         : ${USE_OBSERVERS}")
--- a/test/distributed/test_nvshmem.py
+++ b/test/distributed/test_nvshmem.py
@ -11,6 +11,7 @@ from torch.testing._internal.common_distributed import MultiProcContinousTest
 from torch.testing._internal.common_utils import (
    instantiate_parametrized_tests,
    parametrize,
+    requires_cuda_p2p_access,
    run_tests,
    skip_but_pass_in_sandcastle_if,
    skipIfRocm,
@ -32,6 +33,7 @@ device_module = torch.get_device_module(device_type)

@instantiate_parametrized_tests
@requires_nvshmem()
+@requires_cuda_p2p_access()
 class NVSHMEMSymmetricMemoryTest(MultiProcContinousTest):
    def _init_device(self) -> None:
        # TODO: relieve this (seems to hang if without)
--- a/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
+++ b/torch/csrc/distributed/c10d/symm_mem/nvshmem_extension.cu
@ -19,6 +19,7 @@
 #endif  // Must be done before nvshmem.h is included

 #include <nvshmem.h>
+#include <nvshmemx.h>

 namespace c10d::nvshmem_extension {