[ProcessGroupNCCL] Remove jumper to UCC (#114170)

The "jumper" to UCC lib in ProcessGroupNCCL was a temporary solution a while back. Cleaning it now that UCC has its own "PG" representation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/114170 Approved by: https://github.com/wconstab, https://github.com/fduwjj, https://github.com/XilunWu, https://github.com/Aidyn-A
2025-10-20 21:14:14 +08:00 · 2023-11-22 15:35:03 +00:00
parent d7f698102e
commit f2ca07b680
7 changed files with 1 additions and 81 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -328,9 +328,6 @@ cmake_dependent_option(
    USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
 cmake_dependent_option(
    USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
-cmake_dependent_option(
-    USE_NCCL_WITH_UCC "Enable UCC support for ProcessGroupNCCL. Only available if USE_C10D_NCCL is on." OFF
-    "USE_C10D_NCCL" OFF)
 cmake_dependent_option(
    USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
 cmake_dependent_option(
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -1330,9 +1330,6 @@ if(USE_DISTRIBUTED)
      target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
    else()
      target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
-      if(USE_NCCL_WITH_UCC)
-        target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC)
-      endif()
    endif()
  endif()
  if(USE_MPI AND USE_C10D_MPI)
--- a/cmake/Summary.cmake
+++ b/cmake/Summary.cmake
@ -154,7 +154,6 @@ function(caffe2_print_configuration_summary)
  message(STATUS "  USE_NCCL              : ${USE_NCCL}")
  if(${USE_NCCL})
    message(STATUS "    USE_SYSTEM_NCCL     : ${USE_SYSTEM_NCCL}")
-    message(STATUS "    USE_NCCL_WITH_UCC   : ${USE_NCCL_WITH_UCC}")
  endif()
  message(STATUS "  USE_NNPACK            : ${USE_NNPACK}")
  message(STATUS "  USE_NUMPY             : ${USE_NUMPY}")
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp
@ -1,6 +1,5 @@
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
-#include <torch/csrc/distributed/c10d/UCCForNCCL.hpp>
 #include <fstream>
 #include <mutex>
 #include <sstream>
@ -1064,28 +1063,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
        &cacheAllocatorDeregisterHook);
    allocatorHooksAttached = true;
  }
-
-#ifdef USE_NCCL_WITH_UCC
-  static c10::once_flag initialize_ucc_lib_flag;
-  c10::call_once(initialize_ucc_lib_flag, [&] {
-    uccLib_ = loadTorchUCC();
-    if (uccLib_ != nullptr) {
-      LOG(INFO) << "[Rank " << rank_ << "] torch_ucc.so loaded";
-    }
-  });
-
-  if (uccLib_ != nullptr) {
-    LOG(INFO) << "[Rank " << rank_ << "] torch_ucc.so loaded";
-    typedef c10::intrusive_ptr<Backend> fn(
-        const c10::intrusive_ptr<Store>& store, int rank, int size);
-    auto createProcessGroupUCC =
-        reinterpret_cast<fn*>(uccLib_->sym("createProcessGroupUCC"));
-    if (createProcessGroupUCC != nullptr) {
-      uccPG_ = createProcessGroupUCC(store, rank_, size_);
-      LOG(INFO) << "[Rank " << rank_ << "] ProcessGroupUCC created.";
-    }
-  }
-#endif
 }

 void ProcessGroupNCCL::runHealthCheck() {
@ -4134,18 +4111,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
      avoidRecordStreams);
 }

-#ifdef USE_NCCL_WITH_UCC
-std::shared_ptr<at::DynamicLibrary> ProcessGroupNCCL::uccLib_ = nullptr;
-#endif
-
-bool ProcessGroupNCCL::isUCCAvailable() const {
-#ifdef USE_NCCL_WITH_UCC
-  return (uccPG_ != nullptr);
-#else
-  return false;
-#endif
-}
-
 } // namespace c10d

 #endif // USE_C10D_NCCL
--- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
+++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp
@ -12,7 +12,6 @@
 #include <torch/csrc/distributed/c10d/Backend.hpp>
 #include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
 #include <torch/csrc/distributed/c10d/Store.hpp>
-#include <torch/csrc/distributed/c10d/UCCForNCCL.hpp>

 #include <ATen/DynamicLibrary.h>
 #include <ATen/cuda/CUDAContext.h>
@ -530,9 +529,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
  // Provide an API for users to define their own ways to store NCCL debug info.
  void registerDebugInfoWriter(std::unique_ptr<DebugInfoWriter> writer);

-  // Tests if the UCC fallback path is available
-  bool isUCCAvailable() const;
-
  // Provides an API to abort the ProcessGroup (similar to ncclCommAbort)
  // instead of relying on ProcessGroupNCCL destructor.
  void abort(c10::optional<std::string> abortReason = c10::nullopt);
@ -899,11 +895,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
  // The callback function to store NCCL debug info.
  std::unique_ptr<DebugInfoWriter> debugInfoWriter_ = nullptr;

-#ifdef USE_NCCL_WITH_UCC
-  // ProcessGroupUCC shared library handle and ProcessGroup pointer
-  static std::shared_ptr<at::DynamicLibrary> uccLib_;
-  c10::intrusive_ptr<Backend> uccPG_;
-#endif
  size_t uid_;
 };

--- a/torch/csrc/distributed/c10d/UCCForNCCL.hpp
+++ b/torch/csrc/distributed/c10d/UCCForNCCL.hpp
@ -1,27 +0,0 @@
-#pragma once
-
-#include <cassert>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include <ATen/DynamicLibrary.h>
-
-namespace c10d {
-
-inline std::shared_ptr<at::DynamicLibrary> loadTorchUCC() {
-  const char* path = std::getenv("TORCH_UCC_LIBRARY_PATH");
-  if (path != nullptr) {
-    try {
-      return std::make_shared<at::DynamicLibrary>(path);
-    } catch (const c10::DynamicLibraryError& e) {
-      TORCH_WARN(
-          "TORCH_UCC_LIBRARY_PATH is set, "
-          "but the loading of torch_ucc.so failed with:",
-          e.msg());
-    }
-  }
-  return nullptr;
-}
-
-} // namespace c10d
--- a/torch/csrc/distributed/c10d/init.cpp
+++ b/torch/csrc/distributed/c10d/init.cpp
@ -2294,9 +2294,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
              "comm_split_count",
              &::c10d::ProcessGroupNCCL::getCommSplitCounter)
          .def_property_readonly(
-              "options", &::c10d::ProcessGroupNCCL::getOptions)
-          .def_property_readonly(
-              "is_ucc_available", &::c10d::ProcessGroupNCCL::isUCCAvailable);
+              "options", &::c10d::ProcessGroupNCCL::getOptions);

 #ifdef NCCL_HAS_COMM_CTA_CGA
  py::class_<ncclConfig_t>(