mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
[ProcessGroupNCCL] Remove jumper to UCC (#114170)
The "jumper" to UCC lib in ProcessGroupNCCL was a temporary solution a while back. Cleaning it now that UCC has its own "PG" representation. Pull Request resolved: https://github.com/pytorch/pytorch/pull/114170 Approved by: https://github.com/wconstab, https://github.com/fduwjj, https://github.com/XilunWu, https://github.com/Aidyn-A
This commit is contained in:
@ -328,9 +328,6 @@ cmake_dependent_option(
|
|||||||
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
|
USE_C10D_GLOO "USE C10D GLOO" ON "USE_DISTRIBUTED;USE_GLOO" OFF)
|
||||||
cmake_dependent_option(
|
cmake_dependent_option(
|
||||||
USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
|
USE_C10D_NCCL "USE C10D NCCL" ON "USE_DISTRIBUTED;USE_NCCL" OFF)
|
||||||
cmake_dependent_option(
|
|
||||||
USE_NCCL_WITH_UCC "Enable UCC support for ProcessGroupNCCL. Only available if USE_C10D_NCCL is on." OFF
|
|
||||||
"USE_C10D_NCCL" OFF)
|
|
||||||
cmake_dependent_option(
|
cmake_dependent_option(
|
||||||
USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
|
USE_C10D_MPI "USE C10D MPI" ON "USE_DISTRIBUTED;USE_MPI" OFF)
|
||||||
cmake_dependent_option(
|
cmake_dependent_option(
|
||||||
|
@ -1330,9 +1330,6 @@ if(USE_DISTRIBUTED)
|
|||||||
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
target_compile_definitions(torch_hip PUBLIC USE_C10D_NCCL)
|
||||||
else()
|
else()
|
||||||
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
target_compile_definitions(torch_cuda PUBLIC USE_C10D_NCCL)
|
||||||
if(USE_NCCL_WITH_UCC)
|
|
||||||
target_compile_definitions(torch_cuda PUBLIC USE_NCCL_WITH_UCC)
|
|
||||||
endif()
|
|
||||||
endif()
|
endif()
|
||||||
endif()
|
endif()
|
||||||
if(USE_MPI AND USE_C10D_MPI)
|
if(USE_MPI AND USE_C10D_MPI)
|
||||||
|
@ -154,7 +154,6 @@ function(caffe2_print_configuration_summary)
|
|||||||
message(STATUS " USE_NCCL : ${USE_NCCL}")
|
message(STATUS " USE_NCCL : ${USE_NCCL}")
|
||||||
if(${USE_NCCL})
|
if(${USE_NCCL})
|
||||||
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
|
message(STATUS " USE_SYSTEM_NCCL : ${USE_SYSTEM_NCCL}")
|
||||||
message(STATUS " USE_NCCL_WITH_UCC : ${USE_NCCL_WITH_UCC}")
|
|
||||||
endif()
|
endif()
|
||||||
message(STATUS " USE_NNPACK : ${USE_NNPACK}")
|
message(STATUS " USE_NNPACK : ${USE_NNPACK}")
|
||||||
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
|
message(STATUS " USE_NUMPY : ${USE_NUMPY}")
|
||||||
|
@ -1,6 +1,5 @@
|
|||||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
|
#include <torch/csrc/distributed/c10d/ProcessGroupNCCL.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/UCCForNCCL.hpp>
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
#include <mutex>
|
#include <mutex>
|
||||||
#include <sstream>
|
#include <sstream>
|
||||||
@ -1064,28 +1063,6 @@ ProcessGroupNCCL::ProcessGroupNCCL(
|
|||||||
&cacheAllocatorDeregisterHook);
|
&cacheAllocatorDeregisterHook);
|
||||||
allocatorHooksAttached = true;
|
allocatorHooksAttached = true;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_NCCL_WITH_UCC
|
|
||||||
static c10::once_flag initialize_ucc_lib_flag;
|
|
||||||
c10::call_once(initialize_ucc_lib_flag, [&] {
|
|
||||||
uccLib_ = loadTorchUCC();
|
|
||||||
if (uccLib_ != nullptr) {
|
|
||||||
LOG(INFO) << "[Rank " << rank_ << "] torch_ucc.so loaded";
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
if (uccLib_ != nullptr) {
|
|
||||||
LOG(INFO) << "[Rank " << rank_ << "] torch_ucc.so loaded";
|
|
||||||
typedef c10::intrusive_ptr<Backend> fn(
|
|
||||||
const c10::intrusive_ptr<Store>& store, int rank, int size);
|
|
||||||
auto createProcessGroupUCC =
|
|
||||||
reinterpret_cast<fn*>(uccLib_->sym("createProcessGroupUCC"));
|
|
||||||
if (createProcessGroupUCC != nullptr) {
|
|
||||||
uccPG_ = createProcessGroupUCC(store, rank_, size_);
|
|
||||||
LOG(INFO) << "[Rank " << rank_ << "] ProcessGroupUCC created.";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void ProcessGroupNCCL::runHealthCheck() {
|
void ProcessGroupNCCL::runHealthCheck() {
|
||||||
@ -4134,18 +4111,6 @@ c10::intrusive_ptr<Work> ProcessGroupNCCL::_allgather_base(
|
|||||||
avoidRecordStreams);
|
avoidRecordStreams);
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef USE_NCCL_WITH_UCC
|
|
||||||
std::shared_ptr<at::DynamicLibrary> ProcessGroupNCCL::uccLib_ = nullptr;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool ProcessGroupNCCL::isUCCAvailable() const {
|
|
||||||
#ifdef USE_NCCL_WITH_UCC
|
|
||||||
return (uccPG_ != nullptr);
|
|
||||||
#else
|
|
||||||
return false;
|
|
||||||
#endif
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace c10d
|
} // namespace c10d
|
||||||
|
|
||||||
#endif // USE_C10D_NCCL
|
#endif // USE_C10D_NCCL
|
||||||
|
@ -12,7 +12,6 @@
|
|||||||
#include <torch/csrc/distributed/c10d/Backend.hpp>
|
#include <torch/csrc/distributed/c10d/Backend.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
#include <torch/csrc/distributed/c10d/NCCLUtils.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/Store.hpp>
|
#include <torch/csrc/distributed/c10d/Store.hpp>
|
||||||
#include <torch/csrc/distributed/c10d/UCCForNCCL.hpp>
|
|
||||||
|
|
||||||
#include <ATen/DynamicLibrary.h>
|
#include <ATen/DynamicLibrary.h>
|
||||||
#include <ATen/cuda/CUDAContext.h>
|
#include <ATen/cuda/CUDAContext.h>
|
||||||
@ -530,9 +529,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
|||||||
// Provide an API for users to define their own ways to store NCCL debug info.
|
// Provide an API for users to define their own ways to store NCCL debug info.
|
||||||
void registerDebugInfoWriter(std::unique_ptr<DebugInfoWriter> writer);
|
void registerDebugInfoWriter(std::unique_ptr<DebugInfoWriter> writer);
|
||||||
|
|
||||||
// Tests if the UCC fallback path is available
|
|
||||||
bool isUCCAvailable() const;
|
|
||||||
|
|
||||||
// Provides an API to abort the ProcessGroup (similar to ncclCommAbort)
|
// Provides an API to abort the ProcessGroup (similar to ncclCommAbort)
|
||||||
// instead of relying on ProcessGroupNCCL destructor.
|
// instead of relying on ProcessGroupNCCL destructor.
|
||||||
void abort(c10::optional<std::string> abortReason = c10::nullopt);
|
void abort(c10::optional<std::string> abortReason = c10::nullopt);
|
||||||
@ -899,11 +895,6 @@ class TORCH_API ProcessGroupNCCL : public Backend {
|
|||||||
// The callback function to store NCCL debug info.
|
// The callback function to store NCCL debug info.
|
||||||
std::unique_ptr<DebugInfoWriter> debugInfoWriter_ = nullptr;
|
std::unique_ptr<DebugInfoWriter> debugInfoWriter_ = nullptr;
|
||||||
|
|
||||||
#ifdef USE_NCCL_WITH_UCC
|
|
||||||
// ProcessGroupUCC shared library handle and ProcessGroup pointer
|
|
||||||
static std::shared_ptr<at::DynamicLibrary> uccLib_;
|
|
||||||
c10::intrusive_ptr<Backend> uccPG_;
|
|
||||||
#endif
|
|
||||||
size_t uid_;
|
size_t uid_;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -1,27 +0,0 @@
|
|||||||
#pragma once
|
|
||||||
|
|
||||||
#include <cassert>
|
|
||||||
#include <memory>
|
|
||||||
#include <string>
|
|
||||||
#include <vector>
|
|
||||||
|
|
||||||
#include <ATen/DynamicLibrary.h>
|
|
||||||
|
|
||||||
namespace c10d {
|
|
||||||
|
|
||||||
inline std::shared_ptr<at::DynamicLibrary> loadTorchUCC() {
|
|
||||||
const char* path = std::getenv("TORCH_UCC_LIBRARY_PATH");
|
|
||||||
if (path != nullptr) {
|
|
||||||
try {
|
|
||||||
return std::make_shared<at::DynamicLibrary>(path);
|
|
||||||
} catch (const c10::DynamicLibraryError& e) {
|
|
||||||
TORCH_WARN(
|
|
||||||
"TORCH_UCC_LIBRARY_PATH is set, "
|
|
||||||
"but the loading of torch_ucc.so failed with:",
|
|
||||||
e.msg());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
} // namespace c10d
|
|
@ -2294,9 +2294,7 @@ options :class:`~torch.distributed.ProcessGroupNCCL.Options`).
|
|||||||
"comm_split_count",
|
"comm_split_count",
|
||||||
&::c10d::ProcessGroupNCCL::getCommSplitCounter)
|
&::c10d::ProcessGroupNCCL::getCommSplitCounter)
|
||||||
.def_property_readonly(
|
.def_property_readonly(
|
||||||
"options", &::c10d::ProcessGroupNCCL::getOptions)
|
"options", &::c10d::ProcessGroupNCCL::getOptions);
|
||||||
.def_property_readonly(
|
|
||||||
"is_ucc_available", &::c10d::ProcessGroupNCCL::isUCCAvailable);
|
|
||||||
|
|
||||||
#ifdef NCCL_HAS_COMM_CTA_CGA
|
#ifdef NCCL_HAS_COMM_CTA_CGA
|
||||||
py::class_<ncclConfig_t>(
|
py::class_<ncclConfig_t>(
|
||||||
|
Reference in New Issue
Block a user