diff --git a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp index 9620b01c3c47..94824287acf5 100644 --- a/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp +++ b/torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp @@ -819,6 +819,12 @@ ProcessGroupNCCL::ProcessGroupNCCL( ValueError, at::cuda::getNumGPUs() != 0, "ProcessGroupNCCL is only supported with GPUs, no GPUs found!"); + + // getNcclVersion needs to get called before launching threads which can + // potentially call getenv. getNcclVersion internally calls setenv to set some + // environment variables from config file, which can race with getenv from + // other threads and cause segfaults. + const auto ncclVersion = getNcclVersion(); this->setGroupUid(options_->group_name); this->localDeviceCount_ = at::cuda::getNumGPUs(); logPrefix_ = createLogPrefix(); @@ -911,7 +917,7 @@ ProcessGroupNCCL::ProcessGroupNCCL( << ", PG Name: " << options_->group_name; LOG(INFO) << logPrefix() << "ProcessGroupNCCL environments: " - << "NCCL version: " << getNcclVersion() + << "NCCL version: " << ncclVersion << ", TORCH_NCCL_ASYNC_ERROR_HANDLING: " << asyncErrorHandling_ << ", TORCH_NCCL_DUMP_ON_TIMEOUT: " << dumpOnTimeoutOrEx_ << ", TORCH_NCCL_WAIT_TIMEOUT_DUMP_MILSEC: "