mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
This adds logs if we can't acquire locks in NCCLUtils and ProcessGroupNCCL for 30s. This is motivated by some deadlocks were seeing and it's unclear if it's in NCCL or on the PyTorch side of things. This required replacing most `std::mutex` with `std::timed_mutex` and `std::condition_variable_any` as appropriate. Test plan: existing CI for regressions will add unit tests on `C10D_LOCK_GUARD` Pull Request resolved: https://github.com/pytorch/pytorch/pull/134131 Approved by: https://github.com/c-p-i-o, https://github.com/fduwjj
55 lines
1.2 KiB
C++
55 lines
1.2 KiB
C++
#include <gtest/gtest.h>
|
|
|
|
#include <future>
|
|
#include <thread>
|
|
|
|
#include <c10/util/Logging.h>
|
|
#include <torch/csrc/distributed/c10d/LockGuard.hpp>
|
|
|
|
TEST(LockGuard, basic) {
|
|
std::timed_mutex mutex;
|
|
|
|
{
|
|
C10D_LOCK_GUARD(lock, mutex);
|
|
|
|
// already locked
|
|
ASSERT_FALSE(mutex.try_lock());
|
|
}
|
|
|
|
ASSERT_TRUE(mutex.try_lock());
|
|
mutex.unlock();
|
|
}
|
|
|
|
TEST(LockGuard, logging) {
|
|
// set log level to INFO
|
|
FLAGS_caffe2_log_level = 0;
|
|
|
|
std::timed_mutex mutex;
|
|
|
|
mutex.lock();
|
|
|
|
auto loggingThread = std::async(std::launch::async, [&]() {
|
|
std::unique_lock<std::timed_mutex> name{mutex, std::defer_lock};
|
|
::c10d::detail::lockWithLogging(
|
|
name, std::chrono::milliseconds(10), "my lock", __FILE__, __LINE__);
|
|
});
|
|
|
|
auto deadline = std::chrono::system_clock::now() + std::chrono::seconds(10);
|
|
while (true) {
|
|
ASSERT_LT(std::chrono::system_clock::now(), deadline);
|
|
|
|
testing::internal::CaptureStderr();
|
|
std::this_thread::sleep_for(std::chrono::milliseconds(20));
|
|
std::string output = testing::internal::GetCapturedStderr();
|
|
|
|
if (output.find("my lock: waiting for lock for 10ms") !=
|
|
std::string::npos) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
mutex.unlock();
|
|
|
|
loggingThread.get();
|
|
}
|