Move more TCPStorestate from BackgroundThread to TCPStoreMasterDaemon as it won't be used by the libuv backend. (#105674)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/105674
Approved by: https://github.com/H-Huang
ghstack dependencies: #105163, #105164, #105184, #105672
This commit is contained in:
Rodrigo Kumpera
2023-07-26 10:23:15 -07:00
committed by PyTorch MergeBot
parent 57a47ed905
commit c9c66819a1
3 changed files with 83 additions and 80 deletions

View File

@ -65,6 +65,7 @@ std::shared_ptr<TCPServer> TCPServer::start(const TCPStoreOptions& opts) {
auto startCore = [&opts]() {
auto daemon = opts.useLibUV ? create_libuv_tcpstore_backend(opts)
: create_tcpstore_backend(opts);
daemon->start();
return std::make_shared<TCPServer>(daemon->port(), std::move(daemon));
};

View File

@ -28,11 +28,7 @@ namespace c10d {
namespace detail {
// Background thread parent class methods
BackgroundThread::BackgroundThread(Socket&& storeListenSocket)
: storeListenSocket_{std::move(storeListenSocket)} {
// Signal instance destruction to the daemon thread.
initStopSignal();
}
BackgroundThread::BackgroundThread() {}
BackgroundThread::~BackgroundThread() = default;
@ -44,63 +40,14 @@ void BackgroundThread::dispose() {
// Stop the run
stop();
// Join the thread
join();
// Close unclosed sockets
sockets_.clear();
// Now close the rest control pipe
closeStopSignal();
}
void BackgroundThread::join() {
daemonThread_.join();
}
#ifdef _WIN32
void BackgroundThread::initStopSignal() {
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
if (ghStopEvent_ == NULL) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
void BackgroundThread::start() {
daemonThread_ = std::thread{&BackgroundThread::run, this};
is_running_.store(true);
}
void BackgroundThread::closeStopSignal() {
CloseHandle(ghStopEvent_);
}
void BackgroundThread::stop() {
SetEvent(ghStopEvent_);
}
#else
void BackgroundThread::initStopSignal() {
if (pipe(controlPipeFd_.data()) == -1) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
}
void BackgroundThread::closeStopSignal() {
for (int fd : controlPipeFd_) {
if (fd != -1) {
::close(fd);
}
}
}
void BackgroundThread::stop() {
if (controlPipeFd_[1] != -1) {
::write(controlPipeFd_[1], "\0", 1);
// close the write end of the pipe
::close(controlPipeFd_[1]);
controlPipeFd_[1] = -1;
}
}
#endif
// Separate thread that is only launched on master
class TCPStoreMasterDaemon : public BackgroundThread {
public:
@ -110,8 +57,14 @@ class TCPStoreMasterDaemon : public BackgroundThread {
std::uint16_t port() const override;
protected:
void run() override;
void stop() override;
private:
void run();
void initStopSignal();
void closeStopSignal();
void queryFds(std::vector<struct pollfd>& fds);
void query(int socket);
void clearSocketWaitState(int socket);
@ -141,22 +94,83 @@ class TCPStoreMasterDaemon : public BackgroundThread {
std::unordered_map<std::string, std::vector<int>> waitingSockets_;
// From socket -> number of keys awaited
std::unordered_map<int, size_t> keysAwaited_;
Socket storeListenSocket_;
std::vector<Socket> sockets_{};
#ifdef _WIN32
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
HANDLE ghStopEvent_{};
#else
std::array<int, 2> controlPipeFd_{{-1, -1}};
#endif
};
// Simply start the daemon thread
TCPStoreMasterDaemon::TCPStoreMasterDaemon(Socket&& storeListenSocket)
: BackgroundThread{std::move(storeListenSocket)} {
daemonThread_ = std::thread{&TCPStoreMasterDaemon::run, this};
: storeListenSocket_{std::move(storeListenSocket)} {
initStopSignal();
}
TCPStoreMasterDaemon::~TCPStoreMasterDaemon() {
dispose();
// it's now safe for us to cleanup
// Close unclosed sockets
sockets_.clear();
// Now close the rest control pipe
closeStopSignal();
}
std::uint16_t TCPStoreMasterDaemon::port() const {
return storeListenSocket_.port();
}
#ifdef _WIN32
void TCPStoreMasterDaemon::initStopSignal() {
ghStopEvent_ = CreateEvent(NULL, TRUE, FALSE, NULL);
if (ghStopEvent_ == NULL) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
}
void TCPStoreMasterDaemon::closeStopSignal() {
CloseHandle(ghStopEvent_);
}
void TCPStoreMasterDaemon::stop() {
SetEvent(ghStopEvent_);
}
#else
void TCPStoreMasterDaemon::initStopSignal() {
if (pipe(controlPipeFd_.data()) == -1) {
TORCH_CHECK(
false,
"Failed to create the control pipe to start the "
"BackgroundThread run");
}
}
void TCPStoreMasterDaemon::closeStopSignal() {
for (int fd : controlPipeFd_) {
if (fd != -1) {
::close(fd);
}
}
}
void TCPStoreMasterDaemon::stop() {
if (controlPipeFd_[1] != -1) {
::write(controlPipeFd_[1], "\0", 1);
// close the write end of the pipe
::close(controlPipeFd_[1]);
controlPipeFd_[1] = -1;
}
}
#endif
void TCPStoreMasterDaemon::queryFds(std::vector<struct pollfd>& fds) {
// Skipping the fds[0] and fds[1],
// fds[0] is master's listening socket

View File

@ -42,33 +42,21 @@ enum class WaitResponseType : uint8_t { STOP_WAITING, WAIT_CANCELED };
// shutdown sequence for the thread
class BackgroundThread {
public:
explicit BackgroundThread(Socket&& storeListenSocket);
explicit BackgroundThread();
virtual ~BackgroundThread() = 0;
virtual std::uint16_t port() const = 0;
void start();
bool stop_requested();
protected:
void dispose();
Socket storeListenSocket_;
std::thread daemonThread_{};
std::vector<Socket> sockets_{};
#ifdef _WIN32
const std::chrono::milliseconds checkTimeout_ = std::chrono::milliseconds{10};
HANDLE ghStopEvent_{};
#else
std::array<int, 2> controlPipeFd_{{-1, -1}};
#endif
virtual void run() = 0;
virtual void stop() = 0;
bool is_running() { return is_running_.load(); }
private:
// Initialization for shutdown signal
void initStopSignal();
// Triggers the shutdown signal
void stop();
// Joins the thread
void join();
// Clean up the shutdown signal
void closeStopSignal();
std::atomic<bool> is_running_;
std::thread daemonThread_{};
};
std::unique_ptr<BackgroundThread> create_tcpstore_backend(const TCPStoreOptions& opts);