mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-12 14:54:55 +08:00
This does a round trip request on socket connect -- this allows for detecting connection resets etc and retrying before the non-retryable application requests are sent. This adds support for PING to both the libuv and legacy backend. Example error: ``` [trainer85612|12]:W0701 13:41:43.421574 4776 TCPStore.cpp:182] [c10d] recvValue failed on SocketImpl(fd=24, ...): Connection reset by peer [trainer85612|12]:Exception raised from recvBytes at /mnt/code/pytorch/torch/csrc/distributed/c10d/Utils.hpp:669 (most recent call first): ... [trainer85612|12]:#9 c10d::TCPStore::incrementValueBy(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, long) from /packages/.../conda/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so:84809637 [trainer85612|12]:#10 c10d::TCPStore::waitForWorkers() from /packages/.../conda/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so:84812868 [trainer85612|12]:#11 c10d::TCPStore::TCPStore(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10d::TCPStoreOptions const&) from /packages/.../conda/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so:84814775 ``` Test plan: ``` python test/distributed/test_store.py -v ``` ``` tristanr@devvm4382 ~/pytorch (d4l3k/tcpstore_ping)> python ~/pt_tests/tcpstore_large_test.py starting pool started 90000 started 30000 started 70000 started 20000 started 80000 started 60000 started 0 [W702 16:16:25.301681870 TCPStore.cpp:343] [c10d] Starting store with 100000 workers but somaxconn is 4096.This might cause instability during bootstrap, consider increasing it. init 20000 set 20000 init 80000 set 80000 init 70000 set 70000 init 60000 set 60000 init 30000 set 30000 init 90000 set 90000 started 40000 init 40000 set 40000 started 50000 init 50000 set 50000 started 10000 init 10000 set 10000 init 0 set 0 run finished 617.2992351055145 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/129985 Approved by: https://github.com/rsdcastro, https://github.com/kurman
75 lines
1.5 KiB
C++
75 lines
1.5 KiB
C++
#pragma once
|
|
|
|
#include <thread>
|
|
|
|
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
|
|
#include <torch/csrc/distributed/c10d/socket.h>
|
|
|
|
#ifdef _WIN32
|
|
#include <io.h>
|
|
#include <winsock2.h>
|
|
#else
|
|
#include <poll.h>
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
namespace c10d::detail {
|
|
|
|
// Magic number for client validation.
|
|
static const uint32_t validationMagicNumber = 0x3C85F7CE;
|
|
|
|
enum class QueryType : uint8_t {
|
|
VALIDATE,
|
|
SET,
|
|
COMPARE_SET,
|
|
GET,
|
|
ADD,
|
|
CHECK,
|
|
WAIT,
|
|
GETNUMKEYS,
|
|
DELETE_KEY,
|
|
APPEND,
|
|
MULTI_GET,
|
|
MULTI_SET,
|
|
CANCEL_WAIT,
|
|
PING,
|
|
};
|
|
|
|
enum class CheckResponseType : uint8_t { READY, NOT_READY };
|
|
|
|
enum class WaitResponseType : uint8_t { STOP_WAITING, WAIT_CANCELED };
|
|
|
|
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
|
|
// Contains the windows/unix implementations to signal a
|
|
// shutdown sequence for the thread
|
|
class BackgroundThread {
|
|
public:
|
|
explicit BackgroundThread();
|
|
|
|
virtual ~BackgroundThread() = 0;
|
|
virtual std::uint16_t port() const = 0;
|
|
|
|
void start();
|
|
bool stop_requested();
|
|
|
|
protected:
|
|
void dispose();
|
|
virtual void run() = 0;
|
|
virtual void stop() = 0;
|
|
bool is_running() {
|
|
return is_running_.load();
|
|
}
|
|
|
|
private:
|
|
std::atomic<bool> is_running_{false};
|
|
std::thread daemonThread_{};
|
|
};
|
|
|
|
std::unique_ptr<BackgroundThread> create_tcpstore_backend(
|
|
const TCPStoreOptions& opts);
|
|
std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
|
|
const TCPStoreOptions& opts);
|
|
bool is_libuv_tcpstore_backend_available();
|
|
|
|
} // namespace c10d::detail
|