Files
pytorch/torch/csrc/distributed/c10d/TCPStoreBackend.hpp
Tristan Rice 9ee8c18309 TCPStore: add ping to verify network connectivity on connect (#129985)
This does a round trip request on socket connect -- this allows for detecting connection resets etc and retrying before the non-retryable application requests are sent.

This adds support for PING to both the libuv and legacy backend.

Example error:
```
[trainer85612|12]:W0701 13:41:43.421574  4776 TCPStore.cpp:182] [c10d] recvValue failed on SocketImpl(fd=24, ...): Connection reset by peer
[trainer85612|12]:Exception raised from recvBytes at /mnt/code/pytorch/torch/csrc/distributed/c10d/Utils.hpp:669 (most recent call first):
...
[trainer85612|12]:#9 c10d::TCPStore::incrementValueBy(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, long) from /packages/.../conda/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so:84809637
[trainer85612|12]:#10 c10d::TCPStore::waitForWorkers() from /packages/.../conda/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so:84812868
[trainer85612|12]:#11 c10d::TCPStore::TCPStore(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> >, c10d::TCPStoreOptions const&) from /packages/.../conda/lib/python3.10/site-packages/torch/lib/libtorch_cpu.so:84814775
```

Test plan:

```
python test/distributed/test_store.py -v
```

```
tristanr@devvm4382 ~/pytorch (d4l3k/tcpstore_ping)> python ~/pt_tests/tcpstore_large_test.py
starting pool
started 90000
started 30000
started 70000
started 20000
started 80000
started 60000
started 0
[W702 16:16:25.301681870 TCPStore.cpp:343] [c10d] Starting store with 100000 workers but somaxconn is 4096.This might cause instability during bootstrap, consider increasing it.
init 20000
set 20000
init 80000
set 80000
init 70000
set 70000
init 60000
set 60000
init 30000
set 30000
init 90000
set 90000
started 40000
init 40000
set 40000
started 50000
init 50000
set 50000
started 10000
init 10000
set 10000
init 0
set 0
run finished 617.2992351055145
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/129985
Approved by: https://github.com/rsdcastro, https://github.com/kurman
2024-07-03 02:09:44 +00:00

75 lines
1.5 KiB
C++

#pragma once
#include <thread>
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
#include <torch/csrc/distributed/c10d/socket.h>
#ifdef _WIN32
#include <io.h>
#include <winsock2.h>
#else
#include <poll.h>
#include <unistd.h>
#endif
namespace c10d::detail {
// Magic number for client validation.
static const uint32_t validationMagicNumber = 0x3C85F7CE;
enum class QueryType : uint8_t {
VALIDATE,
SET,
COMPARE_SET,
GET,
ADD,
CHECK,
WAIT,
GETNUMKEYS,
DELETE_KEY,
APPEND,
MULTI_GET,
MULTI_SET,
CANCEL_WAIT,
PING,
};
enum class CheckResponseType : uint8_t { READY, NOT_READY };
enum class WaitResponseType : uint8_t { STOP_WAITING, WAIT_CANCELED };
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
// Contains the windows/unix implementations to signal a
// shutdown sequence for the thread
class BackgroundThread {
public:
explicit BackgroundThread();
virtual ~BackgroundThread() = 0;
virtual std::uint16_t port() const = 0;
void start();
bool stop_requested();
protected:
void dispose();
virtual void run() = 0;
virtual void stop() = 0;
bool is_running() {
return is_running_.load();
}
private:
std::atomic<bool> is_running_{false};
std::thread daemonThread_{};
};
std::unique_ptr<BackgroundThread> create_tcpstore_backend(
const TCPStoreOptions& opts);
std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
const TCPStoreOptions& opts);
bool is_libuv_tcpstore_backend_available();
} // namespace c10d::detail