mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
This adds queue operations as described in https://github.com/pytorch/pytorch/issues/150943. This works by adding two new operations `queue_push` and `queue_pop`. The semantics are designed to be blocking with a timeout. Pushing will always succeed as the queue is infinite size. Popping will first call `wait` until the key is ready and then pop the value from the queue. This implements queues for only: HashStore, TCPStore w/ libuv. FileStore and the legacy backends are not supported. `wait` and `check` work for queue operations though queue_push will only wake up the first waiter rather than all of them. This also has a few cleanups to error types/documentation in related code. Example trace: ``` [I409 16:51:43.963833529 TCPStoreLibUvBackend.cpp:829] [c10d - trace] validate magic:1015412686 address:[localhost]:55816 [I409 16:51:43.963845838 TCPStoreLibUvBackend.cpp:842] [c10d - trace] ping nonce:2840795 address:[localhost]:55816 [I409 16:51:43.963902914 TCPStoreLibUvBackend.cpp:911] [c10d - trace] add key:init/ val:1 address:[localhost]:55816 [I409 16:51:43.963939389 TCPStoreLibUvBackend.cpp:977] [c10d - trace] wait key_count:1 keys[0]:init/ address:[localhost]:55816 [I409 16:51:43.963974842 TCPStoreLibUvBackend.cpp:893] [c10d - trace] get key:init/ address:[localhost]:55816 [I409 16:51:43.964071909 TCPStoreLibUvBackend.cpp:1121] [c10d - trace] queue_push key:/test_prefix/test_queue_support address:[localhost]:55816 [I409 16:51:43.964080221 TCPStoreLibUvBackend.cpp:940] [c10d - trace] check key_count:1 keys[0]:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964108584 TCPStoreLibUvBackend.cpp:1121] [c10d - trace] queue_push key:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964123207 TCPStoreLibUvBackend.cpp:1121] [c10d - trace] queue_push key:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964128194 TCPStoreLibUvBackend.cpp:940] [c10d - trace] check key_count:1 keys[0]:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964156347 TCPStoreLibUvBackend.cpp:977] [c10d - trace] wait key_count:1 keys[0]:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964187493 TCPStoreLibUvBackend.cpp:977] [c10d - trace] wait key_count:1 keys[0]:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964217709 TCPStoreLibUvBackend.cpp:1133] [c10d - trace] queue_pop key:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964324300 TCPStoreLibUvBackend.cpp:977] [c10d - trace] wait key_count:1 keys[0]:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964354495 TCPStoreLibUvBackend.cpp:1133] [c10d - trace] queue_pop key:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964416299 TCPStoreLibUvBackend.cpp:940] [c10d - trace] check key_count:1 keys[0]:/test_prefix/foo address:[localhost]:55816 [I409 16:51:43.964458733 TCPStoreLibUvBackend.cpp:977] [c10d - trace] wait key_count:1 keys[0]:/test_prefix/non_existant address:[localhost]:55816 [W409 16:51:43.974516585 socket.cpp:460] [c10d] waitForInput: poll for socket SocketImpl(fd=75, addr=[localhost]:55816, remote=[localhost]:46641) returned 0, likely a timeout [W409 16:51:43.974559169 socket.cpp:485] [c10d] waitForInput: socket SocketImpl(fd=75, addr=[localhost]:55816, remote=[localhost]:46641) timed out after 10ms [I409 16:51:43.974600451 TCPStoreLibUvBackend.cpp:1101] [c10d - trace] cancel_wait address:[localhost]:55816 ``` Test plan: ``` $ pytest test/distributed/test_store.py -k queue -v -s test/distributed/test_store.py::FileStoreTest::test_queues SKIPPED [0.4351s] (Store does not support queues) test/distributed/test_store.py::HashStoreTest::test_queues PASSED [0.0009s] test/distributed/test_store.py::PrefixFileStoreTest::test_queues SKIPPED [0.0006s] (Store does not support queues) test/distributed/test_store.py::TCPStoreTest::test_queues SKIPPED [0.0012s] (Store does not support queues) test/distributed/test_store.py::LibUvTCPStoreTest::test_queues PASSED [0.0014s] test/distributed/test_store.py::PrefixTCPStoreTest::test_queues PASSED [0.0014s] ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/150969 Approved by: https://github.com/XilunWu, https://github.com/fduwjj
78 lines
1.5 KiB
C++
78 lines
1.5 KiB
C++
#pragma once
|
|
|
|
#include <thread>
|
|
|
|
#include <torch/csrc/distributed/c10d/TCPStore.hpp>
|
|
#include <torch/csrc/distributed/c10d/socket.h>
|
|
|
|
#ifdef _WIN32
|
|
#include <io.h>
|
|
#include <winsock2.h>
|
|
#else
|
|
#include <poll.h>
|
|
#include <unistd.h>
|
|
#endif
|
|
|
|
namespace c10d::detail {
|
|
|
|
// Magic number for client validation.
|
|
static const uint32_t validationMagicNumber = 0x3C85F7CE;
|
|
|
|
enum class QueryType : uint8_t {
|
|
VALIDATE,
|
|
SET,
|
|
COMPARE_SET,
|
|
GET,
|
|
ADD,
|
|
CHECK,
|
|
WAIT,
|
|
GETNUMKEYS,
|
|
DELETE_KEY,
|
|
APPEND,
|
|
MULTI_GET,
|
|
MULTI_SET,
|
|
CANCEL_WAIT,
|
|
PING,
|
|
QUEUE_PUSH,
|
|
QUEUE_POP,
|
|
QUEUE_LEN,
|
|
};
|
|
|
|
enum class CheckResponseType : uint8_t { READY, NOT_READY };
|
|
|
|
enum class WaitResponseType : uint8_t { STOP_WAITING, WAIT_CANCELED };
|
|
|
|
// Abstract base class to handle thread state for TCPStoreMasterDaemon.
|
|
// Contains the windows/unix implementations to signal a
|
|
// shutdown sequence for the thread
|
|
class BackgroundThread {
|
|
public:
|
|
explicit BackgroundThread();
|
|
|
|
virtual ~BackgroundThread() = 0;
|
|
virtual std::uint16_t port() const = 0;
|
|
|
|
void start();
|
|
bool stop_requested();
|
|
|
|
protected:
|
|
void dispose();
|
|
virtual void run() = 0;
|
|
virtual void stop() = 0;
|
|
bool is_running() {
|
|
return is_running_.load();
|
|
}
|
|
|
|
private:
|
|
std::atomic<bool> is_running_{false};
|
|
std::thread daemonThread_{};
|
|
};
|
|
|
|
std::unique_ptr<BackgroundThread> create_tcpstore_backend(
|
|
const TCPStoreOptions& opts);
|
|
std::unique_ptr<BackgroundThread> create_libuv_tcpstore_backend(
|
|
const TCPStoreOptions& opts);
|
|
bool is_libuv_tcpstore_backend_available();
|
|
|
|
} // namespace c10d::detail
|