Files
pytorch/test/cpp/c10d/BackoffTest.cpp
Tristan Rice 0298560ca2 TCPStore: improve connect and retry logic (#129261)
We've been facing issues where TCPStore can successfully connect but then fail in the validate() function due to resets from listen backlog queue overflow when combined with reset enabled as well as long init times.

This PR does a few things:
* Retry that connect and validate up to the specified timeout.
* Use exponential backoff for the retry logic with jitter instead of a fixed 1s sleep.
* Eliminate the `sleep(std::chrono::milliseconds(numWorkers))` on init which can add significant delays to startup. This is no longer necessary per @XilunWu https://github.com/pytorch/pytorch/pull/116141

Test plan:

```
python test/distributed/test_store.py -v
./build/bin/BackoffTest
```

Will do internal testing with some large scale jobs to ensure TCPStore works correctly.

At 4k scale: 4x improvement

```
tristanr@devvm4382 ~/pt_tests [SIGABRT]> time TORCH_SHOW_CPP_STACKTRACES=1 python tcpstore_large_test.py                                                                                                   (pytorch-3.10)
started 0
init 0
set 0
joined all

________________________________________________________
Executed in    1.98 secs    fish           external
   usr time    0.93 secs   91.00 micros    0.93 secs
   sys time    1.98 secs  954.00 micros    1.97 secs

tristanr@devvm4382 ~/pt_tests> conda activate torchdrive-3.10                                                                                                                                              (pytorch-3.10)
tristanr@devvm4382 ~/pt_tests> time TORCH_SHOW_CPP_STACKTRACES=1 python tcpstore_large_test.py                                                                                                          (torchdrive-3.10)
started 0
init 0
set 0
joined all

________________________________________________________
Executed in    8.20 secs    fish           external
   usr time    2.15 secs    0.00 micros    2.15 secs
   sys time    2.76 secs  843.00 micros    2.76 secs
```

```py
import time
import os
import threading
from multiprocessing import Pool

WORLD_SIZE = 10000

import torch.distributed as dist

def run(rank):
    should_log = rank % (WORLD_SIZE // 10) == 0
    if should_log:
        print(f"started {rank}")
    store = dist.TCPStore(
        host_name="devvm4382.nao0.facebook.com",
        port=29500,
        world_size=WORLD_SIZE,
        is_master=rank == 0,
        use_libuv=True,
    )
    if should_log:
        print(f"init {rank}")
    store.set(f"key{rank}", "1234")
    if should_log:
        print(f"set {rank}")
    del store

def noop(rank):
    pass

print("starting pool")
with Pool(WORLD_SIZE) as pool:
    pool.map(noop, range(WORLD_SIZE), 1)
    print("pool hot")
    start = time.time()
    pool.map(run, range(WORLD_SIZE), 1)
    print("run finished", time.time()-start)
```

```
tristanr@devvm4382 ~/pt_tests> python tcpstore_large_test.py                                                                                                                                (pytorch-3.10)
starting pool
pool hot
started 0
[W624 16:58:09.086081750 TCPStore.cpp:343] [c10d] Starting store with 10000 workers but somaxconn is 4096.This might cause instability during bootstrap, consider increasing it.
started 1000
init 1000
set 1000
started 2000
init 2000
set 2000
started 3000
init 3000
set 3000
started 4000
init 4000
set 4000
started 5000
init 5000
set 5000
started 6000
init 6000
set 6000
started 7000
init 7000
set 7000
started 8000
init 8000
set 8000
started 9000
init 9000
set 9000
init 0
set 0
run finished 0.705092191696167
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/129261
Approved by: https://github.com/rsdcastro, https://github.com/wconstab, https://github.com/kurman, https://github.com/XilunWu, https://github.com/c-p-i-o
2024-06-25 19:24:22 +00:00

69 lines
2.4 KiB
C++

#include <c10/util/irange.h>
#include "StoreTestCommon.hpp"
#include <iostream>
#include <thread>
#include <torch/csrc/distributed/c10d/Backoff.hpp>
TEST(BackoffTest, exponentialBackoffDefaults) {
c10d::ExponentialBackoffWithJitter backoff;
EXPECT_EQ(backoff.initialInterval, std::chrono::milliseconds(500));
EXPECT_EQ(backoff.maxInterval, std::chrono::milliseconds(60000));
EXPECT_EQ(backoff.multiplier, 1.5);
EXPECT_EQ(backoff.randomizationFactor, 0.5);
}
TEST(BackoffTest, exponentialBackoff) {
c10d::ExponentialBackoffWithJitter backoff;
backoff.randomizationFactor = 0.0;
backoff.multiplier = 2.0;
backoff.maxInterval = std::chrono::milliseconds(5000);
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(500));
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(2000));
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(4000));
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(5000));
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(5000));
backoff.reset();
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(500));
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
}
TEST(BackoffTest, expontentialBackoffRandomization) {
c10d::ExponentialBackoffWithJitter backoff;
backoff.initialInterval = std::chrono::milliseconds(1000);
backoff.randomizationFactor = 0.5;
backoff.multiplier = 1.0;
backoff.maxInterval = std::chrono::milliseconds(5000);
for (int i = 0; i < 100; i++) {
auto backoffDur = backoff.nextBackoff();
EXPECT_GE(backoffDur, std::chrono::milliseconds(500));
EXPECT_LE(backoffDur, std::chrono::milliseconds(1500));
}
}
TEST(BackoffTest, fixedBackoff) {
c10d::FixedBackoff backoff{std::chrono::milliseconds(1000)};
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
backoff.reset();
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
}
TEST(BackoffTest, sleep) {
std::chrono::milliseconds sleepTime{10};
c10d::FixedBackoff backoff{sleepTime};
EXPECT_EQ(backoff.nextBackoff(), sleepTime);
auto start = std::chrono::high_resolution_clock::now();
backoff.sleepBackoff();
auto dur = std::chrono::high_resolution_clock::now() - start;
EXPECT_GE(dur, sleepTime);
}