mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
We've been facing issues where TCPStore can successfully connect but then fail in the validate() function due to resets from listen backlog queue overflow when combined with reset enabled as well as long init times. This PR does a few things: * Retry that connect and validate up to the specified timeout. * Use exponential backoff for the retry logic with jitter instead of a fixed 1s sleep. * Eliminate the `sleep(std::chrono::milliseconds(numWorkers))` on init which can add significant delays to startup. This is no longer necessary per @XilunWu https://github.com/pytorch/pytorch/pull/116141 Test plan: ``` python test/distributed/test_store.py -v ./build/bin/BackoffTest ``` Will do internal testing with some large scale jobs to ensure TCPStore works correctly. At 4k scale: 4x improvement ``` tristanr@devvm4382 ~/pt_tests [SIGABRT]> time TORCH_SHOW_CPP_STACKTRACES=1 python tcpstore_large_test.py (pytorch-3.10) started 0 init 0 set 0 joined all ________________________________________________________ Executed in 1.98 secs fish external usr time 0.93 secs 91.00 micros 0.93 secs sys time 1.98 secs 954.00 micros 1.97 secs tristanr@devvm4382 ~/pt_tests> conda activate torchdrive-3.10 (pytorch-3.10) tristanr@devvm4382 ~/pt_tests> time TORCH_SHOW_CPP_STACKTRACES=1 python tcpstore_large_test.py (torchdrive-3.10) started 0 init 0 set 0 joined all ________________________________________________________ Executed in 8.20 secs fish external usr time 2.15 secs 0.00 micros 2.15 secs sys time 2.76 secs 843.00 micros 2.76 secs ``` ```py import time import os import threading from multiprocessing import Pool WORLD_SIZE = 10000 import torch.distributed as dist def run(rank): should_log = rank % (WORLD_SIZE // 10) == 0 if should_log: print(f"started {rank}") store = dist.TCPStore( host_name="devvm4382.nao0.facebook.com", port=29500, world_size=WORLD_SIZE, is_master=rank == 0, use_libuv=True, ) if should_log: print(f"init {rank}") store.set(f"key{rank}", "1234") if should_log: print(f"set {rank}") del store def noop(rank): pass print("starting pool") with Pool(WORLD_SIZE) as pool: pool.map(noop, range(WORLD_SIZE), 1) print("pool hot") start = time.time() pool.map(run, range(WORLD_SIZE), 1) print("run finished", time.time()-start) ``` ``` tristanr@devvm4382 ~/pt_tests> python tcpstore_large_test.py (pytorch-3.10) starting pool pool hot started 0 [W624 16:58:09.086081750 TCPStore.cpp:343] [c10d] Starting store with 10000 workers but somaxconn is 4096.This might cause instability during bootstrap, consider increasing it. started 1000 init 1000 set 1000 started 2000 init 2000 set 2000 started 3000 init 3000 set 3000 started 4000 init 4000 set 4000 started 5000 init 5000 set 5000 started 6000 init 6000 set 6000 started 7000 init 7000 set 7000 started 8000 init 8000 set 8000 started 9000 init 9000 set 9000 init 0 set 0 run finished 0.705092191696167 ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/129261 Approved by: https://github.com/rsdcastro, https://github.com/wconstab, https://github.com/kurman, https://github.com/XilunWu, https://github.com/c-p-i-o
69 lines
2.4 KiB
C++
69 lines
2.4 KiB
C++
#include <c10/util/irange.h>
|
|
#include "StoreTestCommon.hpp"
|
|
|
|
#include <iostream>
|
|
#include <thread>
|
|
|
|
#include <torch/csrc/distributed/c10d/Backoff.hpp>
|
|
|
|
TEST(BackoffTest, exponentialBackoffDefaults) {
|
|
c10d::ExponentialBackoffWithJitter backoff;
|
|
EXPECT_EQ(backoff.initialInterval, std::chrono::milliseconds(500));
|
|
EXPECT_EQ(backoff.maxInterval, std::chrono::milliseconds(60000));
|
|
EXPECT_EQ(backoff.multiplier, 1.5);
|
|
EXPECT_EQ(backoff.randomizationFactor, 0.5);
|
|
}
|
|
|
|
TEST(BackoffTest, exponentialBackoff) {
|
|
c10d::ExponentialBackoffWithJitter backoff;
|
|
backoff.randomizationFactor = 0.0;
|
|
backoff.multiplier = 2.0;
|
|
backoff.maxInterval = std::chrono::milliseconds(5000);
|
|
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(500));
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(2000));
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(4000));
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(5000));
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(5000));
|
|
|
|
backoff.reset();
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(500));
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
|
|
}
|
|
|
|
TEST(BackoffTest, expontentialBackoffRandomization) {
|
|
c10d::ExponentialBackoffWithJitter backoff;
|
|
backoff.initialInterval = std::chrono::milliseconds(1000);
|
|
backoff.randomizationFactor = 0.5;
|
|
backoff.multiplier = 1.0;
|
|
backoff.maxInterval = std::chrono::milliseconds(5000);
|
|
|
|
for (int i = 0; i < 100; i++) {
|
|
auto backoffDur = backoff.nextBackoff();
|
|
EXPECT_GE(backoffDur, std::chrono::milliseconds(500));
|
|
EXPECT_LE(backoffDur, std::chrono::milliseconds(1500));
|
|
}
|
|
}
|
|
|
|
TEST(BackoffTest, fixedBackoff) {
|
|
c10d::FixedBackoff backoff{std::chrono::milliseconds(1000)};
|
|
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
|
|
backoff.reset();
|
|
EXPECT_EQ(backoff.nextBackoff(), std::chrono::milliseconds(1000));
|
|
}
|
|
|
|
TEST(BackoffTest, sleep) {
|
|
std::chrono::milliseconds sleepTime{10};
|
|
c10d::FixedBackoff backoff{sleepTime};
|
|
|
|
EXPECT_EQ(backoff.nextBackoff(), sleepTime);
|
|
|
|
auto start = std::chrono::high_resolution_clock::now();
|
|
backoff.sleepBackoff();
|
|
auto dur = std::chrono::high_resolution_clock::now() - start;
|
|
EXPECT_GE(dur, sleepTime);
|
|
}
|