From aeccec755de69ece191f4d8266c20df18d96005d Mon Sep 17 00:00:00 2001 From: Pieter Noordhuis Date: Wed, 11 Jul 2018 21:29:33 -0700 Subject: [PATCH] In Gloo backend use ring reduction by default (#9309) Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/9309 This is faster when you're dealing with a small number of processes. Around the 16 processes mark the halving/doubling algorithm is faster. Reviewed By: apaszke Differential Revision: D8785364 fbshipit-source-id: 4a03326266e473026d943787186e149d0cc489f0 --- torch/lib/c10d/ProcessGroupGloo.cpp | 44 +++++++++++++++++++++-------- 1 file changed, 32 insertions(+), 12 deletions(-) diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index 8ff0af867b62..e4d77b9afd73 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -1,8 +1,10 @@ #include "ProcessGroupGloo.hpp" #include +#include #include #include +#include #include #include #include @@ -320,22 +322,40 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) { auto& context = contexts_[0]; if (backend == at::kCPU) { - entry.algorithm = std::unique_ptr<::gloo::Algorithm>( - new ::gloo::AllreduceHalvingDoubling( - context, - getDataPointers(entry.src), - entry.src[0].numel(), - reductionFunction(key.reduceOp))); + if (getSize() < 16) { + entry.algorithm = std::unique_ptr<::gloo::Algorithm>( + new ::gloo::AllreduceRingChunked( + context, + getDataPointers(entry.src), + entry.src[0].numel(), + reductionFunction(key.reduceOp))); + } else { + entry.algorithm = std::unique_ptr<::gloo::Algorithm>( + new ::gloo::AllreduceHalvingDoubling( + context, + getDataPointers(entry.src), + entry.src[0].numel(), + reductionFunction(key.reduceOp))); + } return; } if (backend == at::kCUDA) { - entry.algorithm = std::unique_ptr<::gloo::Algorithm>( - new ::gloo::CudaAllreduceHalvingDoubling( - context, - getDataPointers(entry.src), - entry.src[0].numel(), - getStreamVector(entry))); + if (getSize() < 16) { + entry.algorithm = std::unique_ptr<::gloo::Algorithm>( + new ::gloo::CudaAllreduceRingChunked( + context, + getDataPointers(entry.src), + entry.src[0].numel(), + getStreamVector(entry))); + } else { + entry.algorithm = std::unique_ptr<::gloo::Algorithm>( + new ::gloo::CudaAllreduceHalvingDoubling( + context, + getDataPointers(entry.src), + entry.src[0].numel(), + getStreamVector(entry))); + } return; }