From aeccec755de69ece191f4d8266c20df18d96005d Mon Sep 17 00:00:00 2001
From: Pieter Noordhuis <pietern@fb.com>
Date: Wed, 11 Jul 2018 21:29:33 -0700
Subject: [PATCH] In Gloo backend use ring reduction by default (#9309)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/9309

This is faster when you're dealing with a small number of processes.
Around the 16 processes mark the halving/doubling algorithm is faster.

Reviewed By: apaszke

Differential Revision: D8785364

fbshipit-source-id: 4a03326266e473026d943787186e149d0cc489f0
---
 torch/lib/c10d/ProcessGroupGloo.cpp | 44 +++++++++++++++++++++--------
 1 file changed, 32 insertions(+), 12 deletions(-)
diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp
index 8ff0af867b62..e4d77b9afd73 100644
--- a/torch/lib/c10d/ProcessGroupGloo.cpp
+++ b/torch/lib/c10d/ProcessGroupGloo.cpp
@@ -1,8 +1,10 @@
 #include "ProcessGroupGloo.hpp"
 
 #include <gloo/allreduce_halving_doubling.h>
+#include <gloo/allreduce_ring_chunked.h>
 #include <gloo/broadcast_one_to_all.h>
 #include <gloo/cuda_allreduce_halving_doubling.h>
+#include <gloo/cuda_allreduce_ring_chunked.h>
 #include <gloo/cuda_broadcast_one_to_all.h>
 #include <gloo/rendezvous/context.h>
 #include <gloo/transport/tcp/device.h>
@@ -320,22 +322,40 @@ void ProcessGroupGloo::createAllreduce(AlgorithmEntry& entry) {
   auto& context = contexts_[0];
 
   if (backend == at::kCPU) {
-    entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
-        new ::gloo::AllreduceHalvingDoubling<T>(
-            context,
-            getDataPointers<T>(entry.src),
-            entry.src[0].numel(),
-            reductionFunction<T>(key.reduceOp)));
+    if (getSize() < 16) {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::AllreduceRingChunked<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              reductionFunction<T>(key.reduceOp)));
+    } else {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::AllreduceHalvingDoubling<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              reductionFunction<T>(key.reduceOp)));
+    }
     return;
   }
 
   if (backend == at::kCUDA) {
-    entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
-        new ::gloo::CudaAllreduceHalvingDoubling<T>(
-            context,
-            getDataPointers<T>(entry.src),
-            entry.src[0].numel(),
-            getStreamVector(entry)));
+    if (getSize() < 16) {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::CudaAllreduceRingChunked<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              getStreamVector(entry)));
+    } else {
+      entry.algorithm = std::unique_ptr<::gloo::Algorithm>(
+          new ::gloo::CudaAllreduceHalvingDoubling<T>(
+              context,
+              getDataPointers<T>(entry.src),
+              entry.src[0].numel(),
+              getStreamVector(entry)));
+    }
     return;
   }