From 184bfbc3d7b37e8f202f4938f6ea9ba557c93b1e Mon Sep 17 00:00:00 2001
From: Rodrigo Kumpera <kumpera@fb.com>
Date: Thu, 30 Mar 2023 22:18:09 +0000
Subject: [PATCH] Move functional collectives to the right namespace (#97793)

This moves them from `torch._C._nn` to `torch._C._dist`
Pull Request resolved: https://github.com/pytorch/pytorch/pull/97793
Approved by: https://github.com/albanD
---
 aten/src/ATen/native/native_functions.yaml    | 12 ++--
 build.bzl                                     |  1 +
 build_variables.bzl                           |  1 +
 caffe2/CMakeLists.txt                         |  1 +
 pt_template_srcs.bzl                          |  1 +
 tools/BUCK.bzl                                |  1 +
 tools/autograd/gen_python_functions.py        | 14 ++++
 .../templates/python_dist_functions.cpp       | 68 +++++++++++++++++++
 torch/csrc/Module.cpp                         |  2 +
 torch/csrc/autograd/python_dist_functions.h   |  9 +++
 torch/distributed/_functional_collectives.py  | 10 +--
 11 files changed, 108 insertions(+), 12 deletions(-)
 create mode 100644 tools/autograd/templates/python_dist_functions.cpp
 create mode 100644 torch/csrc/autograd/python_dist_functions.h

diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml
index 893715750e53..53cbfd09ebba 100644
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@@ -14718,29 +14718,25 @@
 
 # Collectives
 - func: all_reduce(Tensor self, str reduceOp, str tag, int[] ranks, int group_size) -> Tensor
-  # This should be changed to distributed but it requires changes all over the place to work
-  python_module: nn
+  python_module: dist
   dispatch:
     CompositeExplicitAutograd: all_reduce
   variants: function
 
 - func: all_gather_into_tensor(Tensor shard, str tag, int[] ranks, int group_size) -> Tensor
-  # This should be changed to distributed but it requires changes all over the place to work
-  python_module: nn
+  python_module: dist
   dispatch:
     CompositeExplicitAutograd: all_gather_into_tensor
   variants: function
 
 - func: reduce_scatter_tensor(Tensor input, str reduceOp, int scatter_dim, str tag, int[] ranks, int group_size) -> Tensor
-  # This should be changed to distributed but it requires changes all over the place to work
-  python_module: nn
+  python_module: dist
   dispatch:
     CompositeExplicitAutograd: reduce_scatter_tensor
   variants: function
 
 - func: wait_tensor(Tensor self) -> Tensor
-  # This should be changed to distributed but it requires changes all over the place to work
-  python_module: nn
+  python_module: dist
   dispatch:
     CompositeExplicitAutograd: wait_tensor
 
diff --git a/build.bzl b/build.bzl
index 6d1bcf7a0601..613e0727d935 100644
--- a/build.bzl
+++ b/build.bzl
@@ -260,6 +260,7 @@ _GENERATED_AUTOGRAD_PYTHON_CPP = [
     "torch/csrc/autograd/generated/python_nested_functions.cpp",
     "torch/csrc/autograd/generated/python_fft_functions.cpp",
     "torch/csrc/autograd/generated/python_linalg_functions.cpp",
+    "torch/csrc/autograd/generated/python_dist_functions.cpp",
     "torch/csrc/autograd/generated/python_return_types.cpp",
     "torch/csrc/autograd/generated/python_enum_tag.cpp",
     "torch/csrc/autograd/generated/python_sparse_functions.cpp",
diff --git a/build_variables.bzl b/build_variables.bzl
index 0f2ee809b586..2d5ba1eaf11e 100644
--- a/build_variables.bzl
+++ b/build_variables.bzl
@@ -930,6 +930,7 @@ def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
         "torch/csrc/autograd/generated/python_nn_functions.cpp",
         "torch/csrc/autograd/generated/python_fft_functions.cpp",
         "torch/csrc/autograd/generated/python_linalg_functions.cpp",
+        "torch/csrc/autograd/generated/python_dist_functions.cpp",
         "torch/csrc/autograd/generated/python_enum_tag.cpp",
         "torch/csrc/autograd/generated/python_return_types.cpp",
         "torch/csrc/autograd/generated/python_sparse_functions.cpp",
diff --git a/caffe2/CMakeLists.txt b/caffe2/CMakeLists.txt
index 3c71c32bb935..4e373104dbe4 100644
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@@ -394,6 +394,7 @@ set(GENERATED_CXX_PYTHON
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_nested_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_sparse_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_special_functions.cpp"
+  "${TORCH_SRC_DIR}/csrc/autograd/generated/python_dist_functions.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_return_types.cpp"
   "${TORCH_SRC_DIR}/csrc/autograd/generated/python_enum_tag.cpp"
   )
diff --git a/pt_template_srcs.bzl b/pt_template_srcs.bzl
index feb14818700f..4caaea191a49 100644
--- a/pt_template_srcs.bzl
+++ b/pt_template_srcs.bzl
@@ -149,6 +149,7 @@ def get_generate_code_bin_outs():
             "autograd/generated/python_return_types.cpp": ["autograd/generated/python_return_types.cpp"],
             "autograd/generated/python_sparse_functions.cpp": ["autograd/generated/python_sparse_functions.cpp"],
             "autograd/generated/python_special_functions.cpp": ["autograd/generated/python_special_functions.cpp"],
+            "autograd/generated/python_dist_functions.cpp": ["autograd/generated/python_dist_functions.cpp"],
             "autograd/generated/python_torch_functions_0.cpp": ["autograd/generated/python_torch_functions_0.cpp"],
             "autograd/generated/python_torch_functions_1.cpp": ["autograd/generated/python_torch_functions_1.cpp"],
             "autograd/generated/python_torch_functions_2.cpp": ["autograd/generated/python_torch_functions_2.cpp"],
diff --git a/tools/BUCK.bzl b/tools/BUCK.bzl
index 25c85afb3f88..2e4a668dfa61 100644
--- a/tools/BUCK.bzl
+++ b/tools/BUCK.bzl
@@ -135,6 +135,7 @@ def define_tools_targets(
             "autograd/templates/python_return_types.cpp",
             "autograd/templates/python_sparse_functions.cpp",
             "autograd/templates/python_special_functions.cpp",
+            "autograd/templates/python_dist_functions.cpp",
             "autograd/templates/python_torch_functions.cpp",
             "autograd/templates/python_variable_methods.cpp",
             "autograd/templates/variable_factories.h",
diff --git a/tools/autograd/gen_python_functions.py b/tools/autograd/gen_python_functions.py
index 795df8c2ec44..7dca30613f5d 100644
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@@ -239,6 +239,10 @@ def is_py_special_function(f: NativeFunction) -> bool:
     return f.python_module == "special"
 
 
+def is_py_dist_function(f: NativeFunction) -> bool:
+    return f.python_module == "dist"
+
+
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
 #
 #                            Main Function
@@ -345,6 +349,15 @@ def gen(
         symint=symint,
     )
 
+    create_python_bindings(
+        fm,
+        functions,
+        is_py_dist_function,
+        "torch.distributed.functional",
+        "python_dist_functions.cpp",
+        method=False,
+    )
+
     # Currently, we only use `functions` to generate `return_types` bindings.
     # All methods which return namedtuple have function variant at this point.
     # If any method only operator with namedtuple is added in the future,
@@ -902,6 +915,7 @@ if(check_has_torch_function(self_)) {{
             "torch.nested": "THPNestedVariableFunctionsModule",
             "torch.sparse": "THPSparseVariableFunctionsModule",
             "torch.special": "THPSpecialVariableFunctionsModule",
+            "torch.distributed.functional": "THPDistVariableFunctionsModule",
         }[module]
         if module
         else "THPVariableClass"
diff --git a/tools/autograd/templates/python_dist_functions.cpp b/tools/autograd/templates/python_dist_functions.cpp
new file mode 100644
index 000000000000..416ac5b2c7c5
--- /dev/null
+++ b/tools/autograd/templates/python_dist_functions.cpp
@@ -0,0 +1,68 @@
+#define TORCH_ASSERT_ONLY_METHOD_OPERATORS
+// ${generated_comment}
+
+#include "torch/csrc/Device.h"
+#include "torch/csrc/DynamicTypes.h"
+#include "torch/csrc/Exceptions.h"
+#include "torch/csrc/autograd/python_dist_functions.h"
+#include "torch/csrc/autograd/python_return_types.h"
+#include "torch/csrc/autograd/python_variable.h"
+#include "torch/csrc/autograd/utils/wrap_outputs.h"
+#include "torch/csrc/autograd/utils/python_arg_parsing.h"
+#include "torch/csrc/utils/pycfunction_helpers.h"
+#include "torch/csrc/utils/python_arg_parser.h"
+#include "torch/csrc/utils/structseq.h"
+#include "torch/csrc/utils/tensor_memoryformats.h"
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#else
+$ops_headers
+#endif
+
+using at::Tensor;
+using at::Scalar;
+using at::MemoryFormat;
+using at::Generator;
+using at::IntArrayRef;
+using at::ArrayRef;
+
+using namespace torch::autograd::utils;
+
+namespace torch { namespace autograd {
+
+// generated forward declarations start here
+
+${py_forwards}
+
+static PyMethodDef dist_functions[] = {
+  ${py_method_defs}
+  {NULL}
+};
+
+static PyObject* THPDistVariableFunctionsModule = NULL;
+
+void initDistFunctions(PyObject* module) {
+  static struct PyModuleDef def = {
+     PyModuleDef_HEAD_INIT,
+     "torch._C._dist",
+     NULL,
+     -1,
+     dist_functions
+  };
+  PyObject* dist = PyModule_Create(&def);
+  THPDistVariableFunctionsModule = dist;
+  if (!dist) {
+    throw python_error();
+  }
+  // steals a reference to dist
+  if (PyModule_AddObject(module, "_dist", dist) != 0) {
+    throw python_error();
+  }
+}
+
+// generated methods start here
+
+${py_methods}
+
+}} // namespace torch::autograd
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index 97ad819a67c6..c9ff2de5879f 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -41,6 +41,7 @@
 #include <torch/csrc/TypeInfo.h>
 #include <torch/csrc/api/include/torch/python/init.h>
 #include <torch/csrc/autograd/python_cpp_function.h>
+#include <torch/csrc/autograd/python_dist_functions.h>
 #include <torch/csrc/autograd/python_enum_tag.h>
 #include <torch/csrc/autograd/python_fft_functions.h>
 #include <torch/csrc/autograd/python_function.h>
@@ -1328,6 +1329,7 @@ PyObject* initModule() {
   torch::autograd::initNestedFunctions(module);
   torch::autograd::initSparseFunctions(module);
   torch::autograd::initSpecialFunctions(module);
+  torch::autograd::initDistFunctions(module);
   torch::autograd::init_legacy_variable(module);
   torch::profiler::initPythonBindings(module);
   torch::python::init_bindings(module);
diff --git a/torch/csrc/autograd/python_dist_functions.h b/torch/csrc/autograd/python_dist_functions.h
new file mode 100644
index 000000000000..f5f937f51976
--- /dev/null
+++ b/torch/csrc/autograd/python_dist_functions.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace torch {
+namespace autograd {
+
+void initDistFunctions(PyObject* module);
+
+} // namespace autograd
+} // namespace torch
diff --git a/torch/distributed/_functional_collectives.py b/torch/distributed/_functional_collectives.py
index d53b47ed939e..952a33258d34 100644
--- a/torch/distributed/_functional_collectives.py
+++ b/torch/distributed/_functional_collectives.py
@@ -105,7 +105,7 @@ class AsyncCollectiveTensor(torch.Tensor):
     Use it inside functional collective pytorch wrappers like the following:
     def functional_collective(self, group, tag):
         tag, rankset, group_size = _expand_group(group, tag)
-        tensor = torch._C._nn.{collective}(self, tag, rankset, group_size)
+        tensor = torch._C._dist.{collective}(self, tag, rankset, group_size)
         res = AsyncCollectiveTensor(tensor)
         _register_wrapper_tensor(res, tensor)
         return res
@@ -254,7 +254,7 @@ def wait_tensor(tensor):
 
     Waiting follows device semantics, which means blocking on CPU and synchronizing streams on CUDA.
     """
-    return torch._C._nn.wait_tensor(tensor)  # type: ignore[attr-defined]
+    return torch._C._dist.wait_tensor(tensor)  # type: ignore[attr-defined]
 
 
 def all_reduce(self: torch.Tensor, reduceOp: str, group: RANK_TYPES, tag: str = ""):
@@ -275,7 +275,7 @@ def all_reduce(self: torch.Tensor, reduceOp: str, group: RANK_TYPES, tag: str =
     that information and perform collective algebraic optimization. Use other forms of input for that.
     """
     tag, rankset, group_size = _expand_group(group, tag)
-    tensor = torch._C._nn.all_reduce(self, reduceOp, tag, rankset, group_size)  # type: ignore[attr-defined]
+    tensor = torch._C._dist.all_reduce(self, reduceOp, tag, rankset, group_size)  # type: ignore[attr-defined]
     res = AsyncCollectiveTensor(tensor)
     _register_wrapper_tensor(res, tensor)
     return res
@@ -307,7 +307,9 @@ def reduce_scatter_tensor(
     assert (
         self.size(0) % group_size == 0
     ), f"input dimension 0 ({self.size(0)} must be a multiple of group_size {group_size}"
-    tensor = torch._C._nn.reduce_scatter_tensor(self, reduceOp, scatter_dim, tag, rankset, group_size)  # type: ignore[attr-defined]
+    tensor = torch._C._dist.reduce_scatter_tensor(  # type: ignore[attr-defined]
+        self, reduceOp, scatter_dim, tag, rankset, group_size
+    )
     res = AsyncCollectiveTensor(tensor)
     _register_wrapper_tensor(res, tensor)
     return res