more build updates:

(1) nccl submodule, cnmem submodule (2) mpi ops fallback test (3) a bit more blob interface (4) fixed tests (5) caffe2.python.io -> caffe2.python.dataio to avoid name conflicts (6) In the build system autogen __init__.py instead of having manual rules just to copy over an empty __init__.py.
2025-10-20 21:14:14 +08:00 · 2016-07-22 23:58:24 -07:00
parent b2c2d0b70c
commit 1ede7a7ff0
29 changed files with 156 additions and 1629 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -1,3 +1,9 @@
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
 	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/nccl"]
+	path = third_party/nccl
+	url = https://github.com/nvidia/nccl.git
+[submodule "third_party/cnmem"]
+	path = third_party/cnmem
+	url = https://github.com/nvidia/cnmem.git
--- a/2
+++ b/2
@ -16,6 +16,6 @@ lint:
 	@find caffe2 -type f -exec python brewtool/cpplint.py {} \;

 linecount:
-	@cloc --read-lang-def=brewtool/caffe.cloc caffe2 pycaffe2 || \
+	@cloc --read-lang-def=brewtool/caffe.cloc caffe2 || \
 		echo "Cloc is not available on the machine. You can install cloc with " && \
 		echo "    sudo apt-get install cloc"
--- a/build.py
+++ b/build.py
@ -108,6 +108,7 @@ class Config(object):
        'arch=compute_30,code=sm_30',
        'arch=compute_35,code=sm_35',
        'arch=compute_50,code=sm_50',
+        'arch=compute_61,code=sm_61',
    ]
    # additional CUDA cflags to pass to nvcc.
    CUDA_CFLAGS = []
--- a/caffe/BREW
+++ b/caffe/BREW
@ -1,4 +0,0 @@
-filegroup(
-    name = "caffe_python",
-    srcs = ["__init__.py"],
-)
--- a/caffe/init.py
+++ b/caffe/init.py
--- a/caffe/proto/BREW
+++ b/caffe/proto/BREW
@ -4,11 +4,3 @@ proto_library(
    name = 'caffe_proto',
    srcs = ['caffe.proto'],
 )
-
-filegroup(
-    name = "caffe_proto_py",
-    srcs = ["__init__.py"],
-    deps = [
-        "//caffe:caffe_python",
-    ]
-)
--- a/caffe/proto/init.py
+++ b/caffe/proto/init.py
--- a/caffe2/BREW
+++ b/caffe2/BREW
@ -26,7 +26,7 @@ cc_library(
  deps = [
    ":core",
    ":core_gpu_cu",
-    "//third_party/cnmem:cnmem",
+    "//third_party:cnmem",
    "//third_party:cuda",
  ],
  whole_archive = True,
@ -48,6 +48,7 @@ cc_test(
              excludes=["*gpu_test*"]),
  deps = [
      ":core",
+      "//caffe2/operators:core_ops",
      "//third_party:gtest",
      "//caffe2/test:caffe2_gtest_main",
  ],
@ -63,11 +64,6 @@ cc_test(
  ],
 )

-filegroup(
-    name = "caffe2_python",
-    srcs = ["__init__.py"],
-)
-
 cc_library(
    name = "all_available_ops",
    srcs = [],
@ -79,6 +75,7 @@ cc_library(
    optional_deps = [
      "//caffe2/operators:core_ops_gpu",
      "//caffe2/operators:core_ops_cudnn",
+      "//caffe2/contrib/nccl:nccl_ops",
      "//caffe2/cuda_rtc:rtc_ops",
      "//caffe2/db:db_gpu",
      "//caffe2/image:image_ops",
--- a/caffe2/init.py
+++ b/caffe2/init.py
@ -1,5 +0,0 @@
-"""
-Caffe2: A General Tool for Neural Networks.
-"""
-
-__author__ = 'Yangqing Jia'
--- a/caffe2/contrib/nccl/BREW
+++ b/caffe2/contrib/nccl/BREW
@ -0,0 +1,10 @@
+cc_library(
+  name = "nccl_ops",
+  srcs = Glob(["*.cc"]),
+  hdrs = Glob(["*.h"]),
+  deps = [
+    "//caffe2:core_gpu",
+    "//third_party:nccl",
+  ],
+  whole_archive = True,
+)
--- a/caffe2/core/blob.h
+++ b/caffe2/core/blob.h
@ -58,6 +58,9 @@ class Blob {
    return *static_cast<const T*>(pointer_);
  }

+  const void* GetRaw() const { return pointer_; }
+  void* GetRaw() { return pointer_; }
+
  /**
   * @brief Gets a mutable pointer to the stored object.
   *
@ -73,6 +76,7 @@ class Blob {
      return static_cast<T*>(pointer_);
    } else {
      if (is_new_object) *is_new_object = true;
+      VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
      return Reset<T>(new T());
    }
  }
@ -87,28 +91,53 @@ class Blob {
   */
  template <class T>
  T* Reset(T* allocated) {
-    if (pointer_) {
-      CHECK_NOTNULL(destroy_)(pointer_);
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
    }
-    VLOG(1) << "Create new mutable object " << TypeMeta::Name<T>();
    meta_ = TypeMeta::Make<T>();
    pointer_ = static_cast<void*>(allocated);
    destroy_ = &Destroy<T>;
    return allocated;
  }

+  /**
+   * Sets the underlying object to the allocated one, but does not take over
+   * the ownership of the passed in pointer. If there is already an object in
+   * the Blob, the old object is freed.
+   *
+   * Unlike Reset, this does not take over the ownership of the pointer and the
+   * caller is responsible for making sure that the lifetime of the allocated
+   * blob outlasts the lifetime of any access to this blob, until another Reset
+   * call is made or the blob is destructed.
+   */
+  template <class T>
+  typename std::remove_const<T>::type* ShareExternal(
+      typename std::remove_const<T>::type* allocated) {
+    return static_cast<T*>(
+        ShareExternal(static_cast<void*>(allocated),
+        TypeMeta::Make<typename std::remove_const<T>::type>()));
+  }
+
+  void* ShareExternal(void* allocated, const TypeMeta& meta) {
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
+    }
+    meta_ = meta;
+    pointer_ = static_cast<void*>(allocated);
+    destroy_ = nullptr;
+    return allocated;
+  }
+
  /**
   * Resets the Blob to an empty one.
   */
  inline void Reset() {
-    if (pointer_) {
-      CHECK_NOTNULL(destroy_)(pointer_);
-      pointer_ = nullptr;
-      meta_ = TypeMeta();
-      destroy_ = nullptr;
+    if (pointer_ && destroy_) {
+      destroy_(pointer_);
    }
    pointer_ = nullptr;
    meta_ = TypeMeta();
+    destroy_ = nullptr;
  }

  /**
--- a/caffe2/core/blob_test.cc
+++ b/caffe2/core/blob_test.cc
@ -69,6 +69,32 @@ TEST(BlobTest, BlobWrongType) {
  ASSERT_THROW(blob.Get<int>(), EnforceNotMet);
 }

+TEST(BlobTest, BlobReset) {
+  Blob blob;
+  std::unique_ptr<Foo> foo(new Foo());
+  EXPECT_TRUE(blob.Reset(foo.release()) != nullptr);
+  // Also test that Reset works.
+  blob.Reset();
+}
+
+TEST(BlobTest, BlobShareExternalPointer) {
+  Blob blob;
+  std::unique_ptr<Foo> foo(new Foo());
+  EXPECT_EQ(blob.ShareExternal<Foo>(foo.get()), foo.get());
+  EXPECT_TRUE(blob.IsType<Foo>());
+  // Also test that Reset works.
+  blob.Reset();
+}
+
+TEST(BlobTest, BlobShareExternalObject) {
+  Blob blob;
+  Foo foo;
+  EXPECT_EQ(blob.ShareExternal<Foo>(&foo), &foo);
+  EXPECT_TRUE(blob.IsType<Foo>());
+  // Also test that Reset works.
+  blob.Reset();
+}
+
 TEST(BlobTest, StringSerialization) {
  const std::string kTestString = "Hello world?";
  Blob blob;
@ -558,6 +584,7 @@ TYPED_TEST(TypedTensorTest, BigTensorSerialization) {
        "DUMMY_ENGINE");
    Workspace ws;
    auto load_op = CreateOperator(op_def, &ws);
+    EXPECT_TRUE(load_op != nullptr);
    LOG(INFO) << "Running operator";

    load_op->Run();
--- a/caffe2/core/context_gpu_test.cc
+++ b/caffe2/core/context_gpu_test.cc
@ -1,3 +1,4 @@
+#include <chrono>
 #include <future>
 #include <random>
 #include <thread>
@ -55,6 +56,8 @@ namespace {
 void TEST_GetStreamAddress(cudaStream_t* ptr) {
  CUDAContext context(0);
  *ptr = context.cuda_stream();
+  // Sleep for a while so we have concurrent thread executions
+  std::this_thread::sleep_for(std::chrono::seconds(1));  
 }
 }  // namespace

--- a/caffe2/mpi/mpi_ops.h
+++ b/caffe2/mpi/mpi_ops.h
@ -35,9 +35,13 @@ class MPIBroadcastOp final : public Operator<Context> {

  bool RunOnDevice() override {
    MPI_Comm comm = OperatorBase::Input<MPICommonWorldWrapper>(0).comm();
+    CAFFE_ENFORCE(OperatorBase::OutputIsType<Tensor<Context>>(0),
+                  "Output is of wrong type.");
    auto* output = Output(0);
    // Make sure that output is already allocated.
-    CHECK_GT(output->size(), 0);
+    CAFFE_ENFORCE(output->size() > 0,
+                  "Broadcast op uses in-place operation so the output "
+                  "should be already allocated.");
    MPI_CHECK(MPI_Bcast(
        output->raw_mutable_data(),
        output->nbytes(),
--- a/caffe2/operators/operator_fallback_gpu.h
+++ b/caffe2/operators/operator_fallback_gpu.h
@ -54,8 +54,18 @@ class GPUFallbackOp final : public Operator<CUDAContext> {

  bool RunOnDevice() override {
    for (int i = 0; i < InputSize(); ++i) {
+      if (OperatorBase::InputIsType<TensorCUDA>(i)) {
        local_input_blobs_[i]->template GetMutable<TensorCPU>()->CopyFrom(
            Input(i), &context_);
+      } else {
+        VLOG(1) << "Input " << i << " is not TensorCUDA. Skipping copy.";
+        // Note(jiayq): This removes a const but conceptually
+        // local_input_blobs will only be used as const blob input for the
+        // base op so we are still fine.
+        local_input_blobs_[i]->ShareExternal(
+            const_cast<void*>(OperatorBase::Inputs()[i]->GetRaw()),
+            OperatorBase::Inputs()[i]->meta());
+      }
    }
    // Sync to make sure copies are done.
    context_.FinishDeviceComputation();
@ -65,6 +75,9 @@ class GPUFallbackOp final : public Operator<CUDAContext> {
      return false;
    }
    for (int i = 0; i < OutputSize(); ++i) {
+      CAFFE_ENFORCE(local_output_blobs_[i]->IsType<TensorCPU>(),
+                    "GPU fallback op currently does not support non-TensorCPU "
+                    "output type.");
      Output(i)->CopyFrom(
          local_output_blobs_[i]->template Get<TensorCPU>(), &context_);
    }
--- a/caffe2/proto/BREW
+++ b/caffe2/proto/BREW
@ -4,11 +4,3 @@ proto_library(
    name = 'caffe2_proto',
    srcs = Glob(['*.proto']),
 )
-
-filegroup(
-    name = "caffe2_proto_py",
-    srcs = ["__init__.py"],
-    deps = [
-        "//caffe2:caffe2_python",
-    ]
-)
--- a/caffe2/proto/init.py
+++ b/caffe2/proto/init.py
--- a/caffe2/python/BREW
+++ b/caffe2/python/BREW
@ -39,8 +39,8 @@ py_library(
  srcs=Glob(["*.py"], excludes=["*_test.py"]),
  deps=[
      ":caffe2_python_cpu",
-      "//caffe/proto:caffe_proto_py",
-      "//caffe2/proto:caffe2_proto_py",
+      "//caffe/proto:caffe_proto",
+      "//caffe2/proto:caffe2_proto",
      "//caffe2/python/mint:mint",
  ],
  optional_deps=[
--- a/caffe2/python/init.py
+++ b/caffe2/python/init.py
@ -1,4 +0,0 @@
-import atexit
-
-from . import core, utils, workspace
-from caffe2.proto import caffe2_pb2
--- a/caffe2/python/dataio.py
+++ b/caffe2/python/dataio.py
--- a/caffe2/python/dataset.py
+++ b/caffe2/python/dataset.py
@ -14,7 +14,7 @@ from __future__ import print_function
 from __future__ import unicode_literals

 from caffe2.python import core, workspace
-from caffe2.python.io import Reader, Writer
+from caffe2.python.dataio import Reader, Writer
 from caffe2.python.schema import Struct
 import numpy as np

--- a/caffe2/python/mint/BREW
+++ b/caffe2/python/mint/BREW
@ -1,7 +1,6 @@
 py_library(
  name = "mint",
  srcs = [
-      "__init__.py",
      "app.py",
      "static/css/simple-sidebar.css",
      "templates/index.html",
--- a/caffe2/python/mint/init.py
+++ b/caffe2/python/mint/init.py
--- a/third_party/BREW
+++ b/third_party/BREW
@ -92,10 +92,49 @@ cc_thirdparty_target(
  ],
 )

-cc_thirdparty_target(
-  name="cnmen",
-  deps=["//third_party/cnmem:cnmem"],
-  cc_obj_files = [],
+shell_script(
+  name = "cnmem_header",
+  srcs = ["cnmem/include/cnmem.h"],
+  commands=[
+    "DST=$CAFFE2_GENDIR/third_party/include/",
+    "mkdir -p $DST",
+    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem/include/cnmem.h $DST/",
+  ],
+)
+
+cc_library(
+  name = "cnmem",
+  srcs = [
+    "cnmem/src/cnmem.cpp",
+  ],
+  deps = [
+      ":cnmem_header",
+      ":cuda",
+  ]
+)
+
+shell_script(
+  name = "nccl_header",
+  srcs = ["nccl/src/nccl.h"],
+  commands=[
+    "DST=$CAFFE2_GENDIR/third_party/include/",
+    "mkdir -p $DST",
+    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/nccl/src/nccl.h $DST/",
+  ],
+)
+
+cuda_library(
+  name = "nccl",
+  srcs = Glob(["nccl/src/*.cu"]),
+  deps = [
+      ":nccl_header",
+      ":cuda",
+  ],
+  compiler_flags=[
+      "-Wno-switch",  # NCCL does not follow strict switch enum check.
+      "-DNCCL_MAJOR=1 -DNCCL_MINOR=2 -DNCCL_PATCH=3",
+      "-DCUDA_MAJOR=__CUDACC_VER_MAJOR__ -DCUDA_MINOR=__CUDACC_VER_MINOR__",
+  ],
 )

 ###############################################################################
--- a/third_party/cnmem
+++ b/third_party/cnmem
--- a/third_party/cnmem/BREW
+++ b/third_party/cnmem/BREW
@ -1,24 +0,0 @@
-# We need to copy over the header to the right folder.
-shell_script(
-  name = "cnmem_header",
-  srcs = ["cnmem.h"],
-  commands=[
-    "DST=$CAFFE2_GENDIR/third_party/include/",
-    "mkdir -p $DST",
-    "cp $CAFFE2_SRCDIR/$CAFFE2_CWD/cnmem.h $DST/",
-  ],
-)
-
-cuda_library(
-  name = "cnmem",
-  srcs = [
-    "cnmem.cpp",
-  ],
-  hdrs = [
-    "cnmem.h",
-  ],
-  deps = [
-      "cnmem_header",
-      "//third_party:cuda",
-  ]
-)
--- a/third_party/cnmem/cnmem.cpp
+++ b/third_party/cnmem/cnmem.cpp
--- a/third_party/cnmem/cnmem.h
+++ b/third_party/cnmem/cnmem.h
@ -1,263 +0,0 @@
-/* ********************************************************************** 
- * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *  * Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- *  * Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- *  * Neither the name of NVIDIA CORPORATION nor the names of its
- *    contributors may be used to endorse or promote products derived
- *    from this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
- * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
- * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
- * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
- * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
- * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
- * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
- * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * ********************************************************************** */
-#pragma once
-
-#ifdef __cplusplus
-#include "cstdio"
-#else
-#include "stdio.h"
-#endif
-#include "cuda_runtime_api.h"
-
-#if defined(_MSC_VER) || defined(WIN32)
-#ifdef CNMEM_DLLEXPORT
-#define CNMEM_API __declspec(dllexport)
-#else
-#define CNMEM_API __declspec(dllimport)
-#endif
-#else
-#ifdef CNMEM_DLLEXPORT
-#define CNMEM_API __attribute__((visibility ("default")))
-#else
-#define CNMEM_API
-#endif
-#endif
-
-#define CNMEM_VERSION 100 // It corresponds to 1.0.0
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/* ********************************************************************************************* */
-
-typedef enum
-{
-  CNMEM_STATUS_SUCCESS = 0,
-  CNMEM_STATUS_CUDA_ERROR,
-  CNMEM_STATUS_INVALID_ARGUMENT,
-  CNMEM_STATUS_NOT_INITIALIZED,
-  CNMEM_STATUS_OUT_OF_MEMORY,
-  CNMEM_STATUS_UNKNOWN_ERROR
-} cnmemStatus_t;
-
-/* ********************************************************************************************* */
-
-typedef enum
-{
-  CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
-  CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
-  CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
-} cnmemManagerFlags_t;
-
-/* ********************************************************************************************* */
-
-typedef struct cnmemDevice_t_
-{
-  /** The device number. */
-  int device;
-  /** The size to allocate for that device. If 0, the implementation chooses the size. */
-  size_t size;
-  /** The number of named streams associated with the device. The NULL stream is not counted. */
-  int numStreams;
-  /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
-  cudaStream_t *streams;
-  /** The size reserved for each streams. It can be 0. */
-  size_t *streamSizes;
-
-} cnmemDevice_t;
-
-/**
- * \brief Initialize the library and allocate memory on the listed devices.
- *
- * For each device, an internal memory manager is created and the specified amount of memory is 
- * allocated (it is the size defined in device[i].size). For each, named stream an additional 
- * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
- * manager for the device and a list of children, one for each named stream.
- * 
- * This function must be called before any other function in the library. It has to be called 
- * by a single thread since it is not thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
- */
-cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
-
-/**
- * \brief Release all the allocated memory. 
- * 
- * This function must be called by a single thread and after all threads that called 
- * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemFinalize();
-
-/**
- * \brief Increase the internal reference counter of the context object.
- * 
- * This function increases the internal reference counter of the library. The purpose of that
- * reference counting mechanism is to give more control to the user over the lifetime of the 
- * library. It is useful with scoped memory allocation which may be destroyed in a final 
- * memory collection after the end of main(). That function is thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- */
-cnmemStatus_t CNMEM_API cnmemRetain();
-
-/**
- * \brief Decrease the internal reference counter of the context object.
- * 
- * This function decreases the internal reference counter of the library. The purpose of that
- * reference counting mechanism is to give more control to the user over the lifetime of the 
- * library. It is useful with scoped memory allocation which may be destroyed in a final 
- * memory collection after the end of main(). That function is thread-safe.
- *
- * You can use \c cnmemRelease to explicitly finalize the library.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- */
-cnmemStatus_t CNMEM_API cnmemRelease();
-
-/**
- * \brief Add a new stream to the pool of managed streams on a device.
- *
- * This function registers a new stream into a device memory manager. It is thread-safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- */
-cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
-
-/**
- * \brief Allocate memory. 
- * 
- * This function allocates memory and initializes a pointer to device memory. If no memory 
- * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
- *
- * The behavior of that function is the following: 
- *
- * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
- *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
- *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
- *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
- *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
- *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
- *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
- *   CNMEM_STATUS_OUT_OF_MEMORY.
- * 
- * - If the stream is a named stream, the initial request goes to the memory manager associated 
- *   with that stream. If a free node is available in the lists of that manager, it is returned. 
- *   Otherwise, the request is passed to the root node and works as if the request were made on 
- *   the NULL stream.
- *
- * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
- * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
- * make sure no kernel uses a given buffer before stealing it) and it the execution is 
- * sequential (in a multi-threaded context, the code is executed in a critical section inside
- * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
- * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
-
-/**
- * \brief Release memory. 
- * 
- * This function releases memory and recycles a memory block in the manager. This function is 
- * thread safe. 
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
-
-/* ********************************************************************************************* */
-/* Utility functions.                                                                            */
-/* ********************************************************************************************* */
-
-/**
- * \brief Returns the amount of memory managed by the memory manager associated with a stream.
- * 
- * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
- * xity linear in the number of allocated blocks so do not call it in performance critical 
- * sections. 
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
-
-/**
- * \brief Print a list of nodes to a file. 
- * 
- * This function is intended to be used in case of complex scenarios to help understand the 
- * behaviour of the memory managers/application. It is thread safe.
- *
- * \return 
- * CNMEM_STATUS_SUCCESS,          if everything goes fine,
- * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
- * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
- *                                or free_mem == 0,
- * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
- */
-cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
-
-/**
- * \brief Converts a cnmemStatus_t value to a string.
- */
-const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
-
-/* ********************************************************************************************* */
-
-#ifdef __cplusplus
-} // extern "C"
-#endif
-
--- a/third_party/nccl
+++ b/third_party/nccl