A memory pool implementation based on cnmem. Added cnmem license to LICENSE.

2025-10-20 21:14:14 +08:00 · 2015-09-03 20:31:48 -07:00
parent 5325bd5049
commit ecd46d5ea0
9 changed files with 1868 additions and 17 deletions
--- a/33
+++ b/33
@ -124,3 +124,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
 FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 IN THE SOFTWARE.
 *** end zmqhpp license ***
+
+Some part of the caffe2 code (specifically, third_party/cnmem) comes from the
+open-source cnmem code under the 2-clause BSD license. The cnmem license is
+as follows:
+*** begin cnmem license ***
+/* ********************************************************************** 
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ********************************************************************** */
+*** end cnmem license ***
--- a/caffe2/core/BREW
+++ b/caffe2/core/BREW
@ -38,13 +38,16 @@ cuda_library(
  srcs = [
    "blob_serialization_gpu.cc",
    "common_gpu.cc",
+    "cuda_memorypool.cc",
  ],
  hdrs = [
    "common_gpu.h",
    "context_gpu.h",
+    "cuda_memorypool.h",
  ],
  deps = [
    ":core",
+    "//third_party/cnmem:cnmem",
  ],
  whole_archive = True,
 )
@ -87,6 +90,18 @@ cc_test(
  ],
 )

+cc_test(
+  name = "cuda_memorypool_test",
+  srcs = [
+      "cuda_memorypool_test.cc",
+  ],
+  deps = [
+      ":core_gpu",
+      "//gtest:gtest",
+      "//gtest:gtest_main",
+  ],
+)
+
 cc_test(
  name = "registry_test",
  srcs = ["registry_test.cc"],
--- a/caffe2/core/context_gpu.h
+++ b/caffe2/core/context_gpu.h
@ -3,6 +3,7 @@

 #include "caffe2/core/common_gpu.h"
 #include "caffe2/core/context.h"
+#include "caffe2/core/cuda_memorypool.h"
 #include "caffe2/core/types.h"
 #include "caffe2/proto/caffe2.pb.h"
 #include "glog/logging.h"
@ -85,25 +86,12 @@ class CUDAContext {
    return curand_generator_;
  }

-  static void* New(size_t nbytes) {
-    void* dev_ptr;
-    CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
-    // CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
-    return dev_ptr;
+  static inline void* New(size_t nbytes) {
+    return CudaMemoryPool::New(nbytes);
  }

-  static void Delete(void* data) {
-    cudaError_t error = cudaFree(data);
-    // For some reason, in Python runtime we sometimes delete a data pointer
-    // after the cuda runtime exits - this is odd but is probably caused by
-    // a static workspace that pycaffe2 uses, and the destruction got entangled
-    // in some race condition. Anyway, since cuda runtime is exiting anyway, we
-    // will not need to worry about memory leak, so we basically ignore it.
-    // This is definitely not ideal but works for now.
-    if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
-      LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
-                 << cudaGetErrorString(error);
-    }
+  static inline void Delete(void* data) {
+    CudaMemoryPool::Delete(data);
  }

  template <class SrcContext, class DstContext>
--- a/caffe2/core/cuda_memorypool.cc
+++ b/caffe2/core/cuda_memorypool.cc
@ -0,0 +1,118 @@
+#include "third_party/cnmem/cnmem.h"
+#include "caffe2/core/cuda_memorypool.h"
+
+namespace caffe2 {
+
+#define CNMEM_CHECK(condition) \
+  do { \
+    cnmemStatus_t error = condition; \
+    CHECK_EQ(error, CNMEM_STATUS_SUCCESS) << cnmemGetErrorString(error); \
+  } while (0)
+
+bool CudaMemoryPool::is_memory_pool_setup_ = false;
+bool CudaMemoryPool::memory_allocated_before_setup_ = false;
+vector<bool> CudaMemoryPool::memory_pool_available_for_device_(0);
+vector<cudaStream_t> CudaMemoryPool::per_device_streams_(0);
+
+bool CudaMemoryPool::InitializeMemoryPool(
+    const vector<int>& device_ids,
+    const float proportion_of_memory_to_reserve) {
+  if (memory_allocated_before_setup_) {
+    LOG(ERROR) << "There is cuda memory allocated before we initialize the "
+                  "memory pool. This should not happen: you should either "
+                  "use raw cudaMalloc and cudaFree and not initialize the "
+                  "pool at all, or initialize the pool before you allocate "
+                  "anything.";
+    return false;
+  }
+  if (is_memory_pool_setup_) {
+    LOG(ERROR) << "Memory pool is already set up. I cannot set up it twice.";
+    return false;
+  }
+
+  // The actual initialization.
+  int device_count;
+  CUDA_CHECK(cudaGetDeviceCount(&device_count));
+  // Initialize the flags for the memory pool.
+  memory_pool_available_for_device_.resize(device_count, false);
+  per_device_streams_.resize(device_count, nullptr);
+  // Push the current device so we can recover later.
+  int initial_device;
+  CUDA_CHECK(cudaGetDevice(&initial_device));
+
+  vector<cnmemDevice_t> cnmem_devs(device_ids.size());
+  for (int i = 0; i < device_ids.size(); ++i) {
+    const int device_id = device_ids[i];
+    CHECK_GE(device_id, 0);
+    CHECK_LT(device_id, device_count);
+    // This ensures we do not specify the same device twice.
+    CHECK(!memory_pool_available_for_device_[device_id]);
+    CUDA_CHECK(cudaSetDevice(device_id));
+    size_t free_memory, used_memory;
+    CUDA_CHECK(cudaMemGetInfo(&free_memory, &used_memory));
+    LOG(INFO) << "Reserving " << proportion_of_memory_to_reserve * 100
+              << "percent of the free memory (total " << free_memory
+              << ") on device " << device_id;
+    // Note: we create a dummy non-null stream for memory allocations, so that
+    // any malloc can be called from any cuda stream, since caffe2 uses a lot of
+    // non-default streams for computation. We will allocate all the reserved
+    // memory to that non-null stream.
+    cnmem_devs[i].device = device_id;
+    cnmem_devs[i].size = size_t(proportion_of_memory_to_reserve * free_memory);
+    CUDA_CHECK(cudaStreamCreate(&per_device_streams_[i]));
+    cnmem_devs[i].numStreams = 1;
+    cnmem_devs[i].streams = &per_device_streams_[i];
+    cnmem_devs[i].streamSizes = &cnmem_devs[i].size;
+    memory_pool_available_for_device_[device_id] = true;
+  }
+  CNMEM_CHECK(
+      cnmemInit(cnmem_devs.size(), cnmem_devs.data(), CNMEM_FLAGS_DEFAULT));
+  // After initialization, let's set back the device.
+  CUDA_CHECK(cudaSetDevice(initial_device));
+  LOG(INFO) << "Set up memory pool.";
+  is_memory_pool_setup_ = true;
+  return true;
+}
+
+bool CudaMemoryPool::FinalizeMemoryPool() {
+  // If it has not been set up yet, we have nothing to do.
+  if (!is_memory_pool_setup_) {
+    return true;
+  }
+  CNMEM_CHECK(cnmemFinalize());
+  for (int i = 0; i < per_device_streams_.size(); ++i) {
+    if (per_device_streams_[i]) {
+      CUDA_CHECK(cudaStreamDestroy(per_device_streams_[i]));
+    }
+  }
+  // Reset all the static variables
+  per_device_streams_.resize(0);
+  memory_pool_available_for_device_.resize(0);
+  memory_allocated_before_setup_ = false;
+  is_memory_pool_setup_ = false;
+  return true;
+}
+
+void* CudaMemoryPool::NewWithMemoryPool(size_t nbytes) {
+  int device_id;
+  CUDA_CHECK(cudaGetDevice(&device_id));
+  CHECK(memory_pool_available_for_device_[device_id])
+      << "Trying to allocate on device " << device_id
+      << ", but memory pool is not initialized on that device.";
+  void* ptr;
+  CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, per_device_streams_[device_id]));
+  return ptr;
+}
+
+void CudaMemoryPool::DeleteWithMemoryPool(void* data) {
+  cudaPointerAttributes attr;
+  CUDA_CHECK(cudaPointerGetAttributes(&attr, data));
+  DCHECK_EQ(attr.memoryType, cudaMemoryTypeDevice);
+  CHECK(memory_pool_available_for_device_[attr.device])
+      << "Current pointer belongs to " << attr.device
+      << ", but memory pool is not initialized on that device. "
+      << "Was your pointer allocated using the memory pool?";
+  CNMEM_CHECK(cnmemFree(data, per_device_streams_[attr.device]));
+}
+
+}  // namespace caffe2
--- a/caffe2/core/cuda_memorypool.h
+++ b/caffe2/core/cuda_memorypool.h
@ -0,0 +1,74 @@
+#ifndef CAFFE2_CORE_CUDA_MEMORYPOOL_H_
+#define CAFFE2_CORE_CUDA_MEMORYPOOL_H_
+
+#include <cstddef>
+
+#include "caffe2/core/common_gpu.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+class CudaMemoryPool {
+ public:
+  // Initializes the memory pool on the device ids, and pre-preserves the given
+  // proportion of the currently free memory on the device.
+  static bool InitializeMemoryPool(
+      const vector<int>& device_ids,
+      const float proportion_of_memory_to_reserve);
+
+  // Finalizes the memory pool. This has to be called after all memory allocated
+  // by the memory pool has been freed.
+  static bool FinalizeMemoryPool();
+
+  static inline bool MemoryPoolInitialized() { return is_memory_pool_setup_; }
+  static inline bool MemoryPoolAvailableForDevice(int device_id) {
+    return (device_id < memory_pool_available_for_device_.size() &&
+            memory_pool_available_for_device_[device_id]);
+  }
+
+  static inline void* New(size_t nbytes) {
+    if (is_memory_pool_setup_) {
+      return NewWithMemoryPool(nbytes);
+    } else {
+      // If memory pool is not set up, use simple cudaMalloc.
+      void* dev_ptr;
+      CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
+      memory_allocated_before_setup_ = true;
+      return dev_ptr;
+    }
+  }
+
+  static inline void Delete(void* data) {
+    if (is_memory_pool_setup_) {
+      DeleteWithMemoryPool(data);
+    } else {
+      // If memory pool is not set up, use simple cudaFree.
+      cudaError_t error = cudaFree(data);
+      // For some reason, in Python runtime we sometimes delete a data pointer
+      // after the cuda runtime exits - this is odd but is probably caused by
+      // a static workspace that pycaffe2 uses, and the destruction got entangled
+      // in some race condition. Anyway, since cuda runtime is exiting anyway, we
+      // will not need to worry about memory leak, so we basically ignore it.
+      // This is definitely not ideal but works for now.
+      if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
+        LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
+                   << cudaGetErrorString(error);
+      }
+    }
+  }
+
+ private:
+  // CudaMemoryPool is a singleton, so it should not be instantiated.
+  CudaMemoryPool() {};
+  static void* NewWithMemoryPool(size_t nbytes);
+  static void DeleteWithMemoryPool(void* data);
+
+  static bool is_memory_pool_setup_;
+  static bool memory_allocated_before_setup_;
+  static vector<bool> memory_pool_available_for_device_;
+  static vector<cudaStream_t> per_device_streams_;
+};
+
+}  // namespace caffe2
+
+#endif  // CAFFE2_CORE_CUDA_MEMORYPOOL_H_
--- a/caffe2/core/cuda_memorypool_test.cc
+++ b/caffe2/core/cuda_memorypool_test.cc
@ -0,0 +1,64 @@
+#include "caffe2/core/cuda_memorypool.h"
+#include "caffe2/core/context_gpu.h"
+#include "gtest/gtest.h"
+#include "glog/logging.h"
+
+namespace caffe2 {
+
+struct UseMemoryPool { static const bool value = true; };
+struct NotUseMemoryPool { static const bool value = false; };
+
+template <class UsePoolOrNot>
+class MemoryPoolTest : public ::testing::Test {
+ protected:
+  MemoryPoolTest() : device_count_(0) {}
+  // virtual void SetUp() will be called before each test is run.  You
+  // should define it if you need to initialize the varaibles.
+  // Otherwise, this can be skipped.
+  void SetUp() override {
+    int device_count_;
+    CUDA_CHECK(cudaGetDeviceCount(&device_count_));
+    // If we test with the memory pool, initialize the memory pool.
+    if (UsePoolOrNot::value) {
+      vector<int> device_ids(device_count_);
+      for (int i = 0; i < device_count_; ++i) {
+        device_ids[i] = i;
+      }
+      CHECK(CudaMemoryPool::InitializeMemoryPool(device_ids, 0.8));
+    }
+  }
+
+  void TearDown() override {
+    if (UsePoolOrNot::value) {
+      CHECK(CudaMemoryPool::FinalizeMemoryPool());
+    }
+  }
+
+  // Declares the variables your tests want to use.
+  int device_count_;
+};
+
+typedef ::testing::Types<UseMemoryPool, NotUseMemoryPool> MemoryPoolTestTypes;
+TYPED_TEST_CASE(MemoryPoolTest, MemoryPoolTestTypes);
+
+// This just tests that setup and teardown works.
+TYPED_TEST(MemoryPoolTest, InitializeAndFinalizeWorks) {
+  EXPECT_TRUE(true);
+}
+
+TYPED_TEST(MemoryPoolTest, AllocateAndDeallocate) {
+  const int nbytes = 1048576;
+  for (int i = 0; i < this->device_count_; ++i) {
+    LOG(INFO) << "Device " << i << " of " << this->device_count_;
+    CUDA_CHECK(cudaSetDevice(i));
+    void* allocated = CUDAContext::New(nbytes);
+    EXPECT_NE(allocated, nullptr);
+    cudaPointerAttributes attr;
+    CUDA_CHECK(cudaPointerGetAttributes(&attr, allocated));
+    EXPECT_EQ(attr.memoryType, cudaMemoryTypeDevice);
+    EXPECT_EQ(attr.device, i);
+    CUDAContext::Delete(allocated);
+  }
+}
+
+}  // namespace caffe2
--- a/third_party/cnmem/BREW
+++ b/third_party/cnmem/BREW
@ -0,0 +1,9 @@
+cuda_library(
+  name = "cnmem",
+  srcs = [
+    "cnmem.cpp",
+  ],
+  hdrs = [
+    "cnmem.h",
+  ],
+)
--- a/third_party/cnmem/cnmem.cpp
+++ b/third_party/cnmem/cnmem.cpp
--- a/third_party/cnmem/cnmem.h
+++ b/third_party/cnmem/cnmem.h
@ -0,0 +1,263 @@
+/* ********************************************************************** 
+ * Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *  * Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ *  * Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *  * Neither the name of NVIDIA CORPORATION nor the names of its
+ *    contributors may be used to endorse or promote products derived
+ *    from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+ * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
+ * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ * ********************************************************************** */
+#pragma once
+
+#ifdef __cplusplus
+#include "cstdio"
+#else
+#include "stdio.h"
+#endif
+#include "cuda_runtime_api.h"
+
+#if defined(_MSC_VER) || defined(WIN32)
+#ifdef CNMEM_DLLEXPORT
+#define CNMEM_API __declspec(dllexport)
+#else
+#define CNMEM_API __declspec(dllimport)
+#endif
+#else
+#ifdef CNMEM_DLLEXPORT
+#define CNMEM_API __attribute__((visibility ("default")))
+#else
+#define CNMEM_API
+#endif
+#endif
+
+#define CNMEM_VERSION 100 // It corresponds to 1.0.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* ********************************************************************************************* */
+
+typedef enum
+{
+  CNMEM_STATUS_SUCCESS = 0,
+  CNMEM_STATUS_CUDA_ERROR,
+  CNMEM_STATUS_INVALID_ARGUMENT,
+  CNMEM_STATUS_NOT_INITIALIZED,
+  CNMEM_STATUS_OUT_OF_MEMORY,
+  CNMEM_STATUS_UNKNOWN_ERROR
+} cnmemStatus_t;
+
+/* ********************************************************************************************* */
+
+typedef enum
+{
+  CNMEM_FLAGS_DEFAULT = 0,       /// Default flags.
+  CNMEM_FLAGS_CANNOT_GROW = 1,   /// Prevent the manager from growing its memory consumption.
+  CNMEM_FLAGS_CANNOT_STEAL = 2,  /// Prevent the manager from stealing memory.
+} cnmemManagerFlags_t;
+
+/* ********************************************************************************************* */
+
+typedef struct cnmemDevice_t_
+{
+  /** The device number. */
+  int device;
+  /** The size to allocate for that device. If 0, the implementation chooses the size. */
+  size_t size;
+  /** The number of named streams associated with the device. The NULL stream is not counted. */
+  int numStreams;
+  /** The streams associated with the device. It can be NULL. The NULL stream is managed. */
+  cudaStream_t *streams;
+  /** The size reserved for each streams. It can be 0. */
+  size_t *streamSizes;
+
+} cnmemDevice_t;
+
+/**
+ * \brief Initialize the library and allocate memory on the listed devices.
+ *
+ * For each device, an internal memory manager is created and the specified amount of memory is 
+ * allocated (it is the size defined in device[i].size). For each, named stream an additional 
+ * memory manager is created. Currently, it is implemented as a tree of memory managers: A root 
+ * manager for the device and a list of children, one for each named stream.
+ * 
+ * This function must be called before any other function in the library. It has to be called 
+ * by a single thread since it is not thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if the requested size exceeds the available memory,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in a CUDA function.
+ */
+cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
+
+/**
+ * \brief Release all the allocated memory. 
+ * 
+ * This function must be called by a single thread and after all threads that called 
+ * cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemFinalize();
+
+/**
+ * \brief Increase the internal reference counter of the context object.
+ * 
+ * This function increases the internal reference counter of the library. The purpose of that
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ */
+cnmemStatus_t CNMEM_API cnmemRetain();
+
+/**
+ * \brief Decrease the internal reference counter of the context object.
+ * 
+ * This function decreases the internal reference counter of the library. The purpose of that
+ * reference counting mechanism is to give more control to the user over the lifetime of the 
+ * library. It is useful with scoped memory allocation which may be destroyed in a final 
+ * memory collection after the end of main(). That function is thread-safe.
+ *
+ * You can use \c cnmemRelease to explicitly finalize the library.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ */
+cnmemStatus_t CNMEM_API cnmemRelease();
+
+/**
+ * \brief Add a new stream to the pool of managed streams on a device.
+ *
+ * This function registers a new stream into a device memory manager. It is thread-safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ */
+cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
+
+/**
+ * \brief Allocate memory. 
+ * 
+ * This function allocates memory and initializes a pointer to device memory. If no memory 
+ * is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
+ *
+ * The behavior of that function is the following: 
+ *
+ * - If the stream is NULL, the root memory manager is asked to allocate a buffer of device 
+ *   memory. If there's a buffer of size larger or equal to the requested size in the list of 
+ *   free blocks, it is returned. If there's no such buffer but the manager is allowed to grow 
+ *   its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls 
+ *   cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not 
+ *   allowed to grow, the manager attempts to steal memory from one of its children (unless 
+ *   CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns 
+ *   CNMEM_STATUS_OUT_OF_MEMORY.
+ * 
+ * - If the stream is a named stream, the initial request goes to the memory manager associated 
+ *   with that stream. If a free node is available in the lists of that manager, it is returned. 
+ *   Otherwise, the request is passed to the root node and works as if the request were made on 
+ *   the NULL stream.
+ *
+ * The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the 
+ * mechanism to steal memory from the children induces GPU synchronizations (the manager has to 
+ * make sure no kernel uses a given buffer before stealing it) and it the execution is 
+ * sequential (in a multi-threaded context, the code is executed in a critical section inside
+ * the cnmem library - no need for the user to wrap cnmemMalloc with locks).
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_OUT_OF_MEMORY,    if there is not enough memory available,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
+
+/**
+ * \brief Release memory. 
+ * 
+ * This function releases memory and recycles a memory block in the manager. This function is 
+ * thread safe. 
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
+
+/* ********************************************************************************************* */
+/* Utility functions.                                                                            */
+/* ********************************************************************************************* */
+
+/**
+ * \brief Returns the amount of memory managed by the memory manager associated with a stream.
+ * 
+ * The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
+ * xity linear in the number of allocated blocks so do not call it in performance critical 
+ * sections. 
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
+
+/**
+ * \brief Print a list of nodes to a file. 
+ * 
+ * This function is intended to be used in case of complex scenarios to help understand the 
+ * behaviour of the memory managers/application. It is thread safe.
+ *
+ * \return 
+ * CNMEM_STATUS_SUCCESS,          if everything goes fine,
+ * CNMEM_STATUS_NOT_INITIALIZED,  if the ::cnmemInit function has not been called,
+ * CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0 
+ *                                or free_mem == 0,
+ * CNMEM_STATUS_CUDA_ERROR,       if an error happens in one of the CUDA functions.
+ */
+cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
+
+/**
+ * \brief Converts a cnmemStatus_t value to a string.
+ */
+const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
+
+/* ********************************************************************************************* */
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+