A memory pool implementation based on cnmem. Added cnmem license to LICENSE.

This commit is contained in:
Yangqing Jia
2015-09-03 20:31:48 -07:00
parent 5325bd5049
commit ecd46d5ea0
9 changed files with 1868 additions and 17 deletions

33
LICENSE
View File

@ -124,3 +124,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
IN THE SOFTWARE.
*** end zmqhpp license ***
Some part of the caffe2 code (specifically, third_party/cnmem) comes from the
open-source cnmem code under the 2-clause BSD license. The cnmem license is
as follows:
*** begin cnmem license ***
/* **********************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ********************************************************************** */
*** end cnmem license ***

View File

@ -38,13 +38,16 @@ cuda_library(
srcs = [
"blob_serialization_gpu.cc",
"common_gpu.cc",
"cuda_memorypool.cc",
],
hdrs = [
"common_gpu.h",
"context_gpu.h",
"cuda_memorypool.h",
],
deps = [
":core",
"//third_party/cnmem:cnmem",
],
whole_archive = True,
)
@ -87,6 +90,18 @@ cc_test(
],
)
cc_test(
name = "cuda_memorypool_test",
srcs = [
"cuda_memorypool_test.cc",
],
deps = [
":core_gpu",
"//gtest:gtest",
"//gtest:gtest_main",
],
)
cc_test(
name = "registry_test",
srcs = ["registry_test.cc"],

View File

@ -3,6 +3,7 @@
#include "caffe2/core/common_gpu.h"
#include "caffe2/core/context.h"
#include "caffe2/core/cuda_memorypool.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2.pb.h"
#include "glog/logging.h"
@ -85,25 +86,12 @@ class CUDAContext {
return curand_generator_;
}
static void* New(size_t nbytes) {
void* dev_ptr;
CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
// CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
return dev_ptr;
static inline void* New(size_t nbytes) {
return CudaMemoryPool::New(nbytes);
}
static void Delete(void* data) {
cudaError_t error = cudaFree(data);
// For some reason, in Python runtime we sometimes delete a data pointer
// after the cuda runtime exits - this is odd but is probably caused by
// a static workspace that pycaffe2 uses, and the destruction got entangled
// in some race condition. Anyway, since cuda runtime is exiting anyway, we
// will not need to worry about memory leak, so we basically ignore it.
// This is definitely not ideal but works for now.
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
<< cudaGetErrorString(error);
}
static inline void Delete(void* data) {
CudaMemoryPool::Delete(data);
}
template <class SrcContext, class DstContext>

View File

@ -0,0 +1,118 @@
#include "third_party/cnmem/cnmem.h"
#include "caffe2/core/cuda_memorypool.h"
namespace caffe2 {
#define CNMEM_CHECK(condition) \
do { \
cnmemStatus_t error = condition; \
CHECK_EQ(error, CNMEM_STATUS_SUCCESS) << cnmemGetErrorString(error); \
} while (0)
bool CudaMemoryPool::is_memory_pool_setup_ = false;
bool CudaMemoryPool::memory_allocated_before_setup_ = false;
vector<bool> CudaMemoryPool::memory_pool_available_for_device_(0);
vector<cudaStream_t> CudaMemoryPool::per_device_streams_(0);
bool CudaMemoryPool::InitializeMemoryPool(
const vector<int>& device_ids,
const float proportion_of_memory_to_reserve) {
if (memory_allocated_before_setup_) {
LOG(ERROR) << "There is cuda memory allocated before we initialize the "
"memory pool. This should not happen: you should either "
"use raw cudaMalloc and cudaFree and not initialize the "
"pool at all, or initialize the pool before you allocate "
"anything.";
return false;
}
if (is_memory_pool_setup_) {
LOG(ERROR) << "Memory pool is already set up. I cannot set up it twice.";
return false;
}
// The actual initialization.
int device_count;
CUDA_CHECK(cudaGetDeviceCount(&device_count));
// Initialize the flags for the memory pool.
memory_pool_available_for_device_.resize(device_count, false);
per_device_streams_.resize(device_count, nullptr);
// Push the current device so we can recover later.
int initial_device;
CUDA_CHECK(cudaGetDevice(&initial_device));
vector<cnmemDevice_t> cnmem_devs(device_ids.size());
for (int i = 0; i < device_ids.size(); ++i) {
const int device_id = device_ids[i];
CHECK_GE(device_id, 0);
CHECK_LT(device_id, device_count);
// This ensures we do not specify the same device twice.
CHECK(!memory_pool_available_for_device_[device_id]);
CUDA_CHECK(cudaSetDevice(device_id));
size_t free_memory, used_memory;
CUDA_CHECK(cudaMemGetInfo(&free_memory, &used_memory));
LOG(INFO) << "Reserving " << proportion_of_memory_to_reserve * 100
<< "percent of the free memory (total " << free_memory
<< ") on device " << device_id;
// Note: we create a dummy non-null stream for memory allocations, so that
// any malloc can be called from any cuda stream, since caffe2 uses a lot of
// non-default streams for computation. We will allocate all the reserved
// memory to that non-null stream.
cnmem_devs[i].device = device_id;
cnmem_devs[i].size = size_t(proportion_of_memory_to_reserve * free_memory);
CUDA_CHECK(cudaStreamCreate(&per_device_streams_[i]));
cnmem_devs[i].numStreams = 1;
cnmem_devs[i].streams = &per_device_streams_[i];
cnmem_devs[i].streamSizes = &cnmem_devs[i].size;
memory_pool_available_for_device_[device_id] = true;
}
CNMEM_CHECK(
cnmemInit(cnmem_devs.size(), cnmem_devs.data(), CNMEM_FLAGS_DEFAULT));
// After initialization, let's set back the device.
CUDA_CHECK(cudaSetDevice(initial_device));
LOG(INFO) << "Set up memory pool.";
is_memory_pool_setup_ = true;
return true;
}
bool CudaMemoryPool::FinalizeMemoryPool() {
// If it has not been set up yet, we have nothing to do.
if (!is_memory_pool_setup_) {
return true;
}
CNMEM_CHECK(cnmemFinalize());
for (int i = 0; i < per_device_streams_.size(); ++i) {
if (per_device_streams_[i]) {
CUDA_CHECK(cudaStreamDestroy(per_device_streams_[i]));
}
}
// Reset all the static variables
per_device_streams_.resize(0);
memory_pool_available_for_device_.resize(0);
memory_allocated_before_setup_ = false;
is_memory_pool_setup_ = false;
return true;
}
void* CudaMemoryPool::NewWithMemoryPool(size_t nbytes) {
int device_id;
CUDA_CHECK(cudaGetDevice(&device_id));
CHECK(memory_pool_available_for_device_[device_id])
<< "Trying to allocate on device " << device_id
<< ", but memory pool is not initialized on that device.";
void* ptr;
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, per_device_streams_[device_id]));
return ptr;
}
void CudaMemoryPool::DeleteWithMemoryPool(void* data) {
cudaPointerAttributes attr;
CUDA_CHECK(cudaPointerGetAttributes(&attr, data));
DCHECK_EQ(attr.memoryType, cudaMemoryTypeDevice);
CHECK(memory_pool_available_for_device_[attr.device])
<< "Current pointer belongs to " << attr.device
<< ", but memory pool is not initialized on that device. "
<< "Was your pointer allocated using the memory pool?";
CNMEM_CHECK(cnmemFree(data, per_device_streams_[attr.device]));
}
} // namespace caffe2

View File

@ -0,0 +1,74 @@
#ifndef CAFFE2_CORE_CUDA_MEMORYPOOL_H_
#define CAFFE2_CORE_CUDA_MEMORYPOOL_H_
#include <cstddef>
#include "caffe2/core/common_gpu.h"
#include "glog/logging.h"
namespace caffe2 {
class CudaMemoryPool {
public:
// Initializes the memory pool on the device ids, and pre-preserves the given
// proportion of the currently free memory on the device.
static bool InitializeMemoryPool(
const vector<int>& device_ids,
const float proportion_of_memory_to_reserve);
// Finalizes the memory pool. This has to be called after all memory allocated
// by the memory pool has been freed.
static bool FinalizeMemoryPool();
static inline bool MemoryPoolInitialized() { return is_memory_pool_setup_; }
static inline bool MemoryPoolAvailableForDevice(int device_id) {
return (device_id < memory_pool_available_for_device_.size() &&
memory_pool_available_for_device_[device_id]);
}
static inline void* New(size_t nbytes) {
if (is_memory_pool_setup_) {
return NewWithMemoryPool(nbytes);
} else {
// If memory pool is not set up, use simple cudaMalloc.
void* dev_ptr;
CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
memory_allocated_before_setup_ = true;
return dev_ptr;
}
}
static inline void Delete(void* data) {
if (is_memory_pool_setup_) {
DeleteWithMemoryPool(data);
} else {
// If memory pool is not set up, use simple cudaFree.
cudaError_t error = cudaFree(data);
// For some reason, in Python runtime we sometimes delete a data pointer
// after the cuda runtime exits - this is odd but is probably caused by
// a static workspace that pycaffe2 uses, and the destruction got entangled
// in some race condition. Anyway, since cuda runtime is exiting anyway, we
// will not need to worry about memory leak, so we basically ignore it.
// This is definitely not ideal but works for now.
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
<< cudaGetErrorString(error);
}
}
}
private:
// CudaMemoryPool is a singleton, so it should not be instantiated.
CudaMemoryPool() {};
static void* NewWithMemoryPool(size_t nbytes);
static void DeleteWithMemoryPool(void* data);
static bool is_memory_pool_setup_;
static bool memory_allocated_before_setup_;
static vector<bool> memory_pool_available_for_device_;
static vector<cudaStream_t> per_device_streams_;
};
} // namespace caffe2
#endif // CAFFE2_CORE_CUDA_MEMORYPOOL_H_

View File

@ -0,0 +1,64 @@
#include "caffe2/core/cuda_memorypool.h"
#include "caffe2/core/context_gpu.h"
#include "gtest/gtest.h"
#include "glog/logging.h"
namespace caffe2 {
struct UseMemoryPool { static const bool value = true; };
struct NotUseMemoryPool { static const bool value = false; };
template <class UsePoolOrNot>
class MemoryPoolTest : public ::testing::Test {
protected:
MemoryPoolTest() : device_count_(0) {}
// virtual void SetUp() will be called before each test is run. You
// should define it if you need to initialize the varaibles.
// Otherwise, this can be skipped.
void SetUp() override {
int device_count_;
CUDA_CHECK(cudaGetDeviceCount(&device_count_));
// If we test with the memory pool, initialize the memory pool.
if (UsePoolOrNot::value) {
vector<int> device_ids(device_count_);
for (int i = 0; i < device_count_; ++i) {
device_ids[i] = i;
}
CHECK(CudaMemoryPool::InitializeMemoryPool(device_ids, 0.8));
}
}
void TearDown() override {
if (UsePoolOrNot::value) {
CHECK(CudaMemoryPool::FinalizeMemoryPool());
}
}
// Declares the variables your tests want to use.
int device_count_;
};
typedef ::testing::Types<UseMemoryPool, NotUseMemoryPool> MemoryPoolTestTypes;
TYPED_TEST_CASE(MemoryPoolTest, MemoryPoolTestTypes);
// This just tests that setup and teardown works.
TYPED_TEST(MemoryPoolTest, InitializeAndFinalizeWorks) {
EXPECT_TRUE(true);
}
TYPED_TEST(MemoryPoolTest, AllocateAndDeallocate) {
const int nbytes = 1048576;
for (int i = 0; i < this->device_count_; ++i) {
LOG(INFO) << "Device " << i << " of " << this->device_count_;
CUDA_CHECK(cudaSetDevice(i));
void* allocated = CUDAContext::New(nbytes);
EXPECT_NE(allocated, nullptr);
cudaPointerAttributes attr;
CUDA_CHECK(cudaPointerGetAttributes(&attr, allocated));
EXPECT_EQ(attr.memoryType, cudaMemoryTypeDevice);
EXPECT_EQ(attr.device, i);
CUDAContext::Delete(allocated);
}
}
} // namespace caffe2

9
third_party/cnmem/BREW vendored Normal file
View File

@ -0,0 +1,9 @@
cuda_library(
name = "cnmem",
srcs = [
"cnmem.cpp",
],
hdrs = [
"cnmem.h",
],
)

1287
third_party/cnmem/cnmem.cpp vendored Normal file

File diff suppressed because it is too large Load Diff

263
third_party/cnmem/cnmem.h vendored Normal file
View File

@ -0,0 +1,263 @@
/* **********************************************************************
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* * Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* * Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* * Neither the name of NVIDIA CORPORATION nor the names of its
* contributors may be used to endorse or promote products derived
* from this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
* ********************************************************************** */
#pragma once
#ifdef __cplusplus
#include "cstdio"
#else
#include "stdio.h"
#endif
#include "cuda_runtime_api.h"
#if defined(_MSC_VER) || defined(WIN32)
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __declspec(dllexport)
#else
#define CNMEM_API __declspec(dllimport)
#endif
#else
#ifdef CNMEM_DLLEXPORT
#define CNMEM_API __attribute__((visibility ("default")))
#else
#define CNMEM_API
#endif
#endif
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
#ifdef __cplusplus
extern "C" {
#endif
/* ********************************************************************************************* */
typedef enum
{
CNMEM_STATUS_SUCCESS = 0,
CNMEM_STATUS_CUDA_ERROR,
CNMEM_STATUS_INVALID_ARGUMENT,
CNMEM_STATUS_NOT_INITIALIZED,
CNMEM_STATUS_OUT_OF_MEMORY,
CNMEM_STATUS_UNKNOWN_ERROR
} cnmemStatus_t;
/* ********************************************************************************************* */
typedef enum
{
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
} cnmemManagerFlags_t;
/* ********************************************************************************************* */
typedef struct cnmemDevice_t_
{
/** The device number. */
int device;
/** The size to allocate for that device. If 0, the implementation chooses the size. */
size_t size;
/** The number of named streams associated with the device. The NULL stream is not counted. */
int numStreams;
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
cudaStream_t *streams;
/** The size reserved for each streams. It can be 0. */
size_t *streamSizes;
} cnmemDevice_t;
/**
* \brief Initialize the library and allocate memory on the listed devices.
*
* For each device, an internal memory manager is created and the specified amount of memory is
* allocated (it is the size defined in device[i].size). For each, named stream an additional
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
* manager for the device and a list of children, one for each named stream.
*
* This function must be called before any other function in the library. It has to be called
* by a single thread since it is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
*/
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
/**
* \brief Release all the allocated memory.
*
* This function must be called by a single thread and after all threads that called
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFinalize();
/**
* \brief Increase the internal reference counter of the context object.
*
* This function increases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRetain();
/**
* \brief Decrease the internal reference counter of the context object.
*
* This function decreases the internal reference counter of the library. The purpose of that
* reference counting mechanism is to give more control to the user over the lifetime of the
* library. It is useful with scoped memory allocation which may be destroyed in a final
* memory collection after the end of main(). That function is thread-safe.
*
* You can use \c cnmemRelease to explicitly finalize the library.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
*/
cnmemStatus_t CNMEM_API cnmemRelease();
/**
* \brief Add a new stream to the pool of managed streams on a device.
*
* This function registers a new stream into a device memory manager. It is thread-safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
*/
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
/**
* \brief Allocate memory.
*
* This function allocates memory and initializes a pointer to device memory. If no memory
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
*
* The behavior of that function is the following:
*
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
* memory. If there's a buffer of size larger or equal to the requested size in the list of
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
* allowed to grow, the manager attempts to steal memory from one of its children (unless
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
* CNMEM_STATUS_OUT_OF_MEMORY.
*
* - If the stream is a named stream, the initial request goes to the memory manager associated
* with that stream. If a free node is available in the lists of that manager, it is returned.
* Otherwise, the request is passed to the root node and works as if the request were made on
* the NULL stream.
*
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
* make sure no kernel uses a given buffer before stealing it) and it the execution is
* sequential (in a multi-threaded context, the code is executed in a critical section inside
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
/**
* \brief Release memory.
*
* This function releases memory and recycles a memory block in the manager. This function is
* thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
/* ********************************************************************************************* */
/* Utility functions. */
/* ********************************************************************************************* */
/**
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
*
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
* xity linear in the number of allocated blocks so do not call it in performance critical
* sections.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
/**
* \brief Print a list of nodes to a file.
*
* This function is intended to be used in case of complex scenarios to help understand the
* behaviour of the memory managers/application. It is thread safe.
*
* \return
* CNMEM_STATUS_SUCCESS, if everything goes fine,
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
* or free_mem == 0,
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
*/
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
/**
* \brief Converts a cnmemStatus_t value to a string.
*/
const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
/* ********************************************************************************************* */
#ifdef __cplusplus
} // extern "C"
#endif