mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
A memory pool implementation based on cnmem. Added cnmem license to LICENSE.
This commit is contained in:
33
LICENSE
33
LICENSE
@ -124,3 +124,36 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
|
||||
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
|
||||
IN THE SOFTWARE.
|
||||
*** end zmqhpp license ***
|
||||
|
||||
Some part of the caffe2 code (specifically, third_party/cnmem) comes from the
|
||||
open-source cnmem code under the 2-clause BSD license. The cnmem license is
|
||||
as follows:
|
||||
*** begin cnmem license ***
|
||||
/* **********************************************************************
|
||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ********************************************************************** */
|
||||
*** end cnmem license ***
|
||||
|
@ -38,13 +38,16 @@ cuda_library(
|
||||
srcs = [
|
||||
"blob_serialization_gpu.cc",
|
||||
"common_gpu.cc",
|
||||
"cuda_memorypool.cc",
|
||||
],
|
||||
hdrs = [
|
||||
"common_gpu.h",
|
||||
"context_gpu.h",
|
||||
"cuda_memorypool.h",
|
||||
],
|
||||
deps = [
|
||||
":core",
|
||||
"//third_party/cnmem:cnmem",
|
||||
],
|
||||
whole_archive = True,
|
||||
)
|
||||
@ -87,6 +90,18 @@ cc_test(
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "cuda_memorypool_test",
|
||||
srcs = [
|
||||
"cuda_memorypool_test.cc",
|
||||
],
|
||||
deps = [
|
||||
":core_gpu",
|
||||
"//gtest:gtest",
|
||||
"//gtest:gtest_main",
|
||||
],
|
||||
)
|
||||
|
||||
cc_test(
|
||||
name = "registry_test",
|
||||
srcs = ["registry_test.cc"],
|
||||
|
@ -3,6 +3,7 @@
|
||||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/cuda_memorypool.h"
|
||||
#include "caffe2/core/types.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "glog/logging.h"
|
||||
@ -85,25 +86,12 @@ class CUDAContext {
|
||||
return curand_generator_;
|
||||
}
|
||||
|
||||
static void* New(size_t nbytes) {
|
||||
void* dev_ptr;
|
||||
CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
|
||||
// CUDA_CHECK(cudaMemset(dev_ptr, 0, nbytes));
|
||||
return dev_ptr;
|
||||
static inline void* New(size_t nbytes) {
|
||||
return CudaMemoryPool::New(nbytes);
|
||||
}
|
||||
|
||||
static void Delete(void* data) {
|
||||
cudaError_t error = cudaFree(data);
|
||||
// For some reason, in Python runtime we sometimes delete a data pointer
|
||||
// after the cuda runtime exits - this is odd but is probably caused by
|
||||
// a static workspace that pycaffe2 uses, and the destruction got entangled
|
||||
// in some race condition. Anyway, since cuda runtime is exiting anyway, we
|
||||
// will not need to worry about memory leak, so we basically ignore it.
|
||||
// This is definitely not ideal but works for now.
|
||||
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
|
||||
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
|
||||
<< cudaGetErrorString(error);
|
||||
}
|
||||
static inline void Delete(void* data) {
|
||||
CudaMemoryPool::Delete(data);
|
||||
}
|
||||
|
||||
template <class SrcContext, class DstContext>
|
||||
|
118
caffe2/core/cuda_memorypool.cc
Normal file
118
caffe2/core/cuda_memorypool.cc
Normal file
@ -0,0 +1,118 @@
|
||||
#include "third_party/cnmem/cnmem.h"
|
||||
#include "caffe2/core/cuda_memorypool.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
#define CNMEM_CHECK(condition) \
|
||||
do { \
|
||||
cnmemStatus_t error = condition; \
|
||||
CHECK_EQ(error, CNMEM_STATUS_SUCCESS) << cnmemGetErrorString(error); \
|
||||
} while (0)
|
||||
|
||||
bool CudaMemoryPool::is_memory_pool_setup_ = false;
|
||||
bool CudaMemoryPool::memory_allocated_before_setup_ = false;
|
||||
vector<bool> CudaMemoryPool::memory_pool_available_for_device_(0);
|
||||
vector<cudaStream_t> CudaMemoryPool::per_device_streams_(0);
|
||||
|
||||
bool CudaMemoryPool::InitializeMemoryPool(
|
||||
const vector<int>& device_ids,
|
||||
const float proportion_of_memory_to_reserve) {
|
||||
if (memory_allocated_before_setup_) {
|
||||
LOG(ERROR) << "There is cuda memory allocated before we initialize the "
|
||||
"memory pool. This should not happen: you should either "
|
||||
"use raw cudaMalloc and cudaFree and not initialize the "
|
||||
"pool at all, or initialize the pool before you allocate "
|
||||
"anything.";
|
||||
return false;
|
||||
}
|
||||
if (is_memory_pool_setup_) {
|
||||
LOG(ERROR) << "Memory pool is already set up. I cannot set up it twice.";
|
||||
return false;
|
||||
}
|
||||
|
||||
// The actual initialization.
|
||||
int device_count;
|
||||
CUDA_CHECK(cudaGetDeviceCount(&device_count));
|
||||
// Initialize the flags for the memory pool.
|
||||
memory_pool_available_for_device_.resize(device_count, false);
|
||||
per_device_streams_.resize(device_count, nullptr);
|
||||
// Push the current device so we can recover later.
|
||||
int initial_device;
|
||||
CUDA_CHECK(cudaGetDevice(&initial_device));
|
||||
|
||||
vector<cnmemDevice_t> cnmem_devs(device_ids.size());
|
||||
for (int i = 0; i < device_ids.size(); ++i) {
|
||||
const int device_id = device_ids[i];
|
||||
CHECK_GE(device_id, 0);
|
||||
CHECK_LT(device_id, device_count);
|
||||
// This ensures we do not specify the same device twice.
|
||||
CHECK(!memory_pool_available_for_device_[device_id]);
|
||||
CUDA_CHECK(cudaSetDevice(device_id));
|
||||
size_t free_memory, used_memory;
|
||||
CUDA_CHECK(cudaMemGetInfo(&free_memory, &used_memory));
|
||||
LOG(INFO) << "Reserving " << proportion_of_memory_to_reserve * 100
|
||||
<< "percent of the free memory (total " << free_memory
|
||||
<< ") on device " << device_id;
|
||||
// Note: we create a dummy non-null stream for memory allocations, so that
|
||||
// any malloc can be called from any cuda stream, since caffe2 uses a lot of
|
||||
// non-default streams for computation. We will allocate all the reserved
|
||||
// memory to that non-null stream.
|
||||
cnmem_devs[i].device = device_id;
|
||||
cnmem_devs[i].size = size_t(proportion_of_memory_to_reserve * free_memory);
|
||||
CUDA_CHECK(cudaStreamCreate(&per_device_streams_[i]));
|
||||
cnmem_devs[i].numStreams = 1;
|
||||
cnmem_devs[i].streams = &per_device_streams_[i];
|
||||
cnmem_devs[i].streamSizes = &cnmem_devs[i].size;
|
||||
memory_pool_available_for_device_[device_id] = true;
|
||||
}
|
||||
CNMEM_CHECK(
|
||||
cnmemInit(cnmem_devs.size(), cnmem_devs.data(), CNMEM_FLAGS_DEFAULT));
|
||||
// After initialization, let's set back the device.
|
||||
CUDA_CHECK(cudaSetDevice(initial_device));
|
||||
LOG(INFO) << "Set up memory pool.";
|
||||
is_memory_pool_setup_ = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
bool CudaMemoryPool::FinalizeMemoryPool() {
|
||||
// If it has not been set up yet, we have nothing to do.
|
||||
if (!is_memory_pool_setup_) {
|
||||
return true;
|
||||
}
|
||||
CNMEM_CHECK(cnmemFinalize());
|
||||
for (int i = 0; i < per_device_streams_.size(); ++i) {
|
||||
if (per_device_streams_[i]) {
|
||||
CUDA_CHECK(cudaStreamDestroy(per_device_streams_[i]));
|
||||
}
|
||||
}
|
||||
// Reset all the static variables
|
||||
per_device_streams_.resize(0);
|
||||
memory_pool_available_for_device_.resize(0);
|
||||
memory_allocated_before_setup_ = false;
|
||||
is_memory_pool_setup_ = false;
|
||||
return true;
|
||||
}
|
||||
|
||||
void* CudaMemoryPool::NewWithMemoryPool(size_t nbytes) {
|
||||
int device_id;
|
||||
CUDA_CHECK(cudaGetDevice(&device_id));
|
||||
CHECK(memory_pool_available_for_device_[device_id])
|
||||
<< "Trying to allocate on device " << device_id
|
||||
<< ", but memory pool is not initialized on that device.";
|
||||
void* ptr;
|
||||
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, per_device_streams_[device_id]));
|
||||
return ptr;
|
||||
}
|
||||
|
||||
void CudaMemoryPool::DeleteWithMemoryPool(void* data) {
|
||||
cudaPointerAttributes attr;
|
||||
CUDA_CHECK(cudaPointerGetAttributes(&attr, data));
|
||||
DCHECK_EQ(attr.memoryType, cudaMemoryTypeDevice);
|
||||
CHECK(memory_pool_available_for_device_[attr.device])
|
||||
<< "Current pointer belongs to " << attr.device
|
||||
<< ", but memory pool is not initialized on that device. "
|
||||
<< "Was your pointer allocated using the memory pool?";
|
||||
CNMEM_CHECK(cnmemFree(data, per_device_streams_[attr.device]));
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
74
caffe2/core/cuda_memorypool.h
Normal file
74
caffe2/core/cuda_memorypool.h
Normal file
@ -0,0 +1,74 @@
|
||||
#ifndef CAFFE2_CORE_CUDA_MEMORYPOOL_H_
|
||||
#define CAFFE2_CORE_CUDA_MEMORYPOOL_H_
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
#include "caffe2/core/common_gpu.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class CudaMemoryPool {
|
||||
public:
|
||||
// Initializes the memory pool on the device ids, and pre-preserves the given
|
||||
// proportion of the currently free memory on the device.
|
||||
static bool InitializeMemoryPool(
|
||||
const vector<int>& device_ids,
|
||||
const float proportion_of_memory_to_reserve);
|
||||
|
||||
// Finalizes the memory pool. This has to be called after all memory allocated
|
||||
// by the memory pool has been freed.
|
||||
static bool FinalizeMemoryPool();
|
||||
|
||||
static inline bool MemoryPoolInitialized() { return is_memory_pool_setup_; }
|
||||
static inline bool MemoryPoolAvailableForDevice(int device_id) {
|
||||
return (device_id < memory_pool_available_for_device_.size() &&
|
||||
memory_pool_available_for_device_[device_id]);
|
||||
}
|
||||
|
||||
static inline void* New(size_t nbytes) {
|
||||
if (is_memory_pool_setup_) {
|
||||
return NewWithMemoryPool(nbytes);
|
||||
} else {
|
||||
// If memory pool is not set up, use simple cudaMalloc.
|
||||
void* dev_ptr;
|
||||
CUDA_CHECK(cudaMalloc(&dev_ptr, nbytes));
|
||||
memory_allocated_before_setup_ = true;
|
||||
return dev_ptr;
|
||||
}
|
||||
}
|
||||
|
||||
static inline void Delete(void* data) {
|
||||
if (is_memory_pool_setup_) {
|
||||
DeleteWithMemoryPool(data);
|
||||
} else {
|
||||
// If memory pool is not set up, use simple cudaFree.
|
||||
cudaError_t error = cudaFree(data);
|
||||
// For some reason, in Python runtime we sometimes delete a data pointer
|
||||
// after the cuda runtime exits - this is odd but is probably caused by
|
||||
// a static workspace that pycaffe2 uses, and the destruction got entangled
|
||||
// in some race condition. Anyway, since cuda runtime is exiting anyway, we
|
||||
// will not need to worry about memory leak, so we basically ignore it.
|
||||
// This is definitely not ideal but works for now.
|
||||
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
|
||||
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
|
||||
<< cudaGetErrorString(error);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
// CudaMemoryPool is a singleton, so it should not be instantiated.
|
||||
CudaMemoryPool() {};
|
||||
static void* NewWithMemoryPool(size_t nbytes);
|
||||
static void DeleteWithMemoryPool(void* data);
|
||||
|
||||
static bool is_memory_pool_setup_;
|
||||
static bool memory_allocated_before_setup_;
|
||||
static vector<bool> memory_pool_available_for_device_;
|
||||
static vector<cudaStream_t> per_device_streams_;
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_CUDA_MEMORYPOOL_H_
|
64
caffe2/core/cuda_memorypool_test.cc
Normal file
64
caffe2/core/cuda_memorypool_test.cc
Normal file
@ -0,0 +1,64 @@
|
||||
#include "caffe2/core/cuda_memorypool.h"
|
||||
#include "caffe2/core/context_gpu.h"
|
||||
#include "gtest/gtest.h"
|
||||
#include "glog/logging.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
struct UseMemoryPool { static const bool value = true; };
|
||||
struct NotUseMemoryPool { static const bool value = false; };
|
||||
|
||||
template <class UsePoolOrNot>
|
||||
class MemoryPoolTest : public ::testing::Test {
|
||||
protected:
|
||||
MemoryPoolTest() : device_count_(0) {}
|
||||
// virtual void SetUp() will be called before each test is run. You
|
||||
// should define it if you need to initialize the varaibles.
|
||||
// Otherwise, this can be skipped.
|
||||
void SetUp() override {
|
||||
int device_count_;
|
||||
CUDA_CHECK(cudaGetDeviceCount(&device_count_));
|
||||
// If we test with the memory pool, initialize the memory pool.
|
||||
if (UsePoolOrNot::value) {
|
||||
vector<int> device_ids(device_count_);
|
||||
for (int i = 0; i < device_count_; ++i) {
|
||||
device_ids[i] = i;
|
||||
}
|
||||
CHECK(CudaMemoryPool::InitializeMemoryPool(device_ids, 0.8));
|
||||
}
|
||||
}
|
||||
|
||||
void TearDown() override {
|
||||
if (UsePoolOrNot::value) {
|
||||
CHECK(CudaMemoryPool::FinalizeMemoryPool());
|
||||
}
|
||||
}
|
||||
|
||||
// Declares the variables your tests want to use.
|
||||
int device_count_;
|
||||
};
|
||||
|
||||
typedef ::testing::Types<UseMemoryPool, NotUseMemoryPool> MemoryPoolTestTypes;
|
||||
TYPED_TEST_CASE(MemoryPoolTest, MemoryPoolTestTypes);
|
||||
|
||||
// This just tests that setup and teardown works.
|
||||
TYPED_TEST(MemoryPoolTest, InitializeAndFinalizeWorks) {
|
||||
EXPECT_TRUE(true);
|
||||
}
|
||||
|
||||
TYPED_TEST(MemoryPoolTest, AllocateAndDeallocate) {
|
||||
const int nbytes = 1048576;
|
||||
for (int i = 0; i < this->device_count_; ++i) {
|
||||
LOG(INFO) << "Device " << i << " of " << this->device_count_;
|
||||
CUDA_CHECK(cudaSetDevice(i));
|
||||
void* allocated = CUDAContext::New(nbytes);
|
||||
EXPECT_NE(allocated, nullptr);
|
||||
cudaPointerAttributes attr;
|
||||
CUDA_CHECK(cudaPointerGetAttributes(&attr, allocated));
|
||||
EXPECT_EQ(attr.memoryType, cudaMemoryTypeDevice);
|
||||
EXPECT_EQ(attr.device, i);
|
||||
CUDAContext::Delete(allocated);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
9
third_party/cnmem/BREW
vendored
Normal file
9
third_party/cnmem/BREW
vendored
Normal file
@ -0,0 +1,9 @@
|
||||
cuda_library(
|
||||
name = "cnmem",
|
||||
srcs = [
|
||||
"cnmem.cpp",
|
||||
],
|
||||
hdrs = [
|
||||
"cnmem.h",
|
||||
],
|
||||
)
|
1287
third_party/cnmem/cnmem.cpp
vendored
Normal file
1287
third_party/cnmem/cnmem.cpp
vendored
Normal file
File diff suppressed because it is too large
Load Diff
263
third_party/cnmem/cnmem.h
vendored
Normal file
263
third_party/cnmem/cnmem.h
vendored
Normal file
@ -0,0 +1,263 @@
|
||||
/* **********************************************************************
|
||||
* Copyright (c) 2015, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions
|
||||
* are met:
|
||||
* * Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
* * Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in the
|
||||
* documentation and/or other materials provided with the distribution.
|
||||
* * Neither the name of NVIDIA CORPORATION nor the names of its
|
||||
* contributors may be used to endorse or promote products derived
|
||||
* from this software without specific prior written permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
||||
* EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
* PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
||||
* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
* EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
* PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
* PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
||||
* OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||||
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
* ********************************************************************** */
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
#include "cstdio"
|
||||
#else
|
||||
#include "stdio.h"
|
||||
#endif
|
||||
#include "cuda_runtime_api.h"
|
||||
|
||||
#if defined(_MSC_VER) || defined(WIN32)
|
||||
#ifdef CNMEM_DLLEXPORT
|
||||
#define CNMEM_API __declspec(dllexport)
|
||||
#else
|
||||
#define CNMEM_API __declspec(dllimport)
|
||||
#endif
|
||||
#else
|
||||
#ifdef CNMEM_DLLEXPORT
|
||||
#define CNMEM_API __attribute__((visibility ("default")))
|
||||
#else
|
||||
#define CNMEM_API
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CNMEM_VERSION 100 // It corresponds to 1.0.0
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CNMEM_STATUS_SUCCESS = 0,
|
||||
CNMEM_STATUS_CUDA_ERROR,
|
||||
CNMEM_STATUS_INVALID_ARGUMENT,
|
||||
CNMEM_STATUS_NOT_INITIALIZED,
|
||||
CNMEM_STATUS_OUT_OF_MEMORY,
|
||||
CNMEM_STATUS_UNKNOWN_ERROR
|
||||
} cnmemStatus_t;
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
typedef enum
|
||||
{
|
||||
CNMEM_FLAGS_DEFAULT = 0, /// Default flags.
|
||||
CNMEM_FLAGS_CANNOT_GROW = 1, /// Prevent the manager from growing its memory consumption.
|
||||
CNMEM_FLAGS_CANNOT_STEAL = 2, /// Prevent the manager from stealing memory.
|
||||
} cnmemManagerFlags_t;
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
typedef struct cnmemDevice_t_
|
||||
{
|
||||
/** The device number. */
|
||||
int device;
|
||||
/** The size to allocate for that device. If 0, the implementation chooses the size. */
|
||||
size_t size;
|
||||
/** The number of named streams associated with the device. The NULL stream is not counted. */
|
||||
int numStreams;
|
||||
/** The streams associated with the device. It can be NULL. The NULL stream is managed. */
|
||||
cudaStream_t *streams;
|
||||
/** The size reserved for each streams. It can be 0. */
|
||||
size_t *streamSizes;
|
||||
|
||||
} cnmemDevice_t;
|
||||
|
||||
/**
|
||||
* \brief Initialize the library and allocate memory on the listed devices.
|
||||
*
|
||||
* For each device, an internal memory manager is created and the specified amount of memory is
|
||||
* allocated (it is the size defined in device[i].size). For each, named stream an additional
|
||||
* memory manager is created. Currently, it is implemented as a tree of memory managers: A root
|
||||
* manager for the device and a list of children, one for each named stream.
|
||||
*
|
||||
* This function must be called before any other function in the library. It has to be called
|
||||
* by a single thread since it is not thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
||||
* CNMEM_STATUS_OUT_OF_MEMORY, if the requested size exceeds the available memory,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in a CUDA function.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemInit(int numDevices, const cnmemDevice_t *devices, unsigned flags);
|
||||
|
||||
/**
|
||||
* \brief Release all the allocated memory.
|
||||
*
|
||||
* This function must be called by a single thread and after all threads that called
|
||||
* cnmemMalloc/cnmemFree have joined. This function is not thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemFinalize();
|
||||
|
||||
/**
|
||||
* \brief Increase the internal reference counter of the context object.
|
||||
*
|
||||
* This function increases the internal reference counter of the library. The purpose of that
|
||||
* reference counting mechanism is to give more control to the user over the lifetime of the
|
||||
* library. It is useful with scoped memory allocation which may be destroyed in a final
|
||||
* memory collection after the end of main(). That function is thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemRetain();
|
||||
|
||||
/**
|
||||
* \brief Decrease the internal reference counter of the context object.
|
||||
*
|
||||
* This function decreases the internal reference counter of the library. The purpose of that
|
||||
* reference counting mechanism is to give more control to the user over the lifetime of the
|
||||
* library. It is useful with scoped memory allocation which may be destroyed in a final
|
||||
* memory collection after the end of main(). That function is thread-safe.
|
||||
*
|
||||
* You can use \c cnmemRelease to explicitly finalize the library.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemRelease();
|
||||
|
||||
/**
|
||||
* \brief Add a new stream to the pool of managed streams on a device.
|
||||
*
|
||||
* This function registers a new stream into a device memory manager. It is thread-safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemRegisterStream(cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Allocate memory.
|
||||
*
|
||||
* This function allocates memory and initializes a pointer to device memory. If no memory
|
||||
* is available, it returns a CNMEM_STATUS_OUT_OF_MEMORY error. This function is thread safe.
|
||||
*
|
||||
* The behavior of that function is the following:
|
||||
*
|
||||
* - If the stream is NULL, the root memory manager is asked to allocate a buffer of device
|
||||
* memory. If there's a buffer of size larger or equal to the requested size in the list of
|
||||
* free blocks, it is returned. If there's no such buffer but the manager is allowed to grow
|
||||
* its memory usage (the CNMEM_FLAGS_CANNOT_GROW flag is not set), the memory manager calls
|
||||
* cudaMalloc. If cudaMalloc fails due to no more available memory or the manager is not
|
||||
* allowed to grow, the manager attempts to steal memory from one of its children (unless
|
||||
* CNMEM_FLAGS_CANNOT_STEAL is set). If that attempt also fails, the manager returns
|
||||
* CNMEM_STATUS_OUT_OF_MEMORY.
|
||||
*
|
||||
* - If the stream is a named stream, the initial request goes to the memory manager associated
|
||||
* with that stream. If a free node is available in the lists of that manager, it is returned.
|
||||
* Otherwise, the request is passed to the root node and works as if the request were made on
|
||||
* the NULL stream.
|
||||
*
|
||||
* The calls to cudaMalloc are potentially costly and may induce GPU synchronizations. Also the
|
||||
* mechanism to steal memory from the children induces GPU synchronizations (the manager has to
|
||||
* make sure no kernel uses a given buffer before stealing it) and it the execution is
|
||||
* sequential (in a multi-threaded context, the code is executed in a critical section inside
|
||||
* the cnmem library - no need for the user to wrap cnmemMalloc with locks).
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
|
||||
* CNMEM_STATUS_OUT_OF_MEMORY, if there is not enough memory available,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemMalloc(void **ptr, size_t size, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Release memory.
|
||||
*
|
||||
* This function releases memory and recycles a memory block in the manager. This function is
|
||||
* thread safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, ptr == 0,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemFree(void *ptr, cudaStream_t stream);
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
/* Utility functions. */
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
/**
|
||||
* \brief Returns the amount of memory managed by the memory manager associated with a stream.
|
||||
*
|
||||
* The pointers totalMem and freeMem must be valid. At the moment, this function has a comple-
|
||||
* xity linear in the number of allocated blocks so do not call it in performance critical
|
||||
* sections.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemMemGetInfo(size_t *freeMem, size_t *totalMem, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Print a list of nodes to a file.
|
||||
*
|
||||
* This function is intended to be used in case of complex scenarios to help understand the
|
||||
* behaviour of the memory managers/application. It is thread safe.
|
||||
*
|
||||
* \return
|
||||
* CNMEM_STATUS_SUCCESS, if everything goes fine,
|
||||
* CNMEM_STATUS_NOT_INITIALIZED, if the ::cnmemInit function has not been called,
|
||||
* CNMEM_STATUS_INVALID_ARGUMENT, if one of the argument is invalid. For example, used_mem == 0
|
||||
* or free_mem == 0,
|
||||
* CNMEM_STATUS_CUDA_ERROR, if an error happens in one of the CUDA functions.
|
||||
*/
|
||||
cnmemStatus_t CNMEM_API cnmemPrintMemoryState(FILE *file, cudaStream_t stream);
|
||||
|
||||
/**
|
||||
* \brief Converts a cnmemStatus_t value to a string.
|
||||
*/
|
||||
const char CNMEM_API * cnmemGetErrorString(cnmemStatus_t status);
|
||||
|
||||
/* ********************************************************************************************* */
|
||||
|
||||
#ifdef __cplusplus
|
||||
} // extern "C"
|
||||
#endif
|
||||
|
Reference in New Issue
Block a user