mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
324 lines
12 KiB
Plaintext
324 lines
12 KiB
Plaintext
#include <algorithm>
|
|
#include <atomic>
|
|
#include <cstdlib>
|
|
#include <string>
|
|
|
|
#include "cub/util_allocator.cuh"
|
|
#include "cnmem.h"
|
|
|
|
#include "caffe2/core/asan.h"
|
|
#include "caffe2/core/context_gpu.h"
|
|
#include "caffe2/core/init.h"
|
|
#include "caffe2/core/logging.h"
|
|
#include "caffe2/core/tensor.h"
|
|
#include "caffe2/utils/string_utils.h"
|
|
|
|
|
|
#define CNMEM_CHECK(condition) \
|
|
do { \
|
|
cnmemStatus_t error = condition; \
|
|
CHECK_EQ(error, CNMEM_STATUS_SUCCESS) << cnmemGetErrorString(error); \
|
|
} while (0)
|
|
|
|
|
|
DEFINE_string(caffe2_cuda_memory_pool, "",
|
|
"Sets the memory pool used by caffe2. Possible values are "
|
|
"none, cnmen and cub.");
|
|
DEFINE_double(caffe2_cnmem_reserve, 0.8,
|
|
"Sets the proportion of memory pre-allocated by the memory "
|
|
"pool if you use cnmem.");
|
|
DEFINE_string(caffe2_cnmem_gpus, "",
|
|
"A comma separated list containing the index of gpus that "
|
|
"we will set the memory pool on. If not set, we will set "
|
|
"up the memory pool on all available GPUs. This only applies "
|
|
"to cnmem.");
|
|
// TODO(jiayq): Figure out the best default values for the params below.
|
|
// Currently we are using the setting copied from caffe.
|
|
DEFINE_int32(caffe2_cub_bin_growth, 2,
|
|
"If using cub as the memory allocator, sets the growth of bins "
|
|
"used by the cub pool.");
|
|
DEFINE_int32(caffe2_cub_min_bin, 6,
|
|
"If using cub as the memory allocator, sets the min number of "
|
|
"bins.");
|
|
DEFINE_int32(caffe2_cub_max_bin, 16,
|
|
"If using cub as the memory allocator, sets the max number of "
|
|
"bins.");
|
|
|
|
namespace caffe2 {
|
|
|
|
CAFFE_KNOWN_TYPE(Tensor<CUDAContext>);
|
|
|
|
thread_local ThreadLocalCUDAObjects CUDAContext::cuda_objects_;
|
|
|
|
// TODO(jiayq): these variables shouldn't be currently accessed during static
|
|
// initialization. We should consider moving them to a Mayer's singleton to
|
|
// be totally safe against SIOF.
|
|
|
|
// Static global variables for setting up the memory pool.
|
|
CudaMemoryPoolType g_cuda_memory_pool_type;
|
|
// For cnmem allocator
|
|
vector<bool> g_cnmem_available_for_device;
|
|
// For cub allocator
|
|
unique_ptr<cub::CachingDeviceAllocator> g_cub_allocator;
|
|
|
|
CudaMemoryPoolType GetCudaMemoryPoolType() {
|
|
return g_cuda_memory_pool_type;
|
|
}
|
|
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
// A wrapper to allow us to lazily initialize all cuda environments that Caffe
|
|
// uses. This gets done the first time a caffe2::CUDAContext::New() gets called
|
|
// which is probably the decisive indication that this caffe2 run is going to
|
|
// use GPUs. We avoid cuda initialization with core/init.h functionalities so
|
|
// that we have minimal resource impact in case we will need to run multiple
|
|
// caffe2 instances on a GPU machine.
|
|
///////////////////////////////////////////////////////////////////////////////
|
|
|
|
static void Caffe2InitializeCuda() {
|
|
// If the current run does not have any cuda devices, do nothing.
|
|
if (!HasCudaGPU()) {
|
|
VLOG(1) << "No cuda gpu present. Skipping.";
|
|
return;
|
|
}
|
|
// Check if the number of GPUs matches the expected compile-time max number
|
|
// of GPUs.
|
|
CHECK_LE(NumCudaDevices(), CAFFE2_COMPILE_TIME_MAX_GPUS)
|
|
<< "Number of CUDA devices on the machine is larger than the compiled "
|
|
"max number of gpus expected ("
|
|
<< CAFFE2_COMPILE_TIME_MAX_GPUS
|
|
<< "). Increase that and recompile the caffe binary.";
|
|
// Save the current device so we can restore it after moving across
|
|
// different devices.
|
|
int init_device;
|
|
CUDA_CHECK(cudaGetDevice(&init_device));
|
|
|
|
for (int i = 0; i < NumCudaDevices(); ++i) {
|
|
auto err = cudaSetDevice(i);
|
|
if (err != cudaSuccess) {
|
|
LOG(WARNING)
|
|
<< "Cannot use device " << i
|
|
<< "due to the following error: " << cudaGetErrorString(err);
|
|
continue;
|
|
}
|
|
// Enable peer access.
|
|
for (int j = 0; j < NumCudaDevices(); ++j) {
|
|
if (i == j) continue;
|
|
int can_access;
|
|
CUDA_CHECK(cudaDeviceCanAccessPeer(&can_access, i, j));
|
|
if (can_access) {
|
|
VLOG(1) << "Enabling peer access from " << i << " to " << j;
|
|
// Note: just for future reference, the 0 here is not a gpu id, it is
|
|
// a reserved flag for cudaDeviceEnablePeerAccess that should always be
|
|
// zero currently.
|
|
CUDA_CHECK(cudaDeviceEnablePeerAccess(j, 0));
|
|
}
|
|
}
|
|
}
|
|
// Restore the current device.
|
|
CUDA_CHECK(cudaSetDevice(init_device));
|
|
}
|
|
|
|
static void SetUpCNMEM() {
|
|
g_cnmem_available_for_device.assign(NumCudaDevices(), false);
|
|
VLOG(1) << "Setting up cnmem memory pool.";
|
|
vector<int> device_ids;
|
|
// If the cnmem gpus are not set, set up all gpus.
|
|
if (FLAGS_caffe2_cnmem_gpus.size() == 0) {
|
|
device_ids.resize(NumCudaDevices());
|
|
for (int i = 0; i < device_ids.size(); ++i) {
|
|
device_ids[i] = i;
|
|
}
|
|
} else {
|
|
vector<string> device_ids_str = split(',', FLAGS_caffe2_cnmem_gpus);
|
|
for (const string& id_str : device_ids_str) {
|
|
int id = 0;
|
|
try {
|
|
id = std::stoi(id_str);
|
|
} catch (...) {
|
|
CAFFE_THROW(
|
|
"Cannot parse device id ",
|
|
id_str,
|
|
" to a valid int number.");
|
|
}
|
|
device_ids.push_back(id);
|
|
}
|
|
}
|
|
CAFFE_ENFORCE(FLAGS_caffe2_cnmem_reserve >= 0 &&
|
|
FLAGS_caffe2_cnmem_reserve < 1.0,
|
|
"caffe2_cnmem_reserve number must be in [0, 1)");
|
|
vector<cnmemDevice_t> cnmem_devs(device_ids.size());
|
|
for (int i = 0; i < device_ids.size(); ++i) {
|
|
const int id = device_ids[i];
|
|
CAFFE_ENFORCE(
|
|
id >= 0 && id < NumCudaDevices(),
|
|
"GPU id ", id, " out of the range of available GPUs.");
|
|
DeviceGuard guard(id);
|
|
size_t free, used;
|
|
CUDA_CHECK(cudaMemGetInfo(&free, &used));
|
|
VLOG(1) << "Reserving " << FLAGS_caffe2_cnmem_reserve * 100
|
|
<< " percent of the free memory (total " << free
|
|
<< ") on device " << id;
|
|
// Note: we create a dummy non-null stream for memory allocations, so that
|
|
// any malloc can be called from any cuda stream, since caffe2 uses a lot of
|
|
// non-default streams for computation. We will allocate all the reserved
|
|
// memory to that non-null stream.
|
|
cnmem_devs[i].device = id;
|
|
cnmem_devs[i].size = size_t(FLAGS_caffe2_cnmem_reserve * free);
|
|
cnmem_devs[i].numStreams = 0;
|
|
cnmem_devs[i].streamSizes = nullptr;
|
|
g_cnmem_available_for_device[id] = true;
|
|
}
|
|
CNMEM_CHECK(
|
|
cnmemInit(cnmem_devs.size(), cnmem_devs.data(), CNMEM_FLAGS_DEFAULT));
|
|
VLOG(1) << "Done setting up cnmem memory pool.";
|
|
}
|
|
|
|
static void SetUpCub() {
|
|
VLOG(1) << "Setting up cub memory pool.";
|
|
const bool k_cub_debug =
|
|
#ifdef NDEBUG
|
|
false;
|
|
#else
|
|
true;
|
|
#endif
|
|
// Sets up the cub memory pool
|
|
try {
|
|
g_cub_allocator.reset(new cub::CachingDeviceAllocator(
|
|
FLAGS_caffe2_cub_bin_growth,
|
|
FLAGS_caffe2_cub_min_bin,
|
|
FLAGS_caffe2_cub_max_bin,
|
|
static_cast<size_t>(-1),
|
|
false,
|
|
k_cub_debug));
|
|
} catch (...) {
|
|
CAFFE_THROW("Some error happened at cub initialization.");
|
|
}
|
|
VLOG(1) << "Done setting up cub memory pool.";
|
|
}
|
|
|
|
static void Caffe2SetCUDAMemoryPool() {
|
|
if (FLAGS_caffe2_cuda_memory_pool == "" ||
|
|
FLAGS_caffe2_cuda_memory_pool == "none") {
|
|
g_cuda_memory_pool_type = CudaMemoryPoolType::NONE;
|
|
} else if (FLAGS_caffe2_cuda_memory_pool == "cnmem") {
|
|
// sets up cnmem.
|
|
g_cuda_memory_pool_type = CudaMemoryPoolType::CNMEM;
|
|
SetUpCNMEM();
|
|
} else if (FLAGS_caffe2_cuda_memory_pool == "cub") {
|
|
// Sets up cub.
|
|
g_cuda_memory_pool_type = CudaMemoryPoolType::CUB;
|
|
SetUpCub();
|
|
} else {
|
|
CAFFE_THROW("Unrecognized cuda memory pool type: ",
|
|
FLAGS_caffe2_cuda_memory_pool);
|
|
}
|
|
}
|
|
|
|
// An initialization function that sets the CPU side to use pinned cpu
|
|
// allocator.
|
|
void Caffe2UsePinnedCPUAllocator() {
|
|
#if CAFFE2_ASAN_ENABLED
|
|
// Note(jiayq): for more details, see
|
|
// https://github.com/google/sanitizers/issues/629
|
|
LOG(WARNING) << "There are known issues between address sanitizer and "
|
|
"cudaMallocHost. As a result, caffe2 will not enable pinned "
|
|
"memory allocation in asan mode. If you are expecting any "
|
|
"behavior that depends on asan, be advised that it is not "
|
|
"turned on.";
|
|
#else
|
|
if (!HasCudaGPU()) {
|
|
VLOG(1) << "No GPU present. I won't use pinned allocator then.";
|
|
}
|
|
VLOG(1) << "Caffe2 gpu: setting CPUAllocator to PinnedCPUAllocator.";
|
|
SetCPUAllocator(new PinnedCPUAllocator());
|
|
#endif
|
|
}
|
|
|
|
// Caffe2CudaInitializerHelper is a minimal struct whose sole purpose is to
|
|
// detect the first hint that this Caffe2 run is going to use GPU: either
|
|
// CUDAContext is initialized or CUDAContext::New is called. It then runs
|
|
// all the related cuda initialization functions.
|
|
namespace {
|
|
struct Caffe2CudaInitializerHelper {
|
|
Caffe2CudaInitializerHelper() {
|
|
// We cannot use bool because nvcc changes bool to __nv_bool which does
|
|
// not have a std::atomic instantiation.
|
|
static std::atomic<char> first_call(1);
|
|
if (first_call.fetch_and((char)0)) {
|
|
Caffe2InitializeCuda();
|
|
Caffe2SetCUDAMemoryPool();
|
|
Caffe2UsePinnedCPUAllocator();
|
|
}
|
|
}
|
|
};
|
|
} // namespace
|
|
|
|
CUDAContext::CUDAContext(const int gpu_id)
|
|
: gpu_id_(gpu_id == -1 ? GetDefaultGPUID() : gpu_id)
|
|
, random_seed_(math::randomNumberSeed()) {
|
|
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
|
}
|
|
|
|
CUDAContext::CUDAContext(const DeviceOption& option)
|
|
: gpu_id_(option.has_cuda_gpu_id() ?
|
|
option.cuda_gpu_id() : GetDefaultGPUID()),
|
|
random_seed_(option.has_random_seed() ?
|
|
option.random_seed() : math::randomNumberSeed()) {
|
|
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
|
DCHECK_EQ(option.device_type(), CUDA);
|
|
}
|
|
|
|
|
|
void* CUDAContext::New(size_t nbytes) {
|
|
// A one-time caffe2 cuda initializer.
|
|
static Caffe2CudaInitializerHelper g_cuda_initializer_;
|
|
void* ptr = nullptr;
|
|
switch (g_cuda_memory_pool_type) {
|
|
case CudaMemoryPoolType::NONE:
|
|
CUDA_CHECK(cudaMalloc(&ptr, nbytes));
|
|
return ptr;
|
|
case CudaMemoryPoolType::CNMEM: {
|
|
auto gpuId = GetCurrentGPUID();
|
|
CAFFE_ENFORCE(
|
|
gpuId < g_cnmem_available_for_device.size() &&
|
|
g_cnmem_available_for_device[gpuId],
|
|
"Trying to allocate on device ",
|
|
gpuId,
|
|
" but cnmem pool is not set up for it.");
|
|
CNMEM_CHECK(cnmemMalloc(&ptr, nbytes, nullptr));
|
|
return ptr;
|
|
}
|
|
case CudaMemoryPoolType::CUB:
|
|
CUDA_CHECK(g_cub_allocator->DeviceAllocate(&ptr, nbytes));
|
|
return ptr;
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
void CUDAContext::Delete(void* ptr) {
|
|
switch (g_cuda_memory_pool_type) {
|
|
case CudaMemoryPoolType::NONE: {
|
|
// If memory pool is not set up, use simple cudaFree.
|
|
cudaError_t error = cudaFree(ptr);
|
|
// For some reason, in Python runtime we sometimes delete a data pointer
|
|
// after the cuda runtime exits - this is odd but is probably caused by
|
|
// a static workspace that pycaffe2 uses, and the destruction got
|
|
// entangled in some race condition. Anyway, since cuda runtime is exiting
|
|
// anyway, we will not need to worry about memory leak, so we basically
|
|
// ignore it. This is definitely not ideal but works for now.
|
|
if (error != cudaSuccess && error != cudaErrorCudartUnloading) {
|
|
LOG(FATAL) << "Error at: " << __FILE__ << ":" << __LINE__ << ": "
|
|
<< cudaGetErrorString(error);
|
|
}
|
|
break; }
|
|
case CudaMemoryPoolType::CNMEM:
|
|
CNMEM_CHECK(cnmemFree(ptr, nullptr));
|
|
break;
|
|
case CudaMemoryPoolType::CUB:
|
|
CUDA_CHECK(g_cub_allocator->DeviceFree(ptr));
|
|
break;
|
|
}
|
|
}
|
|
|
|
} // namespace caffe2
|