Files
pytorch/caffe2/core/hip/context_hip.h
Jerry Zhang 74dc4460eb New in StaticContext returns at::DataPtr (#12029)
Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/12029

In order to remove New() function in StaticContext(to remove StaticContext) and converge to the Allocator design, we'll first change the return type of New to at::DataPtr.

Reviewed By: ezyang

Differential Revision: D9889990

fbshipit-source-id: 3257c763530b987025f428741bdd2e089d11bad4
2018-10-03 19:10:07 -07:00

399 lines
12 KiB
C++

#ifndef CAFFE2_CORE_CONTEXT_HIP_H_
#define CAFFE2_CORE_CONTEXT_HIP_H_
#include <hiprand.h>
#include <ctime>
#include <mutex>
#include "caffe2/core/context.h"
#include "caffe2/core/hip/common_hip.h"
#include "caffe2/core/hip/common_miopen.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/numa.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/types.h"
#include "caffe2/proto/caffe2_pb.h"
namespace caffe2 {
enum class HipMemoryPoolType {
NONE = 0,
CUB = 1,
THC = 2,
};
/**
* Gets the current memory pool type used by Caffe2.
*
* The memory pool is set up during caffe2's global initialization time.
*/
HipMemoryPoolType GetHipMemoryPoolType();
/**
* A struct to host thread-local cuda objects.
*
* In Caffe2, each thread has its own non-default cuda stream as well as
* related objects such as cublas and curand handles. This is achieved by
* having the ThreadLocalCUDAObjects wrapper that takes care of allocating
* and deallocating these objects at the thread scope. This class is solely
* used inside CUDAContext and should not be used externally.
*/
class ThreadLocalHIPObjects {
friend class HIPContext;
private:
ThreadLocalHIPObjects() {
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_HIP_GPUS; ++i) {
hip_streams_[i] = vector<hipStream_t>();
rocblas_handles_[i] = vector<rocblas_handle>();
miopen_handles_[i] = vector<miopenHandle_t>();
}
}
hipStream_t GetStream(int gpu, int stream_id) {
vector<hipStream_t>& gpu_streams = hip_streams_[gpu];
if (gpu_streams.size() <= (unsigned)stream_id) {
gpu_streams.resize(stream_id + 1, nullptr);
}
if (!gpu_streams[stream_id]) {
DeviceGuard guard(gpu);
HIP_ENFORCE(hipStreamCreateWithFlags(
&gpu_streams[stream_id], hipStreamNonBlocking));
}
return gpu_streams[stream_id];
}
rocblas_handle GetHandle(int gpu, int stream_id) {
DeviceGuard guard(gpu);
vector<rocblas_handle>& gpu_handles = rocblas_handles_[gpu];
if (gpu_handles.size() <= (unsigned)stream_id) {
gpu_handles.resize(stream_id + 1, nullptr);
}
if (!gpu_handles[stream_id]) {
ROCBLAS_ENFORCE(rocblas_create_handle(&gpu_handles[stream_id]));
// The default is ROCBLAS_POINTER_MODE_HOST. You can override
// it after obtaining the rocblas handle, but do that with
// caution.
ROCBLAS_ENFORCE(rocblas_set_pointer_mode(
gpu_handles[stream_id], rocblas_pointer_mode_host));
ROCBLAS_ENFORCE(rocblas_set_stream(
gpu_handles[stream_id], GetStream(gpu, stream_id)));
}
return gpu_handles[stream_id];
}
miopenHandle_t GetMiopenHandle(int gpu, int stream_id) {
DeviceGuard guard(gpu);
vector<miopenHandle_t>& gpu_handles = miopen_handles_[gpu];
if (gpu_handles.size() <= (unsigned)stream_id) {
gpu_handles.resize(stream_id + 1, nullptr);
}
if (!gpu_handles[stream_id]) {
MIOPEN_ENFORCE(miopenCreate(&gpu_handles[stream_id]));
MIOPEN_ENFORCE(
miopenSetStream(gpu_handles[stream_id], GetStream(gpu, stream_id)));
}
return gpu_handles[stream_id];
}
~ThreadLocalHIPObjects() noexcept {
for (int i = 0; i < CAFFE2_COMPILE_TIME_MAX_HIP_GPUS; ++i) {
for (auto& handle : rocblas_handles_[i]) {
if (handle) {
ROCBLAS_CHECK(rocblas_destroy_handle(handle));
}
}
for (auto& stream : hip_streams_[i]) {
if (stream) {
HIP_CHECK(hipStreamDestroy(stream));
}
}
for (auto& handle : miopen_handles_[i]) {
if (handle) {
MIOPEN_CHECK(miopenDestroy(handle));
}
}
}
}
vector<hipStream_t> hip_streams_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
vector<rocblas_handle> rocblas_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
vector<miopenHandle_t> miopen_handles_[CAFFE2_COMPILE_TIME_MAX_HIP_GPUS];
};
BaseStaticContext* GetHIPStaticContext();
class HIPContext final : public BaseContext {
public:
// The default HIP context constructor.
explicit HIPContext(const int gpu_id = -1);
explicit HIPContext(const DeviceOption& option);
explicit HIPContext(const at::Device& device)
: HIPContext(DeviceToOption(device)) {}
~HIPContext() override {
if (hiprand_generator_) {
HIPRAND_CHECK(hiprandDestroyGenerator(hiprand_generator_));
}
FinishDeviceComputation();
}
BaseStaticContext* GetStaticContext() const override {
return GetHIPStaticContext();
}
static BaseStaticContext* StaticContext() {
return GetHIPStaticContext();
}
inline void SwitchToDevice(int stream_id) override {
set_stream_id(stream_id);
CaffeHipSetDevice(gpu_id_);
}
using BaseContext::SwitchToDevice;
inline void WaitEvent(const Event& ev) override {
ev.Wait(HIP, this);
}
inline void Record(Event* ev, const char* err_msg = nullptr) const override {
CAFFE_ENFORCE(ev, "Event must not be null.");
ev->Record(HIP, this, err_msg);
}
void FinishDeviceComputation() override {
hipStreamSynchronize(hip_objects_.GetStream(gpu_id_, stream_id_));
hipError_t error = hipGetLastError();
if (error != hipSuccess) {
CAFFE_THROW("Encountered HIP error: ", hipGetErrorString(error));
}
}
inline int hip_gpu_id() const {
return gpu_id_;
}
inline hipStream_t hip_stream() {
return hip_stream(gpu_id_, stream_id_);
}
inline hipStream_t hip_stream() const {
return hip_stream(gpu_id_, stream_id_);
}
static hipStream_t hip_stream(int gpu_id, int stream_id) {
return hip_objects_.GetStream(gpu_id, stream_id);
}
rocblas_handle rocblas_handle() {
return hip_objects_.GetHandle(gpu_id_, stream_id_);
}
miopenHandle_t miopen_handle() {
return hip_objects_.GetMiopenHandle(gpu_id_, stream_id_);
}
hiprandGenerator_t& hiprand_generator() {
if (!hiprand_generator_) {
DeviceGuard guard(gpu_id_);
HIPRAND_ENFORCE(hiprandCreateGenerator(
&hiprand_generator_, HIPRAND_RNG_PSEUDO_DEFAULT));
HIPRAND_ENFORCE(hiprandSetPseudoRandomGeneratorSeed(
hiprand_generator_, random_seed_));
CHECK_NOTNULL(hiprand_generator_);
}
HIPRAND_ENFORCE(hiprandSetStream(hiprand_generator_, hip_stream()));
return hiprand_generator_;
}
static at::DataPtr New(size_t nbytes) {
return StaticContext()->New(nbytes);
}
// Get a mutex to lock out hipMalloc / hipFree calls when
// NCCL kernels are being launched. Should remove threat of
// deadlocks
static std::mutex& mutex();
// Functions to query memory stats. Only available if flag
// --caffe2_gpu_memory_tracking is enabled.
static std::vector<long> TotalMemoryByGpu();
static std::vector<long> MaxMemoryByGpu();
template <class SrcContext, class DstContext>
inline void CopyBytes(size_t nbytes, const void* src, void* dst) {
if (nbytes == 0)
return;
HIP_ENFORCE(hipMemcpyAsync(
dst,
src,
nbytes,
hipMemcpyDefault,
hip_objects_.GetStream(gpu_id_, stream_id_)));
}
void CopyBytesSameDevice(size_t nbytes, const void* src, void* dst) override {
CopyBytes<HIPContext, HIPContext>(nbytes, src, dst);
}
void CopyBytesToCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytes<HIPContext, CPUContext>(nbytes, src, dst);
}
void CopyBytesFromCPU(size_t nbytes, const void* src, void* dst) override {
CopyBytes<CPUContext, HIPContext>(nbytes, src, dst);
}
template <typename T, class SrcContext, class DstContext>
inline void Copy(int n, const T* src, T* dst) {
CopyBytes<SrcContext, DstContext>(
n * sizeof(T), static_cast<const void*>(src), static_cast<void*>(dst));
}
template <class SrcContext, class DstContext>
inline void
CopyItems(const TypeMeta& meta, size_t n, const void* src, void* dst) {
CAFFE_ENFORCE(!meta.copy(), "HIPContext requires fundamental types.");
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
}
// By default HIP operators have async device parts
static bool HasAsyncPartDefault() {
return true;
}
static bool SupportsAsyncScheduling() {
return true;
}
static bool IsStreamFree(const DeviceOption& option, int stream_id) {
auto stream = HIPContext::hip_stream(option.hip_gpu_id(), stream_id);
return hipStreamQuery(stream) == hipSuccess;
}
DeviceType device_type() const override {
return HIP;
}
static constexpr DeviceType GetDeviceType() {
return HIP;
}
protected:
static void Delete(void* data);
void set_stream_id(int stream_id) {
stream_id_ = stream_id;
}
int gpu_id_;
int stream_id_ = 0;
int random_seed_;
hiprandGenerator_t hiprand_generator_{nullptr};
static thread_local ThreadLocalHIPObjects hip_objects_;
};
// For the CPU context, we also allow a (probably expensive) function
// to copy the data from a HIP context. Inside the function, we create
// a temporary CUDAContext object to carry out the copy. From the caller's
// side, these functions are synchronous with respect to the host, similar
// to a normal CPUContext::CopyBytes<CPUContext, CPUContext> call.
template <>
inline void CPUContext::CopyBytes<HIPContext, CPUContext>(
size_t nbytes,
const void* src,
void* dst) {
HIPContext context(GetGPUIDForPointer(src));
context.CopyBytes<HIPContext, CPUContext>(nbytes, src, dst);
}
template <>
inline void CPUContext::CopyBytes<CPUContext, HIPContext>(
size_t nbytes,
const void* src,
void* dst) {
HIPContext context(GetGPUIDForPointer(dst));
context.CopyBytes<CPUContext, HIPContext>(nbytes, src, dst);
}
/**
* An allocator that does the CPU memory allocation with pinned memory.
*
* This is needed because if we want to do any asynchronous HIP memcpy,
* the underlying CPU memory also needs to be allocated into pinned memory
* space. As a result, whenever Caffe2 is built with GPU and there is
* GPU present during runtime, at global initialization time we will set
* the CPU memory allocator to allocate pinned memory.
*/
struct PinnedCPUAllocator final : public at::Allocator {
PinnedCPUAllocator() {}
~PinnedCPUAllocator() override {}
at::DataPtr allocate(size_t nbytes) const override {
void* data;
at::DataPtr data_ptr;
std::lock_guard<std::mutex> lock(HIPContext::mutex());
if (IsNUMAEnabled()) {
data_ptr = baseAllocator_.allocate(nbytes);
data = data_ptr.get();
CAFFE_ENFORCE(data);
HIP_ENFORCE(hipHostRegister(data, nbytes, hipHostRegisterDefault));
} else {
HIP_ENFORCE(hipHostMalloc(&data, nbytes));
data_ptr = {data, data, &Delete, at::Device(CPU)};
}
memset(data, 0, nbytes);
return data_ptr;
}
at::DeleterFnPtr raw_deleter() const override {
return &Delete;
}
private:
static void Delete(void* data) {
// Caffe2 uses a lazy way to figure out if one is actually going to use GPUs
// or not. If a HIPContext::New() call is made, inside the CUDAContext
// function we will switch the cpu side allocator to a PinnedCPUAllocator.
// But, if one calls CPUContext::New() before any HIP allocations,
// PinnedCPUAllocator can still delete the corresponding memory.
std::lock_guard<std::mutex> lock(HIPContext::mutex());
if (IsNUMAEnabled()) {
HIP_ENFORCE(hipHostUnregister(data));
DefaultCPUAllocator::Delete(data);
} else {
hipError_t err = hipHostFree(data);
if (err == hipErrorInvalidValue) {
free(data);
// Calling hipGetLastError will reset the cuda error.
hipGetLastError();
} else {
// For all other errors, still do a hip check.
HIP_ENFORCE(err);
}
}
}
DefaultCPUAllocator baseAllocator_;
};
class HIPStaticContext final : public BaseStaticContext {
public:
at::DataPtr New(size_t nbytes) const override;
DeviceType GetDeviceType() override {
return HIP;
}
void ExtractDeviceOption(DeviceOption* device, const void* data) override {
device->set_device_type(TypeToProto(GetDeviceType()));
device->set_hip_gpu_id(GetGPUIDForPointer(data));
}
protected:
static void Delete(void* data);
};
typedef Tensor TensorHIP;
} // namespace caffe2
#endif // CAFFE2_CORE_CONTEXT_HIP_H_