mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[CUDACachingAllocator] Turn Allocator::allocate into non-const (#120969)
Ideally, the method should be non-const since it changes the allocator state. Some const_casts are also removed in the way. Pull Request resolved: https://github.com/pytorch/pytorch/pull/120969 Approved by: https://github.com/albanD
This commit is contained in:
@ -316,7 +316,7 @@ struct MetaAllocator final : public at::Allocator {
|
||||
static void deleter(void* const pointer) {
|
||||
TORCH_INTERNAL_ASSERT(!pointer);
|
||||
}
|
||||
DataPtr allocate(const size_t nbytes) const override {
|
||||
DataPtr allocate(const size_t nbytes) override {
|
||||
return {nullptr, nullptr, &deleter, at::Device(DeviceType::Meta)};
|
||||
}
|
||||
DeleterFnPtr raw_deleter() const override {
|
||||
|
@ -492,7 +492,7 @@ void CachingHostAllocator_emptyCache() {
|
||||
}
|
||||
|
||||
struct CUDAHostAllocatorWrapper final : public at::Allocator {
|
||||
at::DataPtr allocate(size_t size) const override {
|
||||
at::DataPtr allocate(size_t size) override {
|
||||
auto ptr_and_ctx = getCUDAHostAllocator().allocate(size);
|
||||
return {
|
||||
ptr_and_ctx.first,
|
||||
|
@ -15,7 +15,7 @@ class HIPAllocatorMasqueradingAsCUDA final : public Allocator {
|
||||
public:
|
||||
explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator)
|
||||
: allocator_(allocator) {}
|
||||
DataPtr allocate(size_t size) const override {
|
||||
DataPtr allocate(size_t size) override {
|
||||
DataPtr r = allocator_->allocate(size);
|
||||
r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index()));
|
||||
return r;
|
||||
|
@ -748,7 +748,7 @@ struct TORCH_API MPSAllocator final : public IMPSAllocator {
|
||||
return &Delete;
|
||||
}
|
||||
|
||||
DataPtr allocate(const size_t nbytes) const override {
|
||||
DataPtr allocate(const size_t nbytes) override {
|
||||
__block id<MTLBuffer> buf = nbytes > 0 ? _getAllocImpl().malloc(nbytes, m_usage) : nullptr;
|
||||
return {buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)};
|
||||
}
|
||||
|
@ -124,7 +124,7 @@ struct ZeroTensorAllocator final : public at::Allocator {
|
||||
static void deleter(void* const pointer) {
|
||||
TORCH_INTERNAL_ASSERT(!pointer);
|
||||
}
|
||||
DataPtr allocate(const size_t /*nbytes*/) const override {
|
||||
DataPtr allocate(const size_t /*nbytes*/) override {
|
||||
return {nullptr, nullptr, &deleter, device_};
|
||||
}
|
||||
DeleterFnPtr raw_deleter() const override {
|
||||
|
@ -17,7 +17,7 @@ void* XLAMalloc(ptrdiff_t size) {
|
||||
}
|
||||
|
||||
struct XLAAllocator final : public at::Allocator {
|
||||
at::DataPtr allocate(size_t size) const override {
|
||||
at::DataPtr allocate(size_t size) override {
|
||||
auto* ptr = XLAMalloc(size);
|
||||
return {ptr, ptr, &XLAFree, at::DeviceType::XLA};
|
||||
}
|
||||
|
@ -4,7 +4,7 @@
|
||||
|
||||
namespace c10 {
|
||||
|
||||
DataPtr Allocator::clone(const void* data, std::size_t n) const {
|
||||
DataPtr Allocator::clone(const void* data, std::size_t n) {
|
||||
DataPtr new_data = allocate(n);
|
||||
copy_data(new_data.mutable_get(), data, n);
|
||||
return new_data;
|
||||
|
@ -160,7 +160,7 @@ inline bool operator!=(std::nullptr_t, const DataPtr& dp) noexcept {
|
||||
struct C10_API Allocator {
|
||||
virtual ~Allocator() = default;
|
||||
|
||||
virtual DataPtr allocate(size_t n) const = 0;
|
||||
virtual DataPtr allocate(size_t n) = 0;
|
||||
|
||||
// Clones an allocation that came from this allocator.
|
||||
//
|
||||
@ -171,7 +171,7 @@ struct C10_API Allocator {
|
||||
// attached to the input data.
|
||||
//
|
||||
// Requires: input data was allocated by the same allocator.
|
||||
DataPtr clone(const void* data, std::size_t n) const;
|
||||
DataPtr clone(const void* data, std::size_t n);
|
||||
|
||||
// Checks if DataPtr has a simple context, not wrapped with any out of the
|
||||
// ordinary contexts.
|
||||
|
@ -17,7 +17,7 @@ namespace c10 {
|
||||
|
||||
struct C10_API DefaultCPUAllocator final : at::Allocator {
|
||||
DefaultCPUAllocator() = default;
|
||||
at::DataPtr allocate(size_t nbytes) const override {
|
||||
at::DataPtr allocate(size_t nbytes) override {
|
||||
void* data = nullptr;
|
||||
try {
|
||||
data = c10::alloc_cpu(nbytes);
|
||||
@ -103,7 +103,7 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
|
||||
}
|
||||
}
|
||||
|
||||
DataPtr allocate(const size_t nbytes) const override {
|
||||
DataPtr allocate(const size_t nbytes) override {
|
||||
if (C10_UNLIKELY(0u == nbytes)) {
|
||||
return {
|
||||
nullptr,
|
||||
|
@ -2262,7 +2262,7 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
storage_offset_ == 0); // because we just reallocated
|
||||
return storage_.mutable_data();
|
||||
}
|
||||
const Allocator* allocator = storage_.allocator();
|
||||
Allocator* allocator = storage_.allocator();
|
||||
// Storage might have nullptr allocator in rare cases, for example, if
|
||||
// an external memory segment has been wrapped with Tensor and we don't
|
||||
// know how to reallocate it. However, in order to preserve legacy C2
|
||||
|
@ -3106,7 +3106,7 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
return cpd;
|
||||
}
|
||||
|
||||
DataPtr allocate(size_t size) const override {
|
||||
DataPtr allocate(size_t size) override {
|
||||
constexpr size_t one_exa_bytes = 1152921504606846976ULL;
|
||||
TORCH_CHECK_WITH(
|
||||
OutOfMemoryError,
|
||||
@ -3131,9 +3131,7 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
}
|
||||
} else {
|
||||
if (size != 0) {
|
||||
// Allocator declars allocate const!?
|
||||
const_cast<NativeCachingAllocator*>(this)->malloc(
|
||||
&devPtr, device, size, stream);
|
||||
this->malloc(&devPtr, device, size, stream);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -405,7 +405,7 @@ void local_raw_delete(void* ptr);
|
||||
|
||||
// Same pattern as CUDACachingAllocator.cpp.
|
||||
struct CudaMallocAsyncAllocator : public CUDAAllocator {
|
||||
DataPtr allocate(size_t size) const override {
|
||||
DataPtr allocate(size_t size) override {
|
||||
constexpr size_t one_exa_bytes = 1152921504606846976ULL;
|
||||
TORCH_CHECK_WITH(
|
||||
OutOfMemoryError,
|
||||
|
@ -497,13 +497,11 @@ class XPUAllocator : public Allocator {
|
||||
device_allocators[block->device]->recordStream(block, stream);
|
||||
}
|
||||
|
||||
DataPtr allocate(size_t size) const override {
|
||||
DataPtr allocate(size_t size) override {
|
||||
auto device = c10::xpu::current_device();
|
||||
void* r = nullptr;
|
||||
if (size != 0) {
|
||||
// Allocator declares allocate const!
|
||||
const_cast<XPUAllocator*>(this)->malloc(
|
||||
&r, device, size, xpu::getCurrentXPUStream(device));
|
||||
this->malloc(&r, device, size, xpu::getCurrentXPUStream(device));
|
||||
}
|
||||
return {r, r, &local_raw_delete, Device(DeviceType::XPU, device)};
|
||||
}
|
||||
|
@ -306,7 +306,7 @@ struct CAFFE2_CUDA_API PinnedCPUAllocator final : public at::Allocator {
|
||||
baseAllocator_ = GetDefaultCPUAllocator();
|
||||
}
|
||||
~PinnedCPUAllocator() override {}
|
||||
at::DataPtr allocate(size_t nbytes) const override {
|
||||
at::DataPtr allocate(size_t nbytes) override {
|
||||
if (nbytes == 0) {
|
||||
// replicate c10::alloc_cpu behavior - return nullptr
|
||||
return {nullptr, nullptr, &Delete, at::Device(CPU)};
|
||||
@ -513,7 +513,7 @@ void TrackMemoryAlloc(size_t nbytes) {
|
||||
struct DefaultCUDAAllocator final : public at::Allocator {
|
||||
DefaultCUDAAllocator() {}
|
||||
~DefaultCUDAAllocator() override {}
|
||||
at::DataPtr allocate(size_t nbytes) const override {
|
||||
at::DataPtr allocate(size_t nbytes) override {
|
||||
// Lock the mutex
|
||||
std::lock_guard<std::mutex> lock(CUDAContext::mutex());
|
||||
// A one-time caffe2 cuda initializer.
|
||||
|
@ -173,7 +173,7 @@ at::Tensor& custom_abs_out(const at::Tensor& self, at::Tensor& out) {
|
||||
// A dummy allocator for our custom device, that secretly uses the CPU
|
||||
struct DummyCustomAllocator final : at::Allocator {
|
||||
DummyCustomAllocator() = default;
|
||||
at::DataPtr allocate(size_t nbytes) const override {
|
||||
at::DataPtr allocate(size_t nbytes) override {
|
||||
void* data = c10::alloc_cpu(nbytes);
|
||||
return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, custom_device_index)};
|
||||
}
|
||||
|
@ -66,7 +66,7 @@ at::Tensor custom_to_device(
|
||||
// A dummy allocator for our custom device, that secretly uses the CPU
|
||||
struct DummyCustomAllocator final : at::Allocator {
|
||||
DummyCustomAllocator() = default;
|
||||
at::DataPtr allocate(size_t nbytes) const override {
|
||||
at::DataPtr allocate(size_t nbytes) override {
|
||||
void* data = c10::alloc_cpu(nbytes);
|
||||
return {data, data, &ReportAndDelete, at::Device(at::DeviceType::PrivateUse1, 0)};
|
||||
}
|
||||
|
@ -94,13 +94,11 @@ void* CUDAPluggableAllocator::malloc(
|
||||
return r;
|
||||
}
|
||||
|
||||
c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) const {
|
||||
c10::DataPtr CUDAPluggableAllocator::allocate(size_t size) {
|
||||
c10::DeviceIndex device = -1;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
cudaStream_t stream = c10::cuda::getCurrentCUDAStream(device);
|
||||
void* r =
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
||||
const_cast<CUDAPluggableAllocator*>(this)->malloc(size, device, stream);
|
||||
void* r = this->malloc(size, device, stream);
|
||||
c10::DataPtr data_ptr = {
|
||||
r, r, raw_deleter(), c10::Device(c10::DeviceType::CUDA, device)};
|
||||
return data_ptr;
|
||||
|
@ -71,7 +71,7 @@ struct CUDAPluggableAllocator
|
||||
|
||||
void* malloc(size_t size, c10::DeviceIndex device, cudaStream_t stream);
|
||||
|
||||
c10::DataPtr allocate(size_t size) const override;
|
||||
c10::DataPtr allocate(size_t size) override;
|
||||
c10::DeleterFnPtr raw_deleter() const override;
|
||||
|
||||
void* raw_alloc(size_t nbytes) override;
|
||||
|
Reference in New Issue
Block a user