mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Revert "raw_alloc ignores PYTORCH_NO_CUDA_MEMORY_CACHING (#131114)"
This reverts commit 70019074806920f95976fedad775d7570294f635. Reverted https://github.com/pytorch/pytorch/pull/131114 on behalf of https://github.com/PaliC due to failing internal builds ([comment](https://github.com/pytorch/pytorch/pull/131114#issuecomment-2390615007))
This commit is contained in:
@ -3127,28 +3127,12 @@ class DeviceCachingAllocator {
|
||||
// Returns whether to force all allocations to bypass the caching allocator and
|
||||
// go straight to cudaMalloc. This setting is useful when debugging GPU memory
|
||||
// errors, since the caching allocator foils cuda-memcheck.
|
||||
static bool forceUncachedAllocator() {
|
||||
// Allow either CUDA or HIP name for env var for maximum user comfort
|
||||
// the CUDA env var avoids being hipified in cuda_to_hip_mappings.py
|
||||
static const char* cuda_env = getenv("PYTORCH_NO_CUDA_MEMORY_CACHING");
|
||||
static const char* rocm_env = getenv("PYTORCH_NO_HIP_MEMORY_CACHING");
|
||||
static bool force_uncached = (cuda_env != nullptr) || (rocm_env != nullptr);
|
||||
bool forceUncachedAllocator() {
|
||||
static bool force_uncached =
|
||||
getenv("PYTORCH_NO_CUDA_MEMORY_CACHING") != nullptr;
|
||||
return force_uncached;
|
||||
}
|
||||
|
||||
static void* uncached_allocate(size_t size) {
|
||||
void* devPtr = nullptr;
|
||||
// Deliberately don't use cudaMallocMaybeCapturing here, to force an error
|
||||
// if someone tries to use forceUncachedAllocator while capturing.
|
||||
C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
|
||||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
|
||||
if (C10_UNLIKELY(interp)) {
|
||||
(*interp)->trace_gpu_memory_allocation(
|
||||
c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
|
||||
}
|
||||
return devPtr;
|
||||
}
|
||||
|
||||
static void uncached_delete(void* ptr) {
|
||||
if (TORCH_SDT_IS_ENABLED(free)) {
|
||||
TORCH_SDT_WITH_SEMAPHORE(free, ptr);
|
||||
@ -3166,9 +3150,6 @@ void local_raw_delete(void* ptr);
|
||||
|
||||
class NativeCachingAllocator : public CUDAAllocator {
|
||||
private:
|
||||
// allows this allocator to be turned on and off programmatically
|
||||
bool enable_ = true;
|
||||
|
||||
// Shard allocation region to have independent mutexes to reduce contention.
|
||||
static constexpr size_t kNumMutexShard = 67;
|
||||
|
||||
@ -3343,14 +3324,6 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
da->emptyCache();
|
||||
}
|
||||
|
||||
void enable(bool value) override {
|
||||
enable_ = value;
|
||||
}
|
||||
|
||||
bool isEnabled() const override {
|
||||
return enable_;
|
||||
}
|
||||
|
||||
void* getBaseAllocation(void* ptr, size_t* outSize) override {
|
||||
Block* block = get_allocated_block(ptr);
|
||||
if (!block) {
|
||||
@ -3485,9 +3458,17 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
void (*deleteFunc)(void*) = &local_raw_delete;
|
||||
CUDAStream stream = cuda::getCurrentCUDAStream(device);
|
||||
|
||||
if (forceUncachedAllocator() || !isEnabled()) {
|
||||
if (forceUncachedAllocator()) {
|
||||
deleteFunc = &uncached_delete;
|
||||
devPtr = uncached_allocate(size);
|
||||
|
||||
// Deliberately don't use cudaMallocMaybeCapturing here, to force an error
|
||||
// if someone tries to use forceUncachedAllocator while capturing.
|
||||
C10_CUDA_CHECK(cudaMalloc(&devPtr, size));
|
||||
const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace();
|
||||
if (C10_UNLIKELY(interp)) {
|
||||
(*interp)->trace_gpu_memory_allocation(
|
||||
c10::kCUDA, reinterpret_cast<uintptr_t>(devPtr));
|
||||
}
|
||||
} else {
|
||||
if (size != 0) {
|
||||
this->malloc(&devPtr, device, size, stream);
|
||||
@ -3501,7 +3482,7 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
return {devPtr, devPtr, deleteFunc, Device(DeviceType::CUDA, device)};
|
||||
}
|
||||
DeleterFnPtr raw_deleter() const override {
|
||||
if (forceUncachedAllocator() || !isEnabled()) {
|
||||
if (forceUncachedAllocator()) {
|
||||
return &uncached_delete;
|
||||
} else {
|
||||
return &local_raw_delete;
|
||||
@ -3558,14 +3539,10 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
if (nbytes == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
c10::DeviceIndex device = 0;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
void* r = nullptr;
|
||||
if (forceUncachedAllocator() || !isEnabled()) {
|
||||
r = uncached_allocate(nbytes);
|
||||
} else {
|
||||
c10::DeviceIndex device = 0;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
|
||||
}
|
||||
malloc(&r, device, nbytes, cuda::getCurrentCUDAStream(device));
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -3573,14 +3550,10 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
if (nbytes == 0) {
|
||||
return nullptr;
|
||||
}
|
||||
c10::DeviceIndex device = 0;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
void* r = nullptr;
|
||||
if (forceUncachedAllocator() || !isEnabled()) {
|
||||
r = uncached_allocate(nbytes);
|
||||
} else {
|
||||
c10::DeviceIndex device = 0;
|
||||
C10_CUDA_CHECK(c10::cuda::GetDevice(&device));
|
||||
malloc(&r, device, nbytes, stream);
|
||||
}
|
||||
malloc(&r, device, nbytes, stream);
|
||||
return r;
|
||||
}
|
||||
|
||||
@ -3625,11 +3598,7 @@ class NativeCachingAllocator : public CUDAAllocator {
|
||||
}
|
||||
|
||||
void raw_delete(void* ptr) override {
|
||||
if (forceUncachedAllocator() || !isEnabled()) {
|
||||
uncached_delete(ptr);
|
||||
} else {
|
||||
this->free(ptr);
|
||||
}
|
||||
this->free(ptr);
|
||||
}
|
||||
|
||||
// In CUDA IPC, sender sends a tensor to receiver via shareIPCHandle,
|
||||
|
@ -206,8 +206,6 @@ class CUDAAllocator : public Allocator {
|
||||
virtual bool initialized() = 0;
|
||||
virtual void setMemoryFraction(double fraction, c10::DeviceIndex device) = 0;
|
||||
virtual void emptyCache() = 0;
|
||||
virtual void enable(bool value) = 0;
|
||||
virtual bool isEnabled() const = 0;
|
||||
virtual void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) = 0;
|
||||
virtual void* getBaseAllocation(void* ptr, size_t* size) = 0;
|
||||
virtual void recordStream(const DataPtr&, CUDAStream stream) = 0;
|
||||
@ -329,14 +327,6 @@ inline void emptyCache() {
|
||||
return get()->emptyCache();
|
||||
}
|
||||
|
||||
inline void enable(bool value) {
|
||||
return get()->enable(value);
|
||||
}
|
||||
|
||||
inline bool isEnabled() {
|
||||
return get()->isEnabled();
|
||||
}
|
||||
|
||||
inline void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) {
|
||||
return get()->cacheInfo(device, largestBlock);
|
||||
}
|
||||
|
@ -496,14 +496,6 @@ struct CudaMallocAsyncAllocator : public CUDAAllocator {
|
||||
}
|
||||
}
|
||||
|
||||
void enable(bool) override {
|
||||
// cannot disable
|
||||
}
|
||||
|
||||
bool isEnabled() const override {
|
||||
return true;
|
||||
}
|
||||
|
||||
void cacheInfo(c10::DeviceIndex device, size_t* maxWorkspaceGuess) override {
|
||||
// The only consumer of cacheInfo is getMaxWorkspaceSize in Conv_v7.cpp.
|
||||
// Afaict, the role of cacheInfo is to give getMaxWorkspaceSize a reasonable
|
||||
|
@ -123,15 +123,6 @@ Memory management
|
||||
MemPool
|
||||
MemPoolContext
|
||||
|
||||
.. currentmodule:: torch.cuda.memory
|
||||
|
||||
.. autosummary::
|
||||
:toctree: generated
|
||||
:nosignatures:
|
||||
|
||||
caching_allocator_enable
|
||||
|
||||
.. currentmodule:: torch.cuda
|
||||
.. autoclass:: torch.cuda.use_mem_pool
|
||||
|
||||
.. FIXME The following doesn't seem to exist. Is it supposed to?
|
||||
|
@ -3770,43 +3770,6 @@ class TestCudaMallocAsync(TestCase):
|
||||
"pinned_num_register_threads:1024"
|
||||
)
|
||||
|
||||
def test_cachingAllocator_raw_alloc(self):
|
||||
# Test that raw_alloc respects the setting that
|
||||
# activates/deactivates the caching allocator
|
||||
|
||||
# Helper function that calls raw_alloc and returns
|
||||
# relevant field in data structure
|
||||
def requested_bytes_alloc_stats(raw_alloc_size, stream):
|
||||
start = torch.cuda.memory_stats()["requested_bytes.all.allocated"]
|
||||
torch._C._cuda_cudaCachingAllocator_raw_alloc(raw_alloc_size, stream)
|
||||
finish = torch.cuda.memory_stats()["requested_bytes.all.allocated"]
|
||||
return finish - start
|
||||
|
||||
torch.cuda.empty_cache()
|
||||
device = torch._C._cuda_getDevice()
|
||||
stream = torch._C._cuda_getCurrentRawStream(device)
|
||||
torch._C._cuda_resetAccumulatedMemoryStats(device)
|
||||
|
||||
# size of allocation
|
||||
raw_alloc_size = 1024 * 1024 # 1 MB
|
||||
|
||||
try:
|
||||
# Deactivate the caching allocator
|
||||
torch.cuda.caching_allocator_enable(False)
|
||||
|
||||
# For a deactivated caching allocator, result is zero
|
||||
cuda_alloc_size = requested_bytes_alloc_stats(raw_alloc_size, stream)
|
||||
self.assertEqual(cuda_alloc_size, 0)
|
||||
|
||||
finally:
|
||||
# Make sure we get back to the default state that is
|
||||
# an activated caching allocator
|
||||
torch.cuda.caching_allocator_enable(True)
|
||||
|
||||
# For an active caching allocator, result matches raw_alloc_size
|
||||
cuda_alloc_size = requested_bytes_alloc_stats(raw_alloc_size, stream)
|
||||
self.assertEqual(cuda_alloc_size, raw_alloc_size)
|
||||
|
||||
@parametrize("max_split_size_mb_setting", [False, True])
|
||||
def test_raises_oom(self, max_split_size_mb_setting):
|
||||
if max_split_size_mb_setting:
|
||||
|
@ -1833,7 +1833,6 @@ def _cuda_getCompiledVersion() -> _int: ...
|
||||
def _cuda_cudaHostAllocator() -> _int: ...
|
||||
def _cuda_cudaCachingAllocator_raw_alloc(size: _int, cuda_stream: _int) -> _int: ...
|
||||
def _cuda_cudaCachingAllocator_raw_delete(ptr: _int) -> None: ...
|
||||
def _cuda_cudaCachingAllocator_enable(val: _bool) -> None: ...
|
||||
def _cuda_cudaCachingAllocator_set_allocator_settings(env: str) -> None: ...
|
||||
def _cuda_beginAllocateToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
|
||||
def _cuda_beginAllocateCurrentStreamToPool(device: _int, mempool_id: Tuple[_int, _int]) -> None: ...
|
||||
|
@ -2540,7 +2540,6 @@ torch_non_c_binding_in_graph_functions = dict.fromkeys(
|
||||
"torch.cuda.memory._snapshot",
|
||||
"torch.cuda.memory.caching_allocator_alloc",
|
||||
"torch.cuda.memory.caching_allocator_delete",
|
||||
"torch.cuda.memory.caching_allocator_enable",
|
||||
"torch.cuda.memory.change_current_allocator",
|
||||
"torch.cuda.memory.empty_cache",
|
||||
"torch.cuda.memory.get_allocator_backend",
|
||||
|
@ -110,10 +110,6 @@ struct TORCH_CUDA_CPP_API CUDAPluggableAllocator
|
||||
bool initialized() override;
|
||||
void setMemoryFraction(double fraction, c10::DeviceIndex device) override;
|
||||
void emptyCache() override;
|
||||
void enable(bool) override {}
|
||||
bool isEnabled() const override {
|
||||
return true;
|
||||
}
|
||||
void cacheInfo(c10::DeviceIndex device, size_t* largestBlock) override;
|
||||
void* getBaseAllocation(void* ptr, size_t* size) override;
|
||||
|
||||
|
@ -428,19 +428,6 @@ PyObject* THCPModule_cudaCachingAllocator_raw_delete(
|
||||
END_HANDLE_TH_ERRORS
|
||||
}
|
||||
|
||||
PyObject* THCPModule_cudaCachingAllocator_enable(
|
||||
PyObject* _unused,
|
||||
PyObject* arg) {
|
||||
HANDLE_TH_ERRORS
|
||||
TORCH_CHECK(
|
||||
THPUtils_checkBool(arg),
|
||||
"cudaCachingAllocator_enable expects a bool, but got ",
|
||||
THPUtils_typename(arg));
|
||||
c10::cuda::CUDACachingAllocator::enable(THPUtils_unpackBool(arg));
|
||||
Py_RETURN_NONE;
|
||||
END_HANDLE_TH_ERRORS
|
||||
}
|
||||
|
||||
PyObject* THCPModule_cudaCachingAllocator_set_allocator_settings(
|
||||
PyObject* _unused,
|
||||
PyObject* env) {
|
||||
@ -1869,10 +1856,6 @@ static struct PyMethodDef _THCPModule_methods[] = {
|
||||
THCPModule_cudaCachingAllocator_raw_delete,
|
||||
METH_O,
|
||||
nullptr},
|
||||
{"_cuda_cudaCachingAllocator_enable",
|
||||
THCPModule_cudaCachingAllocator_enable,
|
||||
METH_O,
|
||||
nullptr},
|
||||
{"_cuda_cudaCachingAllocator_set_allocator_settings",
|
||||
THCPModule_cudaCachingAllocator_set_allocator_settings,
|
||||
METH_O,
|
||||
|
@ -1578,7 +1578,6 @@ __all__ = [
|
||||
"amp",
|
||||
"caching_allocator_alloc",
|
||||
"caching_allocator_delete",
|
||||
"caching_allocator_enable",
|
||||
"can_device_access_peer",
|
||||
"check_error",
|
||||
"cudaStatus",
|
||||
|
@ -29,7 +29,6 @@ from ._memory_viz import memory as _memory, segments as _segments
|
||||
__all__ = [
|
||||
"caching_allocator_alloc",
|
||||
"caching_allocator_delete",
|
||||
"caching_allocator_enable",
|
||||
"set_per_process_memory_fraction",
|
||||
"empty_cache",
|
||||
"memory_stats",
|
||||
@ -149,12 +148,6 @@ def caching_allocator_delete(mem_ptr):
|
||||
torch._C._cuda_cudaCachingAllocator_raw_delete(mem_ptr)
|
||||
|
||||
|
||||
def caching_allocator_enable(value: bool = True) -> None:
|
||||
r"""Enable or disable the CUDA memory allocator. On by default."""
|
||||
if is_initialized():
|
||||
torch._C._cuda_cudaCachingAllocator_enable(value)
|
||||
|
||||
|
||||
def set_per_process_memory_fraction(
|
||||
fraction, device: Union[Device, int] = None
|
||||
) -> None:
|
||||
|
@ -8568,7 +8568,6 @@ PYTORCH_SPECIFIC_MAPPINGS = collections.OrderedDict(
|
||||
|
||||
CAFFE2_SPECIFIC_MAPPINGS = collections.OrderedDict(
|
||||
[
|
||||
("PYTORCH_NO_CUDA_MEMORY_CACHING", ("PYTORCH_NO_CUDA_MEMORY_CACHING", API_CAFFE2)),
|
||||
("cuda_stream", ("hip_stream", API_CAFFE2)),
|
||||
# if the header is a native hip folder (under hip directory),
|
||||
# there is no need to add a hip path to it; the trie in hipify script
|
||||
|
Reference in New Issue
Block a user