Refine XPU allocator message when OOM (#165509)

# Motivation
Provide more information and align with other backends to enhance the user experience.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/165509
Approved by: https://github.com/EikanWang
ghstack dependencies: #165508
This commit is contained in:
Yu, Guangye
2025-10-15 23:38:02 +00:00
committed by PyTorch MergeBot
parent d7ffa8b8a2
commit d0c32971b4

View File

@ -433,6 +433,18 @@ class DeviceCachingAllocator {
c10::xpu::DeviceProp device_prop;
c10::xpu::get_device_properties(&device_prop, device);
auto device_total = device_prop.global_mem_size;
// Estimate the available device memory when the SYCL runtime does not
// support the corresponding aspect (ext_intel_free_memory).
size_t device_free = device_prop.global_mem_size -
stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
.current;
auto& raw_device = c10::xpu::get_raw_device(device);
// TODO: Remove the aspect check once the SYCL runtime bug is fixed on
// affected devices.
if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
device_free =
raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
}
auto allocated_bytes =
stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
.current;
@ -455,7 +467,9 @@ class DeviceCachingAllocator {
static_cast<int>(device),
" has a total capacity of ",
format_size(device_total),
". Of the allocated memory ",
" of which ",
format_size(device_free),
" is free. Of the allocated memory ",
format_size(allocated_bytes),
" is allocated by PyTorch, and ",
format_size(reserved_bytes - allocated_bytes),