Refine XPU allocator message when OOM (#165509)

# Motivation Provide more information and align with other backends to enhance the user experience. Pull Request resolved: https://github.com/pytorch/pytorch/pull/165509 Approved by: https://github.com/EikanWang ghstack dependencies: #165508
2025-10-20 21:14:14 +08:00 · 2025-10-15 23:38:02 +00:00
parent d7ffa8b8a2
commit d0c32971b4
1 changed files with 15 additions and 1 deletions
--- a/c10/xpu/XPUCachingAllocator.cpp
+++ b/c10/xpu/XPUCachingAllocator.cpp
@ -433,6 +433,18 @@ class DeviceCachingAllocator {
      c10::xpu::DeviceProp device_prop;
      c10::xpu::get_device_properties(&device_prop, device);
      auto device_total = device_prop.global_mem_size;
+      // Estimate the available device memory when the SYCL runtime does not
+      // support the corresponding aspect (ext_intel_free_memory).
+      size_t device_free = device_prop.global_mem_size -
+          stats.reserved_bytes[static_cast<size_t>(StatType::AGGREGATE)]
+              .current;
+      auto& raw_device = c10::xpu::get_raw_device(device);
+      // TODO: Remove the aspect check once the SYCL runtime bug is fixed on
+      // affected devices.
+      if (raw_device.has(sycl::aspect::ext_intel_free_memory)) {
+        device_free =
+            raw_device.get_info<sycl::ext::intel::info::device::free_memory>();
+      }
      auto allocated_bytes =
          stats.allocated_bytes[static_cast<size_t>(StatType::AGGREGATE)]
              .current;
@ -455,7 +467,9 @@ class DeviceCachingAllocator {
          static_cast<int>(device),
          " has a total capacity of ",
          format_size(device_total),
-          ". Of the allocated memory ",
+          " of which ",
+          format_size(device_free),
+          " is free. Of the allocated memory ",
          format_size(allocated_bytes),
          " is allocated by PyTorch, and ",
          format_size(reserved_bytes - allocated_bytes),