Initial implementation of host memory stats (#147660)

This is an initial attempt to provide some statistics for the pinned host memory allocations flowing through CachingHostAllocator. Many times in the past we have had inexplicable slowdowns that would be much easier to diagnose if we had some host memory characteristics. This change tries very hard not to disrupt the initial design of the allocator, and it uses existing locking mechanism, whenever possible, to gather statistics "for free". Only deviation from that is on the "slow path" where we incur CUDA calls anyway, so taking a short lock is not going to hurt the performance much, especially in the steady state where most allocations will come from cache. As mentioned before, this is the first PR, to introduce the concept and to see if it fits the right paradigm. We can always add more later. Metrics that would require more involved changes to the code base and locks, like requested memory, have been punted for now. I also tried to reuse the Stat structure used in CUDA caching allocator, in order to maintain symmetry. Pull Request resolved: https://github.com/pytorch/pytorch/pull/147660 Approved by: https://github.com/ngimel
2025-10-21 05:34:18 +08:00 · 2025-02-28 18:36:44 +00:00
parent 982d7ba3ef
commit 945e359fc1
18 changed files with 794 additions and 86 deletions
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -593,10 +593,10 @@ PyObject* THCPModule_memoryStats(PyObject* _unused, PyObject* arg) {
  TORCH_CHECK(THPUtils_checkLong(arg), "invalid argument to memory_allocated");
  const auto device_index = THPUtils_unpackDeviceIndex(arg);

+  using c10::CachingAllocator::Stat;
+  using c10::CachingAllocator::StatArray;
+  using c10::CachingAllocator::StatType;
  using c10::CachingDeviceAllocator::DeviceStats;
-  using c10::CachingDeviceAllocator::Stat;
-  using c10::CachingDeviceAllocator::StatArray;
-  using c10::CachingDeviceAllocator::StatType;

  const auto statToDict = [](const Stat& stat) {
    py::dict dict;
@ -667,6 +667,70 @@ PyObject* THCPModule_resetPeakMemoryStats(PyObject* _unused, PyObject* arg) {
  Py_RETURN_NONE;
 }

+PyObject* THCPModule_hostMemoryStats(PyObject* _unused, PyObject* noargs) {
+  HANDLE_TH_ERRORS
+
+  using at::HostStats;
+  using c10::CachingAllocator::DurationStat;
+  using c10::CachingAllocator::Stat;
+  using c10::CachingAllocator::StatArray;
+  using c10::CachingAllocator::StatType;
+
+  const auto statToDict = [](const Stat& stat) {
+    py::dict dict;
+
+    dict["current"] = stat.current;
+    dict["peak"] = stat.peak;
+    dict["allocated"] = stat.allocated;
+    dict["freed"] = stat.freed;
+    return dict;
+  };
+
+  const auto durationStatToDict = [](const DurationStat& stat) {
+    py::dict dict;
+
+    dict["total"] = stat.total;
+    dict["max"] = stat.max;
+    dict["min"] = stat.min;
+    dict["count"] = stat.count;
+    dict["avg"] = stat.count == 0 ? 0 : stat.total / stat.count;
+    return dict;
+  };
+
+  const HostStats stats = at::cuda::CachingHostAllocator_getStats();
+
+  py::dict result;
+  result["num_host_alloc"] = stats.num_host_alloc;
+  result["num_host_free"] = stats.num_host_free;
+  result["allocation"] = statToDict(stats.allocation);
+  result["segment"] = statToDict(stats.segment);
+  result["allocated_bytes"] = statToDict(stats.allocated_bytes);
+  result["reserved_bytes"] = statToDict(stats.reserved_bytes);
+  result["host_alloc_time"] = durationStatToDict(stats.host_alloc_time);
+  result["host_free_time"] = durationStatToDict(stats.host_free_time);
+
+  return result.release().ptr();
+  END_HANDLE_TH_ERRORS
+}
+
+PyObject* THCPModule_resetAccumulatedHostMemoryStats(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  at::cuda::CachingHostAllocator_resetAccumulatedStats();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
+PyObject* THCPModule_resetPeakHostMemoryStats(
+    PyObject* _unused,
+    PyObject* noargs) {
+  HANDLE_TH_ERRORS
+  at::cuda::CachingHostAllocator_resetPeakStats();
+  END_HANDLE_TH_ERRORS
+  Py_RETURN_NONE;
+}
+
 CapturedTraceback* getFromContext(
    const std::shared_ptr<c10::GatheredContext>& x) {
  if (CapturedTraceback* sc = dynamic_cast<CapturedTraceback*>(x.get())) {
@ -1957,6 +2021,15 @@ static struct PyMethodDef _THCPModule_methods[] = {
     THCPModule_attachOutOfMemoryObserver,
     METH_O,
     nullptr},
+    {"_cuda_hostMemoryStats", THCPModule_hostMemoryStats, METH_NOARGS, nullptr},
+    {"_cuda_resetAccumulatedHostMemoryStats",
+     THCPModule_resetAccumulatedHostMemoryStats,
+     METH_NOARGS,
+     nullptr},
+    {"_cuda_resetPeakHostMemoryStats",
+     THCPModule_resetPeakHostMemoryStats,
+     METH_NOARGS,
+     nullptr},
    {"_cuda_cudaHostAllocator",
     THCPModule_cudaHostAllocator,
     METH_NOARGS,