Add API query for available per-process CUDA memory (#140620)

Certain `cpp_wrapper`-enabled tests were OOM-ing in the CI pipeline, with error messages suggesting that sufficient memory was accessible. This ultimately resulted from an internal memory limitation that was not queryable in the API. This PR adds querying for that limit. Additionally, the failing tests had incorrect memory availability checks, and are updated with measured memory requirements. Pull Request resolved: https://github.com/pytorch/pytorch/pull/140620 Approved by: https://github.com/malfet, https://github.com/eqy ghstack dependencies: #141367
2025-10-21 05:34:18 +08:00 · 2024-12-02 17:28:24 +00:00
parent 5c33c9202f
commit 4959784dac
15 changed files with 158 additions and 14 deletions
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -536,6 +536,20 @@ PyObject* THCPModule_hasPrimaryContext(PyObject* _unused, PyObject* arg) {
  END_HANDLE_TH_ERRORS
 }

+PyObject* THCPModule_getMemoryFraction(PyObject* _unused, PyObject* args) {
+  HANDLE_TH_ERRORS
+  PyObject* device_o = nullptr;
+  if (!PyArg_ParseTuple(args, "O", &device_o)) {
+    THPUtils_invalidArguments(
+        args, nullptr, "get_memory_fraction", 1, "(int device);");
+    return nullptr;
+  }
+  auto device_index = THPUtils_unpackDeviceIndex(device_o);
+  return PyFloat_FromDouble(
+      c10::cuda::CUDACachingAllocator::getMemoryFraction(device_index));
+  END_HANDLE_TH_ERRORS
+}
+
 PyObject* THCPModule_setMemoryFraction(PyObject* _unused, PyObject* args) {
  HANDLE_TH_ERRORS
  PyObject* fraction_o = nullptr;
@ -1872,6 +1886,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
     METH_NOARGS,
     nullptr},
    {"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext, METH_O, nullptr},
+    {"_cuda_getMemoryFraction",
+     THCPModule_getMemoryFraction,
+     METH_VARARGS,
+     nullptr},
    {"_cuda_setMemoryFraction",
     THCPModule_setMemoryFraction,
     METH_VARARGS,