Add API query for available per-process CUDA memory (#140620)

Certain `cpp_wrapper`-enabled tests were OOM-ing in the CI pipeline, with error messages suggesting that sufficient memory was accessible.  This ultimately resulted from an internal memory limitation that was not queryable in the API.  This PR adds querying for that limit.

Additionally, the failing tests had incorrect memory availability checks, and are updated with measured memory requirements.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/140620
Approved by: https://github.com/malfet, https://github.com/eqy
ghstack dependencies: #141367
This commit is contained in:
Benjamin Glass
2024-12-02 17:28:24 +00:00
committed by PyTorch MergeBot
parent 5c33c9202f
commit 4959784dac
15 changed files with 158 additions and 14 deletions

View File

@ -536,6 +536,20 @@ PyObject* THCPModule_hasPrimaryContext(PyObject* _unused, PyObject* arg) {
END_HANDLE_TH_ERRORS
}
PyObject* THCPModule_getMemoryFraction(PyObject* _unused, PyObject* args) {
HANDLE_TH_ERRORS
PyObject* device_o = nullptr;
if (!PyArg_ParseTuple(args, "O", &device_o)) {
THPUtils_invalidArguments(
args, nullptr, "get_memory_fraction", 1, "(int device);");
return nullptr;
}
auto device_index = THPUtils_unpackDeviceIndex(device_o);
return PyFloat_FromDouble(
c10::cuda::CUDACachingAllocator::getMemoryFraction(device_index));
END_HANDLE_TH_ERRORS
}
PyObject* THCPModule_setMemoryFraction(PyObject* _unused, PyObject* args) {
HANDLE_TH_ERRORS
PyObject* fraction_o = nullptr;
@ -1872,6 +1886,10 @@ static struct PyMethodDef _THCPModule_methods[] = {
METH_NOARGS,
nullptr},
{"_cuda_hasPrimaryContext", THCPModule_hasPrimaryContext, METH_O, nullptr},
{"_cuda_getMemoryFraction",
THCPModule_getMemoryFraction,
METH_VARARGS,
nullptr},
{"_cuda_setMemoryFraction",
THCPModule_setMemoryFraction,
METH_VARARGS,