mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Prevent hanging in data loader altogether
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/11985 Differential Revision: D10202374 Pulled By: SsnL fbshipit-source-id: 1ab1a07185f78a104f9b05930a87ef5a32f431e4
This commit is contained in:
committed by
Facebook Github Bot
parent
1a0d82e4f4
commit
11c31aef04
@ -1,15 +1,14 @@
|
||||
#include "DataLoader.h"
|
||||
|
||||
// In cases like DataLoader, if a worker process die due to bus error/segfault
|
||||
// or just hang, the main process, if implemented with
|
||||
// multiprocessing.queue.SimpleQueue, will hang waiting for data. This is
|
||||
// difficult to avoid on PyTorch side as it can be caused by limited shm, or
|
||||
// other libraries users call in the workers. The following methods is an effort
|
||||
// to do our best provide some error message to users when such unfortunate
|
||||
// events happen.
|
||||
// In cases like DataLoader, if a worker process dies due to bus error/segfault
|
||||
// or just hang, the main process will hang waiting for data. This is difficult
|
||||
// to avoid on PyTorch side as it can be caused by limited shm, or other
|
||||
// libraries users call in the workers. The following methods is an effort to do
|
||||
// our best to provide some error message to users when such unfortunate events
|
||||
// happen.
|
||||
|
||||
// TODO: The following don't work on Windows. Specifically, sigaction, waitid
|
||||
// calls ,and SIGCHLD handler. Currently, dummy implementations are provided
|
||||
// calls, and SIGCHLD handler. Currently, dummy implementations are provided
|
||||
// for Windows.
|
||||
|
||||
#ifndef _WIN32
|
||||
@ -63,6 +62,7 @@ static inline void setSignalHandler(int signal, void(*handler)(int, siginfo_t *,
|
||||
SIGNAL_HANDLER(SIGBUS, handler_SIGBUS, "ERROR: Unexpected bus error encountered in worker. "
|
||||
"This might be caused by insufficient shared memory (shm).\n");
|
||||
SIGNAL_HANDLER(SIGSEGV, handler_SIGSEGV, "ERROR: Unexpected segmentation fault encountered in worker.\n");
|
||||
SIGNAL_HANDLER(SIGFPE, handler_SIGFPE, "ERROR: Unexpected floating-point exception encountered in worker.\n");
|
||||
|
||||
// When an error happend in DataLoader methods and Python starts to exit, the
|
||||
// error trace will keep the loader alive, and Python may kill the children
|
||||
@ -92,6 +92,7 @@ static PyObject *THPModule_setWorkerSignalHandlers(PyObject *module, PyObject *a
|
||||
setSignalHandler(SIGBUS, &handler_SIGBUS, nullptr);
|
||||
setSignalHandler(SIGSEGV, &handler_SIGSEGV, nullptr);
|
||||
setSignalHandler(SIGTERM, &handler_SIGTERM, nullptr);
|
||||
setSignalHandler(SIGFPE, &handler_SIGFPE, nullptr);
|
||||
Py_RETURN_NONE;
|
||||
END_HANDLE_TH_ERRORS
|
||||
}
|
||||
@ -130,9 +131,7 @@ static PyObject *THPModule_errorIfAnyWorkerFails(PyObject *module) {
|
||||
} else if (infop.si_code == CLD_KILLED || infop.si_code == CLD_DUMPED) { // killed by signal
|
||||
std::ostringstream oss;
|
||||
oss << "DataLoader worker (pid " << worker_pid << ") is killed "
|
||||
<< "by signal: " << strsignal(infop.si_status) << ". "
|
||||
<< "Details are lost due to multiprocessing. Rerunning with "
|
||||
<< "num_workers=0 may give better error trace.";
|
||||
<< "by signal: " << strsignal(infop.si_status) << ". ";
|
||||
// This is necessary. Otherwise, the runtime error will kill the other
|
||||
// workers, and trigger this again.
|
||||
pid_set->clear();
|
||||
|
Reference in New Issue
Block a user