mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Add a crash handler to async compile subprocesses (#155068)
When the async compile subprocesses crash in C++ they tend to just silently die instead of leaving any kind of trace. This installs a crash handler so that if they SEGV, ILL, or ABRT they'll attempt to output a backtrace instead. While in there I also cleaned up the CLANGTIDY warnings coming from Module.cpp. Pull Request resolved: https://github.com/pytorch/pytorch/pull/155068 Approved by: https://github.com/masnesral
This commit is contained in:
committed by
PyTorch MergeBot
parent
beb52f5c0a
commit
568ca89bac
@ -2,6 +2,7 @@
|
||||
#include <fmt/core.h>
|
||||
#include <sys/types.h>
|
||||
#include <torch/csrc/python_headers.h>
|
||||
#include <csignal>
|
||||
#include <optional>
|
||||
|
||||
#ifndef _MSC_VER
|
||||
@ -280,7 +281,8 @@ static PyObject* THPModule_crashIfvptrUBSAN(PyObject* module, PyObject* noarg) {
|
||||
virtual ~Baz() = default;
|
||||
};
|
||||
Baz x{};
|
||||
// NOLINTNEXTLINE(bugprone-casting*)
|
||||
// Purposely cast through `void*` so there's no fixups applied.
|
||||
// NOLINTNEXTLINE(bugprone-casting-through-void,-warnings-as-errors)
|
||||
auto y = static_cast<Foo*>(static_cast<void*>(&x));
|
||||
auto rc = y->bar();
|
||||
return THPUtils_packInt32(rc);
|
||||
@ -1249,6 +1251,7 @@ static PyObject* THPModule_setQEngine(PyObject* /* unused */, PyObject* arg) {
|
||||
"but got ",
|
||||
THPUtils_typename(arg));
|
||||
auto qengine = THPUtils_unpackLong(arg);
|
||||
// NOLINTNEXTLINE(clang-analyzer-optin.core.EnumCastOutOfRange)
|
||||
at::globalContext().setQEngine(static_cast<at::QEngine>(qengine));
|
||||
Py_RETURN_NONE;
|
||||
END_HANDLE_TH_ERRORS
|
||||
@ -1743,6 +1746,7 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
|
||||
{nullptr, nullptr, 0, nullptr}};
|
||||
|
||||
#ifdef USE_CUDA
|
||||
// NOLINTBEGIN(misc-use-internal-linkage)
|
||||
void THCPStream_init(PyObject* module);
|
||||
void THCPEvent_init(PyObject* module);
|
||||
void THCPGraph_init(PyObject* module);
|
||||
@ -1751,6 +1755,7 @@ PyMethodDef* THCPModule_methods();
|
||||
namespace torch::cuda {
|
||||
void initModule(PyObject* module);
|
||||
} // namespace torch::cuda
|
||||
// NOLINTEND(misc-use-internal-linkage)
|
||||
#endif
|
||||
|
||||
#ifdef USE_XPU
|
||||
@ -1792,6 +1797,66 @@ class WeakTensorRef {
|
||||
}
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
using SigHandler = void (*)(int);
|
||||
|
||||
SigHandler* _getOldHandler(int signum) {
|
||||
#define SIG_CHECK(n) \
|
||||
if (signum == (n)) { \
|
||||
static SigHandler handler = nullptr; \
|
||||
return &handler; \
|
||||
}
|
||||
|
||||
SIG_CHECK(SIGSEGV);
|
||||
SIG_CHECK(SIGILL);
|
||||
|
||||
throw std::runtime_error("unexpected signal number");
|
||||
#undef SIG_CHECK
|
||||
}
|
||||
|
||||
extern "C" void _signalHandler(int signum) {
|
||||
// Note that technically there's not much you're allowed to do here - but
|
||||
// we're probably dying anyway so give it a try...
|
||||
|
||||
auto oldAction = *_getOldHandler(signum);
|
||||
*_getOldHandler(signum) = nullptr;
|
||||
|
||||
// If we hit another signal don't run this handler again.
|
||||
std::signal(signum, oldAction);
|
||||
|
||||
#ifdef _WIN32
|
||||
const char* signame = "<unknown>";
|
||||
#else
|
||||
const char* signame = strsignal(signum);
|
||||
#endif
|
||||
|
||||
fprintf(
|
||||
stderr,
|
||||
"Process %d crashed with signal %s (%d):\n",
|
||||
getpid(),
|
||||
signame,
|
||||
signum);
|
||||
|
||||
auto bt = c10::get_backtrace();
|
||||
fwrite(bt.data(), 1, bt.size(), stderr);
|
||||
|
||||
// Try to run the previous signal handler
|
||||
if (oldAction != SIG_IGN && oldAction != SIG_DFL) {
|
||||
oldAction(signum);
|
||||
}
|
||||
if (oldAction != SIG_IGN) {
|
||||
_exit(-1);
|
||||
}
|
||||
}
|
||||
|
||||
void _initCrashHandler() {
|
||||
*_getOldHandler(SIGILL) = std::signal(SIGILL, _signalHandler);
|
||||
*_getOldHandler(SIGSEGV) = std::signal(SIGSEGV, _signalHandler);
|
||||
}
|
||||
|
||||
} // anonymous namespace
|
||||
|
||||
extern "C" TORCH_PYTHON_API PyObject* initModule();
|
||||
// separate decl and defn for msvc error C2491
|
||||
PyObject* initModule() {
|
||||
@ -1970,6 +2035,7 @@ PyObject* initModule() {
|
||||
});
|
||||
|
||||
auto py_module = py::reinterpret_borrow<py::module>(module);
|
||||
py_module.def("_initCrashHandler", &_initCrashHandler);
|
||||
py_module.def("_demangle", &c10::demangle);
|
||||
py_module.def("_log_api_usage_once", &LogAPIUsageOnceFromPython);
|
||||
py_module.def("_log_api_usage_metadata", &LogAPIUsageMetadataFromPython);
|
||||
|
Reference in New Issue
Block a user