Add a crash handler to async compile subprocesses (#155068)

When the async compile subprocesses crash in C++ they tend to just silently die instead of leaving any kind of trace.  This installs a crash handler so that if they SEGV, ILL, or ABRT they'll attempt to output a backtrace instead.

While in there I also cleaned up the CLANGTIDY warnings coming from Module.cpp.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/155068
Approved by: https://github.com/masnesral
This commit is contained in:
Aaron Orenstein
2025-06-24 08:03:17 -07:00
committed by PyTorch MergeBot
parent beb52f5c0a
commit 568ca89bac
3 changed files with 73 additions and 1 deletions

View File

@ -2,6 +2,7 @@
#include <fmt/core.h>
#include <sys/types.h>
#include <torch/csrc/python_headers.h>
#include <csignal>
#include <optional>
#ifndef _MSC_VER
@ -280,7 +281,8 @@ static PyObject* THPModule_crashIfvptrUBSAN(PyObject* module, PyObject* noarg) {
virtual ~Baz() = default;
};
Baz x{};
// NOLINTNEXTLINE(bugprone-casting*)
// Purposely cast through `void*` so there's no fixups applied.
// NOLINTNEXTLINE(bugprone-casting-through-void,-warnings-as-errors)
auto y = static_cast<Foo*>(static_cast<void*>(&x));
auto rc = y->bar();
return THPUtils_packInt32(rc);
@ -1249,6 +1251,7 @@ static PyObject* THPModule_setQEngine(PyObject* /* unused */, PyObject* arg) {
"but got ",
THPUtils_typename(arg));
auto qengine = THPUtils_unpackLong(arg);
// NOLINTNEXTLINE(clang-analyzer-optin.core.EnumCastOutOfRange)
at::globalContext().setQEngine(static_cast<at::QEngine>(qengine));
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
@ -1743,6 +1746,7 @@ static std::initializer_list<PyMethodDef> TorchMethods = {
{nullptr, nullptr, 0, nullptr}};
#ifdef USE_CUDA
// NOLINTBEGIN(misc-use-internal-linkage)
void THCPStream_init(PyObject* module);
void THCPEvent_init(PyObject* module);
void THCPGraph_init(PyObject* module);
@ -1751,6 +1755,7 @@ PyMethodDef* THCPModule_methods();
namespace torch::cuda {
void initModule(PyObject* module);
} // namespace torch::cuda
// NOLINTEND(misc-use-internal-linkage)
#endif
#ifdef USE_XPU
@ -1792,6 +1797,66 @@ class WeakTensorRef {
}
};
namespace {
using SigHandler = void (*)(int);
SigHandler* _getOldHandler(int signum) {
#define SIG_CHECK(n) \
if (signum == (n)) { \
static SigHandler handler = nullptr; \
return &handler; \
}
SIG_CHECK(SIGSEGV);
SIG_CHECK(SIGILL);
throw std::runtime_error("unexpected signal number");
#undef SIG_CHECK
}
extern "C" void _signalHandler(int signum) {
// Note that technically there's not much you're allowed to do here - but
// we're probably dying anyway so give it a try...
auto oldAction = *_getOldHandler(signum);
*_getOldHandler(signum) = nullptr;
// If we hit another signal don't run this handler again.
std::signal(signum, oldAction);
#ifdef _WIN32
const char* signame = "<unknown>";
#else
const char* signame = strsignal(signum);
#endif
fprintf(
stderr,
"Process %d crashed with signal %s (%d):\n",
getpid(),
signame,
signum);
auto bt = c10::get_backtrace();
fwrite(bt.data(), 1, bt.size(), stderr);
// Try to run the previous signal handler
if (oldAction != SIG_IGN && oldAction != SIG_DFL) {
oldAction(signum);
}
if (oldAction != SIG_IGN) {
_exit(-1);
}
}
void _initCrashHandler() {
*_getOldHandler(SIGILL) = std::signal(SIGILL, _signalHandler);
*_getOldHandler(SIGSEGV) = std::signal(SIGSEGV, _signalHandler);
}
} // anonymous namespace
extern "C" TORCH_PYTHON_API PyObject* initModule();
// separate decl and defn for msvc error C2491
PyObject* initModule() {
@ -1970,6 +2035,7 @@ PyObject* initModule() {
});
auto py_module = py::reinterpret_borrow<py::module>(module);
py_module.def("_initCrashHandler", &_initCrashHandler);
py_module.def("_demangle", &c10::demangle);
py_module.def("_log_api_usage_once", &LogAPIUsageOnceFromPython);
py_module.def("_log_api_usage_metadata", &LogAPIUsageMetadataFromPython);