Move caffe2 signal_handler to c10. (#56717)

Summary:
Pull Request resolved: https://github.com/pytorch/pytorch/pull/56717

The signal_handler was under the caffe2 namespacee but was being used
by PyTorch as well.

I've fixed this my moving it to the c10 namespace where now both C2 and PyTorch
can use it.

The signal_handler interface in caffe2/utils/signal_handler.h is kept the same
for backward compatiblity for C2, but most of the commmon code is moved to c10.
ghstack-source-id: 127446929

Test Plan: waitforbuildbot

Reviewed By: ezyang

Differential Revision: D27946738

fbshipit-source-id: d6228d1a0108f4c807d405e7a0bb799c5375388f
This commit is contained in:
Pritam Damania
2021-04-26 23:06:55 -07:00
committed by Facebook GitHub Bot
parent 6ed5bbfb46
commit dc8a8cea79
9 changed files with 527 additions and 406 deletions

388
c10/util/signal_handler.cpp Normal file
View File

@ -0,0 +1,388 @@
#include <c10/util/Backtrace.h>
#include <c10/util/signal_handler.h>
#if defined(C10_SUPPORTS_SIGNAL_HANDLER)
// Normal signal handler implementation.
#include <cxxabi.h>
#include <dirent.h>
#include <dlfcn.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include <unwind.h>
#include <cstdio>
#include <cstdlib>
#include <iostream>
#include <unordered_set>
#ifdef C10_ANDROID
#ifndef SYS_gettid
#define SYS_gettid __NR_gettid
#endif
#ifndef SYS_tgkill
#define SYS_tgkill __NR_tgkill
#endif
#endif
namespace {
struct sigaction previousSighup;
struct sigaction previousSigint;
std::atomic<int> sigintCount(0);
std::atomic<int> sighupCount(0);
std::atomic<int> hookedUpCount(0);
void handleSignal(int signal) {
switch (signal) {
// TODO: what if the previous handler uses sa_sigaction?
case SIGHUP:
sighupCount += 1;
if (previousSighup.sa_handler) {
previousSighup.sa_handler(signal);
}
break;
case SIGINT:
sigintCount += 1;
if (previousSigint.sa_handler) {
previousSigint.sa_handler(signal);
}
break;
}
}
void hookupHandler() {
if (hookedUpCount++) {
return;
}
struct sigaction sa;
// Setup the handler
sa.sa_handler = &handleSignal;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &sa, &previousSighup) == -1) {
LOG(FATAL) << "Cannot install SIGHUP handler.";
}
if (sigaction(SIGINT, &sa, &previousSigint) == -1) {
LOG(FATAL) << "Cannot install SIGINT handler.";
}
}
// Set the signal handlers to the default.
void unhookHandler() {
if (--hookedUpCount > 0) {
return;
}
struct sigaction sa;
// Setup the sighub handler
sa.sa_handler = SIG_DFL;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &previousSighup, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
}
if (sigaction(SIGINT, &previousSigint, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGINT handler.";
}
}
} // namespace
namespace c10 {
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
FatalSignalHandler& FatalSignalHandler::getInstance() {
// Leaky singleton to avoid module destructor race.
static FatalSignalHandler* handler = new FatalSignalHandler();
return *handler;
}
FatalSignalHandler::~FatalSignalHandler() {}
FatalSignalHandler::FatalSignalHandler()
: fatalSignalHandlersInstalled(false),
fatalSignalReceived(false),
fatalSignalName("<UNKNOWN>"),
writingCond(PTHREAD_COND_INITIALIZER),
writingMutex(PTHREAD_MUTEX_INITIALIZER) {}
FatalSignalHandler::signal_handler FatalSignalHandler::kSignalHandlers[] = {
{"SIGABRT", SIGABRT, {}},
{"SIGINT", SIGINT, {}},
{"SIGILL", SIGILL, {}},
{"SIGFPE", SIGFPE, {}},
{"SIGBUS", SIGBUS, {}},
{"SIGSEGV", SIGSEGV, {}},
{nullptr, 0, {}}};
struct sigaction* FatalSignalHandler::getPreviousSigaction(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return &handler->previous;
}
}
return nullptr;
}
const char* FatalSignalHandler::getSignalName(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return handler->name;
}
}
return nullptr;
}
void FatalSignalHandler::callPreviousSignalHandler(
struct sigaction* action,
int signum,
siginfo_t* info,
void* ctx) {
if (!action->sa_handler) {
return;
}
if ((action->sa_flags & SA_SIGINFO) == SA_SIGINFO) {
action->sa_sigaction(signum, info, ctx);
} else {
action->sa_handler(signum);
}
}
// needsLock signals whether we need to lock our writing mutex.
void FatalSignalHandler::stacktraceSignalHandler(bool needsLock) {
if (needsLock) {
pthread_mutex_lock(&writingMutex);
}
pid_t tid = syscall(SYS_gettid);
std::cerr << fatalSignalName << "(" << fatalSignum << "), PID: " << ::getpid()
<< ", Thread " << tid << ": " << std::endl;
std::cerr << c10::get_backtrace();
std::cerr << std::endl;
if (needsLock) {
pthread_mutex_unlock(&writingMutex);
pthread_cond_signal(&writingCond);
}
}
void FatalSignalHandler::fatalSignalHandlerPostProcess() {}
void FatalSignalHandler::fatalSignalHandlerStatic(int signum) {
getInstance().fatalSignalHandler(signum);
}
// Our fatal signal entry point
void FatalSignalHandler::fatalSignalHandler(int signum) {
// Check if this is a proper signal that we declared above.
const char* name = getSignalName(signum);
if (!name) {
return;
}
if (fatalSignalReceived) {
return;
}
// Set the flag so that our SIGUSR2 handler knows that we're aborting and
// that it should intercept any SIGUSR2 signal.
fatalSignalReceived = true;
// Set state for other threads.
fatalSignum = signum;
fatalSignalName = name;
// Linux doesn't have a nice userland API for enumerating threads so we
// need to use the proc pseudo-filesystem.
DIR* procDir = opendir("/proc/self/task");
if (procDir) {
pid_t pid = getpid();
pid_t currentTid = syscall(SYS_gettid);
struct dirent* entry;
pthread_mutex_lock(&writingMutex);
while ((entry = readdir(procDir)) != nullptr) {
if (entry->d_name[0] == '.') {
continue;
}
pid_t tid = atoi(entry->d_name);
// If we've found the current thread then we'll jump into the SIGUSR2
// handler before calling pthread_cond_wait thus deadlocking, so branch
// our directly to the backtrace handler instead of signaling it.
if (tid != currentTid) {
syscall(SYS_tgkill, pid, tid, SIGUSR2);
pthread_cond_wait(&writingCond, &writingMutex);
} else {
stacktraceSignalHandler(false);
}
}
pthread_mutex_unlock(&writingMutex);
} else {
perror("Failed to open /proc/self/task");
}
fatalSignalHandlerPostProcess();
sigaction(signum, getPreviousSigaction(signum), nullptr);
raise(signum);
}
// Our SIGUSR2 entry point
void FatalSignalHandler::stacktraceSignalHandlerStatic(
int signum,
siginfo_t* info,
void* ctx) {
getInstance().stacktraceSignalHandler(signum, info, ctx);
}
void FatalSignalHandler::stacktraceSignalHandler(
int signum,
siginfo_t* info,
void* ctx) {
if (fatalSignalReceived) {
stacktraceSignalHandler(true);
} else {
// We don't want to actually change the signal handler as we want to
// remain the signal handler so that we may get the usr2 signal later.
callPreviousSignalHandler(&previousSigusr2, signum, info, ctx);
}
}
// Installs SIGABRT signal handler so that we get stack traces
// from every thread on SIGABRT caused exit. Also installs SIGUSR2 handler
// so that threads can communicate with each other (be sure if you use SIGUSR2)
// to install your handler before initing caffe2 (we properly fall back to
// the previous handler if we didn't initiate the SIGUSR2).
void FatalSignalHandler::installFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = true;
struct sigaction sa;
sigemptyset(&sa.sa_mask);
// Since we'll be in an exiting situation it's possible there's memory
// corruption, so make our own stack just in case.
sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
sa.sa_handler = FatalSignalHandler::fatalSignalHandlerStatic;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &sa, &handler->previous)) {
std::string str("Failed to add ");
str += handler->name;
str += " handler!";
perror(str.c_str());
}
}
sa.sa_sigaction = FatalSignalHandler::stacktraceSignalHandlerStatic;
if (sigaction(SIGUSR2, &sa, &previousSigusr2)) {
perror("Failed to add SIGUSR2 handler!");
}
}
void FatalSignalHandler::uninstallFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (!fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = false;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &handler->previous, nullptr)) {
std::string str("Failed to remove ");
str += handler->name;
str += " handler!";
perror(str.c_str());
} else {
handler->previous = {};
}
}
if (sigaction(SIGUSR2, &previousSigusr2, nullptr)) {
perror("Failed to add SIGUSR2 handler!");
} else {
previousSigusr2 = {};
}
}
#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action)
: SIGINT_action_(SIGINT_action),
SIGHUP_action_(SIGHUP_action),
my_sigint_count_(sigintCount),
my_sighup_count_(sighupCount) {
hookupHandler();
}
SignalHandler::~SignalHandler() {
unhookHandler();
}
// Return true iff a SIGINT has been received since the last time this
// function was called.
bool SignalHandler::GotSIGINT() {
uint64_t count = sigintCount;
bool result = (count != my_sigint_count_);
my_sigint_count_ = count;
return result;
}
// Return true iff a SIGHUP has been received since the last time this
// function was called.
bool SignalHandler::GotSIGHUP() {
uint64_t count = sighupCount;
bool result = (count != my_sighup_count_);
my_sighup_count_ = count;
return result;
}
SignalHandler::Action SignalHandler::CheckForSignals() {
if (GotSIGHUP()) {
return SIGHUP_action_;
}
if (GotSIGINT()) {
return SIGINT_action_;
}
return SignalHandler::Action::NONE;
}
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
void FatalSignalHandler::setPrintStackTracesOnFatalSignal(bool print) {
if (print) {
installFatalSignalHandlers();
} else {
uninstallFatalSignalHandlers();
}
}
bool FatalSignalHandler::printStackTracesOnFatalSignal() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
return fatalSignalHandlersInstalled;
}
#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
} // namespace c10
#else // defined(C10_SUPPORTS_SIGNAL_HANDLER)
// TODO: Currently we do not support signal handling in non-Linux yet - below is
// a minimal implementation that makes things compile.
namespace c10 {
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action) {
SIGINT_action_ = SIGINT_action;
SIGHUP_action_ = SIGHUP_action;
my_sigint_count_ = 0;
my_sighup_count_ = 0;
}
SignalHandler::~SignalHandler() {}
bool SignalHandler::GotSIGINT() {
return false;
}
bool SignalHandler::GotSIGHUP() {
return false;
}
SignalHandler::Action SignalHandler::CheckForSignals() {
return SignalHandler::Action::NONE;
}
} // namespace c10
#endif // defined(C10_SUPPORTS_SIGNAL_HANDLER)

106
c10/util/signal_handler.h Normal file
View File

@ -0,0 +1,106 @@
#pragma once
#include <atomic>
#include <csignal>
#include <mutex>
#include <c10/macros/Export.h>
#include <c10/util/Logging.h>
#if defined(__APPLE__)
#define C10_SUPPORTS_SIGNAL_HANDLER
#elif defined(__linux__) && !defined(C10_DISABLE_SIGNAL_HANDLERS)
#define C10_SUPPORTS_FATAL_SIGNAL_HANDLERS
#define C10_SUPPORTS_SIGNAL_HANDLER
#endif
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
#include <pthread.h>
#endif
namespace c10 {
class TORCH_API SignalHandler {
public:
enum class Action { NONE, STOP };
// Constructor. Specify what action to take when a signal is received.
SignalHandler(Action SIGINT_action, Action SIGHUP_action);
~SignalHandler();
Action CheckForSignals();
bool GotSIGINT();
bool GotSIGHUP();
Action SIGINT_action_;
Action SIGHUP_action_;
unsigned long my_sigint_count_;
unsigned long my_sighup_count_;
};
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
class TORCH_API FatalSignalHandler {
// This works by setting up certain fatal signal handlers. Previous fatal
// signal handlers will still be called when the signal is raised. Defaults
// to being off.
public:
TORCH_API void setPrintStackTracesOnFatalSignal(bool print);
TORCH_API bool printStackTracesOnFatalSignal();
static FatalSignalHandler& getInstance();
virtual ~FatalSignalHandler();
protected:
explicit FatalSignalHandler();
private:
void installFatalSignalHandlers();
void uninstallFatalSignalHandlers();
static void fatalSignalHandlerStatic(int signum);
void fatalSignalHandler(int signum);
virtual void fatalSignalHandlerPostProcess();
struct sigaction* getPreviousSigaction(int signum);
const char* getSignalName(int signum);
void callPreviousSignalHandler(
struct sigaction* action,
int signum,
siginfo_t* info,
void* ctx);
void stacktraceSignalHandler(bool needsLock);
static void stacktraceSignalHandlerStatic(
int signum,
siginfo_t* info,
void* ctx);
void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx);
// The mutex protects the bool.
std::mutex fatalSignalHandlersInstallationMutex;
bool fatalSignalHandlersInstalled;
// We need to hold a reference to call the previous SIGUSR2 handler in case
// we didn't signal it
struct sigaction previousSigusr2;
// Flag dictating whether the SIGUSR2 handler falls back to previous handlers
// or is intercepted in order to print a stack trace.
std::atomic<bool> fatalSignalReceived;
// Global state set when a fatal signal is received so that backtracing
// threads know why they're printing a stacktrace.
const char* fatalSignalName;
int fatalSignum = -1;
// This wait condition is used to wait for other threads to finish writing
// their stack trace when in fatal sig handler (we can't use pthread_join
// because there's no way to convert from a tid to a pthread_t).
pthread_cond_t writingCond;
pthread_mutex_t writingMutex;
struct signal_handler {
const char* name;
int signum;
struct sigaction previous;
};
static signal_handler kSignalHandlers[];
};
#endif // defined(C10_SUPPORTS_SIGNAL_HANDLER)
} // namespace c10

View File

@ -35,7 +35,6 @@
#include "caffe2/python/pybind_state_registry.h"
#include "caffe2/utils/cpuid.h"
#include "caffe2/utils/proto_convert.h"
#include "caffe2/utils/signal_handler.h"
#include "caffe2/utils/string_utils.h"
#include "torch/csrc/autograd/variable.h"
#include "torch/csrc/jit/python/module_python.h"
@ -1922,11 +1921,6 @@ void addGlobalMethods(py::module& m) {
return py::bytes(out);
});
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
m.def("set_print_stack_traces_on_fatal_signal",
&caffe2::setPrintStackTracesOnFatalSignal);
#endif
auto initialize = [&]() {
// Initialization of the module
#ifdef USE_NUMPY

View File

@ -1,5 +1,5 @@
#include "caffe2/utils/signal_handler.h"
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
#include <gtest/gtest.h>
#include <pthread.h>
#include <unistd.h>
@ -145,4 +145,4 @@ TEST(fatalSignalTest, SIGSEGV8) {
TEST(fatalSignalTest, SIGABRT8_NOPRINT) {
TEST_FATAL_SIGNAL_NO_PRINT(SIGABRT, "SIGABRT", 8);
}
#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)

View File

@ -1,369 +1,42 @@
#include "caffe2/utils/signal_handler.h"
#include "caffe2/core/logging.h"
#include <c10/util/Backtrace.h>
#if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
// Normal signal handler implementation.
#include <cxxabi.h>
#include <dirent.h>
#include <dlfcn.h>
#include <pthread.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <unistd.h>
#include <unwind.h>
#include <atomic>
#include <csignal>
#include <cstdio>
#include <cstdlib>
#include <mutex>
#include <unordered_set>
#include "caffe2/core/init.h"
#include "caffe2/core/workspace.h"
#ifdef C10_ANDROID
#ifndef SYS_gettid
#define SYS_gettid __NR_gettid
#endif
#ifndef SYS_tgkill
#define SYS_tgkill __NR_tgkill
#endif
#endif
namespace {
struct sigaction previousSighup;
struct sigaction previousSigint;
std::atomic<int> sigintCount(0);
std::atomic<int> sighupCount(0);
std::atomic<int> hookedUpCount(0);
void handleSignal(int signal) {
switch (signal) {
// TODO: what if the previous handler uses sa_sigaction?
case SIGHUP:
sighupCount += 1;
if (previousSighup.sa_handler) {
previousSighup.sa_handler(signal);
}
break;
case SIGINT:
sigintCount += 1;
if (previousSigint.sa_handler) {
previousSigint.sa_handler(signal);
}
break;
}
}
void hookupHandler() {
if (hookedUpCount++) {
return;
}
struct sigaction sa;
// Setup the handler
sa.sa_handler = &handleSignal;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &sa, &previousSighup) == -1) {
LOG(FATAL) << "Cannot install SIGHUP handler.";
}
if (sigaction(SIGINT, &sa, &previousSigint) == -1) {
LOG(FATAL) << "Cannot install SIGINT handler.";
}
}
// Set the signal handlers to the default.
void unhookHandler() {
if (--hookedUpCount > 0) {
return;
}
struct sigaction sa;
// Setup the sighub handler
sa.sa_handler = SIG_DFL;
// Restart the system call, if at all possible
sa.sa_flags = SA_RESTART;
// Block every signal during the handler
sigfillset(&sa.sa_mask);
// Intercept SIGHUP and SIGINT
if (sigaction(SIGHUP, &previousSighup, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
}
if (sigaction(SIGINT, &previousSigint, nullptr) == -1) {
LOG(FATAL) << "Cannot uninstall SIGINT handler.";
}
}
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
// The mutex protects the bool.
std::mutex fatalSignalHandlersInstallationMutex;
bool fatalSignalHandlersInstalled;
// We need to hold a reference to call the previous SIGUSR2 handler in case
// we didn't signal it
struct sigaction previousSigusr2;
// Flag dictating whether the SIGUSR2 handler falls back to previous handlers
// or is intercepted in order to print a stack trace.
std::atomic<bool> fatalSignalReceived(false);
// Global state set when a fatal signal is received so that backtracing threads
// know why they're printing a stacktrace.
const char* fatalSignalName("<UNKNOWN>");
int fatalSignum(-1);
// This wait condition is used to wait for other threads to finish writing
// their stack trace when in fatal sig handler (we can't use pthread_join
// because there's no way to convert from a tid to a pthread_t).
pthread_cond_t writingCond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t writingMutex = PTHREAD_MUTEX_INITIALIZER;
struct {
const char* name;
int signum;
struct sigaction previous;
} kSignalHandlers[] = {{"SIGABRT", SIGABRT, {}},
{"SIGINT", SIGINT, {}},
{"SIGILL", SIGILL, {}},
{"SIGFPE", SIGFPE, {}},
{"SIGBUS", SIGBUS, {}},
{"SIGSEGV", SIGSEGV, {}},
{nullptr, 0, {}}};
struct sigaction* getPreviousSigaction(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return &handler->previous;
}
}
return nullptr;
}
const char* getSignalName(int signum) {
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (handler->signum == signum) {
return handler->name;
}
}
return nullptr;
}
void printBlobSizes() {
::caffe2::Workspace::ForEach(
[&](::caffe2::Workspace* ws) { ws->PrintBlobSizes(); });
}
void callPreviousSignalHandler(
struct sigaction* action,
int signum,
siginfo_t* info,
void* ctx) {
if (!action->sa_handler) {
return;
}
if ((action->sa_flags & SA_SIGINFO) == SA_SIGINFO) {
action->sa_sigaction(signum, info, ctx);
} else {
action->sa_handler(signum);
}
}
// needsLock signals whether we need to lock our writing mutex.
void stacktraceSignalHandler(bool needsLock) {
if (needsLock) {
pthread_mutex_lock(&writingMutex);
}
pid_t tid = syscall(SYS_gettid);
std::cerr << fatalSignalName << "(" << fatalSignum << "), PID: " << ::getpid()
<< ", Thread " << tid << ": " << std::endl;
std::cerr << c10::get_backtrace();
std::cerr << std::endl;
if (needsLock) {
pthread_mutex_unlock(&writingMutex);
pthread_cond_signal(&writingCond);
}
}
// Our fatal signal entry point
void fatalSignalHandler(int signum) {
// Check if this is a proper signal that we declared above.
const char* name = getSignalName(signum);
if (!name) {
return;
}
if (fatalSignalReceived) {
return;
}
// Set the flag so that our SIGUSR2 handler knows that we're aborting and
// that it should intercept any SIGUSR2 signal.
fatalSignalReceived = true;
// Set state for other threads.
fatalSignum = signum;
fatalSignalName = name;
// Linux doesn't have a nice userland API for enumerating threads so we
// need to use the proc pseudo-filesystem.
DIR* procDir = opendir("/proc/self/task");
if (procDir) {
pid_t pid = getpid();
pid_t currentTid = syscall(SYS_gettid);
struct dirent* entry;
pthread_mutex_lock(&writingMutex);
while ((entry = readdir(procDir)) != nullptr) {
if (entry->d_name[0] == '.') {
continue;
}
pid_t tid = atoi(entry->d_name);
// If we've found the current thread then we'll jump into the SIGUSR2
// handler before calling pthread_cond_wait thus deadlocking, so branch
// our directly to the backtrace handler instead of signaling it.
if (tid != currentTid) {
syscall(SYS_tgkill, pid, tid, SIGUSR2);
pthread_cond_wait(&writingCond, &writingMutex);
} else {
stacktraceSignalHandler(false);
}
}
pthread_mutex_unlock(&writingMutex);
} else {
perror("Failed to open /proc/self/task");
}
printBlobSizes();
sigaction(signum, getPreviousSigaction(signum), nullptr);
raise(signum);
}
// Our SIGUSR2 entry point
void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx) {
if (fatalSignalReceived) {
stacktraceSignalHandler(true);
} else {
// We don't want to actually change the signal handler as we want to
// remain the signal handler so that we may get the usr2 signal later.
callPreviousSignalHandler(&previousSigusr2, signum, info, ctx);
}
}
// Installs SIGABRT signal handler so that we get stack traces
// from every thread on SIGABRT caused exit. Also installs SIGUSR2 handler
// so that threads can communicate with each other (be sure if you use SIGUSR2)
// to install your handler before initing caffe2 (we properly fall back to
// the previous handler if we didn't initiate the SIGUSR2).
void installFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = true;
struct sigaction sa;
sigemptyset(&sa.sa_mask);
// Since we'll be in an exiting situation it's possible there's memory
// corruption, so make our own stack just in case.
sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
sa.sa_handler = ::fatalSignalHandler;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &sa, &handler->previous)) {
std::string str("Failed to add ");
str += handler->name;
str += " handler!";
perror(str.c_str());
}
}
sa.sa_sigaction = ::stacktraceSignalHandler;
if (sigaction(SIGUSR2, &sa, &::previousSigusr2)) {
perror("Failed to add SIGUSR2 handler!");
}
}
void uninstallFatalSignalHandlers() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
if (!fatalSignalHandlersInstalled) {
return;
}
fatalSignalHandlersInstalled = false;
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
if (sigaction(handler->signum, &handler->previous, nullptr)) {
std::string str("Failed to remove ");
str += handler->name;
str += " handler!";
perror(str.c_str());
} else {
handler->previous = {};
}
}
if (sigaction(SIGUSR2, &::previousSigusr2, nullptr)) {
perror("Failed to add SIGUSR2 handler!");
} else {
::previousSigusr2 = {};
}
}
#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
} // namespace
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
C10_DEFINE_bool(
caffe2_print_stacktraces,
false,
"If set, prints stacktraces when a fatal signal is raised.");
#endif
namespace caffe2 {
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action)
: SIGINT_action_(SIGINT_action),
SIGHUP_action_(SIGHUP_action),
my_sigint_count_(sigintCount),
my_sighup_count_(sighupCount) {
hookupHandler();
C2FatalSignalHandler::C2FatalSignalHandler() {}
C2FatalSignalHandler& C2FatalSignalHandler::getInstance() {
// Leaky singleton to avoid module destructor race.
static C2FatalSignalHandler* handler = new C2FatalSignalHandler();
return *handler;
}
SignalHandler::~SignalHandler() {
unhookHandler();
void C2FatalSignalHandler::fatalSignalHandlerPostProcess() {
printBlobSizes();
}
// Return true iff a SIGINT has been received since the last time this
// function was called.
bool SignalHandler::GotSIGINT() {
uint64_t count = sigintCount;
bool result = (count != my_sigint_count_);
my_sigint_count_ = count;
return result;
}
// Return true iff a SIGHUP has been received since the last time this
// function was called.
bool SignalHandler::GotSIGHUP() {
uint64_t count = sighupCount;
bool result = (count != my_sighup_count_);
my_sighup_count_ = count;
return result;
}
SignalHandler::Action SignalHandler::CheckForSignals() {
if (GotSIGHUP()) {
return SIGHUP_action_;
}
if (GotSIGINT()) {
return SIGINT_action_;
}
return SignalHandler::Action::NONE;
}
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
void setPrintStackTracesOnFatalSignal(bool print) {
if (print) {
installFatalSignalHandlers();
} else {
uninstallFatalSignalHandlers();
}
C2FatalSignalHandler::getInstance().setPrintStackTracesOnFatalSignal(print);
}
bool printStackTracesOnFatalSignal() {
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
return fatalSignalHandlersInstalled;
return C2FatalSignalHandler::getInstance().printStackTracesOnFatalSignal();
}
namespace internal {
@ -381,32 +54,5 @@ REGISTER_CAFFE2_INIT_FUNCTION(
" caffe2_print_stacktraces is set.");
} // namespace internal
#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
} // namespace caffe2
#else // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
// TODO: Currently we do not support signal handling in non-Linux yet - below is
// a minimal implementation that makes things compile.
namespace caffe2 {
SignalHandler::SignalHandler(
SignalHandler::Action SIGINT_action,
SignalHandler::Action SIGHUP_action) {
SIGINT_action_ = SIGINT_action;
SIGHUP_action_ = SIGHUP_action;
my_sigint_count_ = 0;
my_sighup_count_ = 0;
}
SignalHandler::~SignalHandler() {}
bool SignalHandler::GotSIGINT() {
return false;
}
bool SignalHandler::GotSIGHUP() {
return false;
}
SignalHandler::Action SignalHandler::CheckForSignals() {
return SignalHandler::Action::NONE;
}
} // namespace caffe2
#endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)

View File

@ -1,45 +1,24 @@
#pragma once
#include "caffe2/core/common.h"
#if defined(__APPLE__)
#define CAFFE2_SUPPORTS_SIGNAL_HANDLER
#elif defined(__linux__) && !defined(CAFFE2_DISABLE_SIGNAL_HANDLERS)
#define CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS
#define CAFFE2_SUPPORTS_SIGNAL_HANDLER
#endif
#include <c10/util/signal_handler.h>
namespace caffe2 {
class TORCH_API SignalHandler {
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
class TORCH_API C2FatalSignalHandler : public c10::FatalSignalHandler {
public:
enum class Action {
NONE,
STOP
};
// Constructor. Specify what action to take when a signal is received.
SignalHandler(Action SIGINT_action,
Action SIGHUP_action);
~SignalHandler();
Action CheckForSignals();
void fatalSignalHandlerPostProcess() override;
static C2FatalSignalHandler& getInstance();
private:
bool GotSIGINT();
bool GotSIGHUP();
Action SIGINT_action_;
Action SIGHUP_action_;
unsigned long my_sigint_count_;
unsigned long my_sighup_count_;
explicit C2FatalSignalHandler();
};
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
// This works by setting up certain fatal signal handlers. Previous fatal
// signal handlers will still be called when the signal is raised. Defaults
// to being off.
TORCH_API void setPrintStackTracesOnFatalSignal(bool print);
TORCH_API bool printStackTracesOnFatalSignal();
#endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
#endif // defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLER)
} // namespace caffe2
} // namespace caffe2

View File

@ -168,6 +168,7 @@ HOIST_CONV_PACKED_PARAMS: MobileOptimizerType
def fork(*args: Any, **kwargs: Any) -> Future: ...
def wait(fut: Future) -> Any: ...
def _collect_all(futures: List[Future]) -> Future: ...
def _set_print_stack_traces_on_fatal_signal(print: _bool) -> None: ...
def unify_type_list(types: List[JitType]) -> JitType: ...
def _freeze_module(module: ScriptModule,

View File

@ -93,6 +93,7 @@
#include <torch/csrc/jit/tensorexpr/tensorexpr_init.h>
#include <c10/macros/Export.h>
#include <c10/util/signal_handler.h>
#include <caffe2/serialize/inline_container.h>
#include <ATen/core/function_schema.h>
@ -1384,6 +1385,13 @@ void initJITBindings(PyObject* module) {
toIValue(std::move(obj), type);
});
#if defined(C10_SUPPORTS_FATAL_SIGNAL_HANDLERS)
m.def("_set_print_stack_traces_on_fatal_signal", [](bool print) {
c10::FatalSignalHandler::getInstance().setPrintStackTracesOnFatalSignal(
print);
});
#endif // defined(C10_SUPPORTS_SIGNAL_HANDLER)
initPythonCustomClassBindings(module);
initPythonIRBindings(module);
tracer::initPythonTracerBindings(module);

View File

@ -484,8 +484,7 @@ class MultiProcessTestCase(TestCase):
if sys.platform != 'win32' and sys.platform != 'darwin':
# Register signal handler to dump stack traces on FATALs.
# Windows and MacOS do not support the signal handlers.
import caffe2.python._import_c_extension as C
C.set_print_stack_traces_on_fatal_signal(True) # type: ignore[attr-defined]
torch._C._set_print_stack_traces_on_fatal_signal(True) # type: ignore[attr-defined]
# self.id() == e.g. '__main__.TestDistributed.test_get_rank'
# We're retrieving a corresponding test and executing it.