mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/15363 Didn't define C10_MOBILE in the numa file move diff: D13380559 move CAFFE2_MOBILE/ANDROID/IOS to c10 ``` codemod -m -d caffe2 --extensions h,hpp,cc,cpp,mm "CAFFE2_MOBILE" "C10_MOBILE" codemod -m -d caffe2 --extensions h,hpp,cc,cpp,mm "CAFFE2_ANDROID" "C10_ANDROID" codemod -m -d caffe2 --extensions h,hpp,cc,cpp,mm "CAFFE2_IOS" "C10_IOS" ``` i-am-not-moving-c2-to-c10 Reviewed By: marcinkwiatkowski Differential Revision: D13490020 fbshipit-source-id: c4f01cacbefc0f16d5de94155c26c92fd5d780e4
461 lines
13 KiB
C++
461 lines
13 KiB
C++
#include "caffe2/utils/signal_handler.h"
|
|
#include "caffe2/core/logging.h"
|
|
|
|
#if defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
|
|
|
|
// Normal signal handler implementation.
|
|
#include <cxxabi.h>
|
|
#include <dirent.h>
|
|
#include <dlfcn.h>
|
|
#include <pthread.h>
|
|
#include <sys/syscall.h>
|
|
#include <sys/types.h>
|
|
#include <unistd.h>
|
|
#include <unwind.h>
|
|
|
|
#include <atomic>
|
|
#include <csignal>
|
|
#include <cstdio>
|
|
#include <cstdlib>
|
|
#include <mutex>
|
|
#include <unordered_set>
|
|
|
|
#include "caffe2/core/init.h"
|
|
#include "caffe2/core/workspace.h"
|
|
|
|
#if C10_ANDROID
|
|
#ifndef SYS_gettid
|
|
#define SYS_gettid __NR_gettid
|
|
#endif
|
|
#ifndef SYS_tgkill
|
|
#define SYS_tgkill __NR_tgkill
|
|
#endif
|
|
#endif
|
|
|
|
namespace {
|
|
|
|
struct sigaction previousSighup;
|
|
struct sigaction previousSigint;
|
|
std::atomic<int> sigintCount(0);
|
|
std::atomic<int> sighupCount(0);
|
|
std::atomic<int> hookedUpCount(0);
|
|
|
|
void handleSignal(int signal) {
|
|
switch (signal) {
|
|
// TODO: what if the previous handler uses sa_sigaction?
|
|
case SIGHUP:
|
|
sighupCount += 1;
|
|
if (previousSighup.sa_handler) {
|
|
previousSighup.sa_handler(signal);
|
|
}
|
|
break;
|
|
case SIGINT:
|
|
sigintCount += 1;
|
|
if (previousSigint.sa_handler) {
|
|
previousSigint.sa_handler(signal);
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
|
|
void hookupHandler() {
|
|
if (hookedUpCount++) {
|
|
return;
|
|
}
|
|
struct sigaction sa;
|
|
// Setup the handler
|
|
sa.sa_handler = &handleSignal;
|
|
// Restart the system call, if at all possible
|
|
sa.sa_flags = SA_RESTART;
|
|
// Block every signal during the handler
|
|
sigfillset(&sa.sa_mask);
|
|
// Intercept SIGHUP and SIGINT
|
|
if (sigaction(SIGHUP, &sa, &previousSighup) == -1) {
|
|
LOG(FATAL) << "Cannot install SIGHUP handler.";
|
|
}
|
|
if (sigaction(SIGINT, &sa, &previousSigint) == -1) {
|
|
LOG(FATAL) << "Cannot install SIGINT handler.";
|
|
}
|
|
}
|
|
|
|
// Set the signal handlers to the default.
|
|
void unhookHandler() {
|
|
if (--hookedUpCount > 0) {
|
|
return;
|
|
}
|
|
struct sigaction sa;
|
|
// Setup the sighub handler
|
|
sa.sa_handler = SIG_DFL;
|
|
// Restart the system call, if at all possible
|
|
sa.sa_flags = SA_RESTART;
|
|
// Block every signal during the handler
|
|
sigfillset(&sa.sa_mask);
|
|
// Intercept SIGHUP and SIGINT
|
|
if (sigaction(SIGHUP, &previousSighup, nullptr) == -1) {
|
|
LOG(FATAL) << "Cannot uninstall SIGHUP handler.";
|
|
}
|
|
if (sigaction(SIGINT, &previousSigint, nullptr) == -1) {
|
|
LOG(FATAL) << "Cannot uninstall SIGINT handler.";
|
|
}
|
|
}
|
|
|
|
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
|
|
// The mutex protects the bool.
|
|
std::mutex fatalSignalHandlersInstallationMutex;
|
|
bool fatalSignalHandlersInstalled;
|
|
// We need to hold a reference to call the previous SIGUSR2 handler in case
|
|
// we didn't signal it
|
|
struct sigaction previousSigusr2;
|
|
// Flag dictating whether the SIGUSR2 handler falls back to previous handlers
|
|
// or is intercepted in order to print a stack trace.
|
|
std::atomic<bool> fatalSignalReceived(false);
|
|
// Global state set when a fatal signal is received so that backtracing threads
|
|
// know why they're printing a stacktrace.
|
|
const char* fatalSignalName("<UNKNOWN>");
|
|
int fatalSignum(-1);
|
|
// This wait condition is used to wait for other threads to finish writing
|
|
// their stack trace when in fatal sig handler (we can't use pthread_join
|
|
// because there's no way to convert from a tid to a pthread_t).
|
|
pthread_cond_t writingCond = PTHREAD_COND_INITIALIZER;
|
|
pthread_mutex_t writingMutex = PTHREAD_MUTEX_INITIALIZER;
|
|
|
|
struct {
|
|
const char* name;
|
|
int signum;
|
|
struct sigaction previous;
|
|
} kSignalHandlers[] = {
|
|
{ "SIGABRT", SIGABRT, {} },
|
|
{ "SIGINT", SIGINT, {} },
|
|
{ "SIGILL", SIGILL, {} },
|
|
{ "SIGFPE", SIGFPE, {} },
|
|
{ "SIGBUS", SIGBUS, {} },
|
|
{ "SIGSEGV", SIGSEGV, {} },
|
|
{ nullptr, 0, {} }
|
|
};
|
|
|
|
struct sigaction* getPreviousSigaction(int signum) {
|
|
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
|
|
if (handler->signum == signum) {
|
|
return &handler->previous;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
const char* getSignalName(int signum) {
|
|
for (auto handler = kSignalHandlers; handler->name != nullptr; handler++) {
|
|
if (handler->signum == signum) {
|
|
return handler->name;
|
|
}
|
|
}
|
|
return nullptr;
|
|
}
|
|
|
|
_Unwind_Reason_Code unwinder(struct _Unwind_Context* context, void* userInfo) {
|
|
auto& pcs = *reinterpret_cast<std::vector<uintptr_t>*>(userInfo);
|
|
pcs.push_back(_Unwind_GetIP(context));
|
|
return _URC_NO_REASON;
|
|
}
|
|
|
|
std::vector<uintptr_t> getBacktrace() {
|
|
std::vector<uintptr_t> pcs;
|
|
_Unwind_Backtrace(unwinder, &pcs);
|
|
return pcs;
|
|
}
|
|
|
|
void printBlobSizes() {
|
|
::caffe2::Workspace::ForEach(
|
|
[&](::caffe2::Workspace* ws) { ws->PrintBlobSizes(); });
|
|
}
|
|
|
|
void printStacktrace() {
|
|
std::vector<uintptr_t> pcs = getBacktrace();
|
|
Dl_info info;
|
|
size_t i = 0;
|
|
for (uintptr_t pcAddr : pcs) {
|
|
const void* pc = reinterpret_cast<const void*>(pcAddr);
|
|
const char* path = nullptr;
|
|
const char* name = "???";
|
|
char* demangled = nullptr;
|
|
int offset = -1;
|
|
|
|
std::cerr << "[" << i << "] ";
|
|
if (dladdr(pc, &info)) {
|
|
path = info.dli_fname;
|
|
name = info.dli_sname ?: "???";
|
|
offset = reinterpret_cast<uintptr_t>(pc) -
|
|
reinterpret_cast<uintptr_t>(info.dli_saddr);
|
|
|
|
int status;
|
|
demangled = abi::__cxa_demangle(name, nullptr, nullptr, &status);
|
|
if (status == 0) {
|
|
name = demangled;
|
|
}
|
|
}
|
|
std::cerr << name;
|
|
if (offset >= 0) {
|
|
std::cerr << "+" << reinterpret_cast<void*>(offset);
|
|
}
|
|
std::cerr << "(" << pc << ")";
|
|
if (path) {
|
|
std::cerr << " in " << path;
|
|
}
|
|
std::cerr << std::endl;
|
|
if (demangled) {
|
|
free(demangled);
|
|
}
|
|
i += 1;
|
|
}
|
|
}
|
|
|
|
void callPreviousSignalHandler(
|
|
struct sigaction* action,
|
|
int signum,
|
|
siginfo_t* info,
|
|
void* ctx) {
|
|
if (!action->sa_handler) {
|
|
return;
|
|
}
|
|
if ((action->sa_flags & SA_SIGINFO) == SA_SIGINFO) {
|
|
action->sa_sigaction(signum, info, ctx);
|
|
} else {
|
|
action->sa_handler(signum);
|
|
}
|
|
}
|
|
|
|
// needsLock signals whether we need to lock our writing mutex.
|
|
void stacktraceSignalHandler(bool needsLock) {
|
|
if (needsLock) {
|
|
pthread_mutex_lock(&writingMutex);
|
|
}
|
|
pid_t tid = syscall(SYS_gettid);
|
|
std::cerr << fatalSignalName << "(" << fatalSignum << "), Thread " << tid
|
|
<< ": " << std::endl;
|
|
printStacktrace();
|
|
std::cerr << std::endl;
|
|
if (needsLock) {
|
|
pthread_mutex_unlock(&writingMutex);
|
|
pthread_cond_signal(&writingCond);
|
|
}
|
|
}
|
|
|
|
// Our fatal signal entry point
|
|
void fatalSignalHandler(int signum) {
|
|
// Check if this is a proper signal that we declared above.
|
|
const char* name = getSignalName(signum);
|
|
if (!name) {
|
|
return;
|
|
}
|
|
if (fatalSignalReceived) {
|
|
return;
|
|
}
|
|
// Set the flag so that our SIGUSR2 handler knows that we're aborting and
|
|
// that it should intercept any SIGUSR2 signal.
|
|
fatalSignalReceived = true;
|
|
// Set state for other threads.
|
|
fatalSignum = signum;
|
|
fatalSignalName = name;
|
|
// Linux doesn't have a nice userland API for enumerating threads so we
|
|
// need to use the proc pseudo-filesystem.
|
|
DIR* procDir = opendir("/proc/self/task");
|
|
if (procDir) {
|
|
pid_t pid = getpid();
|
|
pid_t currentTid = syscall(SYS_gettid);
|
|
struct dirent* entry;
|
|
pthread_mutex_lock(&writingMutex);
|
|
while ((entry = readdir(procDir)) != nullptr) {
|
|
if (entry->d_name[0] == '.') {
|
|
continue;
|
|
}
|
|
pid_t tid = atoi(entry->d_name);
|
|
// If we've found the current thread then we'll jump into the SIGUSR2
|
|
// handler before calling pthread_cond_wait thus deadlocking, so branch
|
|
// our directly to the backtrace handler instead of signaling it.
|
|
if (tid != currentTid) {
|
|
syscall(SYS_tgkill, pid, tid, SIGUSR2);
|
|
pthread_cond_wait(&writingCond, &writingMutex);
|
|
} else {
|
|
stacktraceSignalHandler(false);
|
|
}
|
|
}
|
|
pthread_mutex_unlock(&writingMutex);
|
|
} else {
|
|
perror("Failed to open /proc/self/task");
|
|
}
|
|
printBlobSizes();
|
|
sigaction(signum, getPreviousSigaction(signum), nullptr);
|
|
raise(signum);
|
|
}
|
|
|
|
// Our SIGUSR2 entry point
|
|
void stacktraceSignalHandler(int signum, siginfo_t* info, void* ctx) {
|
|
if (fatalSignalReceived) {
|
|
stacktraceSignalHandler(true);
|
|
} else {
|
|
// We don't want to actually change the signal handler as we want to
|
|
// remain the signal handler so that we may get the usr2 signal later.
|
|
callPreviousSignalHandler(&previousSigusr2, signum, info, ctx);
|
|
}
|
|
}
|
|
|
|
// Installs SIGABRT signal handler so that we get stack traces
|
|
// from every thread on SIGABRT caused exit. Also installs SIGUSR2 handler
|
|
// so that threads can communicate with each other (be sure if you use SIGUSR2)
|
|
// to install your handler before initing caffe2 (we properly fall back to
|
|
// the previous handler if we didn't initiate the SIGUSR2).
|
|
void installFatalSignalHandlers() {
|
|
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
|
|
if (fatalSignalHandlersInstalled) {
|
|
return;
|
|
}
|
|
fatalSignalHandlersInstalled = true;
|
|
struct sigaction sa;
|
|
sigemptyset(&sa.sa_mask);
|
|
// Since we'll be in an exiting situation it's possible there's memory
|
|
// corruption, so make our own stack just in case.
|
|
sa.sa_flags = SA_ONSTACK | SA_SIGINFO;
|
|
sa.sa_handler = ::fatalSignalHandler;
|
|
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
|
|
if (sigaction(handler->signum, &sa, &handler->previous)) {
|
|
std::string str("Failed to add ");
|
|
str += handler->name;
|
|
str += " handler!";
|
|
perror(str.c_str());
|
|
}
|
|
}
|
|
sa.sa_sigaction = ::stacktraceSignalHandler;
|
|
if (sigaction(SIGUSR2, &sa, &::previousSigusr2)) {
|
|
perror("Failed to add SIGUSR2 handler!");
|
|
}
|
|
}
|
|
|
|
void uninstallFatalSignalHandlers() {
|
|
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
|
|
if (!fatalSignalHandlersInstalled) {
|
|
return;
|
|
}
|
|
fatalSignalHandlersInstalled = false;
|
|
for (auto* handler = kSignalHandlers; handler->name != nullptr; handler++) {
|
|
if (sigaction(handler->signum, &handler->previous, nullptr)) {
|
|
std::string str("Failed to remove ");
|
|
str += handler->name;
|
|
str += " handler!";
|
|
perror(str.c_str());
|
|
} else {
|
|
handler->previous = {};
|
|
}
|
|
}
|
|
if (sigaction(SIGUSR2, &::previousSigusr2, nullptr)) {
|
|
perror("Failed to add SIGUSR2 handler!");
|
|
} else {
|
|
::previousSigusr2 = {};
|
|
}
|
|
}
|
|
#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
|
|
|
|
} // namespace
|
|
|
|
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
|
|
C10_DEFINE_bool(
|
|
caffe2_print_stacktraces,
|
|
false,
|
|
"If set, prints stacktraces when a fatal signal is raised.");
|
|
#endif
|
|
|
|
namespace caffe2 {
|
|
|
|
SignalHandler::SignalHandler(
|
|
SignalHandler::Action SIGINT_action,
|
|
SignalHandler::Action SIGHUP_action)
|
|
: SIGINT_action_(SIGINT_action),
|
|
SIGHUP_action_(SIGHUP_action),
|
|
my_sigint_count_(sigintCount),
|
|
my_sighup_count_(sighupCount) {
|
|
hookupHandler();
|
|
}
|
|
|
|
SignalHandler::~SignalHandler() {
|
|
unhookHandler();
|
|
}
|
|
|
|
// Return true iff a SIGINT has been received since the last time this
|
|
// function was called.
|
|
bool SignalHandler::GotSIGINT() {
|
|
uint64_t count = sigintCount;
|
|
bool result = (count != my_sigint_count_);
|
|
my_sigint_count_ = count;
|
|
return result;
|
|
}
|
|
|
|
// Return true iff a SIGHUP has been received since the last time this
|
|
// function was called.
|
|
bool SignalHandler::GotSIGHUP() {
|
|
uint64_t count = sighupCount;
|
|
bool result = (count != my_sighup_count_);
|
|
my_sighup_count_ = count;
|
|
return result;
|
|
}
|
|
|
|
SignalHandler::Action SignalHandler::CheckForSignals() {
|
|
if (GotSIGHUP()) {
|
|
return SIGHUP_action_;
|
|
}
|
|
if (GotSIGINT()) {
|
|
return SIGINT_action_;
|
|
}
|
|
return SignalHandler::Action::NONE;
|
|
}
|
|
|
|
#if defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
|
|
void setPrintStackTracesOnFatalSignal(bool print) {
|
|
if (print) {
|
|
installFatalSignalHandlers();
|
|
} else {
|
|
uninstallFatalSignalHandlers();
|
|
}
|
|
}
|
|
bool printStackTracesOnFatalSignal() {
|
|
std::lock_guard<std::mutex> locker(fatalSignalHandlersInstallationMutex);
|
|
return fatalSignalHandlersInstalled;
|
|
}
|
|
|
|
namespace internal {
|
|
bool Caffe2InitFatalSignalHandler(int*, char***) {
|
|
if (FLAGS_caffe2_print_stacktraces) {
|
|
setPrintStackTracesOnFatalSignal(true);
|
|
}
|
|
return true;
|
|
}
|
|
|
|
REGISTER_CAFFE2_INIT_FUNCTION(
|
|
Caffe2InitFatalSignalHandler,
|
|
&Caffe2InitFatalSignalHandler,
|
|
"Inits signal handlers for fatal signals so we can see what if"
|
|
" caffe2_print_stacktraces is set.");
|
|
|
|
} // namepsace internal
|
|
#endif // defined(CAFFE2_SUPPORTS_FATAL_SIGNAL_HANDLERS)
|
|
} // namespace caffe2
|
|
|
|
#else // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
|
|
|
|
// TODO: Currently we do not support signal handling in non-Linux yet - below is
|
|
// a minimal implementation that makes things compile.
|
|
namespace caffe2 {
|
|
SignalHandler::SignalHandler(
|
|
SignalHandler::Action SIGINT_action,
|
|
SignalHandler::Action SIGHUP_action) {}
|
|
SignalHandler::~SignalHandler() {}
|
|
bool SignalHandler::GotSIGINT() {
|
|
return false;
|
|
}
|
|
bool SignalHandler::GotSIGHUP() {
|
|
return false;
|
|
}
|
|
SignalHandler::Action SignalHandler::CheckForSignals() {
|
|
return SignalHandler::Action::NONE;
|
|
}
|
|
} // namespace caffe2
|
|
|
|
#endif // defined(CAFFE2_SUPPORTS_SIGNAL_HANDLER)
|