mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Improve and expose cpp_backtrace to python binding (#84896)
We can now get cpp stack trace by calling torch.utils.get_cpp_backtrace() Sample output when calling from a torch_dispatch stack: ``` <omitting python frames> frame #23: torch::handle_torch_function_no_python_arg_parser(c10::ArrayRef<pybind11::handle>, _object*, _object*, char const*, _object*, char const*, torch::TorchFunctionName) (0x7f69330bab90 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/utils/python_arg_parser.cpp:323) frame #24: <unknown function> (0x7f6932a09e79 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/python_variable.cpp:2252) frame #25: <unknown function> (0x7f69261aee33 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/PythonFallbackKernel.cpp:56) frame #26: <unknown function> (0x7f69261afef9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:19) frame #27: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41) frame #28: <unknown function> (0x7f6926fae9b9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/boxing.h:227) frame #29: at::Tensor c10::Dispatcher::redispatch<at::Tensor, at::Tensor const&>(c10::TypedOperatorHandle<at::Tensor (at::Tensor const&)> const&, c10::DispatchKeySet, at::Tensor const&) const (0x7f6926e821f5 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:106) frame #30: at::_ops::alias::redispatch(c10::DispatchKeySet, at::Tensor const&) (0x7f6927142c31 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:438) frame #31: <unknown function> (0x7f692ae4f8be in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp:1361) frame #32: <unknown function> (0x7f692ae4f9b1 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/ADInplaceOrViewType_1.cpp:1362) frame #33: <unknown function> (0x7f692aef77e9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h:13) frame #34: <unknown function> (0x7f6926fae7d8 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:50) frame #35: at::Tensor c10::Dispatcher::redispatch<at::Tensor, at::Tensor const&>(c10::TypedOperatorHandle<at::Tensor (at::Tensor const&)> const&, c10::DispatchKeySet, at::Tensor const&) const (0x7f6926e821c9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:97) frame #36: at::_ops::alias::redispatch(c10::DispatchKeySet, at::Tensor const&) (0x7f6927142c31 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:438) frame #37: <unknown function> (0x7f6929ec654a in /fsx/users/bahuang/repos/pytorch_fsx/build/aten/src/ATen/RedispatchFunctions.h:10697) frame #38: <unknown function> (0x7f6929d9edae in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/VariableType_1.cpp:2837) frame #39: <unknown function> (0x7f6929d9f043 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/autograd/generated/VariableType_1.cpp:2838) frame #40: <unknown function> (0x7f6929e7d2f9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/WrapFunctionIntoFunctor.h:13) frame #41: <unknown function> (0x7f6929eb1344 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:478) frame #42: <unknown function> (0x7f6929ea7b99 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:490) frame #43: <unknown function> (0x7f6929e7d370 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:563) frame #44: <unknown function> (0x7f6929e7d43a in /fsx/users/bahuang/repos/pytorch_fsx/c10/util/C++17.h:239) frame #45: <unknown function> (0x7f6929e7d48c in /fsx/users/bahuang/repos/pytorch_fsx/c10/util/C++17.h:364) frame #46: <unknown function> (0x7f6929e7d50a in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h:554) frame #47: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41) frame #48: c10::KernelFunction::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadd26 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:43) frame #49: c10::Dispatcher::redispatchBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f692603890a in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:652) frame #50: <unknown function> (0x7f69260387f9 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:388) frame #51: <unknown function> (0x7f69261af0ef in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/PythonFallbackKernel.cpp:96) frame #52: <unknown function> (0x7f69261aff2b in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:25) frame #53: c10::BoxedKernel::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadced in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/BoxedKernel_impl.h:41) frame #54: c10::KernelFunction::callBoxed(c10::OperatorHandle const&, c10::DispatchKeySet, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6932aadd26 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/boxing/KernelFunction_impl.h:43) frame #55: c10::Dispatcher::callBoxed(c10::OperatorHandle const&, std::vector<c10::IValue, std::allocator<c10::IValue> >*) const (0x7f6925fd6ab2 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:628) frame #56: <unknown function> (0x7f6925fd6690 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:376) frame #57: <unknown function> (0x7f692bf5b525 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/dispatch/Dispatcher.h:380) frame #58: <unknown function> (0x7f692bf59fac in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/runtime/register_c10_ops.cpp:15) frame #59: <unknown function> (0x7f692bf5af41 in /usr/include/c++/7/bits/std_function.h:316) frame #60: std::function<void (std::vector<c10::IValue, std::allocator<c10::IValue> >&)>::operator()(std::vector<c10::IValue, std::allocator<c10::IValue> >&) const (0x7f6932ab9a0f in /usr/include/c++/7/bits/std_function.h:706) frame #61: <unknown function> (0x7f6932aad541 in /fsx/users/bahuang/repos/pytorch_fsx/aten/src/ATen/core/stack.h:41) frame #62: <unknown function> (0x7f6932ab3102 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/pybind_utils.h:1206 (discriminator 1)) frame #63: <unknown function> (0x7f6932ab3943 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/pybind_utils.h:1272) frame #64: <unknown function> (0x7f6932a46120 in /fsx/users/bahuang/repos/pytorch_fsx/torch/csrc/jit/python/init.cpp:1767) frame #65: <unknown function> (0x7f6932a997be in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/cast.h:1441) frame #66: <unknown function> (0x7f6932a8a985 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/cast.h:1410) frame #67: <unknown function> (0x7f6932a66e1e in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:249) frame #68: <unknown function> (0x7f6932a66ec2 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:224) frame #69: <unknown function> (0x7f6932473111 in /fsx/users/bahuang/repos/pytorch_fsx/third_party/pybind11/include/pybind11/pybind11.h:929) frame #104: __libc_start_main (0x7f693485dc87 in /build/glibc-uZu3wS/glibc-2.27/csu/../csu/libc-start.c:310) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/84896 Approved by: https://github.com/ezyang
This commit is contained in:
committed by
PyTorch MergeBot
parent
52fd7e491b
commit
73fbca1ea6
@ -924,6 +924,7 @@ add-auto-load-safe-path /path/to/pytorch/.gdbinit
|
||||
|
||||
### C++ stacktraces
|
||||
Set `TORCH_SHOW_CPP_STACKTRACES=1` to get the C++ stacktrace when an error occurs in Python.
|
||||
Set `TORCH_SHOW_CPP_STACKTRACES_WITH_LINENO=1` to get the C++ stacktrace with file and line number.
|
||||
|
||||
## CUDA development tips
|
||||
|
||||
|
@ -1,6 +1,7 @@
|
||||
#include <c10/util/Backtrace.h>
|
||||
#include <c10/util/Optional.h>
|
||||
#include <c10/util/Type.h>
|
||||
#include <c10/util/env.h>
|
||||
#include <c10/util/irange.h>
|
||||
|
||||
#include <functional>
|
||||
@ -21,7 +22,14 @@
|
||||
#include <dlfcn.h>
|
||||
#include <unwind.h>
|
||||
#else
|
||||
#include <dlfcn.h>
|
||||
#include <execinfo.h>
|
||||
|
||||
#ifndef __APPLE__
|
||||
// link.h is not available on IOS and Mac builds
|
||||
#include <link.h>
|
||||
#endif
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -87,6 +95,46 @@ void dump_stack(
|
||||
#if SUPPORTS_BACKTRACE
|
||||
namespace {
|
||||
|
||||
#if !defined(C10_ANDROID) && !defined(__APPLE__)
|
||||
|
||||
// converts a function's address in memory to its VMA address in the executable
|
||||
// file. VMA is what addr2line expects
|
||||
size_t ConvertToVMA(size_t addr) {
|
||||
Dl_info info;
|
||||
link_map* link_map;
|
||||
dladdr1((void*)addr, &info, (void**)&link_map, RTLD_DL_LINKMAP);
|
||||
return addr - link_map->l_addr;
|
||||
}
|
||||
|
||||
std::string exec(const char* cmd) {
|
||||
std::array<char, 128> buffer;
|
||||
std::string result;
|
||||
std::unique_ptr<FILE, decltype(&pclose)> pipe(popen(cmd, "r"), pclose);
|
||||
if (!pipe) {
|
||||
throw std::runtime_error("popen() failed!");
|
||||
}
|
||||
while (fgets(buffer.data(), buffer.size(), pipe.get()) != nullptr) {
|
||||
result += buffer.data();
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
std::string rstrip(const std::string& s) {
|
||||
const std::string WHITESPACE = " \n\r\t\f\v";
|
||||
size_t end = s.find_last_not_of(WHITESPACE);
|
||||
return (end == std::string::npos) ? "" : s.substr(0, end + 1);
|
||||
}
|
||||
|
||||
bool use_addr2line() {
|
||||
static bool _use_addr2line = []() {
|
||||
return c10::utils::check_env("TORCH_SHOW_CPP_STACKTRACES_WITH_LINENO") ==
|
||||
true;
|
||||
}();
|
||||
return _use_addr2line;
|
||||
}
|
||||
|
||||
#endif // !defined(C10_ANDROID) && !defined(__APPLE__)
|
||||
|
||||
struct FrameInformation {
|
||||
/// If available, the demangled name of the function at this frame, else
|
||||
/// whatever (possibly mangled) name we got from `backtrace()`.
|
||||
@ -99,6 +147,10 @@ struct FrameInformation {
|
||||
/// NOTE: In debugger parlance, the "object file" refers to the ELF file that
|
||||
/// the symbol originates from, i.e. either an executable or a library.
|
||||
std::string object_file;
|
||||
/// Source file name and line number
|
||||
std::string source_file_lineno;
|
||||
|
||||
bool is_python_frame;
|
||||
};
|
||||
|
||||
#ifndef C10_ANDROID
|
||||
@ -108,7 +160,8 @@ bool is_python_frame(const FrameInformation& frame) {
|
||||
}
|
||||
|
||||
c10::optional<FrameInformation> parse_frame_information(
|
||||
const std::string& frame_string) {
|
||||
const std::string& frame_string,
|
||||
void* frame_pointer) {
|
||||
FrameInformation frame;
|
||||
|
||||
// This is the function name in the CXX ABI mangled format, e.g. something
|
||||
@ -141,6 +194,7 @@ c10::optional<FrameInformation> parse_frame_information(
|
||||
frame.object_file = frame_string.substr(0, function_name_start - 1);
|
||||
frame.offset_into_function =
|
||||
frame_string.substr(offset_start, offset_end - offset_start);
|
||||
frame.is_python_frame = is_python_frame(frame);
|
||||
|
||||
// NOTE: We don't need to parse the return address because
|
||||
// we already have it from the call to `backtrace()`.
|
||||
@ -171,6 +225,30 @@ c10::optional<FrameInformation> parse_frame_information(
|
||||
}
|
||||
|
||||
frame.function_name = demangle(mangled_function_name.c_str());
|
||||
|
||||
#if !defined(__APPLE__)
|
||||
|
||||
if (use_addr2line() && !frame.is_python_frame) {
|
||||
Dl_info info;
|
||||
if (dladdr(frame_pointer, &info)) {
|
||||
char command[256];
|
||||
size_t VMA_addr = ConvertToVMA((size_t)frame_pointer);
|
||||
// Need to decrease the VMA address by 1 to get the correct line number
|
||||
// https://stackoverflow.com/questions/11579509/wrong-line-numbers-from-addr2line/63841497#63841497
|
||||
VMA_addr -= 1;
|
||||
snprintf(
|
||||
command,
|
||||
sizeof(command),
|
||||
"addr2line -e %s -C %zx",
|
||||
info.dli_fname,
|
||||
VMA_addr);
|
||||
|
||||
frame.source_file_lineno = rstrip(exec(command));
|
||||
}
|
||||
}
|
||||
|
||||
#endif // !defined(__APPLE__)
|
||||
|
||||
return frame;
|
||||
}
|
||||
#endif /* !defined(C10_ANDROID) */
|
||||
@ -283,9 +361,10 @@ std::string get_backtrace(
|
||||
bool has_skipped_python_frames = false;
|
||||
|
||||
for (const auto frame_number : c10::irange(callstack.size())) {
|
||||
const auto frame = parse_frame_information(symbols[frame_number]);
|
||||
const auto frame =
|
||||
parse_frame_information(symbols[frame_number], callstack[frame_number]);
|
||||
|
||||
if (skip_python_frames && frame && is_python_frame(*frame)) {
|
||||
if (skip_python_frames && frame && frame->is_python_frame) {
|
||||
if (!has_skipped_python_frames) {
|
||||
stream << "<omitting python frames>\n";
|
||||
has_skipped_python_frames = true;
|
||||
@ -297,10 +376,17 @@ std::string get_backtrace(
|
||||
stream << "frame #" << frame_number << ": ";
|
||||
|
||||
if (frame) {
|
||||
if (frame->source_file_lineno.empty()) {
|
||||
// <function_name> + <offset> (<return-address> in <object-file>)
|
||||
stream << frame->function_name << " + " << frame->offset_into_function
|
||||
<< " (" << callstack[frame_number] << " in " << frame->object_file
|
||||
<< ")\n";
|
||||
<< " (" << callstack[frame_number] << " in "
|
||||
<< frame->object_file << ")\n";
|
||||
|
||||
} else {
|
||||
// <function_name> (<return-address> in <filename>:<line-number>)
|
||||
stream << frame->function_name << " (" << callstack[frame_number]
|
||||
<< " in " << frame->source_file_lineno << ")\n";
|
||||
}
|
||||
} else {
|
||||
// In the edge-case where we couldn't parse the frame string, we can
|
||||
// just use it directly (it may have a different format).
|
||||
|
@ -843,6 +843,7 @@ def _remove_meta_from_tls_dispatch_include() -> None: ...
|
||||
# https://code.activestate.com/lists/python-dev/139675/
|
||||
def _to_dlpack(data: Tensor) -> Any: ... # THPModule_toDLPack
|
||||
def _from_dlpack(data: Any) -> Tensor: ... # THPModule_fromDLPack
|
||||
def _get_cpp_backtrace(frames_to_skip: _int, maximum_number_of_frames: _int) -> str: ... # THPModule_getCppBacktrace
|
||||
def set_flush_denormal(arg: _bool) -> _bool: ... # THPModule_setFlushDenormal
|
||||
def get_default_dtype() -> _dtype: ... # THPModule_getDefaultDtype
|
||||
def _get_default_device() -> str: ... # THPModule_getDefaultDevice
|
||||
|
@ -425,6 +425,19 @@ PyObject* THPModule_fromDLPack(PyObject* _unused, PyObject* data) {
|
||||
END_HANDLE_TH_ERRORS
|
||||
}
|
||||
|
||||
PyObject* THModule_getCppBacktrace(PyObject* _unused, PyObject* args) {
|
||||
HANDLE_TH_ERRORS
|
||||
size_t frames_to_skip;
|
||||
size_t maximum_number_of_frames;
|
||||
if (!PyArg_ParseTuple(
|
||||
args, "LL", &frames_to_skip, &maximum_number_of_frames)) {
|
||||
return nullptr;
|
||||
}
|
||||
return THPUtils_packString(
|
||||
c10::get_backtrace(frames_to_skip, maximum_number_of_frames, true));
|
||||
END_HANDLE_TH_ERRORS
|
||||
}
|
||||
|
||||
PyObject* THPModule_setAllowTF32CuDNN(PyObject* _unused, PyObject* arg) {
|
||||
THPUtils_assert(
|
||||
PyBool_Check(arg),
|
||||
@ -866,6 +879,7 @@ static PyMethodDef TorchMethods[] = {
|
||||
nullptr},
|
||||
{"_to_dlpack", THPModule_toDLPack, METH_O, nullptr},
|
||||
{"_from_dlpack", THPModule_fromDLPack, METH_O, nullptr},
|
||||
{"_get_cpp_backtrace", THModule_getCppBacktrace, METH_VARARGS, nullptr},
|
||||
{"set_flush_denormal", THPModule_setFlushDenormal, METH_O, nullptr},
|
||||
{"get_default_dtype", THPModule_getDefaultDtype, METH_NOARGS, nullptr},
|
||||
{"_get_default_device", THPModule_getDefaultDevice, METH_NOARGS, nullptr},
|
||||
|
@ -3,6 +3,7 @@ import sys
|
||||
|
||||
from .throughput_benchmark import ThroughputBenchmark
|
||||
from ._crash_handler import enable_minidumps, disable_minidumps, enable_minidumps_on_exceptions
|
||||
from .cpp_backtrace import get_cpp_backtrace
|
||||
|
||||
# Set the module for a given object for nicer printing
|
||||
def set_module(obj, mod):
|
||||
|
11
torch/utils/cpp_backtrace.py
Normal file
11
torch/utils/cpp_backtrace.py
Normal file
@ -0,0 +1,11 @@
|
||||
from torch._C import _get_cpp_backtrace
|
||||
|
||||
def get_cpp_backtrace(frames_to_skip=0, maximum_number_of_frames=64) -> str:
|
||||
r"""
|
||||
Returns a string containing the C++ stack trace of the current thread.
|
||||
Args:
|
||||
frames_to_skip (int): the number of frames to skip from the top of the stack
|
||||
maximum_number_of_frames (int): the maximum number of frames to return
|
||||
"""
|
||||
|
||||
return _get_cpp_backtrace(frames_to_skip, maximum_number_of_frames)
|
Reference in New Issue
Block a user