mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
DebugPlane: add dump_traceback handler (#128904)
This adds a `dump_traceback` handler so you can see all running threads for a job. This uses a temporary file as a buffer when calling `faulthandler.dump_traceback` and requires the GIL to be held during dumping. Test plan: ``` python test/distributed/elastic/test_control_plane.py -v -k traceback ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/128904 Approved by: https://github.com/c-p-i-o
This commit is contained in:
committed by
PyTorch MergeBot
parent
17abbafdfc
commit
59b4983dc0
@ -927,6 +927,7 @@ libtorch_python_distributed_sources = libtorch_python_distributed_core_sources +
|
||||
"torch/csrc/distributed/rpc/unpickled_python_call.cpp",
|
||||
"torch/csrc/distributed/rpc/unpickled_python_remote_call.cpp",
|
||||
"torch/csrc/jit/runtime/register_distributed_ops.cpp",
|
||||
"torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp",
|
||||
]
|
||||
|
||||
def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
|
||||
|
@ -92,6 +92,12 @@ class WorkerServerTest(TestCase):
|
||||
|
||||
server.shutdown()
|
||||
|
||||
def test_dump_traceback(self) -> None:
|
||||
with local_worker_server() as pool:
|
||||
resp = pool.request("POST", "/handler/dump_traceback")
|
||||
self.assertEqual(resp.status, 200)
|
||||
self.assertIn(b"in test_dump_traceback\n", resp.data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
|
44
torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
Normal file
44
torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
Normal file
@ -0,0 +1,44 @@
|
||||
#include <torch/csrc/distributed/c10d/control_plane/Handlers.hpp>
|
||||
|
||||
#include <cstdio>
|
||||
#include <fstream>
|
||||
#include <string>
|
||||
|
||||
#include <c10/util/tempfile.h>
|
||||
#include <torch/csrc/distributed/c10d/exception.h>
|
||||
#include <torch/csrc/utils/pybind.h>
|
||||
|
||||
namespace c10d::control_plane {
|
||||
namespace {
|
||||
|
||||
RegisterHandler tracebackHandler{
|
||||
"dump_traceback",
|
||||
[](const Request&, Response& res) {
|
||||
auto tmpfile = c10::make_tempfile("torch-dump_traceback");
|
||||
|
||||
auto cfile = ::fopen(tmpfile.name.c_str(), "w");
|
||||
if (!cfile) {
|
||||
throw std::runtime_error("failed to open file for writing");
|
||||
}
|
||||
|
||||
{
|
||||
py::gil_scoped_acquire guard{};
|
||||
|
||||
auto faulthandler = py::module::import("faulthandler");
|
||||
faulthandler.attr("dump_traceback")(fileno(cfile), true);
|
||||
}
|
||||
|
||||
::fclose(cfile);
|
||||
|
||||
std::ifstream file(tmpfile.name);
|
||||
std::string str;
|
||||
std::string file_contents;
|
||||
while (std::getline(file, str)) {
|
||||
file_contents += str;
|
||||
file_contents.push_back('\n');
|
||||
}
|
||||
|
||||
res.setContent(std::move(file_contents), "text/plain");
|
||||
}};
|
||||
}
|
||||
} // namespace c10d::control_plane
|
Reference in New Issue
Block a user