mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
DebugPlane: add dump_traceback handler (#128904)
This adds a `dump_traceback` handler so you can see all running threads for a job. This uses a temporary file as a buffer when calling `faulthandler.dump_traceback` and requires the GIL to be held during dumping. Test plan: ``` python test/distributed/elastic/test_control_plane.py -v -k traceback ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/128904 Approved by: https://github.com/c-p-i-o
This commit is contained in:
committed by
PyTorch MergeBot
parent
17abbafdfc
commit
59b4983dc0
@ -927,6 +927,7 @@ libtorch_python_distributed_sources = libtorch_python_distributed_core_sources +
|
|||||||
"torch/csrc/distributed/rpc/unpickled_python_call.cpp",
|
"torch/csrc/distributed/rpc/unpickled_python_call.cpp",
|
||||||
"torch/csrc/distributed/rpc/unpickled_python_remote_call.cpp",
|
"torch/csrc/distributed/rpc/unpickled_python_remote_call.cpp",
|
||||||
"torch/csrc/jit/runtime/register_distributed_ops.cpp",
|
"torch/csrc/jit/runtime/register_distributed_ops.cpp",
|
||||||
|
"torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
|
def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):
|
||||||
|
@ -92,6 +92,12 @@ class WorkerServerTest(TestCase):
|
|||||||
|
|
||||||
server.shutdown()
|
server.shutdown()
|
||||||
|
|
||||||
|
def test_dump_traceback(self) -> None:
|
||||||
|
with local_worker_server() as pool:
|
||||||
|
resp = pool.request("POST", "/handler/dump_traceback")
|
||||||
|
self.assertEqual(resp.status, 200)
|
||||||
|
self.assertIn(b"in test_dump_traceback\n", resp.data)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
run_tests()
|
run_tests()
|
||||||
|
44
torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
Normal file
44
torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
#include <torch/csrc/distributed/c10d/control_plane/Handlers.hpp>
|
||||||
|
|
||||||
|
#include <cstdio>
|
||||||
|
#include <fstream>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
#include <c10/util/tempfile.h>
|
||||||
|
#include <torch/csrc/distributed/c10d/exception.h>
|
||||||
|
#include <torch/csrc/utils/pybind.h>
|
||||||
|
|
||||||
|
namespace c10d::control_plane {
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
RegisterHandler tracebackHandler{
|
||||||
|
"dump_traceback",
|
||||||
|
[](const Request&, Response& res) {
|
||||||
|
auto tmpfile = c10::make_tempfile("torch-dump_traceback");
|
||||||
|
|
||||||
|
auto cfile = ::fopen(tmpfile.name.c_str(), "w");
|
||||||
|
if (!cfile) {
|
||||||
|
throw std::runtime_error("failed to open file for writing");
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
py::gil_scoped_acquire guard{};
|
||||||
|
|
||||||
|
auto faulthandler = py::module::import("faulthandler");
|
||||||
|
faulthandler.attr("dump_traceback")(fileno(cfile), true);
|
||||||
|
}
|
||||||
|
|
||||||
|
::fclose(cfile);
|
||||||
|
|
||||||
|
std::ifstream file(tmpfile.name);
|
||||||
|
std::string str;
|
||||||
|
std::string file_contents;
|
||||||
|
while (std::getline(file, str)) {
|
||||||
|
file_contents += str;
|
||||||
|
file_contents.push_back('\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
res.setContent(std::move(file_contents), "text/plain");
|
||||||
|
}};
|
||||||
|
}
|
||||||
|
} // namespace c10d::control_plane
|
Reference in New Issue
Block a user