DebugPlane: add dump_traceback handler (#128904)

This adds a `dump_traceback` handler so you can see all running threads for a job. This uses a temporary file as a buffer when calling `faulthandler.dump_traceback` and requires the GIL to be held during dumping.

Test plan:

```
python test/distributed/elastic/test_control_plane.py -v -k traceback
```

Pull Request resolved: https://github.com/pytorch/pytorch/pull/128904
Approved by: https://github.com/c-p-i-o
This commit is contained in:
Tristan Rice
2024-06-18 03:40:14 +00:00
committed by PyTorch MergeBot
parent 17abbafdfc
commit 59b4983dc0
3 changed files with 51 additions and 0 deletions

View File

@ -927,6 +927,7 @@ libtorch_python_distributed_sources = libtorch_python_distributed_core_sources +
"torch/csrc/distributed/rpc/unpickled_python_call.cpp", "torch/csrc/distributed/rpc/unpickled_python_call.cpp",
"torch/csrc/distributed/rpc/unpickled_python_remote_call.cpp", "torch/csrc/distributed/rpc/unpickled_python_remote_call.cpp",
"torch/csrc/jit/runtime/register_distributed_ops.cpp", "torch/csrc/jit/runtime/register_distributed_ops.cpp",
"torch/csrc/distributed/c10d/control_plane/PythonHandlers.cpp",
] ]
def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"): def glob_libtorch_python_sources(gencode_pattern = ":generate-code[{}]"):

View File

@ -92,6 +92,12 @@ class WorkerServerTest(TestCase):
server.shutdown() server.shutdown()
def test_dump_traceback(self) -> None:
with local_worker_server() as pool:
resp = pool.request("POST", "/handler/dump_traceback")
self.assertEqual(resp.status, 200)
self.assertIn(b"in test_dump_traceback\n", resp.data)
if __name__ == "__main__": if __name__ == "__main__":
run_tests() run_tests()

View File

@ -0,0 +1,44 @@
#include <torch/csrc/distributed/c10d/control_plane/Handlers.hpp>
#include <cstdio>
#include <fstream>
#include <string>
#include <c10/util/tempfile.h>
#include <torch/csrc/distributed/c10d/exception.h>
#include <torch/csrc/utils/pybind.h>
namespace c10d::control_plane {
namespace {
RegisterHandler tracebackHandler{
"dump_traceback",
[](const Request&, Response& res) {
auto tmpfile = c10::make_tempfile("torch-dump_traceback");
auto cfile = ::fopen(tmpfile.name.c_str(), "w");
if (!cfile) {
throw std::runtime_error("failed to open file for writing");
}
{
py::gil_scoped_acquire guard{};
auto faulthandler = py::module::import("faulthandler");
faulthandler.attr("dump_traceback")(fileno(cfile), true);
}
::fclose(cfile);
std::ifstream file(tmpfile.name);
std::string str;
std::string file_contents;
while (std::getline(file, str)) {
file_contents += str;
file_contents.push_back('\n');
}
res.setContent(std::move(file_contents), "text/plain");
}};
}
} // namespace c10d::control_plane