From e8a9d088c694a49c414152ef6ea28b475e8f85f2 Mon Sep 17 00:00:00 2001 From: Nikita Shulga Date: Fri, 29 Dec 2023 05:15:35 +0000 Subject: [PATCH] [DevX] Add tool and doc on partial debug builds (#116521) Turned command sequence mentioned in https://dev-discuss.pytorch.org/t/how-to-get-a-fast-debug-build/1597 and in various discussions into a tool that I use almost daily to debug crashes or correctness issues in the codebase Essentially it allows one to turn this: ``` Process 87729 stopped * thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1 frame #0: 0x00000001023d55a8 libtorch_python.dylib`at::indexing::impl::applySelect(at::Tensor const&, long long, c10::SymInt, long long, c10::Device const&, std::__1::optional> const&) libtorch_python.dylib`at::indexing::impl::applySelect: -> 0x1023d55a8 <+0>: sub sp, sp, #0xd0 0x1023d55ac <+4>: stp x24, x23, [sp, #0x90] 0x1023d55b0 <+8>: stp x22, x21, [sp, #0xa0] 0x1023d55b4 <+12>: stp x20, x19, [sp, #0xb0] ``` into this ``` Process 87741 stopped * thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1 frame #0: 0x00000001024e2628 libtorch_python.dylib`at::indexing::impl::applySelect(self=0x00000001004ee8a8, dim=0, index=(data_ = 3), real_dim=0, (null)=0x000000016fdfe535, self_sizes= Has Value=true ) at TensorIndexing.h:239:7 236 const at::Device& /*self_device*/, 237 const c10::optional& self_sizes) { 238 // See NOTE [nested tensor size for indexing] -> 239 if (self_sizes.has_value()) { 240 auto maybe_index = index.maybe_as_int(); 241 if (maybe_index.has_value()) { 242 TORCH_CHECK_INDEX( ``` while retaining good performance for the rest of the codebase Pull Request resolved: https://github.com/pytorch/pytorch/pull/116521 Approved by: https://github.com/atalman --- CONTRIBUTING.md | 61 +++++++++++++++++++ tools/build_with_debinfo.py | 115 ++++++++++++++++++++++++++++++++++++ 2 files changed, 176 insertions(+) create mode 100755 tools/build_with_debinfo.py diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 38ffa5977e0e..270e20a0b99b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -41,6 +41,7 @@ aspects of contributing to PyTorch. - [Use a faster linker](#use-a-faster-linker) - [Use pre-compiled headers](#use-pre-compiled-headers) - [Workaround for header dependency bug in nvcc](#workaround-for-header-dependency-bug-in-nvcc) + - [Rebuild few files with debug information](#rebuild-few-files-with-debug-information) - [C++ frontend development tips](#c-frontend-development-tips) - [GDB integration](#gdb-integration) - [C++ stacktraces](#c-stacktraces) @@ -811,6 +812,66 @@ export CMAKE_CUDA_COMPILER_LAUNCHER="python;`pwd`/tools/nvcc_fix_deps.py;ccache" python setup.py develop ``` +### Rebuild few files with debug information + +While debugging a problem one often had to maintain a debug build in a separate folder. +But often only a few files needs to be rebuild with debug info to get a symbolicated backtrace or enable source debugging +One can easily solve this with the help of `tools/build_with_debinfo.py` + +For example, suppose one wants to debug what is going on while tensor index is selected, which can be achieved by setting a breakpoint at `applySelect` function: +``` +% lldb -o "b applySelect" -o "process launch" -- python3 -c "import torch;print(torch.rand(5)[3])" +(lldb) target create "python" +Current executable set to '/usr/bin/python3' (arm64). +(lldb) settings set -- target.run-args "-c" "import torch;print(torch.rand(5)[3])" +(lldb) b applySelect +Breakpoint 1: no locations (pending). +WARNING: Unable to resolve breakpoint to any actual locations. +(lldb) process launch +2 locations added to breakpoint 1 +Process 87729 stopped +* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1 + frame #0: 0x00000001023d55a8 libtorch_python.dylib`at::indexing::impl::applySelect(at::Tensor const&, long long, c10::SymInt, long long, c10::Device const&, std::__1::optional> const&) +libtorch_python.dylib`at::indexing::impl::applySelect: +-> 0x1023d55a8 <+0>: sub sp, sp, #0xd0 + 0x1023d55ac <+4>: stp x24, x23, [sp, #0x90] + 0x1023d55b0 <+8>: stp x22, x21, [sp, #0xa0] + 0x1023d55b4 <+12>: stp x20, x19, [sp, #0xb0] +Target 0: (python) stopped. +Process 87729 launched: '/usr/bin/python' (arm64) +``` +Which is not very informative, but can be easily remedied by rebuilding `python_variable_indexing.cpp` with debug information +``` +% ./tools/build_with_debinfo.py torch/csrc/autograd/python_variable_indexing.cpp +[1 / 2] Building caffe2/torch/CMakeFiles/torch_python.dir/csrc/autograd/python_variable_indexing.cpp.o +[2 / 2] Building lib/libtorch_python.dylib +``` +And afterwards: +``` +% lldb -o "b applySelect" -o "process launch" -- python3 -c "import torch;print(torch.rand(5)[3])" +(lldb) target create "python" +Current executable set to '/usr/bin/python3' (arm64). +(lldb) settings set -- target.run-args "-c" "import torch;print(torch.rand(5)[3])" +(lldb) b applySelect +Breakpoint 1: no locations (pending). +WARNING: Unable to resolve breakpoint to any actual locations. +(lldb) process launch +2 locations added to breakpoint 1 +Process 87741 stopped +* thread #1, queue = 'com.apple.main-thread', stop reason = breakpoint 1.1 + frame #0: 0x00000001024e2628 libtorch_python.dylib`at::indexing::impl::applySelect(self=0x00000001004ee8a8, dim=0, index=(data_ = 3), real_dim=0, (null)=0x000000016fdfe535, self_sizes= Has Value=true ) at TensorIndexing.h:239:7 + 236 const at::Device& /*self_device*/, + 237 const c10::optional& self_sizes) { + 238 // See NOTE [nested tensor size for indexing] +-> 239 if (self_sizes.has_value()) { + 240 auto maybe_index = index.maybe_as_int(); + 241 if (maybe_index.has_value()) { + 242 TORCH_CHECK_INDEX( +Target 0: (python) stopped. +Process 87741 launched: '/usr/bin/python3' (arm64) +``` +Which is much more useful, isn't it? + ### C++ frontend development tips We have very extensive tests in the [test/cpp/api](test/cpp/api) folder. The diff --git a/tools/build_with_debinfo.py b/tools/build_with_debinfo.py new file mode 100755 index 000000000000..0f2bc9b7379c --- /dev/null +++ b/tools/build_with_debinfo.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +# Tool quickly rebuild one or two files with debug info +# Mimics following behavior: +# - touch file +# - ninja -j1 -v -n torch_python | sed -e 's/-O[23]/-g/g' -e 's#\[[0-9]\+\/[0-9]\+\] \+##' |sh +# - Copy libs from build/lib to torch/lib folder + +import subprocess +import sys +from pathlib import Path +from typing import Any, List, Optional, Tuple + +PYTORCH_ROOTDIR = Path(__file__).resolve().parent.parent +TORCH_DIR = PYTORCH_ROOTDIR / "torch" +TORCH_LIB_DIR = TORCH_DIR / "lib" +BUILD_DIR = PYTORCH_ROOTDIR / "build" +BUILD_LIB_DIR = BUILD_DIR / "lib" + + +def check_output(args: List[str], cwd: Optional[str] = None) -> str: + return subprocess.check_output(args, cwd=cwd).decode("utf-8") + + +def parse_args() -> Any: + from argparse import ArgumentParser + + parser = ArgumentParser(description="Incremental build PyTorch with debinfo") + parser.add_argument("--verbose", action="store_true") + parser.add_argument("files", nargs="?", action="append") + return parser.parse_args() + + +def get_lib_extension() -> str: + if sys.platform == "linux": + return "so" + if sys.platform == "darwin": + return "dylib" + raise RuntimeError(f"Usupported platform {sys.platform}") + + +def create_symlinks() -> None: + """Creates symlinks from build/lib to torch/lib""" + if not TORCH_LIB_DIR.exists(): + raise RuntimeError(f"Can't create symlinks as {TORCH_LIB_DIR} does not exist") + if not BUILD_LIB_DIR.exists(): + raise RuntimeError(f"Can't create symlinks as {BUILD_LIB_DIR} does not exist") + for torch_lib in TORCH_LIB_DIR.glob(f"*.{get_lib_extension()}"): + if torch_lib.is_symlink(): + continue + build_lib = BUILD_LIB_DIR / torch_lib.name + if not build_lib.exists(): + raise RuntimeError(f"Can't find {build_lib} corresponding to {torch_lib}") + torch_lib.unlink() + torch_lib.symlink_to(build_lib) + + +def has_build_ninja() -> bool: + return (BUILD_DIR / "build.ninja").exists() + + +def is_devel_setup() -> bool: + output = check_output([sys.executable, "-c", "import torch;print(torch.__file__)"]) + return output.strip() == str(TORCH_DIR / "__init__.py") + + +def create_build_plan() -> List[Tuple[str, str]]: + output = check_output( + ["ninja", "-j1", "-v", "-n", "torch_python"], cwd=str(BUILD_DIR) + ) + rc = [] + for line in output.split("\n"): + if not line.startswith("["): + continue + line = line.split("]", 1)[1].strip() + if line.startswith(": &&") and line.endswith("&& :"): + line = line[4:-4] + line = line.replace("-O2", "-g").replace("-O3", "-g") + name = line.split("-o ", 1)[1].split(" ")[0] + rc.append((name, line)) + return rc + + +def main() -> None: + if sys.platform == "win32": + print("Not supported on Windows yet") + sys.exit(-95) + if not is_devel_setup(): + print( + "Not a devel setup of PyTorch, please run `python3 setup.py develop --user` first" + ) + sys.exit(-1) + if not has_build_ninja(): + print("Only ninja build system is supported at the moment") + sys.exit(-1) + args = parse_args() + for file in args.files: + if file is None: + continue + Path(file).touch() + build_plan = create_build_plan() + if len(build_plan) == 0: + return print("Nothing to do") + if len(build_plan) > 100: + print("More than 100 items needs to be rebuild, run `ninja torch_python` first") + sys.exit(-1) + for idx, (name, cmd) in enumerate(build_plan): + print(f"[{idx + 1 } / {len(build_plan)}] Building {name}") + if args.verbose: + print(cmd) + subprocess.check_call(["sh", "-c", cmd], cwd=BUILD_DIR) + create_symlinks() + + +if __name__ == "__main__": + main()