[memory debugging] Extract frame information from inductor (#95753)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/95753 Approved by: https://github.com/Chillee
2025-10-20 21:14:14 +08:00 · 2023-03-15 15:14:10 -07:00
parent e74f70d212
commit 3162f71787
7 changed files with 127 additions and 12 deletions
--- a/test/inductor/test_torchinductor.py
+++ b/test/inductor/test_torchinductor.py
@ -7065,6 +7065,28 @@ if HAS_CUDA and not TEST_WITH_ASAN:
            self.assertTrue(torch.allclose(module(input), traced(input)))
        def test_memory_history_inductor(self):
            def called_inside_compile(x, w, b):
                a = x @ w + b
                return torch.sigmoid(a)
            @torch.compile
            def fn(x, w, b):
                x = called_inside_compile(x, w, b)
                return called_inside_compile(x, w, b)
            w = torch.rand(3, 3, device="cuda")
            b = torch.rand(3, device="cuda")
            x = torch.rand(3, device="cuda")
            try:
                torch.cuda.memory.empty_cache()
                torch.cuda.memory._record_memory_history(True)
                r = fn(x, w, b)
            finally:
                torch.cuda.memory._record_memory_history(False)
            snapshot = str(torch.cuda.memory._snapshot())
            self.assertTrue("called_inside_compile" in snapshot)
    copy_tests(CommonTemplate, CudaTests, "cuda")
    class CudaReproTests(TestCase):
--- a/torch/_inductor/codecache.py
+++ b/torch/_inductor/codecache.py
@ -15,6 +15,7 @@ import sys
 import sysconfig
 import tempfile
 import types
 from bisect import bisect_right
 from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor
 from ctypes import cdll
 from functools import partial
@ -635,10 +636,11 @@ class CppCodeCache:
 class PyCodeCache:
    cache = dict()
    linemaps = dict()
    clear = staticmethod(cache.clear)
    @classmethod
-    def load(cls, source_code, extra=""):
+    def load(cls, source_code, extra="", linemap=()):
        key, path = write(source_code, "py", extra)
        if key not in cls.cache:
            with open(path) as f:
@ -654,8 +656,36 @@ class PyCodeCache:
                exec(code, mod.__dict__, mod.__dict__)
                # another thread might set this first
                cls.cache.setdefault(key, mod)
                cls.linemaps[path] = linemap
        return cls.cache[key]
    @classmethod
    @functools.lru_cache(None)
    def stack_frames_for_code(cls, path, lineno):
        if path not in cls.linemaps:
            return None
        # [(starting_line, <fx node>), ...]
        linemap = cls.linemaps[path]
        p = bisect_right(linemap, lineno, key=lambda x: x[0])
        if p == 0:
            return None
        _, entry = linemap[p - 1]
        if not entry:
            return None
        def parse_stack_trace(stack_trace):
            # ideally fx stores stack traces as data rather than a string
            # but this is not along a performance critical path
            regex = r'File "(.+)", line (\d+), in (.+)\n'
            matches = re.findall(regex, stack_trace)
            return [
                {"filename": f, "line": int(l), "name": n}
                for f, l, n in reversed(matches)
            ]
        return parse_stack_trace(entry.stack_trace)
 class TritonCodeCache:
    @staticmethod
--- a/torch/_inductor/codegen/wrapper.py
+++ b/torch/_inductor/codegen/wrapper.py
@ -17,6 +17,7 @@ from ..utils import (
    cache_on_self,
    get_benchmark_name,
    has_triton,
    LineContext,
    sympy_dot,
    sympy_product,
    sympy_symbol,
@ -562,7 +563,7 @@ class WrapperCodeGen(CodeGen):
        self.add_benchmark_harness(result)
-        return result.getvalue()
+        return result.getvaluewithlinemap()
    def codegen_inputs(self, code: IndentedBuffer, graph_inputs: Dict[str, ir.Buffer]):
        """Assign all symbolic shapes to locals"""
@ -736,6 +737,9 @@ class WrapperCodeGen(CodeGen):
    def writeline(self, line):
        self.lines.append(line)
    def enter_context(self, ctx):
        self.lines.append(LineContext(ctx))
 class CppWrapperCodeGen(WrapperCodeGen):
    """
--- a/torch/_inductor/graph.py
+++ b/torch/_inductor/graph.py
@ -619,11 +619,11 @@ class GraphLowering(torch.fx.Interpreter):
    def compile_to_module(self):
        from .codecache import PyCodeCache
-        code = self.codegen()
+        code, linemap = self.codegen()
        if config.debug:
            print(code)
-        mod = PyCodeCache.load(code)
+        mod = PyCodeCache.load(code, linemap=linemap)
        for name, value in self.constants.items():
            setattr(mod, name, value)
--- a/torch/_inductor/scheduler.py
+++ b/torch/_inductor/scheduler.py
@ -674,6 +674,10 @@ class Scheduler:
        self.buffer_names_to_free = set()
        self.buffer_names_no_longer_needed = set()
        # fx graph node to the position it appears in the graph
        # for debug attribution
        self.origin_to_index = {}
    def debug_draw_graph(self):
        """Generate an image of the graph for debugging"""
        if os.environ.get("INDUCTOR_WRITE_SCHEDULER_GRAPH", None) == "1":
@ -1164,9 +1168,21 @@ class Scheduler:
            self.backends[device] = self.create_backend(device)
        return self.backends[device]
    def enter_context(self, node):
        def get_order(n):
            if n not in self.origin_to_index:
                self.origin_to_index.update({n: i for i, n in enumerate(n.graph.nodes)})
            return self.origin_to_index[n]
        origins = [(get_order(e), e) for n in node.get_nodes() for e in n.node.origins]
        if origins:
            _, last = max(origins)
            V.graph.wrapper_code.enter_context(last)
    @dynamo_timed
    def codegen(self):
        for node in self.nodes:
            self.enter_context(node)
            self.buffer_names_no_longer_needed.update(node.last_usage)
            if not isinstance(node, NopKernelSchedulerNode):
--- a/torch/_inductor/utils.py
+++ b/torch/_inductor/utils.py
@ -12,7 +12,7 @@ import tempfile
 import textwrap
 import time
 from io import StringIO
-from typing import Any, Dict, List, Optional, Union
+from typing import Any, Dict, List, NamedTuple, Optional, Union
 from unittest import mock
 import sympy
@ -420,6 +420,10 @@ def get_dtype_size(dtype):
    return torch.empty((), dtype=dtype).element_size()
 class LineContext(NamedTuple):
    context: Any
 class IndentedBuffer:
    tabwidth = 4
@ -427,19 +431,27 @@ class IndentedBuffer:
        self._lines = []
        self._indent = initial_indent
-    def getvalue(
+    def getvaluewithlinemap(self):
        self,
    ):
        buf = StringIO()
        p = 1
        linemap = []
        for line in self._lines:
            if isinstance(line, DeferredLineBase):
                line = line()
                if line is None:
                    continue
            elif isinstance(line, LineContext):
                linemap.append((p, line.context))
                continue
            assert isinstance(line, str)
            buf.write(line)
            buf.write("\n")
-        return buf.getvalue()
+            p += 1 + line.count("\n")
        return buf.getvalue(), linemap
    def getvalue(self):
        v, _ = self.getvaluewithlinemap()
        return v
    def getrawvalue(self):
        buf = StringIO()
@ -448,6 +460,8 @@ class IndentedBuffer:
                line = line()
                if line is None:
                    continue
            elif isinstance(line, LineContext):
                continue
            assert isinstance(line, str)
            # backslash implies line continuation
            if line.endswith("\\"):
@ -467,7 +481,9 @@ class IndentedBuffer:
        return " " * (self._indent * self.tabwidth)
    def writeline(self, line):
-        if isinstance(line, DeferredLineBase):
+        if isinstance(line, LineContext):
            self._lines.append(line)
        elif isinstance(line, DeferredLineBase):
            self._lines.append(line.with_prefix(self.prefix()))
        elif line.strip():
            self._lines.append(f"{self.prefix()}{line}")
@ -493,12 +509,15 @@ class IndentedBuffer:
        if isinstance(other_code, IndentedBuffer):
            dedent = float("inf")
            for line in other_code._lines:
-                if line:
+                if not isinstance(line, LineContext) and line:
                    dedent = min(dedent, len(line) - len(line.lstrip()))
            if math.isinf(dedent):
                dedent = 0
            for line in other_code._lines:
-                IndentedBuffer.writeline(self, line[dedent:])
+                if isinstance(line, LineContext):
                    self._lines.append(line)
                else:
                    IndentedBuffer.writeline(self, line[dedent:])
        else:
            other_code = textwrap.dedent(other_code)
            if strip:
--- a/torch/csrc/profiler/python/combined_traceback.cpp
+++ b/torch/csrc/profiler/python/combined_traceback.cpp
@ -56,6 +56,16 @@ struct PythonTraceback : public CapturedTraceback::Python {
    py::str name_s = "name";
    py::str filename_s = "filename";
    auto torch = py::module::import("torch");
    py::object stack_frames_for_code;
    if (py::hasattr(torch, "_inductor")) {
      py::object inductor = torch.attr("_inductor");
      if (py::hasattr(inductor, "codecache")) {
        stack_frames_for_code = inductor.attr("codecache")
                                    .attr("PyCodeCache")
                                    .attr("stack_frames_for_code");
      }
    }
    for (const auto& f : to_symbolize) {
      auto f_code = (PyCodeObject*)f.code;
      py::handle filename = f_code->co_filename;
@ -67,6 +77,20 @@ struct PythonTraceback : public CapturedTraceback::Python {
          py::cast<std::string>(filename),
          py::cast<std::string>(funcname),
          (uint64_t)lineno});
      // find all the additional frames associated with inductor generated
      // code
      if (stack_frames_for_code.ptr()) {
        py::object extra = stack_frames_for_code(filename, lineno);
        if (!extra.is_none()) {
          for (py::handle h : extra) {
            result.tracebacks.back().push_back(result.all_frames.size());
            result.all_frames.emplace_back(unwind::Frame{
                py::cast<std::string>(h[filename_s]),
                py::cast<std::string>(h[name_s]),
                py::cast<uint64_t>(h[line_s])});
          }
        }
      }
    }
  }
 };