[nnc] Insert alloc/free at global scope (#61725)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61725 Alloc/free inside a loop isn't really an optimization, and furthermore it breaks some attempted optimization in the llvm backend: we use alloca for small allocations, which is efficient since alloca is on the stack, but there's no corresponding free, so we leak tons of stack. I hit this while building an rfactor buffer inside a very deeply nested loop. ghstack-source-id: 133627310 Test Plan: Unit test which simulates use of a temp buffer in a deeply nested loop. Reviewed By: navahgar Differential Revision: D29533364 fbshipit-source-id: c321f4cb05304cfb9146afe32edc4567b623412e
2025-10-20 21:14:14 +08:00 · 2021-07-16 08:35:22 -07:00
parent 4c3d9cfe03
commit b963607d50
4 changed files with 44 additions and 28 deletions
--- a/test/test_tensorexpr_pybind.py
+++ b/test/test_tensorexpr_pybind.py
@ -397,6 +397,25 @@ graph(%a : Float(1, 3, 1, strides=[3, 1, 1], requires_grad=0, device=cpu)):
    def test_forgot_kernel_arena(self):
        self.assertRaises(RuntimeError, lambda: torch._C._te.VarHandle("n", torch._C._te.Dtype.Int))

+    @unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
+    def test_alloc_in_loop(self):
+        with kernel_arena_scope():
+            a, tmp, b = [
+                te.Placeholder(name, te.Dtype.Float, [te.ExprHandle.int(1)])
+                for name in ["a", "tmp", "b"]]
+            t0, t100 = [te.ExprHandle.int(n) for n in [0, 100]]
+            body = te.Block([
+                tmp.store([t0], a.load([t0])),
+                b.store([t0], tmp.load([t0]))
+            ])
+            for _ in range(4):
+                i = te.VarHandle("i", te.Dtype.Int)
+                body = te.For.make(i, t0, t100, body)
+            nest = te.LoopNest(body, [b.data()])
+            nest.prepare_for_codegen()
+            f = te.construct_codegen("llvm", nest.simplify(), [a, b])
+            ta, tb = [torch.ones(1) for _ in range(2)]
+            f.call([ta.data_ptr(), tb.data_ptr()])

 if __name__ == '__main__':
    run_tests()