mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[nnc] Insert alloc/free at global scope (#61725)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/61725 Alloc/free inside a loop isn't really an optimization, and furthermore it breaks some attempted optimization in the llvm backend: we use alloca for small allocations, which is efficient since alloca is on the stack, but there's no corresponding free, so we leak tons of stack. I hit this while building an rfactor buffer inside a very deeply nested loop. ghstack-source-id: 133627310 Test Plan: Unit test which simulates use of a temp buffer in a deeply nested loop. Reviewed By: navahgar Differential Revision: D29533364 fbshipit-source-id: c321f4cb05304cfb9146afe32edc4567b623412e
This commit is contained in:
committed by
Facebook GitHub Bot
parent
4c3d9cfe03
commit
b963607d50
@ -397,6 +397,25 @@ graph(%a : Float(1, 3, 1, strides=[3, 1, 1], requires_grad=0, device=cpu)):
|
||||
def test_forgot_kernel_arena(self):
|
||||
self.assertRaises(RuntimeError, lambda: torch._C._te.VarHandle("n", torch._C._te.Dtype.Int))
|
||||
|
||||
@unittest.skipIf(not LLVM_ENABLED, "LLVM backend not enabled")
|
||||
def test_alloc_in_loop(self):
|
||||
with kernel_arena_scope():
|
||||
a, tmp, b = [
|
||||
te.Placeholder(name, te.Dtype.Float, [te.ExprHandle.int(1)])
|
||||
for name in ["a", "tmp", "b"]]
|
||||
t0, t100 = [te.ExprHandle.int(n) for n in [0, 100]]
|
||||
body = te.Block([
|
||||
tmp.store([t0], a.load([t0])),
|
||||
b.store([t0], tmp.load([t0]))
|
||||
])
|
||||
for _ in range(4):
|
||||
i = te.VarHandle("i", te.Dtype.Int)
|
||||
body = te.For.make(i, t0, t100, body)
|
||||
nest = te.LoopNest(body, [b.data()])
|
||||
nest.prepare_for_codegen()
|
||||
f = te.construct_codegen("llvm", nest.simplify(), [a, b])
|
||||
ta, tb = [torch.ones(1) for _ in range(2)]
|
||||
f.call([ta.data_ptr(), tb.data_ptr()])
|
||||
|
||||
if __name__ == '__main__':
|
||||
run_tests()
|
||||
|
Reference in New Issue
Block a user