Avoid making node a successor/predecessor of itself (#161205)

This fixes an assertion we were running into in the memory planning about not having an acyclic graph. The repro is very long so hard to make local test of, but fixes repro I am looking at.

Pull Request resolved: https://github.com/pytorch/pytorch/pull/161205
Approved by: https://github.com/IvanKobzarev, https://github.com/bdhirsh
This commit is contained in:
eellison
2025-08-21 14:26:19 -07:00
committed by PyTorch MergeBot
parent ff4f5dd8ed
commit a85711d565

View File

@ -286,6 +286,11 @@ def assign_memory_planning_info_for_scheduler_nodes(
for index, node in enumerate(nodes):
size_alloc = sum(buffer.mpi_buffer.size_alloc for buffer in node.get_outputs())
succ_nodes = node_to_succ_nodes[node]
pred_nodes = node_to_pred_nodes[node]
# make sure we do not make node a successor or predecessor of itself
succ_nodes.discard(node)
pred_nodes.discard(node)
node.mpi_node = MemoryPlanningInfoForNode(
index=index,
@ -684,6 +689,7 @@ def validate_graph_acyclic(nodes: list[BaseSchedulerNode]) -> None:
path.append(node)
for pred_node in node.mpi_node.pred_nodes:
assert pred_node != node
dfs_visit(pred_node)
path.pop()