mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Summary: Fixes https://github.com/pytorch/pytorch/issues/43405. This pull request adds a feature of printing all tracebacks if a `detect_anomaly` mode detects `nan` in nested backward operations. The way I did it is by assigning a node as a parent to all nodes it produces during its backward calculation. Then if one of the children produces `nan`, it will print the traceback from the parent and grand parents (if any). The parent is assigned in `parent_node_` member in `Node` class which is accessible in C++ by function `node->parent()` and in Python by `node.parent_function`. A node has a parent iff: 1. it is created from a backward operation, and 2. created when anomaly mode and grad mode are both enabled. An example of this feature: import torch def example(): x = torch.tensor(1.0, requires_grad=True) y = torch.tensor(1e-8, requires_grad=True) # small to induce nan in n-th backward a = x * y b = x * y z1 = a / b # can produce nan in n-th backward as long as https://github.com/pytorch/pytorch/issues/43414 is unsolved z = z1 * z1 gy , = torch.autograd.grad( z , (y,), create_graph=True) gy2, = torch.autograd.grad(gy , (y,), create_graph=True) gy3, = torch.autograd.grad(gy2, (y,), create_graph=True) gy4, = torch.autograd.grad(gy3, (y,), create_graph=True) return gy4 with torch.autograd.detect_anomaly(): gy4 = example() with output: example.py:16: UserWarning: Anomaly Detection has been enabled. This mode will increase the runtime and should only be enabled for debugging. with torch.autograd.detect_anomaly(): /home/mfkasim/anaconda2/envs/base3/lib/python3.8/site-packages/torch/autograd/__init__.py:190: UserWarning: Error detected in DivBackward0. Traceback of forward call that caused the error: File "example.py", line 17, in <module> gy4 = example() File "example.py", line 12, in example gy3, = torch.autograd.grad(gy2, (y,), create_graph=True) File "/home/mfkasim/anaconda2/envs/base3/lib/python3.8/site-packages/torch/autograd/__init__.py", line 190, in grad return Variable._execution_engine.run_backward( (Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:61.) return Variable._execution_engine.run_backward( /home/mfkasim/anaconda2/envs/base3/lib/python3.8/site-packages/torch/autograd/__init__.py:190: UserWarning: Traceback of forward call that induces the previous calculation: File "example.py", line 17, in <module> gy4 = example() File "example.py", line 11, in example gy2, = torch.autograd.grad(gy , (y,), create_graph=True) File "/home/mfkasim/anaconda2/envs/base3/lib/python3.8/site-packages/torch/autograd/__init__.py", line 190, in grad return Variable._execution_engine.run_backward( (Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:65.) return Variable._execution_engine.run_backward( /home/mfkasim/anaconda2/envs/base3/lib/python3.8/site-packages/torch/autograd/__init__.py:190: UserWarning: Traceback of forward call that induces the previous calculation: File "example.py", line 17, in <module> gy4 = example() File "example.py", line 8, in example z1 = a / b # can produce nan in n-th backward as long as https://github.com/pytorch/pytorch/issues/43414 is unsolved (Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:65.) return Variable._execution_engine.run_backward( Traceback (most recent call last): File "example.py", line 17, in <module> gy4 = example() File "example.py", line 13, in example gy4, = torch.autograd.grad(gy3, (y,), create_graph=True) File "/home/mfkasim/anaconda2/envs/base3/lib/python3.8/site-packages/torch/autograd/__init__.py", line 190, in grad return Variable._execution_engine.run_backward( RuntimeError: Function 'DivBackward0' returned nan values in its 1th output. cc & thanks to albanD Pull Request resolved: https://github.com/pytorch/pytorch/pull/43626 Reviewed By: malfet Differential Revision: D23397499 Pulled By: albanD fbshipit-source-id: aa7435ec2a7f0d23a7a02ab7db751c198faf3b7d
99 lines
3.0 KiB
C++
99 lines
3.0 KiB
C++
#include <torch/csrc/autograd/function.h>
|
|
|
|
#include <torch/csrc/autograd/engine.h>
|
|
#include <torch/csrc/autograd/variable.h>
|
|
|
|
#include <ATen/ATen.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
namespace torch { namespace autograd {
|
|
|
|
// The current evaluating node. This is useful to assign the current node as a
|
|
// parent of new nodes created during the evaluation of this node in anomaly
|
|
// mode.
|
|
static thread_local std::shared_ptr<Node> current_evaluating_node = nullptr;
|
|
|
|
NodeGuard::NodeGuard(std::shared_ptr<Node> node) {
|
|
last_evaluating_node_ = std::move(current_evaluating_node);
|
|
current_evaluating_node = std::move(node);
|
|
}
|
|
NodeGuard::~NodeGuard() {
|
|
// restore the previous evaluating node
|
|
current_evaluating_node = std::move(last_evaluating_node_);
|
|
}
|
|
|
|
void Node::assign_parent() {
|
|
metadata()->assign_parent(current_evaluating_node);
|
|
}
|
|
|
|
auto Node::name() const -> std::string {
|
|
return c10::demangle(typeid(*this).name());
|
|
}
|
|
|
|
AnomalyMetadata* Node::metadata() noexcept {
|
|
if (!anomaly_metadata_) {
|
|
anomaly_metadata_ = Engine::get_default_engine().make_anomaly_metadata();
|
|
}
|
|
return anomaly_metadata_.get();
|
|
}
|
|
|
|
static void gatherFunctions(
|
|
Node* func,
|
|
std::vector<std::shared_ptr<Node>>& stack) {
|
|
func->release_variables();
|
|
|
|
for (auto& edge : func->next_edges()) {
|
|
if (edge.function.use_count() == 1) {
|
|
stack.emplace_back(std::move(edge.function));
|
|
} else {
|
|
edge.function.reset();
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Fix for #5534: prevent stack overflow on deletion of deep computation graph
|
|
*
|
|
* Sometimes one can end up with a very big computation graph of Nodes
|
|
* and Edges. Each std::shared_ptr<Node> contains a list of Edge, and
|
|
* each Edge contains a std::shared_ptr<Node>. Deleting a
|
|
* std::shared_ptr<Node> can trigger the recursive deletion of other
|
|
* std::shared_ptr<Node>'s: this can stack overflow if the graph
|
|
* is deep enough. Here is an example of such a graph:
|
|
*
|
|
* shared_ptr<Node> -> Edge -> shared_ptr<Node> -> Edge -> ... -> shared_ptr<Node>
|
|
*
|
|
* The solution here is to detect when we are decrementing away the last
|
|
* reference to a Node, and when doing so to buffer up the Node's
|
|
* that will be recursively decremented. We can then decrement (and free)
|
|
* the original Node without causing a recursive cascade, before
|
|
* draining the buffer applying the same behavior. This is, in effect,
|
|
* converting recursion to a loop, using a heap buffer in place of the
|
|
* recursive call stack.
|
|
*/
|
|
void deleteNode(Node* function) {
|
|
// To avoid stack overflow on large computational graphs,
|
|
// we need to track reference decrementing and freeing
|
|
// on the heap.
|
|
function->release_variables();
|
|
std::vector<std::shared_ptr<Node>> stack;
|
|
gatherFunctions(function, stack);
|
|
delete function;
|
|
|
|
while (!stack.empty()) {
|
|
auto func = std::move(stack.back());
|
|
stack.pop_back();
|
|
gatherFunctions(func.get(), stack);
|
|
// Reference count is decremented on the loop backedge.
|
|
}
|
|
}
|
|
|
|
}} // namespace torch::autograd
|