mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-22 06:11:27 +08:00
This allows to know at any point during the backward pass what is running and where the Node currently running was created at: ```python import torch from torch.utils._python_dispatch import TorchDispatchMode from torch.autograd import detect_anomaly class MyMode(TorchDispatchMode): def __torch_dispatch__(self, func, types, args, kwargs=None): node = torch._C._current_autograd_node() print(f"Running {func} from within {node}") if node is not None: print("The Node was created at:") print("\n ".join(node.metadata["traceback_"])) return func(*args, **kwargs or {}) with MyMode(), detect_anomaly(): print("FW") a = torch.rand(10, requires_grad=True) b = a.mul(2) b = b.div(3) b = b.sum() print("BW") b.backward() ``` Gives ``` $ python foo.py foo.py:15: UserWarning: Anomaly Detection has been enabled. This mode will increase the runtime and should only be enabled for debugging. with MyMode(), detect_anomaly(): FW Running aten.rand.default from within None Running aten.mul.Tensor from within None Running aten.div.Tensor from within None Running aten.sum.default from within None BW Running aten.ones_like.default from within None Running aten.expand.default from within <SumBackward0 object at 0x7fa40c0c6dc0> The Node was created at: File "foo.py", line 20, in <module> b = b.sum() Running aten.isnan.default from within <SumBackward0 object at 0x7fa40c0c6500> The Node was created at: File "foo.py", line 20, in <module> b = b.sum() Running aten.any.default from within <SumBackward0 object at 0x7fa32b23a780> The Node was created at: File "foo.py", line 20, in <module> b = b.sum() Running aten._local_scalar_dense.default from within <SumBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 20, in <module> b = b.sum() Running aten.div.Tensor from within <DivBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 19, in <module> b = b.div(3) Running aten.isnan.default from within <DivBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 19, in <module> b = b.div(3) Running aten.any.default from within <DivBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 19, in <module> b = b.div(3) Running aten._local_scalar_dense.default from within <DivBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 19, in <module> b = b.div(3) Running aten.mul.Tensor from within <MulBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 18, in <module> b = a.mul(2) Running aten.isnan.default from within <MulBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 18, in <module> b = a.mul(2) Running aten.any.default from within <MulBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 18, in <module> b = a.mul(2) Running aten._local_scalar_dense.default from within <MulBackward0 object at 0x7fa40c0c9190> The Node was created at: File "foo.py", line 18, in <module> b = a.mul(2) Running aten.detach.default from within <AccumulateGrad object at 0x7fa40c0c9730> The Node was created at: File "foo.py", line 18, in <module> b = a.mul(2) Running aten.detach.default from within <AccumulateGrad object at 0x7fa40c0c94b0> The Node was created at: File "foo.py", line 18, in <module> b = a.mul(2) ``` Pull Request resolved: https://github.com/pytorch/pytorch/pull/90867 Approved by: https://github.com/soulitzer
120 lines
3.5 KiB
C++
120 lines
3.5 KiB
C++
#include <torch/csrc/autograd/function.h>
|
|
|
|
#include <c10/util/ThreadLocal.h>
|
|
#include <torch/csrc/autograd/engine.h>
|
|
#include <torch/csrc/autograd/variable.h>
|
|
|
|
#include <ATen/ATen.h>
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <stdexcept>
|
|
#include <string>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
namespace torch {
|
|
namespace autograd {
|
|
|
|
// The current evaluating node. This is useful to assign the current node as a
|
|
// parent of new nodes created during the evaluation of this node in anomaly
|
|
// mode.
|
|
C10_DEFINE_TLS_static(std::shared_ptr<Node>, tls_current_evaluating_node);
|
|
#define current_evaluating_node (tls_current_evaluating_node.get())
|
|
|
|
NodeGuard::NodeGuard(std::shared_ptr<Node> node) {
|
|
last_evaluating_node_ = std::move(current_evaluating_node);
|
|
current_evaluating_node = std::move(node);
|
|
}
|
|
NodeGuard::~NodeGuard() {
|
|
// restore the previous evaluating node
|
|
current_evaluating_node = std::move(last_evaluating_node_);
|
|
}
|
|
|
|
std::shared_ptr<Node> get_current_node() {
|
|
return current_evaluating_node;
|
|
}
|
|
|
|
void Node::assign_parent() {
|
|
metadata()->assign_parent(current_evaluating_node);
|
|
}
|
|
|
|
auto Node::name() const -> std::string {
|
|
return c10::demangle(typeid(*this).name());
|
|
}
|
|
|
|
AnomalyMetadata* Node::metadata() noexcept {
|
|
if (!anomaly_metadata_) {
|
|
anomaly_metadata_ = Engine::get_default_engine().make_anomaly_metadata();
|
|
}
|
|
return anomaly_metadata_.get();
|
|
}
|
|
|
|
static void gatherFunctions(
|
|
Node* func,
|
|
std::vector<std::shared_ptr<Node>>& stack) {
|
|
func->release_variables();
|
|
|
|
for (auto& edge : func->next_edges()) {
|
|
if (edge.function.use_count() == 1) {
|
|
stack.emplace_back(std::move(edge.function));
|
|
} else {
|
|
edge.function.reset();
|
|
}
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Fix for #5534: prevent stack overflow on deletion of deep computation graph
|
|
*
|
|
* Sometimes one can end up with a very big computation graph of Nodes
|
|
* and Edges. Each std::shared_ptr<Node> contains a list of Edge, and
|
|
* each Edge contains a std::shared_ptr<Node>. Deleting a
|
|
* std::shared_ptr<Node> can trigger the recursive deletion of other
|
|
* std::shared_ptr<Node>'s: this can stack overflow if the graph
|
|
* is deep enough. Here is an example of such a graph:
|
|
*
|
|
* shared_ptr<Node> -> Edge -> shared_ptr<Node> -> Edge -> ... ->
|
|
* shared_ptr<Node>
|
|
*
|
|
* The solution here is to detect when we are decrementing away the last
|
|
* reference to a Node, and when doing so to buffer up the Node's
|
|
* that will be recursively decremented. We can then decrement (and free)
|
|
* the original Node without causing a recursive cascade, before
|
|
* draining the buffer applying the same behavior. This is, in effect,
|
|
* converting recursion to a loop, using a heap buffer in place of the
|
|
* recursive call stack.
|
|
*/
|
|
void deleteNode(Node* function) {
|
|
// To avoid stack overflow on large computational graphs,
|
|
// we need to track reference decrementing and freeing
|
|
// on the heap.
|
|
function->release_variables();
|
|
std::vector<std::shared_ptr<Node>> stack;
|
|
gatherFunctions(function, stack);
|
|
delete function;
|
|
|
|
while (!stack.empty()) {
|
|
auto func = std::move(stack.back());
|
|
stack.pop_back();
|
|
gatherFunctions(func.get(), stack);
|
|
// Reference count is decremented on the loop backedge.
|
|
}
|
|
}
|
|
|
|
namespace {
|
|
bool kAutogradFunctionExtensionEnabled = false;
|
|
}
|
|
|
|
bool isAutogradFunctionExtensionEnabled() {
|
|
return kAutogradFunctionExtensionEnabled;
|
|
}
|
|
|
|
void setAutogradFunctionExtensionEnabled(bool enabled) {
|
|
kAutogradFunctionExtensionEnabled = enabled;
|
|
}
|
|
|
|
} // namespace autograd
|
|
} // namespace torch
|