Files
pytorch/torch/csrc/autograd/python_engine.cpp
Edward Z. Yang 9465c0e0b5 Add a lint rule for torch/csrc/util/pybind.h include (#82552)
We define specializations for pybind11 defined templates
(in particular, PYBIND11_DECLARE_HOLDER_TYPE) and consequently
it is important that these specializations *always* be #include'd
when making use of pybind11 templates whose behavior depends on
these specializations, otherwise we can cause an ODR violation.

The easiest way to ensure that all the specializations are always
loaded is to designate a header (in this case, torch/csrc/util/pybind.h)
that ensures the specializations are defined, and then add a lint
to ensure this header is included whenever pybind11 headers are
included.

The existing grep linter didn't have enough knobs to do this
conveniently, so I added some features.  I'm open to suggestions
for how to structure the features better.  The main changes:

- Added an --allowlist-pattern flag, which turns off the grep lint
  if some other line exists.  This is used to stop the grep
  lint from complaining about pybind11 includes if the util
  include already exists.

- Added --match-first-only flag, which lets grep only match against
  the first matching line.  This is because, even if there are multiple
  includes that are problematic, I only need to fix one of them.
  We don't /really/ need this, but when I was running lintrunner -a
  to fixup the preexisting codebase it was annoying without this,
  as the lintrunner overall driver fails if there are multiple edits
  on the same file.

I excluded any files that didn't otherwise have a dependency on
torch/ATen, this was mostly caffe2 and the valgrind wrapper compat
bindings.

Note the grep replacement is kind of crappy, but clang-tidy lint
cleaned it up in most cases.

See also https://github.com/pybind/pybind11/issues/4099

Signed-off-by: Edward Z. Yang <ezyang@fb.com>
Pull Request resolved: https://github.com/pytorch/pytorch/pull/82552
Approved by: https://github.com/albanD
2022-08-01 17:16:58 +00:00

460 lines
15 KiB
C++

#include <torch/csrc/autograd/python_engine.h>
#include <ATen/BatchedTensorImpl.h>
#include <ATen/VmapMode.h>
#include <c10/util/irange.h>
#include <pybind11/pybind11.h>
#include <torch/csrc/DynamicTypes.h>
#include <torch/csrc/THP.h>
#include <torch/csrc/autograd/edge.h>
#include <torch/csrc/autograd/engine.h>
#include <torch/csrc/autograd/function.h>
#include <torch/csrc/autograd/functions/basic_ops.h>
#include <torch/csrc/autograd/python_anomaly_mode.h>
#include <torch/csrc/autograd/python_function.h>
#include <torch/csrc/autograd/python_saved_variable_hooks.h>
#include <torch/csrc/utils/pybind.h>
#include <torch/csrc/utils/pycfunction_helpers.h>
#ifndef _WIN32
#include <pthread.h>
#endif
#include <memory> // for unique_ptr
#include <unordered_set>
using namespace torch::autograd;
struct THPEngine {
PyObject_HEAD
};
static bool _reinitialize_engine = false;
namespace torch {
namespace autograd {
namespace python {
PythonEngine::PythonEngine() = default;
Engine& PythonEngine::get_python_engine() {
static PythonEngine engine;
// This is "probably" thread-safe because the flag is set in a fork handler
// before any threads are created, and this function is only called with the
// GIL held. However, using fork + threads is playing with fire so this is
// more of a "best effort" thing. For example, if the fork occurs while the
// backwards threads hold a lock, we'll probably deadlock in the engine
// destructor.
if (_reinitialize_engine) {
engine.release_workers();
engine.~PythonEngine();
new (&engine) torch::autograd::python::PythonEngine();
_reinitialize_engine = false;
}
return engine;
}
PythonEngine::~PythonEngine() {
Engine::stop();
}
#if PY_MAJOR_VERSION == 3 && PY_MINOR_VERSION >= 9
#define IS_PYTHON_3_9_PLUS
#endif
void PythonEngine::thread_init(
int device,
const std::shared_ptr<ReadyQueue>& ready_queue,
bool should_increment) {
// Increment thread usage count before acquiring the GIL
if (should_increment) {
increment_non_reentrant_thread_count();
}
// Create a PyThreadState, but release the GIL. This lets
// pybind11::gil_scoped_acquire calls inside thread_main acquire the GIL
// without having to create a new PyThreadState each time.
#if defined(IS_PYTHON_3_9_PLUS) || defined(USE_DEPLOY)
auto gil = std::make_unique<pybind11::gil_scoped_acquire>();
#else
pybind11::gil_scoped_acquire gil;
#endif
pybind11::gil_scoped_release no_gil;
Engine::thread_init(device, ready_queue, false);
if (should_increment) {
// Decrement the count during shutdown if we incremented earlier.
decrement_non_reentrant_thread_count();
}
#if defined(IS_PYTHON_3_9_PLUS) || defined(USE_DEPLOY)
// Do not call PyEval_RestoreThread, PyThreadState_[Clear|DeleteCurrent] if
// runtime is finalizing
if (!Py_IsInitialized()) {
no_gil.disarm();
// TODO: call disarm rather than leak gil_scoped_acquired once
// PyThreadState_Clear can safely be called from finalize NOTE: deploy.cpp
// calls `PyInterpreterState_Delete` to destruct PyThreadState, so avoid
// use-after-free here.
gil.release();
}
#endif
}
void PythonEngine::thread_on_exception(
std::shared_ptr<GraphTask> graph_task,
const std::shared_ptr<Node>& fn,
std::exception& e) {
auto python_err = dynamic_cast<python_error*>(&e);
if (python_err) {
python_err->persist();
}
Engine::thread_on_exception(graph_task, fn, e);
}
std::unique_ptr<AnomalyMetadata> PythonEngine::make_anomaly_metadata() {
return std::unique_ptr<AnomalyMetadata>(new PyAnomalyMetadata());
}
std::unique_ptr<SavedVariableHooks> PythonEngine::
get_default_saved_variable_hooks() {
return PyDefaultSavedVariableHooks::get_hooks();
}
variable_list PythonEngine::execute(
const edge_list& roots,
const variable_list& inputs,
bool keep_graph,
bool create_graph,
bool accumulate_grad,
const edge_list& outputs) {
TORCH_CHECK(
!PyGILState_Check(),
"The autograd engine was called while holding the GIL. If you are using the C++ "
"API, the autograd engine is an expensive operation that does not require the "
"GIL to be held so you should release it with 'pybind11::gil_scoped_release no_gil;'"
". If you are not using the C++ API, please report a bug to the pytorch team.")
try {
return Engine::execute(
roots, inputs, keep_graph, create_graph, accumulate_grad, outputs);
} catch (python_error& e) {
e.restore();
throw;
}
}
c10::intrusive_ptr<at::ivalue::Future> PythonEngine::execute_with_graph_task(
const std::shared_ptr<GraphTask>& graph_task,
std::shared_ptr<Node> graph_root,
InputBuffer&& input_buffer) {
try {
return Engine::execute_with_graph_task(
graph_task, graph_root, std::move(input_buffer));
} catch (python_error& e) {
pybind11::gil_scoped_acquire gil;
if (!PyErr_Occurred()) {
// Set the error indicator only if it is not set already.
e.restore();
}
throw;
}
}
} // namespace python
} // namespace autograd
} // namespace torch
PyObject* THPEngineClass = nullptr;
// Implementation of torch._C._EngineBase.run_backward
PyObject* THPEngine_run_backward(
PyObject* self,
PyObject* args,
PyObject* kwargs) {
HANDLE_TH_ERRORS
PyObject* tensors = nullptr;
PyObject* grad_tensors = nullptr;
unsigned char keep_graph = 0;
unsigned char create_graph = 0;
PyObject* inputs = nullptr;
unsigned char allow_unreachable = 0;
unsigned char accumulate_grad =
0; // Indicate whether to accumulate grad into leaf Tensors or capture
const char* accepted_kwargs[] = {// NOLINT
"tensors",
"grad_tensors",
"keep_graph",
"create_graph",
"inputs",
"allow_unreachable",
"accumulate_grad",
nullptr};
if (!PyArg_ParseTupleAndKeywords(
args,
kwargs,
"OObb|Obb",
(char**)accepted_kwargs,
&tensors,
&grad_tensors,
&keep_graph,
&create_graph,
&inputs,
&allow_unreachable,
&accumulate_grad))
return nullptr;
THPUtils_assert(
PyTuple_Check(tensors),
"tensors argument is expected to "
"be a tuple, but got %s",
THPUtils_typename(tensors));
THPUtils_assert(
PyTuple_Check(grad_tensors),
"grad_tensors argument is "
"expected to be a tuple, but got %s",
THPUtils_typename(grad_tensors));
Py_ssize_t num_tensors = PyTuple_GET_SIZE(tensors);
Py_ssize_t num_gradients = PyTuple_GET_SIZE(grad_tensors);
THPUtils_assert(
num_tensors == num_gradients,
"got %ld tensors and %ld "
"gradients",
num_tensors,
num_gradients);
// The user either called autograd.backward(...) or autograd.grad(...) to get
// here
bool backward_api_called = accumulate_grad;
TORCH_CHECK(
!backward_api_called || at::impl::VmapMode::current_vmap_level() == 0,
"backward() called inside torch.vmap. This is not supported, "
"please call backward() outside torch.vmap or instead use "
"torch.autograd.grad inside torch.vmap");
edge_list roots;
roots.reserve(num_tensors);
variable_list grads;
grads.reserve(num_tensors);
for (const auto i : c10::irange(num_tensors)) {
PyObject* _tensor = PyTuple_GET_ITEM(tensors, i);
THPUtils_assert(
THPVariable_Check(_tensor),
"element %d of tensors "
"tuple is not a Tensor",
i);
const auto& variable = THPVariable_Unpack(_tensor);
TORCH_CHECK(
!isBatchedTensor(variable),
"torch.autograd.grad(outputs, inputs, grad_outputs) called inside ",
"torch.vmap. We do not support the case where any outputs are ",
"vmapped tensors (output ",
i,
" is being vmapped over). Please "
"call autograd.grad() outside torch.vmap or file a bug report "
"with your use case.")
auto gradient_edge = torch::autograd::impl::gradient_edge(variable);
THPUtils_assert(
gradient_edge.function,
"element %d of tensors does not require grad and does not have a grad_fn",
i);
roots.push_back(std::move(gradient_edge));
PyObject* grad = PyTuple_GET_ITEM(grad_tensors, i);
if (THPVariable_Check(grad)) {
const Variable& grad_var = THPVariable_Unpack(grad);
if (grad_var.has_names()) {
TORCH_WARN(
"Autograd was passed a named grad tensor with dims ",
grad_var.names(),
". Autograd does not yet support named tensor semantics, so all names ",
"will be ignored. In practice all computed gradients will still be correct "
"according to regular tensor semantics.");
}
grads.push_back(grad_var);
} else {
THPUtils_assert(
grad == Py_None,
"element %d of gradients tuple is not a Tensor or None",
i);
THPUtils_assert(
!variable.requires_grad(),
"element %d of gradients tuple is None, but the corresponding Tensor requires grad");
}
}
std::vector<Edge> output_edges;
if (inputs != nullptr) {
int num_inputs = PyTuple_GET_SIZE(inputs);
output_edges.reserve(num_inputs);
for (const auto i : c10::irange(num_inputs)) {
PyObject* input = PyTuple_GET_ITEM(inputs, i);
THPUtils_assert(
THPVariable_Check(input),
"all inputs have to be Tensors, but got %s",
THPUtils_typename(input));
const auto& tensor = THPVariable_Unpack(input);
TORCH_CHECK(
!isBatchedTensor(tensor),
"torch.autograd.grad(outputs, inputs, grad_outputs) called inside ",
"torch.vmap. We do not support the case where any inputs are ",
"vmapped tensors (input ",
i,
" is being vmapped over). Please "
"call autograd.grad() outside torch.vmap or file a bug report "
"with your use case.")
const auto output_nr = tensor.output_nr();
auto grad_fn = tensor.grad_fn();
if (!grad_fn) {
grad_fn = torch::autograd::impl::try_get_grad_accumulator(tensor);
}
if (accumulate_grad) {
tensor.retain_grad();
}
THPUtils_assert(
tensor.requires_grad(),
"One of the differentiated Tensors does not require grad");
if (!grad_fn) {
// NOTE [ Autograd Unreachable Input ]
// Since input has no grad_accumulator, its guaranteed to be
// unreachable. We initialize an edge pointing to a non-nullptr Node so
// nodes in the graph (e.g., mul when an operand is scalar) that have
// edges pointing to nullptr don't get erroneously assigned `needed =
// True` in exec_info.
output_edges.emplace_back(std::make_shared<Identity>(), 0);
} else {
output_edges.emplace_back(grad_fn, output_nr);
}
}
}
variable_list outputs;
{
pybind11::gil_scoped_release no_gil;
auto& engine = python::PythonEngine::get_python_engine();
outputs = engine.execute(
roots, grads, keep_graph, create_graph, accumulate_grad, output_edges);
}
if (!backward_api_called && inputs != nullptr) {
int num_inputs = PyTuple_GET_SIZE(inputs);
THPObjectPtr py_outputs{PyTuple_New(num_inputs)};
if (!py_outputs)
return nullptr;
for (const auto i : c10::irange(num_inputs)) {
THPUtils_assert(
allow_unreachable || outputs[i].defined(),
"One of the "
"differentiated Tensors appears to not have been used "
"in the graph. Set allow_unused=True if this is the "
"desired behavior.");
PyTuple_SET_ITEM(py_outputs.get(), i, THPVariable_Wrap(outputs[i]));
}
return py_outputs.release();
} else {
Py_RETURN_NONE;
}
END_HANDLE_TH_ERRORS
}
PyObject* THPEngine_queue_callback(PyObject* self, PyObject* _callback) {
HANDLE_TH_ERRORS
auto& engine = python::PythonEngine::get_python_engine();
std::shared_ptr<PyObject> callback(_callback, [](PyObject* obj) {
pybind11::gil_scoped_acquire gil;
Py_DECREF(obj);
});
Py_INCREF(_callback);
engine.queue_callback([callback]() {
pybind11::gil_scoped_acquire gil;
THPObjectPtr result{PyObject_CallFunctionObjArgs(callback.get(), nullptr)};
if (!result)
throw python_error();
});
Py_RETURN_NONE;
END_HANDLE_TH_ERRORS
}
PyObject* THPEngine_is_checkpoint_valid(PyObject* self, PyObject* noargs) {
HANDLE_TH_ERRORS
auto& engine = python::PythonEngine::get_python_engine();
if (engine.is_checkpoint_valid()) {
Py_RETURN_TRUE;
} else {
Py_RETURN_FALSE;
}
END_HANDLE_TH_ERRORS
}
PyObject* THPEngine_new(PyTypeObject* type, PyObject* args, PyObject* kwargs) {
return type->tp_alloc(type, 0);
}
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays,cppcoreguidelines-avoid-non-const-global-variables)
static struct PyMethodDef THPEngine_methods[] = {
{(char*)"run_backward",
castPyCFunctionWithKeywords(THPEngine_run_backward),
METH_VARARGS | METH_KEYWORDS,
nullptr},
{(char*)"queue_callback", THPEngine_queue_callback, METH_O, nullptr},
{(char*)"is_checkpoint_valid",
THPEngine_is_checkpoint_valid,
METH_NOARGS,
nullptr},
{nullptr}};
PyTypeObject THPEngineType = {
PyVarObject_HEAD_INIT(nullptr, 0) "torch._C._EngineBase", /* tp_name */
sizeof(THPEngine), /* tp_basicsize */
0, /* tp_itemsize */
nullptr, /* tp_dealloc */
0, /* tp_vectorcall_offset */
nullptr, /* tp_getattr */
nullptr, /* tp_setattr */
nullptr, /* tp_reserved */
nullptr, /* tp_repr */
nullptr, /* tp_as_number */
nullptr, /* tp_as_sequence */
nullptr, /* tp_as_mapping */
nullptr, /* tp_hash */
nullptr, /* tp_call */
nullptr, /* tp_str */
nullptr, /* tp_getattro */
nullptr, /* tp_setattro */
nullptr, /* tp_as_buffer */
Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE, /* tp_flags */
nullptr, /* tp_doc */
nullptr, /* tp_traverse */
nullptr, /* tp_clear */
nullptr, /* tp_richcompare */
0, /* tp_weaklistoffset */
nullptr, /* tp_iter */
nullptr, /* tp_iternext */
THPEngine_methods, /* tp_methods */
nullptr, /* tp_members */
nullptr, /* tp_getset */
nullptr, /* tp_base */
nullptr, /* tp_dict */
nullptr, /* tp_descr_get */
nullptr, /* tp_descr_set */
0, /* tp_dictoffset */
nullptr, /* tp_init */
nullptr, /* tp_alloc */
THPEngine_new /* tp_new */
};
static void child_atfork() {
_reinitialize_engine = true;
}
bool THPEngine_initModule(PyObject* module) {
#ifndef _WIN32
if (pthread_atfork(nullptr, nullptr, child_atfork) != 0) {
throw std::runtime_error("unable to set pthread_atfork handler");
}
#endif
if (PyType_Ready(&THPEngineType) < 0)
return false;
Py_INCREF(&THPEngineType);
PyModule_AddObject(module, "_ImperativeEngine", (PyObject*)&THPEngineType);
set_default_engine_stub(python::PythonEngine::get_python_engine);
return true;
}