Use pybind11::gil_scoped_* functions instead of AutoGIL/AutoNoGIL (#30274)

Summary: Reland of https://github.com/pytorch/pytorch/pull/29095 Pull Request resolved: https://github.com/pytorch/pytorch/pull/30274 Differential Revision: D18762293 Pulled By: ezyang fbshipit-source-id: d3d50c2dd12bcb678ab25fa708eb6587cc4b66f9
2025-11-03 07:24:58 +08:00 · 2019-12-02 12:18:20 -08:00
parent 6deb41c88d
commit 1111a6b810
42 changed files with 195 additions and 164 deletions
--- a/tools/autograd/gen_python_functions.py
+++ b/tools/autograd/gen_python_functions.py
@ -541,7 +541,7 @@ def create_python_bindings(python_functions, has_self, is_module=False):
        else:
            raise RuntimeError('could not dispatch, neither namespace function nor Tensor method')
-        env['AutoNoGIL'] = 'AutoNoGIL no_gil;' if not declaration['with_gil'] else ''
+        env['AutoNoGIL'] = 'pybind11::gil_scoped_release no_gil;' if not declaration['with_gil'] else ''
        # Use the simple_return_type (Tensor) rather than the fancy return type
        # (Tensor &).  This is important because the dispatch functions take
--- a/tools/autograd/templates/python_nn_functions.h
+++ b/tools/autograd/templates/python_nn_functions.h
@ -3,6 +3,7 @@
 // ${generated_comment}
 #include <Python.h>
 #include <pybind11/pybind11.h>
 namespace torch { namespace autograd {
--- a/tools/autograd/templates/python_nn_functions_dispatch.h
+++ b/tools/autograd/templates/python_nn_functions_dispatch.h
@ -2,8 +2,6 @@
 // ${generated_comment}
 #include "torch/csrc/utils/auto_gil.h"
 #include <ATen/ATen.h>
 // Contains inline wrappers around ATen functions that release the GIL and
--- a/tools/autograd/templates/python_torch_functions.cpp
+++ b/tools/autograd/templates/python_torch_functions.cpp
@ -76,24 +76,24 @@ static void check_out_type_matches(Tensor result,
 }
 inline Tensor dispatch_arange(Scalar end, Tensor result) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return at::arange_out(result, end);
 }
 inline Tensor dispatch_arange(Scalar end, const TensorOptions& options) {
  torch::utils::maybe_initialize_cuda(options);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::arange(end, options);
 }
 inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, Tensor result) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return at::arange_out(result, start, end, step);
 }
 inline Tensor dispatch_arange(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
  torch::utils::maybe_initialize_cuda(options);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::arange(start, end, step, options);
 }
@ -152,14 +152,14 @@ static PyObject * THPVariable_arange(PyObject* self, PyObject* args, PyObject* k
 }
 inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, Tensor result) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(result));
  return at::range_out(result, start, end, step);
 }
 inline Tensor dispatch_range(Scalar start, Scalar end, Scalar step, const TensorOptions& options) {
  torch::utils::maybe_initialize_cuda(options);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  DeviceGuard device_guard(options.device());
  return torch::range(start, end, step, options);
 }
@ -196,39 +196,39 @@ static PyObject * THPVariable_range(PyObject* self, PyObject* args, PyObject* kw
 }
 inline Tensor dispatch_randint(int64_t high, IntArrayRef size, Generator * generator, Tensor result) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return at::randint_out(result, high, size, generator);
 }
 inline Tensor dispatch_randint(int64_t high, IntArrayRef size, Generator * generator, const TensorOptions & options) {
  torch::utils::maybe_initialize_cuda(options);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::randint(high, size, generator, options);
 }
 inline Tensor dispatch_randint(int64_t high, IntArrayRef size, Tensor result) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return at::randint_out(result, high, size);
 }
 inline Tensor dispatch_randint(int64_t high, IntArrayRef size, const TensorOptions & options) {
  torch::utils::maybe_initialize_cuda(options);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::randint(high, size, options);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, Generator * generator, Tensor result) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return at::randint_out(result, low, high, size, generator);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, Generator * generator, const TensorOptions & options) {
  torch::utils::maybe_initialize_cuda(options);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::randint(low, high, size, generator, options);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, Tensor result) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return at::randint_out(result, low, high, size);
 }
 inline Tensor dispatch_randint(int64_t low, int64_t high, IntArrayRef size, const TensorOptions & options) {
  torch::utils::maybe_initialize_cuda(options);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::randint(low, high, size, options);
 }
@ -309,19 +309,19 @@ static PyObject * THPVariable_from_numpy(PyObject* module, PyObject* arg)
 }
 static Tensor dispatch_nonzero(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return self.nonzero();
 }
 static Tensor dispatch_nonzero(const Tensor & self, Tensor out) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return at::nonzero_out(out, self);
 }
 static std::vector<Tensor> dispatch_nonzero_numpy(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return self.nonzero_numpy();
 }
--- a/tools/autograd/templates/python_torch_functions_dispatch.h
+++ b/tools/autograd/templates/python_torch_functions_dispatch.h
@ -5,10 +5,10 @@
 #include "torch/csrc/autograd/generated/VariableType.h"
 #include "torch/csrc/autograd/generated/variable_factories.h"
 #include "torch/csrc/tensor/python_tensor.h"
 #include "torch/csrc/utils/auto_gil.h"
 #include "torch/csrc/utils/cuda_lazy_init.h"
 #include <ATen/ATen.h>
 #include <pybind11/pybind11.h>
 // Contains inline wrappers around ATen functions that release the GIL and
 // switch to the correct CUDA device.
--- a/tools/autograd/templates/python_variable_methods.cpp
+++ b/tools/autograd/templates/python_variable_methods.cpp
@ -197,7 +197,7 @@ static PyObject * THPVariable_numel(PyObject* self, PyObject* args)
 }
 static Tensor dispatch_contiguous(const Tensor & self, at::MemoryFormat memory_format) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return self.contiguous(memory_format);
 }
@ -235,7 +235,7 @@ static PyObject * THPVariable_contiguous(PyObject* self, PyObject* args, PyObjec
 }
 static Tensor dispatch_copy_(Tensor & self, const Tensor & other, bool non_blocking) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return self.copy_(other, non_blocking);
 }
@ -255,7 +255,7 @@ static Tensor dispatch_copy_(Tensor & self, const Tensor & other, bool non_block
 }
 static double dispatch_to_CDouble(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  if (self.numel() != 1) {
    throw ValueError("only one element tensors can be converted to Python scalars");
@ -264,7 +264,7 @@ static double dispatch_to_CDouble(const Tensor & self) {
 }
 static std::complex<double> dispatch_to_CComplexDouble(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  if (self.numel() != 1) {
    throw ValueError("only one element tensors can be converted to Python scalars");
@ -273,7 +273,7 @@ static std::complex<double> dispatch_to_CComplexDouble(const Tensor & self) {
 }
 static int64_t dispatch_to_CLong(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  if (self.numel() != 1) {
    throw ValueError("only one element tensors can be converted to Python scalars");
@ -282,7 +282,7 @@ static int64_t dispatch_to_CLong(const Tensor & self) {
 }
 static bool dispatch_to_Bool(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  if (self.numel() != 1) {
    throw ValueError("only one element tensors can be converted to Python scalars");
@ -328,7 +328,7 @@ static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) {
 }
 static Tensor dispatch_invert(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return self.bitwise_not();
 }
@ -344,7 +344,7 @@ static PyObject * THPVariable_invert(PyObject* self, PyObject* args) {
 }
 static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  // NOTE: this is where we record aten::to in the graph during tracing. However, the behavior of aten::to
  // is different with respect to TensorOptions fields that are not present: aten::to inherits fields that
  // are missing from the self argument while the tracer assumes that they should be populated with the
@ -354,12 +354,12 @@ static Tensor dispatch_to(const Tensor & self, Device device, bool non_blocking,
 }
 static Tensor dispatch_to(const Tensor & self, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return self.to(dtype, non_blocking, copy, optional_memory_format);
 }
 static Tensor dispatch_to(const Tensor & self, Device device, ScalarType dtype, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return self.to(device, dtype, non_blocking, copy, optional_memory_format);
 }
@ -378,13 +378,13 @@ static PyObject * THPVariable_cpu(PyObject* self, PyObject* args, PyObject* kwar
 }
 static Tensor dispatch_nonzero(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return self.nonzero();
 }
 static std::vector<Tensor> dispatch_nonzero_numpy(const Tensor & self) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  OptionalDeviceGuard device_guard(device_of(self));
  return self.nonzero_numpy();
 }
--- a/tools/autograd/templates/python_variable_methods_dispatch.h
+++ b/tools/autograd/templates/python_variable_methods_dispatch.h
@ -6,6 +6,7 @@
 #include <torch/csrc/utils/cuda_lazy_init.h>
 #include <ATen/ATen.h>
 #include <pybind11/pybind11.h>
 // Contains inline wrappers around ATen functions that release the GIL and
 // switch to the correct CUDA device.
--- a/tools/build_variables.py
+++ b/tools/build_variables.py
@ -75,7 +75,6 @@ libtorch_sources = [
    "torch/csrc/distributed/rpc/script_resp.cpp",
    "torch/csrc/distributed/rpc/types.cpp",
    "torch/csrc/distributed/rpc/utils.cpp",
    "torch/csrc/Exceptions.cpp",
    "torch/csrc/jit/autodiff.cpp",
    "torch/csrc/jit/attributes.cpp",
    "torch/csrc/jit/argument_spec.cpp",
@ -271,6 +270,7 @@ def add_torch_libs():
        "torch/csrc/Device.cpp",
        "torch/csrc/Dtype.cpp",
        "torch/csrc/DynamicTypes.cpp",
        "torch/csrc/Exceptions.cpp",
        "torch/csrc/Generator.cpp",
        "torch/csrc/Layout.cpp",
        "torch/csrc/MemoryFormat.cpp",
--- a/torch/csrc/Exceptions.cpp
+++ b/torch/csrc/Exceptions.cpp
@ -154,7 +154,7 @@ PyWarningHandler::~PyWarningHandler() noexcept(false) {
  c10::Warning::set_warning_handler(prev_handler_);
  if(warning_buffer_.size() > 0) {
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    PyObject *ptype, *pvalue, *ptraceback;
    PyErr_Fetch(&ptype, &pvalue, &ptraceback);
--- a/torch/csrc/Exceptions.h
+++ b/torch/csrc/Exceptions.h
@ -7,6 +7,7 @@
 #include <mutex>
 #include <c10/util/Exception.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/THP_export.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/jit/script/jit_exception.h>
@ -124,7 +125,7 @@ struct python_error : public std::exception {
        value(other.value),
        traceback(other.traceback),
        message(other.message) {
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    Py_XINCREF(type);
    Py_XINCREF(value);
    Py_XINCREF(traceback);
@ -142,7 +143,7 @@ struct python_error : public std::exception {
  ~python_error() override {
    if (type || value || traceback) {
-      AutoGIL gil;
+      pybind11::gil_scoped_acquire gil;
      Py_XDECREF(type);
      Py_XDECREF(value);
      Py_XDECREF(traceback);
@ -155,7 +156,7 @@ struct python_error : public std::exception {
  void build_message() {
    // Ensure we have the GIL.
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    // No errors should be set when we enter the function since PyErr_Fetch
    // clears the error indicator.
@ -194,7 +195,7 @@ struct python_error : public std::exception {
  inline void persist() {
    if (type) return; // Don't overwrite exceptions
    // PyErr_Fetch overwrites the pointers
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    Py_XDECREF(type);
    Py_XDECREF(value);
    Py_XDECREF(traceback);
@ -206,7 +207,7 @@ struct python_error : public std::exception {
  inline void restore() {
    if (!type) return;
    // PyErr_Restore steals references
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    Py_XINCREF(type);
    Py_XINCREF(value);
    Py_XINCREF(traceback);
--- a/torch/csrc/README.md
+++ b/torch/csrc/README.md
@ -9,8 +9,8 @@ There are a number of utilities for easing integration with Python which
 are worth knowing about, which we briefly describe here.  But the most
 important gotchas:
-* DO NOT forget to take out the GIL with `AutoGil` before calling Python
+* DO NOT forget to take out the GIL with `pybind11::gil_scoped_acquire`
-  API or bringing a `THPObjectPtr` into scope.
+  before calling Python API or bringing a `THPObjectPtr` into scope.
 * Make sure you include `Python.h` first in your header files, before
  any system headers; otherwise, you will get `error: "_XOPEN_SOURCE" redefined`
@ -96,16 +96,16 @@ at::Tensor foo(at::Tensor x) {
 ```
-### `utils/auto_gil.h`
+### GIL
 Whenever you make any calls to the Python API, you must have taken out
-the Python GIL, as none of these calls are thread safe.  `AutoGIL` is
+the Python GIL, as none of these calls are thread safe.
-a RAII struct which handles taking and releasing the GIL.  Use it like
+`pybind11::gil_scoped_acquire` is a RAII struct which handles taking and
-this:
+releasing the GIL.  Use it like this:
 ```
 void iWantToUsePython() {
-  AutoGil gil;
+  pybind11::gil_scoped_acquire gil;
  ...
 }
 ```
--- a/torch/csrc/autograd/python_anomaly_mode.cpp
+++ b/torch/csrc/autograd/python_anomaly_mode.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/autograd/python_anomaly_mode.h>
 #include <c10/util/Exception.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/auto_gil.h>
@ -11,7 +12,7 @@
 namespace torch { namespace autograd {
 void PyAnomalyMetadata::store_stack() {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  THPObjectPtr mod(PyImport_ImportModule("traceback"));
  if (!mod) {
    throw python_error();
@ -28,7 +29,7 @@ void PyAnomalyMetadata::store_stack() {
 }
 void PyAnomalyMetadata::print_stack() {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  if (!PyDict_Check(dict())) {
    throw std::runtime_error("Anomaly metadata is not a python dictionary.");
  }
--- a/torch/csrc/autograd/python_anomaly_mode.h
+++ b/torch/csrc/autograd/python_anomaly_mode.h
@ -1,5 +1,6 @@
 #pragma once
 #include <pybind11/pybind11.h>
 #include <torch/csrc/autograd/anomaly_mode.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/auto_gil.h>
@ -10,11 +11,11 @@ struct PyAnomalyMetadata : public AnomalyMetadata {
  static constexpr char* ANOMALY_TRACE_KEY = "traceback_";
  PyAnomalyMetadata() {
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    dict_ = PyDict_New();
  }
  ~PyAnomalyMetadata() override {
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    Py_DECREF(dict_);
  }
  void store_stack() override;
--- a/torch/csrc/autograd/python_cpp_function.cpp
+++ b/torch/csrc/autograd/python_cpp_function.cpp
@ -10,7 +10,7 @@
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/autograd/python_hook.h>
 #include <torch/csrc/autograd/python_anomaly_mode.h>
-#include <torch/csrc/utils/auto_gil.h>
+#include <pybind11/pybind11.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
@ -48,7 +48,7 @@ PyObject* THPCppFunction_call(PyObject* self, PyObject* args, PyObject *kwargs)
  variable_list output;
  HANDLE_TH_ERRORS {
-    AutoNoGIL nogil;
+    pybind11::gil_scoped_release nogil;
    output = (*((THPCppFunction*)self)->cdata)(std::move(vars));
  }
  END_HANDLE_TH_ERRORS
--- a/torch/csrc/autograd/python_engine.cpp
+++ b/torch/csrc/autograd/python_engine.cpp
@ -8,7 +8,7 @@
 #include <torch/csrc/autograd/function.h>
 #include <torch/csrc/autograd/python_anomaly_mode.h>
 #include <torch/csrc/autograd/python_function.h>
-#include <torch/csrc/utils/auto_gil.h>
+#include <pybind11/pybind11.h>
 #include <ATen/core/EnableNamedTensor.h>
 #ifndef _WIN32
@ -33,11 +33,11 @@ static Engine& get_python_engine() {
 namespace torch { namespace autograd { namespace python {
 void PythonEngine::thread_init(int device) {
-  // Create a PyThreadState, but release the GIL. This lets AutoGIL calls
+  // Create a PyThreadState, but release the GIL. This lets pybind11::gil_scoped_acquire calls
  // inside thread_main acquire the GIL without having to create a new
  // PyThreadState each time.
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  Engine::thread_init(device);
 }
@ -76,7 +76,7 @@ variable_list PythonEngine::execute_with_graph_task(
  try {
    return Engine::execute_with_graph_task(graph_task, graph_root);
  } catch (python_error& e) {
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    if (!PyErr_Occurred()) {
      // Set the error indicator only if it is not set already.
      e.restore();
@ -194,7 +194,7 @@ PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwar
  variable_list outputs;
  {
-    AutoNoGIL no_gil;
+    pybind11::gil_scoped_release no_gil;
    outputs = engine.execute(roots, grads, keep_graph, create_graph, output_edges);
  }
@ -219,10 +219,10 @@ PyObject *THPEngine_run_backward(THPEngine *self, PyObject *args, PyObject *kwar
 PyObject* THPEngine_queue_callback(PyObject *self, PyObject *_callback) {
  HANDLE_TH_ERRORS
  _maybe_reinitialize_engine_after_fork();
-  std::shared_ptr<PyObject> callback(_callback, [](PyObject *obj) { AutoGIL gil; Py_DECREF(obj); });
+  std::shared_ptr<PyObject> callback(_callback, [](PyObject *obj) { pybind11::gil_scoped_acquire gil; Py_DECREF(obj); });
  Py_INCREF(_callback);
  engine.queue_callback([callback]() {
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    THPObjectPtr result {PyObject_CallFunctionObjArgs(callback.get(), nullptr)};
    if (!result) throw python_error();
  });
--- a/torch/csrc/autograd/python_function.cpp
+++ b/torch/csrc/autograd/python_function.cpp
@ -6,6 +6,7 @@
 #include <unordered_set>
 #include <exception>
 #include <ATen/ATen.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/autograd/grad_mode.h>
@ -20,7 +21,6 @@
 #include <torch/csrc/jit/ir.h>
 #include <torch/csrc/jit/python_tracer.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/Exceptions.h>
 #include <exception>
@ -45,7 +45,7 @@ PyObject *THPFunctionClass = nullptr;
 namespace torch { namespace autograd {
 auto PyNode::legacy_apply(const variable_list& inputs) -> variable_list {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  THPObjectPtr pyInputs(PyTuple_New(inputs.size()));
  if (!pyInputs) throw python_error();
@ -92,7 +92,7 @@ auto PyNode::legacy_apply(const variable_list& inputs) -> variable_list {
 // it's used by engine.cpp.  This is responsible for forwarding a call from
 // C++'s Node::apply to a Python method "apply".
 auto PyNode::apply(variable_list&& inputs) -> variable_list {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  at::OptionalDeviceGuard _device_guard;
  THPFunction* py_fn = (THPFunction*)obj;
@ -187,7 +187,7 @@ auto PyNode::apply(variable_list&& inputs) -> variable_list {
 }
 auto PyNode::is_traceable() -> bool {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  THPObjectPtr forward_class {PyObject_GetAttrString(obj, "_forward_cls")};
  if (!forward_class) throw python_error();
  THPObjectPtr traceable_py_bool {PyObject_GetAttrString(forward_class, "is_traceable")};
@ -196,14 +196,14 @@ auto PyNode::is_traceable() -> bool {
 }
 auto PyNode::release_variables() -> void {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  auto f = (THPFunction*) obj;
  f->saved_variables.clear();
  f->has_freed_buffers = 1;
 }
 auto PyNode::name() const -> std::string {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  auto f = (THPFunction*) obj;
  auto name = std::string(Py_TYPE(f)->tp_name);
  // Python API functions are not const-correct
--- a/torch/csrc/autograd/python_function.h
+++ b/torch/csrc/autograd/python_function.h
@ -38,7 +38,7 @@ struct PyNode : public Node {
    // Can't use THPObjectPtr as a field in this class; destructor won't take
    // out GIL!  When I forgot to do this by hand
    // TestAutograd.test_inplace_view_python called me out about it.
-    AutoGIL g;
+    pybind11::gil_scoped_acquire g;
    Py_DECREF(obj);
  }
 };
--- a/torch/csrc/autograd/python_hook.cpp
+++ b/torch/csrc/autograd/python_hook.cpp
@ -2,9 +2,9 @@
 #include <sstream>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/THP.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/utils/object_ptr.h>
 #include <torch/csrc/utils/python_strings.h>
 #include <torch/csrc/Exceptions.h>
@ -29,13 +29,13 @@ PyFunctionPreHook::PyFunctionPreHook(PyObject* dict, int value_idx)
 }
 PyFunctionPreHook::~PyFunctionPreHook() {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  Py_DECREF(dict);
 }
 auto PyFunctionPreHook::operator()(const variable_list& values) -> variable_list
 {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  THPObjectPtr value(THPVariable_Wrap(values.at(value_idx)));
  if (!value) throw python_error();
@ -60,7 +60,7 @@ PyFunctionPostHook::PyFunctionPostHook(PyObject* dict) : dict(dict) {
 }
 PyFunctionPostHook::~PyFunctionPostHook() {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  Py_DECREF(dict);
 }
@ -68,7 +68,7 @@ auto PyFunctionPostHook::operator()(
    const variable_list& _outputs, /* grad_inputs */
    const variable_list& _inputs /* grad_outputs */) -> variable_list
 {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  THPObjectPtr outputs(wrap_variables(_outputs));
  THPObjectPtr inputs(wrap_variables(_inputs));
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@ -17,7 +17,7 @@
 #include <torch/csrc/autograd/utils/error_messages.h>
 #include <torch/csrc/autograd/utils/wrap_outputs.h>
 #include <torch/csrc/tensor/python_tensor.h>
-#include <torch/csrc/utils/auto_gil.h>
+#include <pybind11/pybind11.h>
 #include <torch/csrc/utils/cuda_lazy_init.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_strings.h>
--- a/torch/csrc/autograd/python_variable_indexing.cpp
+++ b/torch/csrc/autograd/python_variable_indexing.cpp
@ -240,14 +240,14 @@ static std::vector<Tensor> typeConvertIndices(const Variable& self, const variab
 }
 static Variable dispatch_index(const Variable& self, const variable_list& indices) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  std::vector<Tensor> converted_indices = typeConvertIndices(self, indices);
  OptionalDeviceGuard device_guard(device_of(self));
  return self.index(converted_indices);
 }
 static Variable dispatch_index_put_(Variable& self, const variable_list& indices, const Variable& value) {
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  std::vector<Tensor> converted_indices = typeConvertIndices(self, indices);
  OptionalDeviceGuard device_guard(device_of(self));
  return self.index_put_(converted_indices, value);
--- a/torch/csrc/cuda/Event.cpp
+++ b/torch/csrc/cuda/Event.cpp
@ -1,3 +1,4 @@
 #include <pybind11/pybind11.h>
 #include <torch/csrc/cuda/Event.h>
 #include <torch/csrc/cuda/Module.h>
 #include <torch/csrc/cuda/Stream.h>
@ -106,7 +107,10 @@ static PyObject * THCPEvent_record(THCPEvent *self, THCPStream *stream) {
 static PyObject * THCPEvent_wait(THCPEvent *self, THCPStream *stream) {
  HANDLE_TH_ERRORS
-  with_no_gil([&] { self->cuda_event.block(stream->cuda_stream); });
+  {
    pybind11::gil_scoped_release no_gil;
    self->cuda_event.block(stream->cuda_stream);
  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
 }
@ -125,7 +129,10 @@ static PyObject * THCPEvent_elapsed_time(THCPEvent *self, THCPEvent *other) {
 static PyObject * THCPEvent_synchronize(THCPEvent *self, PyObject *noargs) {
  HANDLE_TH_ERRORS
-  with_no_gil([&] { self->cuda_event.synchronize(); });
+  {
    pybind11::gil_scoped_release no_gil;
    self->cuda_event.synchronize();
  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
 }
--- a/torch/csrc/cuda/Module.cpp
+++ b/torch/csrc/cuda/Module.cpp
@ -217,7 +217,7 @@ PyObject * THCPModule_cudaLockMutex(PyObject *module, PyObject *noargs)
    if (mutex->try_lock())
      break;
    {
-      AutoNoGIL no_gil;
+      pybind11::gil_scoped_release no_gil;
      std::this_thread::sleep_for(std::chrono::microseconds(10));
    }
  }
--- a/torch/csrc/cuda/Stream.cpp
+++ b/torch/csrc/cuda/Stream.cpp
@ -1,3 +1,4 @@
 #include <pybind11/pybind11.h>
 #include <torch/csrc/cuda/Stream.h>
 #include <torch/csrc/cuda/Module.h>
 #include <torch/csrc/Device.h>
@ -85,7 +86,10 @@ static PyObject * THCPStream_query(THCPStream *self, PyObject *noargs) {
 static PyObject * THCPStream_synchronize(THCPStream *self, PyObject *noargs) {
  HANDLE_TH_ERRORS
-  with_no_gil([&] { self->cuda_stream.synchronize(); });
+  {
    pybind11::gil_scoped_release no_gil;
    self->cuda_stream.synchronize();
  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
 }
--- a/torch/csrc/cuda/python_comm.cpp
+++ b/torch/csrc/cuda/python_comm.cpp
@ -2,7 +2,7 @@
 #include <torch/csrc/cuda/comm.h>
 #include <torch/csrc/cuda/Stream.h>
 #include <torch/csrc/cuda/THCP.h>
-#include <torch/csrc/utils/auto_gil.h>
+#include <pybind11/pybind11.h>
 #include <ATen/core/functional.h>
 #include <ATen/ATen.h>
@ -45,7 +45,7 @@ void initCommMethods(PyObject *module) {
              streams = THPUtils_PySequence_to_CUDAStreamList(handle.ptr());
            }
            // Note: We're holding the GIL up to here.
-            AutoNoGIL no_gil;
+            pybind11::gil_scoped_release no_gil;
            return scatter(tensor, devices, chunk_sizes, dim, streams);
          },
          py::arg("tensor"),
--- a/torch/csrc/cuda/python_nccl.cpp
+++ b/torch/csrc/cuda/python_nccl.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/cuda/python_nccl.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/cuda/nccl.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
@ -56,7 +57,10 @@ static void destroy_nccl_comm(PyObject* capsule) {
  HANDLE_TH_ERRORS
  ncclComm_t comm = unpack_nccl_comm(capsule);
-  with_no_gil([&] { ncclCommDestroy(comm); });
+  {
    pybind11::gil_scoped_release no_gil;
    ncclCommDestroy(comm);
  }
  END_HANDLE_TH_ERRORS_RET()
 }
@ -118,8 +122,10 @@ PyObject* THCPModule_nccl_init_rank(PyObject* self, PyObject* args) {
  ncclUniqueId commId;
  memcpy(&commId, id, NCCL_UNIQUE_ID_BYTES);
  ncclComm_t comm;
-  with_no_gil(
+  {
-      [&] { NCCL_CHECK(ncclCommInitRank(&comm, nranks, commId, rank)); });
+    pybind11::gil_scoped_release no_gil;
    NCCL_CHECK(ncclCommInitRank(&comm, nranks, commId, rank));
  }
  return PyCapsule_New(comm, COMM_CAPSULE_NAME, &destroy_nccl_comm);
  END_HANDLE_TH_ERRORS
 }
@ -153,9 +159,10 @@ PyObject* THCPModule_nccl_reduce(PyObject* self, PyObject* args) {
  std::vector<c10::optional<at::cuda::CUDAStream>> streams = unpack_streams(_streams, inputs.size());
  auto user_comms = unpack_comms(_comms, inputs.size());
-  with_no_gil([&] {
+  {
    pybind11::gil_scoped_release no_gil;
    torch::cuda::nccl::reduce(inputs, outputs, root, op, streams, user_comms);
-  });
+  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
@ -184,7 +191,8 @@ PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
  auto streams = unpack_streams(_streams, inputs.size());
  auto user_comms = unpack_comms(_comms, inputs.size());
-  with_no_gil([&] {
+  {
    pybind11::gil_scoped_release no_gil;
    check_inputs(inputs, outputs, 1, 1);
    size_t len = inputs.size();
@ -210,7 +218,7 @@ PyObject* THCPModule_nccl_all_reduce(PyObject* self, PyObject* args) {
          comms[i],
          stream));
    }
-  });
+  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
@ -236,8 +244,10 @@ PyObject* THCPModule_nccl_broadcast(PyObject* self, PyObject* args) {
  auto streams = unpack_streams(_streams, inputs.size());
  auto user_comms = unpack_comms(_comms, inputs.size());
-  with_no_gil(
+  {
-      [&] { torch::cuda::nccl::broadcast(inputs, streams, user_comms); });
+    pybind11::gil_scoped_release no_gil;
    torch::cuda::nccl::broadcast(inputs, streams, user_comms);
  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
@ -263,7 +273,8 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
  auto streams = unpack_streams(_streams, inputs.size());
  auto user_comms = unpack_comms(_comms, inputs.size());
-  with_no_gil([&] {
+  {
    pybind11::gil_scoped_release no_gil;
    size_t len = inputs.size();
    check_inputs(inputs, outputs, len, 1);
@ -298,7 +309,7 @@ PyObject* THCPModule_nccl_all_gather(PyObject* self, PyObject* args) {
          stream));
 #endif
    }
-  });
+  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
@ -325,7 +336,8 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
  auto streams = unpack_streams(_streams, inputs.size());
  auto user_comms = unpack_comms(_comms, inputs.size());
-  with_no_gil([&] {
+  {
    pybind11::gil_scoped_release no_gil;
    size_t len = inputs.size();
    check_inputs(inputs, outputs, 1, len);
@ -351,7 +363,7 @@ PyObject* THCPModule_nccl_reduce_scatter(PyObject* self, PyObject* args) {
          comms[i],
          stream));
    }
-  });
+  }
  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
--- a/torch/csrc/distributed/rpc/py_rref.cpp
+++ b/torch/csrc/distributed/rpc/py_rref.cpp
@ -60,7 +60,7 @@ py::object PyRRef::toHere() {
      {
        // acquiring GIL as torch::jit::toPyObject creates new py::object
        // without grabbing the GIL.
-        AutoGIL ag;
+        pybind11::gil_scoped_acquire ag;
        return torch::jit::toPyObject(std::move(value));
      }
    }
@ -81,7 +81,7 @@ py::object PyRRef::localValue() {
    {
      // acquiring GIL as the return statement construct a new py::object from
      // a const reference.
-      AutoGIL ag;
+      pybind11::gil_scoped_acquire ag;
      return value;
    }
  } else {
@ -90,7 +90,7 @@ py::object PyRRef::localValue() {
    {
      // acquiring GIL as torch::jit::toPyObject creates new py::object without
      // grabbing the GIL.
-      AutoGIL ag;
+      pybind11::gil_scoped_acquire ag;
      return torch::jit::toPyObject(std::move(value));
    }
  }
--- a/torch/csrc/distributed/rpc/python_functions.cpp
+++ b/torch/csrc/distributed/rpc/python_functions.cpp
@ -106,7 +106,7 @@ py::object toPyObjInternal(RpcCommandBase& rpc, MessageType messageType) {
      Stack stack;
      stack.push_back(ret.value());
      {
-        AutoGIL ag;
+        pybind11::gil_scoped_acquire ag;
        // The createPyObjectForStack does not acquire GIL, but creating a new
        // py::object requires GIL.
        return torch::jit::createPyObjectForStack(std::move(stack));
--- a/torch/csrc/distributed/rpc/python_rpc_handler.cpp
+++ b/torch/csrc/distributed/rpc/python_rpc_handler.cpp
@ -19,7 +19,7 @@ py::object getFunction(const py::object& module, const char* name) {
 } // namespace
 PythonRpcHandler::PythonRpcHandler() {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  py::object module = py::module::import("torch.distributed.rpc.internal");
  pyRunFunction_ = getFunction(module, "_run_function");
  pyLoadReturnValue_ = getFunction(module, "_load_return_value");
@ -28,7 +28,7 @@ PythonRpcHandler::PythonRpcHandler() {
 }
 void PythonRpcHandler::cleanup() {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  pyRunFunction_ = py::none();
  pyLoadReturnValue_ = py::none();
  pySerialize_ = py::none();
@ -44,7 +44,7 @@ std::vector<char> PythonRpcHandler::generatePythonUDFResult(
    const std::vector<char>& pickledPayload,
    const std::vector<torch::Tensor>& requestTensorTable,
    std::vector<torch::Tensor>& responseTensorTable) {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  auto pargs = py::bytes(pickledPayload.data(), pickledPayload.size());
  py::tuple pres = pySerialize_(pyRunFunction_(pargs, requestTensorTable));
  const auto& presStr = pres[0].cast<std::string>();
@ -56,33 +56,33 @@ std::vector<char> PythonRpcHandler::generatePythonUDFResult(
 py::object PythonRpcHandler::loadPythonUDFResult(
    const std::vector<char>& pickledPayload,
    const std::vector<torch::Tensor>& tensorTable) {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  auto pargs = py::bytes(pickledPayload.data(), pickledPayload.size());
  return pyLoadReturnValue_(pargs, tensorTable);
 }
 py::object PythonRpcHandler::runPythonUDF(
    const SerializedPyObj& serializedObj) {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  return pyRunFunction_(
      py::bytes(serializedObj.payload_), serializedObj.tensors_);
 }
 SerializedPyObj PythonRpcHandler::serialize(const py::object& obj) {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  py::tuple t = pySerialize_(obj);
  return SerializedPyObj(
      t[0].cast<std::string>(), t[1].cast<std::vector<torch::Tensor>>());
 }
 py::object PythonRpcHandler::deserialize(const SerializedPyObj& serializedObj) {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  return pyLoadReturnValue_(
      py::bytes(serializedObj.payload_), serializedObj.tensors_);
 }
 void PythonRpcHandler::handleException(const py::object& obj) {
-  AutoGIL ag;
+  pybind11::gil_scoped_acquire ag;
  pyHandleException_(obj);
 }
--- a/torch/csrc/distributed/rpc/rref_context.cpp
+++ b/torch/csrc/distributed/rpc/rref_context.cpp
@ -36,7 +36,7 @@ RRefContext::RRefContext(std::shared_ptr<RpcAgent> agent)
 RRefContext::~RRefContext() {
  if (!owners_.empty()) {
-    AutoGIL ag;
+    pybind11::gil_scoped_acquire ag;
    owners_.clear();
  }
 }
@ -375,7 +375,7 @@ void RRefContext::delForkOfOwner(const RRefId& rrefId, const ForkId& forkId) {
    }
  }
  if (deletedRRef && deletedRRef->isPyObj()) {
-    AutoGIL ag;
+    pybind11::gil_scoped_acquire ag;
    deletedRRef.reset();
  }
 }
--- a/torch/csrc/jit/init.cpp
+++ b/torch/csrc/jit/init.cpp
@ -1,4 +1,3 @@
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/jit/argument_spec.h>
@ -53,7 +52,6 @@
 #include <torch/csrc/jit/script/module.h>
 #include <torch/csrc/jit/script/python_tree_views.h>
 #include <torch/csrc/jit/tracer.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <c10/macros/Export.h>
 #include <caffe2/serialize/inline_container.h>
@ -285,7 +283,7 @@ void initJITBindings(PyObject* module) {
            // happen to initialize the autograd engine in these tests, the
            // newly spawned worker threads will try to initialize their
            // PyThreadState*, and they need the GIL for this.
-            AutoNoGIL _no_gil;
+            pybind11::gil_scoped_release _no_gil;
            return runJITCPPTests(runCuda);
          },
          py::arg("run_cuda"))
--- a/torch/csrc/jit/pybind_utils.h
+++ b/torch/csrc/jit/pybind_utils.h
@ -1,8 +1,10 @@
 #pragma once
 #include <ATen/core/EnableNamedTensor.h>
 #include <ATen/core/ivalue.h>
 #include <ATen/core/jit_type.h>
 #include <ATen/core/stack.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Device.h>
 #include <torch/csrc/Dtype.h>
 #include <torch/csrc/Layout.h>
@ -15,10 +17,9 @@
 #include <torch/csrc/jit/script/module_python.h>
 #include <torch/csrc/jit/script/schema_matching.h>
 #include <torch/csrc/jit/tracer.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/six.h>
-#include <ATen/core/EnableNamedTensor.h>
+#include <torch/csrc/utils/auto_gil.h>
 #include <ATen/core/function_schema.h>
 #include <c10/util/Exception.h>
@ -862,7 +863,7 @@ inline py::object runAndInsertCall(
      callee.getSchema(), std::move(args), std::move(kwargs), std::move(self));
  auto tracing_state = tracer::getTracingState();
  if (!tracing_state) {
-    AutoNoGIL no_gil_guard;
+    pybind11::gil_scoped_release no_gil_guard;
    // If we're not tracing, just run the callee as normal.
    callee.run(stack);
  } else {
@ -893,7 +894,7 @@ inline py::object runAndInsertCall(
    // Actually run the callee. Pause the tracer so that we don't double-add the
    // callee nodes.
    {
-      AutoNoGIL no_gil_guard;
+      pybind11::gil_scoped_release no_gil_guard;
      ResourceGuard guard(tracer::pauseTracing());
      callee.run(stack);
    }
--- a/torch/csrc/jit/python_interpreter.cpp
+++ b/torch/csrc/jit/python_interpreter.cpp
@ -14,11 +14,11 @@
 #include <typeinfo>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/autograd/python_engine.h>
 #include <torch/csrc/autograd/python_variable.h>
 #include <torch/csrc/jit/pybind.h>
 #include <torch/csrc/utils/auto_gil.h>
 namespace py = pybind11;
@ -29,7 +29,7 @@ namespace {
 // Note: const_cast is used twice below to acquire a handle to a pyobject.
 Operation createPythonOperation(const Node* op_) {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  const ConcretePythonOp* op = static_cast<const ConcretePythonOp*>(op_);
  const py::function func = py::reinterpret_borrow<const py::function>(
      // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
@ -44,7 +44,7 @@ Operation createPythonOperation(const Node* op_) {
  AT_ASSERT(op->outputs().size() == 1);
  return [=](Stack& stack) {
-    AutoGIL gil;
+    pybind11::gil_scoped_acquire gil;
    py::tuple py_inputs(op->cconv.size());
    size_t i = 0;
    size_t next_scalar = 0;
--- a/torch/csrc/jit/python_ir.cpp
+++ b/torch/csrc/jit/python_ir.cpp
@ -1,5 +1,6 @@
 #include <torch/csrc/jit/python_ir.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/jit/argument_spec.h>
 #include <torch/csrc/jit/export.h>
 #include <torch/csrc/jit/ir.h>
@ -9,7 +10,6 @@
 #include <torch/csrc/jit/pybind.h>
 #include <torch/csrc/jit/python_tracer.h>
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/utils/pybind.h>
 #include <torch/csrc/utils/python_strings.h>
@ -24,7 +24,7 @@ Symbol ConcretePythonOp::Kind = prim::PythonOp;
 using c10::Type;
 std::string getPythonName(const PyObject* obj_) {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
  PyObject* obj = const_cast<PyObject*>(obj_);
  auto v = py::getattr(obj, "__name__", py::str("<python_value>"));
@ -33,7 +33,7 @@ std::string getPythonName(const PyObject* obj_) {
 }
 std::ostream& printPyObject(std::ostream& out, const THPObjectPtr& obj) {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
  auto pyobj = py::handle(const_cast<PyObject*>(obj.get()));
  if (py::isinstance<py::tuple>(pyobj)) {
@ -125,7 +125,7 @@ Node* findNode(Block* block, Symbol kind, bool recurse = true) {
 }
 std::string ConcretePythonOp::name() const {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  if (auto autograd = autogradFunction()) {
    return getPythonName(autograd->get());
  } else {
@ -149,7 +149,7 @@ void ConcretePythonOp::cloneFrom(Node* other_) {
 // was originally SomeFunction.apply
 // used in ONNX for discovering symbolics
 c10::optional<THPObjectPtr> ConcretePythonOp::autogradFunction() const {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
  py::handle obj = const_cast<PyObject*>(pyobj.get());
--- a/torch/csrc/jit/python_tracer.cpp
+++ b/torch/csrc/jit/python_tracer.cpp
@ -28,7 +28,7 @@ SourceRange getPythonInterpreterSourceRange() {
  size_t source_line = 0;
  std::stringstream stack_trace;
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  PyFrameObject* frame = PyEval_GetFrame();
  while (nullptr != frame) {
@ -57,9 +57,10 @@ std::pair<std::shared_ptr<Graph>, Stack> createGraphByTracing(
    script::Module* self) {
  C10_LOG_API_USAGE_ONCE("torch.tracer");
-  auto lookup_fn_adapter = [var_name_lookup_fn](const Variable& var) -> std::string {
+  auto lookup_fn_adapter =
-      AutoGIL ag;
+      [var_name_lookup_fn](const Variable& var) -> std::string {
-      return py::cast<std::string>(var_name_lookup_fn(var));
+    pybind11::gil_scoped_acquire ag;
    return py::cast<std::string>(var_name_lookup_fn(var));
  };
  auto outs = tracer::trace(
@ -114,7 +115,7 @@ void pythonRecordSourceLocation(Node* n) {
 }
 void pythonWarn(const std::string& reason) {
-  AutoGIL gil;
+  pybind11::gil_scoped_acquire gil;
  auto warn_class = py::module::import("torch.jit").attr("TracerWarning");
  PyErr_WarnEx(warn_class.ptr(), reason.c_str(), 1);
 }
@ -175,7 +176,7 @@ void initPythonTracerBindings(PyObject* module) {
    AT_ASSERT(tracing_state);
    tracing_state->lookup_var_name_fn =
        [func](const Variable& var) -> std::string {
-      AutoGIL ag;
+      pybind11::gil_scoped_acquire ag;
      return py::cast<std::string>(func(var));
    };
  });
--- a/torch/csrc/jit/script/init.cpp
+++ b/torch/csrc/jit/script/init.cpp
@ -82,7 +82,7 @@ struct PythonResolver : public Resolver {
      const std::string& name,
      Function& m,
      const SourceRange& loc) override {
-    AutoGIL ag;
+    pybind11::gil_scoped_acquire ag;
    py::object obj = rcb_(name);
    if (obj.is(py::none())) {
      return nullptr;
@ -101,7 +101,7 @@ struct PythonResolver : public Resolver {
    if (classType_ && name == classname_) {
      return classType_;
    }
-    AutoGIL ag;
+    pybind11::gil_scoped_acquire ag;
    py::object obj = rcb_(name);
    if (obj.is(py::none())) {
      return nullptr;
--- a/torch/csrc/utils/auto_gil.h
+++ b/torch/csrc/utils/auto_gil.h
@ -2,12 +2,16 @@
 // RAII structs to acquire and release Python's global interpreter lock (GIL)
 #include <c10/util/Deprecated.h>
 #include <torch/csrc/python_headers.h>
 // TODO: Deprecate these structs after we land this diff
 // (to avoid -Werror failures)
 // Acquires the GIL on construction
-struct AutoGIL {
+struct /* C10_DEPRECATED_MESSAGE(
-  AutoGIL() : gstate(PyGILState_Ensure()) {
+    "Use pybind11::gil_scoped_acquire instead") */ AutoGIL {
-  }
+  AutoGIL() : gstate(PyGILState_Ensure()) {}
  ~AutoGIL() {
    PyGILState_Release(gstate);
  }
@ -16,9 +20,9 @@ struct AutoGIL {
 };
 // Releases the GIL on construction
-struct AutoNoGIL {
+struct /* C10_DEPRECATED_MESSAGE(
-  AutoNoGIL() : save(PyEval_SaveThread()) {
+    "Use pybind11::gil_scoped_release instead") */ AutoNoGIL {
-  }
+  AutoNoGIL() : save(PyEval_SaveThread()) {}
  ~AutoNoGIL() {
    PyEval_RestoreThread(save);
  }
@ -27,8 +31,10 @@ struct AutoNoGIL {
 };
 // Runs the function without the GIL
-template<typename F>
+template <typename F>
-inline void with_no_gil(F f) {
+/* C10_DEPRECATED */ inline void with_no_gil(F f) {
  // TODO: The deprecation here triggers a deprecated use warning
  // on some versions of compilers; need to avoid this
  AutoNoGIL no_gil;
  f();
 }
--- a/torch/csrc/utils/cuda_lazy_init.cpp
+++ b/torch/csrc/utils/cuda_lazy_init.cpp
@ -11,7 +11,7 @@ namespace utils {
 static bool run_yet = false;
 void cuda_lazy_init() {
-  AutoGIL g;
+  pybind11::gil_scoped_acquire g;
  // Protected by the GIL.  We don't use call_once because under ASAN it
  // has a buggy implementation that deadlocks if an instance throws an
  // exception.  In any case, call_once isn't necessary, because we
--- a/torch/csrc/utils/init.cpp
+++ b/torch/csrc/utils/init.cpp
@ -41,7 +41,7 @@ void initThroughputBenchmarkBindings(PyObject* module) {
        // The benchmark always runs without the GIL. GIL will be used where
        // needed. This will happen only in the nn.Module mode when manipulating
        // inputs and running actual inference
-        AutoNoGIL no_gil_guard;
+        pybind11::gil_scoped_release no_gil_guard;
        return self.benchmark(config);
      });
--- a/torch/csrc/utils/tensor_list.cpp
+++ b/torch/csrc/utils/tensor_list.cpp
@ -1,7 +1,7 @@
 #include <torch/csrc/utils/tensor_list.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/utils/python_scalars.h>
 using namespace at;
@ -31,9 +31,8 @@ static PyObject* recursive_to_list(
 PyObject* tensor_to_list(const Tensor& tensor) {
  Tensor data = tensor;
  if (data.type().backend() != Backend::CPU) {
-    with_no_gil([&]() {
+    pybind11::gil_scoped_release no_gil;
-      data = data.toBackend(Backend::CPU);
+    data = data.toBackend(Backend::CPU);
    });
  }
  return recursive_to_list(
      (char*)data.data_ptr(), data.sizes(), data.strides(), 0,
--- a/torch/csrc/utils/tensor_new.cpp
+++ b/torch/csrc/utils/tensor_new.cpp
@ -1,11 +1,11 @@
 #include <torch/csrc/python_headers.h>
 #include <torch/csrc/utils/tensor_new.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/DynamicTypes.h>
 #include <torch/csrc/Exceptions.h>
 #include <torch/csrc/Size.h>
 #include <torch/csrc/autograd/variable.h>
 #include <torch/csrc/utils/auto_gil.h>
 #include <torch/csrc/utils/cuda_lazy_init.h>
 #include <torch/csrc/utils/numpy_stub.h>
 #include <torch/csrc/utils/python_arg_parser.h>
@ -87,25 +87,25 @@ void maybe_initialize_cuda(const Device device) {
 Tensor dispatch_zeros(c10::TensorTypeId type_id, at::ScalarType scalar_type, const optional<Device>& device, IntArrayRef sizes) {
  maybe_initialize_cuda(type_id);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::zeros(sizes, options(type_id, scalar_type, device));
 }
 Tensor dispatch_ones(c10::TensorTypeId type_id, at::ScalarType scalar_type, const optional<Device>& device, IntArrayRef sizes) {
  maybe_initialize_cuda(type_id);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::ones(sizes, options(type_id, scalar_type, device));
 }
 Tensor dispatch_full(c10::TensorTypeId type_id, at::ScalarType scalar_type, Scalar fill_value, const optional<Device>& device, IntArrayRef sizes) {
  maybe_initialize_cuda(type_id);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::full(sizes, fill_value, options(type_id, scalar_type, device));
 }
 Tensor new_with_sizes(c10::TensorTypeId type_id, at::ScalarType scalar_type, const optional<Device>& device, IntArrayRef sizes) {
  maybe_initialize_cuda(type_id);
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  return torch::empty(sizes, options(type_id, scalar_type, device));
 }
@ -247,7 +247,7 @@ Tensor internal_new_from_data(
    // are defined per-layout-type (e.g. tensor vs sparse_coo_tensor).
    const auto& inferred_scalar_type = type_inference ? var.scalar_type() : scalar_type;
    auto device = device_opt.has_value() ? *device_opt : (type_inference ? var.device() : at::Device(computeDeviceType(type_id)));
-    AutoNoGIL no_gil;
+    pybind11::gil_scoped_release no_gil;
    maybe_initialize_cuda(device);
    return var.to(device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/copy_variables);
  }
@ -258,7 +258,7 @@ Tensor internal_new_from_data(
    auto tensor = tensor_from_cuda_array_interface(data);
    const auto& inferred_scalar_type = type_inference ? tensor.scalar_type() : scalar_type;
    auto device = device_opt.has_value() ? *device_opt : at::Device(computeDeviceType(type_id));
-    AutoNoGIL no_gil;
+    pybind11::gil_scoped_release no_gil;
    maybe_initialize_cuda(device);
    return tensor.to(device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/copy_numpy);
  }
@ -268,7 +268,7 @@ Tensor internal_new_from_data(
    auto tensor = tensor_from_numpy(data);
    const auto& inferred_scalar_type = type_inference ? tensor.scalar_type() : scalar_type;
    auto device = device_opt.has_value() ? *device_opt : at::Device(computeDeviceType(type_id));
-    AutoNoGIL no_gil;
+    pybind11::gil_scoped_release no_gil;
    maybe_initialize_cuda(device);
    return tensor.to(device, inferred_scalar_type, /*non_blocking=*/false, /*copy=*/copy_numpy);
  }
@ -288,7 +288,7 @@ Tensor internal_new_from_data(
        inferred_scalar_type, tensor.dtype().itemsize(), data);
  }
  auto device = device_opt.has_value() ? *device_opt : at::Device(computeDeviceType(type_id));
-  AutoNoGIL no_gil;
+  pybind11::gil_scoped_release no_gil;
  maybe_initialize_cuda(device);
  // However, it is VERY important that we trace the to() call here (even
  // though the reason this is important is a hack).  Without *some* factory
--- a/torch/csrc/utils/tensor_numpy.cpp
+++ b/torch/csrc/utils/tensor_numpy.cpp
@ -168,8 +168,8 @@ at::Tensor tensor_from_numpy(PyObject* obj) {
      sizes,
      strides,
      [obj](void* data) {
-          AutoGIL gil;
+        pybind11::gil_scoped_acquire gil;
-          Py_DECREF(obj);
+        Py_DECREF(obj);
      },
      at::device(kCPU).dtype(numpy_dtype_to_aten(PyArray_TYPE(array)))
  );
@ -320,8 +320,8 @@ at::Tensor tensor_from_cuda_array_interface(PyObject* obj) {
      sizes,
      strides,
      [obj](void* data) {
-          AutoGIL gil;
+        pybind11::gil_scoped_acquire gil;
-          Py_DECREF(obj);
+        Py_DECREF(obj);
      },
      at::device(kCUDA).dtype(dtype)
  );
--- a/torch/csrc/utils/throughput_benchmark.cpp
+++ b/torch/csrc/utils/throughput_benchmark.cpp
@ -1,7 +1,7 @@
 #include <torch/csrc/utils/throughput_benchmark.h>
 #include <pybind11/pybind11.h>
 #include <torch/csrc/jit/pybind_utils.h>
 #include <torch/csrc/utils/auto_gil.h>
 namespace torch {
 namespace throughput_benchmark {
@ -21,7 +21,7 @@ py::object ThroughputBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs)  {
  if (script_module_.initialized()) {
    c10::IValue result;
    {
-      AutoNoGIL no_gil_guard;
+      pybind11::gil_scoped_release no_gil_guard;
      result = script_module_.runOnce(std::move(args), std::move(kwargs));
    }
    return jit::toPyObject(std::move(result));
@ -82,7 +82,7 @@ ScriptModuleOutput ScriptModuleBenchmark::runOnce(
 template <>
 void ModuleBenchmark::runOnce(ModuleInput&& input) const {
  CHECK(initialized_);
-  AutoGIL gil_guard;
+  pybind11::gil_scoped_acquire gil_guard;
  model_(*input.args, **input.kwargs);
 }
@ -90,7 +90,7 @@ template <>
 ModuleOutput ModuleBenchmark::runOnce(py::args&& args, py::kwargs&& kwargs)
    const {
  CHECK(initialized_);
-  AutoGIL gil_guard;
+  pybind11::gil_scoped_acquire gil_guard;
  return model_(*args, **kwargs);
 }
@ -111,7 +111,7 @@ void ModuleBenchmark::addInput(py::args&& args, py::kwargs&& kwargs) {
 template <>
 ModuleInput cloneInput<ModuleInput>(const ModuleInput& input) {
-  AutoGIL gil_guard;
+  pybind11::gil_scoped_acquire gil_guard;
  py::args args = input.args;
  py::kwargs kwargs = input.kwargs;
  return {std::move(args), std::move(kwargs)};