Revert "[BE] Make PyObjectSlot use a global PyInterpreter and remove (#158427)"

This reverts commit eb7365072315be2bc4259114e25e269801441748. Reverted https://github.com/pytorch/pytorch/pull/158427 on behalf of https://github.com/ZainRizvi due to Reverting this as part of reverting the stack for https://github.com/pytorch/pytorch/pull/158288 ([comment](https://github.com/pytorch/pytorch/pull/158427#issuecomment-3099815367))
2025-10-20 12:54:11 +08:00 · 2025-07-21 23:14:55 +00:00
parent 1227ed6674
commit 15a50dcf1c
19 changed files with 2884 additions and 3282 deletions
--- a/build_variables.bzl
+++ b/build_variables.bzl
@ -864,7 +864,6 @@ libtorch_python_core_sources = [
    "torch/csrc/QScheme.cpp",
    "torch/csrc/Module.cpp",
    "torch/csrc/PyInterpreter.cpp",
-    "torch/csrc/PyInterpreterHooks.cpp",
    "torch/csrc/python_dimname.cpp",
    "torch/csrc/Size.cpp",
    "torch/csrc/Storage.cpp",
--- a/c10/core/impl/PyInterpreter.h
+++ b/c10/core/impl/PyInterpreter.h
@ -240,4 +240,24 @@ struct C10_API PyInterpreter {
  void disarm() noexcept;
 };

+// PyInterpreterStatus describes what the state of its interpreter tag
+// is, relative to the thread currently holding the GIL.
+enum class PyInterpreterStatus {
+  // We just allocated the Tensor, it hasn't escaped to other threads,
+  // we know that it definitely hasn't been tagged to be associated
+  // with an interpreter.
+  DEFINITELY_UNINITIALIZED,
+  // We queried the interpreter field and it looked uninitialized.  But
+  // another thread may have raced with us to tag it with some other
+  // interpreter id.  So we will have to do a CEX to make sure we can
+  // actually nab it.
+  MAYBE_UNINITIALIZED,
+  // We queried the interpreter field and it was tagged to belong to us.
+  // This means we have sole write access (as we hold the GIL for this
+  // interpreter)
+  TAGGED_BY_US,
+  // Someone else tagged this.  We can't use this TensorImpl from Python.
+  TAGGED_BY_OTHER,
+};
+
 } // namespace c10::impl
--- a/c10/core/impl/PyInterpreterHooks.cpp
+++ b/c10/core/impl/PyInterpreterHooks.cpp
@ -1,32 +0,0 @@
-#include <c10/core/impl/PyInterpreterHooks.h>
-
-namespace c10::impl {
-
-// Define the registry
-C10_DEFINE_REGISTRY(
-    PyInterpreterHooksRegistry,
-    PyInterpreterHooksInterface,
-    PyInterpreterHooksArgs)
-
-const PyInterpreterHooksInterface& getPyInterpreterHooks() {
-  auto create_impl = [] {
-#if !defined C10_MOBILE
-    auto hooks = PyInterpreterHooksRegistry()->Create(
-        "PyInterpreterHooks", PyInterpreterHooksArgs{});
-    if (hooks) {
-      return hooks;
-    }
-#endif
-    // Return stub implementation that will throw errors when methods are called
-    return std::make_unique<PyInterpreterHooksInterface>();
-  };
-  static auto hooks = create_impl();
-  return *hooks;
-}
-
-// Main function to get global PyInterpreter
-PyInterpreter* getGlobalPyInterpreter() {
-  return getPyInterpreterHooks().getPyInterpreter();
-}
-
-} // namespace c10::impl
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@ -1,39 +0,0 @@
-#pragma once
-
-#include <c10/core/impl/PyInterpreter.h>
-#include <c10/macros/Export.h>
-#include <c10/util/Registry.h>
-#include <memory>
-
-namespace c10::impl {
-
-// Minimal interface for PyInterpreter hooks
-struct C10_API PyInterpreterHooksInterface {
-  virtual ~PyInterpreterHooksInterface() = default;
-
-  // Get the PyInterpreter instance
-  // Stub implementation throws error when Python is not available
-  virtual PyInterpreter* getPyInterpreter() const {
-    TORCH_CHECK(
-        false,
-        "PyTorch was compiled without Python support. "
-        "Cannot access Python interpreter from C++.");
-  }
-};
-
-struct C10_API PyInterpreterHooksArgs{};
-
-C10_DECLARE_REGISTRY(
-    PyInterpreterHooksRegistry,
-    PyInterpreterHooksInterface,
-    PyInterpreterHooksArgs);
-
-#define REGISTER_PYTHON_HOOKS(clsname) \
-  C10_REGISTER_CLASS(PyInterpreterHooksRegistry, clsname, clsname)
-
-// Get the global PyInterpreter hooks instance
-C10_API const PyInterpreterHooksInterface& getPyInterpreterHooks();
-
-C10_API PyInterpreter* getGlobalPyInterpreter();
-
-} // namespace c10::impl
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@ -34,6 +34,11 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
      reinterpret_cast<uintptr_t>(pyobj_) & ~0x1ULL);
 }

+void PyObjectSlot::unchecked_clear_pyobj(PyInterpreter* interpreter) {
+  TORCH_INTERNAL_ASSERT_DEBUG_ONLY(interpreter == pyobj_interpreter_.load());
+  pyobj_ = nullptr;
+}
+
 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
  if (interpreter) {
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@ -2,7 +2,6 @@

 #include <c10/core/impl/HermeticPyObjectTLS.h>
 #include <c10/core/impl/PyInterpreter.h>
-#include <c10/core/impl/PyInterpreterHooks.h>
 #include <c10/util/python_stub.h>
 #include <optional>

@ -25,9 +24,11 @@ struct C10_API PyObjectSlot {
  //
  // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
  // PyObject if necessary!
-  void init_pyobj(PyObject* pyobj) {
-    pyobj_interpreter_.store(
-        getGlobalPyInterpreter(), std::memory_order_relaxed);
+  void init_pyobj(
+      PyInterpreter* self_interpreter,
+      PyObject* pyobj,
+      PyInterpreterStatus status) {
+    pyobj_interpreter_.store(self_interpreter, std::memory_order_relaxed);
    pyobj_ = pyobj;
  }

@ -52,10 +53,9 @@ struct C10_API PyObjectSlot {
  //
  // NB: this lives in header so that we can avoid actually creating the
  // std::optional
-
-  // @todo alban: I'm not too sure what's going on here, we can probably delete
-  // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
+  std::optional<PyObject*> check_pyobj(
+      PyInterpreter* self_interpreter,
+      bool ignore_hermetic_tls = false) const {
    impl::PyInterpreter* interpreter =
        pyobj_interpreter_.load(std::memory_order_acquire);
    if (interpreter == nullptr) {
@ -69,6 +69,10 @@ struct C10_API PyObjectSlot {
    }
  }

+  // Clear the PyObject field for an interpreter, in situations where we
+  // statically know the tensor is tagged with our interpreter.
+  void unchecked_clear_pyobj(PyInterpreter* interpreter);
+
  PyInterpreter& load_pyobj_interpreter() const;

  bool owns_pyobj();
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
--- a/torch/_dynamo/trace_rules.py
+++ b/torch/_dynamo/trace_rules.py
@ -583,6 +583,7 @@ torch_c_binding_in_graph_functions = dict.fromkeys(
        "torch._C._dispatch_has_kernel",
        "torch._C._dispatch_is_alias_key",
        "torch._C._dispatch_is_included_in_alias",
+        "torch._C._dispatch_is_main_interpreter",
        "torch._C._dispatch_isTensorSubclassLike",
        "torch._C._dispatch_key_for_device",
        "torch._C._dispatch_key_name",
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -407,10 +407,10 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
  // associated with the TensorImpl. Swap this field as well.
  std::optional<PyObject*> mb_obj_a =
      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
  std::optional<PyObject*> mb_obj_b =
      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
  TORCH_INTERNAL_ASSERT(
      mb_obj_a.has_value() && mb_obj_b.has_value(),
      "Both tensors should have PyObjects tagged by the current python interpreter");
@ -420,8 +420,10 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
  a->cdata = b->cdata;
  b->cdata = tmp;

-  a->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(a_);
-  b->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(b_);
+  a->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+      getPyInterpreter(), a_, c10::impl::PyInterpreterStatus::TAGGED_BY_US);
+  b->cdata->unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+      getPyInterpreter(), b_, c10::impl::PyInterpreterStatus::TAGGED_BY_US);

  Py_RETURN_NONE;
  END_HANDLE_TH_ERRORS
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@ -586,7 +586,7 @@ static void set_tensor_attr_with_capsule(
    py::capsule& capsule,
    const char* attr_name) {
  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);
  TORCH_CHECK(
      mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
  auto obj = mb_obj.value();
@ -987,3 +987,7 @@ py::handle getTorchApiFunction(const c10::OperatorHandle& op) {
 c10::impl::PyInterpreter* getPyInterpreter() {
  return torch::detail::self_interpreter.get();
 }
+
+bool isMainPyInterpreter() {
+  return torch::detail::self_interpreter.is_main_interpreter();
+}
--- a/torch/csrc/PyInterpreter.h
+++ b/torch/csrc/PyInterpreter.h
@ -10,4 +10,4 @@ TORCH_PYTHON_API py::handle getTorchApiFunction(const c10::OperatorHandle& op);

 // TODO: Move these to a proper namespace
 TORCH_PYTHON_API c10::impl::PyInterpreter* getPyInterpreter();
-TORCH_PYTHON_API void initializeGlobalPyInterpreter();
+TORCH_PYTHON_API bool isMainPyInterpreter();
--- a/torch/csrc/PyInterpreterHooks.cpp
+++ b/torch/csrc/PyInterpreterHooks.cpp
@ -1,20 +0,0 @@
-#include <torch/csrc/PyInterpreter.h>
-#include <torch/csrc/PyInterpreterHooks.h>
-
-namespace torch::detail {
-
-PyInterpreterHooks::PyInterpreterHooks(c10::impl::PyInterpreterHooksArgs) {}
-
-c10::impl::PyInterpreter* PyInterpreterHooks::getPyInterpreter() const {
-  // Delegate to the existing implementation
-  return ::getPyInterpreter();
-}
-
-} // namespace torch::detail
-
-// Sigh, the registry doesn't support namespaces :(
-using c10::impl::PyInterpreterHooksRegistry;
-using c10::impl::RegistererPyInterpreterHooksRegistry;
-using PyInterpreterHooks = torch::detail::PyInterpreterHooks;
-// Register the implementation
-REGISTER_PYTHON_HOOKS(PyInterpreterHooks);
--- a/torch/csrc/PyInterpreterHooks.h
+++ b/torch/csrc/PyInterpreterHooks.h
@ -1,15 +0,0 @@
-#pragma once
-
-#include <c10/core/impl/PyInterpreterHooks.h>
-
-namespace torch::detail {
-
-// Concrete implementation of PyInterpreterHooks
-class PyInterpreterHooks : public c10::impl::PyInterpreterHooksInterface {
- public:
-  explicit PyInterpreterHooks(c10::impl::PyInterpreterHooksArgs);
-
-  c10::impl::PyInterpreter* getPyInterpreter() const override;
-};
-
-} // namespace torch::detail
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@ -35,6 +35,7 @@ PyTypeObject* THPStorageClass = nullptr;
 PyObject* THPStorage_NewWithStorage(
    PyTypeObject* type,
    c10::Storage _storage,
+    c10::impl::PyInterpreterStatus status,
    bool allow_preexisting_pyobj) {
  TORCH_CHECK(
      PyType_IsSubtype(type, &THPStorageType),
@ -42,7 +43,7 @@ PyObject* THPStorage_NewWithStorage(
      "Storage is not possible. Make sure your class inherits from Storage.");

  auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);
  if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
    TORCH_CHECK(
        allow_preexisting_pyobj,
@ -77,7 +78,8 @@ PyObject* THPStorage_NewWithStorage(
  if (!c10::impl::HermeticPyObjectTLS::get_state()) {
    s->is_hermetic = false;
    const auto& storage = THPStorage_Unpack(s);
-    storage.unsafeGetStorageImpl()->pyobj_slot()->init_pyobj(obj);
+    storage.unsafeGetStorageImpl()->pyobj_slot()->init_pyobj(
+        getPyInterpreter(), obj, status);
  } else {
    s->is_hermetic = true;
  }
@ -89,12 +91,17 @@ PyObject* THPStorage_NewWithStorage(
 PyObject* THPStorage_Wrap(c10::Storage storage) {
  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
  if (c10::impl::HermeticPyObjectTLS::get_state()) {
-    return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
+    return THPStorage_NewWithStorage(
+        THPStorageClass,
+        std::move(storage),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
  }
  c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();

  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+  c10::impl::PyInterpreterStatus status =
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US;
  if (maybe_pyobj.has_value()) {
    auto obj = *maybe_pyobj;
    if (obj) {
@ -113,8 +120,15 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
        return obj;
      }
    }
+    status = c10::impl::PyInterpreterStatus::TAGGED_BY_US;
+  } else {
+    if (storage.use_count() <= 1) {
+      status = c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED;
+    } else {
+      status = c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED;
+    }
  }
-  return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
+  return THPStorage_NewWithStorage(THPStorageClass, std::move(storage), status);
 }

 static bool THPStorage_isPreservable(THPStorage* self) {
@ -128,7 +142,8 @@ static bool THPStorage_isPreservable(THPStorage* self) {
  }

  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/true) != (PyObject*)self) {
+          getPyInterpreter(), /*ignore_hermetic_tls=*/true) !=
+      (PyObject*)self) {
    return false;
  }
  if (storage.use_count() <= 1) {
@ -146,10 +161,11 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
  c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();

  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
+      getPyInterpreter(),
      /*ignore_hermetic_tls=*/true);
  // NOTE: It is possible to just set the PyObjectSlot here, but the point is
-  // that we should have already set PyObjectSlot when the storage PyObject
-  // was created.
+  // that we should have already set PyObjectSlot when the storage PyObject was
+  // created.
  TORCH_INTERNAL_ASSERT(
      maybe_pyobj.has_value(),
      "Trying to preserve a Python storage whose PyObjectSlot does not have a PyObject");
@ -357,7 +373,8 @@ static PyObject* THPStorage_pynew(
            at::DataPtr(),
            allocator,
            /*resizable=*/true,
-            device_opt));
+            device_opt),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);

    // torch.Storage(size, *, ...)
  } else if (r.idx == 1) {
@ -370,7 +387,8 @@ static PyObject* THPStorage_pynew(
            at::DataPtr(),
            allocator,
            /*resizable=*/true,
-            device_opt));
+            device_opt),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);

    // torch.Storage(sequence, *, ...)
  } else if (r.idx == 2) {
@ -394,7 +412,8 @@ static PyObject* THPStorage_pynew(
            at::DataPtr(),
            allocator,
            /*resizable=*/true,
-            device_opt));
+            device_opt),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
    THPObjectPtr item;
    try {
      const auto& storage = THPStorage_Unpack(self);
@ -490,8 +509,10 @@ static PyObject* THPStorage_get(THPStorage* self, PyObject* index) {
        /* resizable */ false,
        device_opt);

-    PyObject* _ret =
-        THPStorage_NewWithStorage(Py_TYPE(self), std::move(new_storage_impl));
+    PyObject* _ret = THPStorage_NewWithStorage(
+        Py_TYPE(self),
+        std::move(new_storage_impl),
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);

    return _ret;
  }
--- a/torch/csrc/Storage.h
+++ b/torch/csrc/Storage.h
@ -19,6 +19,7 @@ TORCH_PYTHON_API PyObject* THPStorage_Wrap(c10::Storage storage);
 TORCH_PYTHON_API PyObject* THPStorage_NewWithStorage(
    PyTypeObject* type,
    c10::Storage _storage,
+    c10::impl::PyInterpreterStatus status,
    bool allow_preexisting_pyobj = false);
 TORCH_PYTHON_API extern PyTypeObject* THPStorageClass;

--- a/torch/csrc/StorageMethods.cpp
+++ b/torch/csrc/StorageMethods.cpp
@ -390,7 +390,10 @@ static PyObject* THPStorage_fromFile(
    storage->set_nbytes(actual_nbytes);
  }

-  return THPStorage_NewWithStorage(THPStorageClass, std::move(storage));
+  return THPStorage_NewWithStorage(
+      THPStorageClass,
+      std::move(storage),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
  END_HANDLE_TH_ERRORS
 }

--- a/torch/csrc/StorageSharing.cpp
+++ b/torch/csrc/StorageSharing.cpp
@ -86,7 +86,8 @@ static PyObject* THPStorage_pyNewFilenameStorage(
          THManagedMapAllocator::makeDataPtr(
              "", handle.c_str(), flags, static_cast<size_t>(size)),
          /*allocator=*/nullptr,
-          /*resizable=*/false));
+          /*resizable=*/false),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
  END_HANDLE_TH_ERRORS
 }

@ -181,7 +182,8 @@ static PyObject* THPStorage_newSharedFilename(
          THManagedMapAllocator::makeDataPtr(
              manager_handle, object_handle, flags, size),
          /*allocator=*/nullptr,
-          /*resizable=*/false));
+          /*resizable=*/false),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
  END_HANDLE_TH_ERRORS
 }

@ -195,7 +197,9 @@ static PyObject* THPStorage_pyNewFdStorage(PyObject* _unused, PyObject* args) {
    return nullptr;
  }
  return THPStorage_NewWithStorage(
-      THPStorageClass, at::new_shm_fd_storage(size));
+      THPStorageClass,
+      at::new_shm_fd_storage(size),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
  END_HANDLE_TH_ERRORS
 }

@ -274,7 +278,8 @@ static PyObject* THPStorage_newSharedFd(PyObject* _unused, PyObject* args) {
          at::MapAllocator::makeDataPtr(
              at::WITH_FD, "", fd, flags, size, nullptr),
          /*allocator=*/nullptr,
-          /*resizable=*/false));
+          /*resizable=*/false),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
  END_HANDLE_TH_ERRORS
 }

@ -555,7 +560,10 @@ static PyObject* THPStorage_newSharedCuda(PyObject* _unused, PyObject* args) {
  base->set_resizable(false);
  base->set_received_cuda(true);

-  return THPStorage_NewWithStorage(THPStorageClass, std::move(base));
+  return THPStorage_NewWithStorage(
+      THPStorageClass,
+      std::move(base),
+      c10::impl::PyInterpreterStatus::TAGGED_BY_US);
 #else
  TORCH_CHECK(false, "CUDA is not available");
 #endif
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@ -209,6 +209,7 @@ PyObject* ParameterClass = nullptr;
 static PyObject* THPVariable_NewWithVar(
    PyTypeObject* type,
    const at::TensorBase& _var,
+    c10::impl::PyInterpreterStatus status,
    bool allow_preexisting_pyobj = false);

 // clang-tidy gets confused by static const
@ -260,12 +261,16 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
  }

  if (c10::impl::HermeticPyObjectTLS::get_state()) {
-    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
+    return THPVariable_NewWithVar(
+        (PyTypeObject*)THPVariableClass,
+        var,
+        c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
  }

  std::optional<PyObject*> mb_obj =
      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false);
+  c10::impl::PyInterpreterStatus status{};
  if (mb_obj.has_value()) {
    auto obj = *mb_obj;
    if (obj) {
@ -290,17 +295,27 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
    // (https://github.com/pytorch/pytorch/pull/56017).  Prior to this PR
    // being a thing, the PyObject field will get cleared when all references
    // to the Python object are removed.
+    status = c10::impl::PyInterpreterStatus::TAGGED_BY_US;
+  } else {
+    // Assumption: if a Tensor has been shared across threads, this induces
+    // a refcount bump.  Therefore, if the use count 1, we are the sole thread
+    // with access to this tensor and no race is possible.
+    if (var.use_count() <= 1) {
+      status = c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED;
+    } else {
+      status = c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED;
+    }
  }

  if (C10_LIKELY(var.device().type() != c10::kXLA)) {
-    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
+    return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var, status);
  }

  if (auto clazz = getPythonTensorClass(var.device())) {
-    return THPVariable_NewWithVar((PyTypeObject*)clazz, var);
+    return THPVariable_NewWithVar((PyTypeObject*)clazz, var, status);
  }

-  return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var);
+  return THPVariable_NewWithVar((PyTypeObject*)THPVariableClass, var, status);
 }

 static bool isResurrectable(THPVariable* self) {
@ -329,7 +344,8 @@ static bool isResurrectable(THPVariable* self) {
  }
  // Check if this is hermetic. If it is, no resurrection.
  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false) != (PyObject*)self) {
+          getPyInterpreter(), /*ignore_hermetic_tls=*/false) !=
+      (PyObject*)self) {
    return false;
  }
  return true;
@ -355,6 +371,7 @@ static bool THPVariable_tryResurrect(THPVariable* self) {

  c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
+      getPyInterpreter(),
      /*ignore_hermetic_tls=*/false);

  TORCH_INTERNAL_ASSERT(
@ -570,7 +587,10 @@ static PyObject* THPVariable_as_subclass(
  // stack
  torch_dispatch_mode::StashTorchDispatchStackGuard td_g;
  c10::impl::DisablePythonDispatcher dpd_g;
-  return THPVariable_NewWithVar((PyTypeObject*)cls, self.alias());
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      self.alias(),
+      c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
  END_HANDLE_TH_ERRORS
 }

@ -622,7 +642,10 @@ static PyObject* THPVariable_make_subclass(
    data.unsafeGetTensorImpl()->_change_backend_component_keys(r.device(6));
  }

-  return THPVariable_NewWithVar((PyTypeObject*)cls, data);
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      data,
+      c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
  END_HANDLE_TH_ERRORS
 }

@ -767,7 +790,10 @@ static PyObject* THPVariable_make_wrapper_subclass(
    tensor.unsafeGetTensorImpl()->set_python_custom_layout(true);
  }

-  return THPVariable_NewWithVar((PyTypeObject*)cls, tensor);
+  return THPVariable_NewWithVar(
+      (PyTypeObject*)cls,
+      tensor,
+      c10::impl::PyInterpreterStatus::DEFINITELY_UNINITIALIZED);
  END_HANDLE_TH_ERRORS
 }

@ -1795,6 +1821,7 @@ PyObject* THPVariable_pynew(
  return THPVariable_NewWithVar(
      type,
      tensor,
+      c10::impl::PyInterpreterStatus::MAYBE_UNINITIALIZED,
      /*allow_preexisting_pyobj=*/true);
  END_HANDLE_TH_ERRORS
 }
@ -1847,7 +1874,8 @@ static int THPVariable_subclass_clear(THPVariable* self) {

    if (!self->cdata.unsafeIsBorrowed() &&
        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-            /*ignore_hermetic_tls=*/false) == (PyObject*)self) {
+            getPyInterpreter(), /*ignore_hermetic_tls=*/false) ==
+            (PyObject*)self) {
      // TODO: empirically, on OS X this assert appears to be untrue
      // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
      // distributed/rpc/test_process_group_agent.py
@ -2019,10 +2047,17 @@ static void THPVariable_subclass_dealloc(PyObject* self) {
  Py_DECREF(type);
 }

-// Creates a new Python object for a Variable.
+// Creates a new Python object for a Variable.  The status parameter
+// specifies what the interpreter tag status on the object is; for
+// example, if you ran check_pyobj, the return optional of this object
+// tells you if the tensor was already tagged or not so you can pass
+// TAGGED_BY_US or MAYBE_UNINITIALIZED; in other cases, you know where
+// var came from and can directly assert that it's DEFINITELY_UNINITIALIZED.
+// It's ALWAYS safe (albeit slower) to call this with MAYBE_UNINITIALIZED.
 static PyObject* THPVariable_NewWithVar(
    PyTypeObject* type,
    const at::TensorBase& _var,
+    c10::impl::PyInterpreterStatus status,
    bool allow_preexisting_pyobj) {
  // Make sure that the reinterpret into a THPVariable* will be valid
  TORCH_CHECK(
@ -2033,7 +2068,7 @@ static PyObject* THPVariable_NewWithVar(
  // This function overwrite the Tensor's pyobj field without extra checks
  // Make sure it is not set otherwise we would leak memory
  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+      getPyInterpreter(), /*ignore_hermetic_tls=*/false);

  // Under some circumstances, we may attempt to create a new Python
  // object for a variable that already has a Python object.  The most common
@ -2115,7 +2150,8 @@ static PyObject* THPVariable_NewWithVar(
      // Normal codepath
      v->cdata = MaybeOwned<Variable>::owned(Variable(_var));
      const auto& var = THPVariable_Unpack(v);
-      var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(obj);
+      var.unsafeGetTensorImpl()->pyobj_slot()->init_pyobj(
+          getPyInterpreter(), obj, status);
      if (check_has_torch_dispatch(obj)) {
        var.unsafeGetTensorImpl()->set_python_dispatch(true);
      }
--- a/torch/csrc/utils/python_dispatch.cpp
+++ b/torch/csrc/utils/python_dispatch.cpp
@ -209,10 +209,12 @@ class PythonKernelHolder : public c10::OperatorKernel {
  }
 };

-// @todo sahanp: Afait only register is used in the codebase. This can be
-// removed / simplified
 static torch::_RegisterOrVerify register_or_verify() {
-  return torch::_RegisterOrVerify::REGISTER;
+  if (isMainPyInterpreter()) {
+    return torch::_RegisterOrVerify::REGISTER;
+  } else {
+    return torch::_RegisterOrVerify::VERIFY;
+  }
 }

 static py::object ophandle_call_boxed(
@ -285,6 +287,7 @@ void initDispatchBindings(PyObject* module) {
      .def(
          "reset",
          [](const py::object& self) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            self.cast<torch::Library&>().reset();
            return;
          },
@ -294,6 +297,7 @@ void initDispatchBindings(PyObject* module) {
      .def(
          "def_",
          [](py::object self, const char* schema, const char* alias) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            self.cast<torch::Library&>().def(
                torch::schema(schema, parseAliasAnalysisKind(alias)));
            return self;
@ -307,6 +311,7 @@ void initDispatchBindings(PyObject* module) {
      .def(
          "def_legacy",
          [](py::object self, const char* schema) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            self.cast<torch::Library&>().def(torch::jit::parseSchema(schema));
            return self;
          },
@ -326,6 +331,7 @@ void initDispatchBindings(PyObject* module) {
             const char* name,
             const char* dispatch,
             const char* debug) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            self.cast<torch::Library&>().def(
                name, dispatch_str(dispatch, [](const at::Tensor& a) {
                        return a;
@ -343,6 +349,7 @@ void initDispatchBindings(PyObject* module) {
             const char* dispatch,
             const char* alias,
             const char* debug) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            self.cast<torch::Library&>().def(
                torch::schema(schema, parseAliasAnalysisKind(alias)),
                dispatch_str(dispatch, [](const at::Tensor& a) {
@ -363,6 +370,7 @@ void initDispatchBindings(PyObject* module) {
             const char* name,
             const char* dispatch,
             const char* debug) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            self.cast<torch::Library&>().impl(
                name, dispatch_str(dispatch, [](const at::Tensor& a) {
                        return a;
@ -457,6 +465,7 @@ void initDispatchBindings(PyObject* module) {
      .def(
          "fallback_fallthrough",
          [](py::object self, const char* dispatch) {
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            self.cast<torch::Library&>().fallback(
                dispatch_str(dispatch, CppFunction::makeFallthrough()));
            return self;
@ -471,6 +480,7 @@ void initDispatchBindings(PyObject* module) {
             bool with_keyset) {
            HANDLE_TH_ERRORS
            auto& lib = self.cast<torch::Library&>();
+            TORCH_INTERNAL_ASSERT(isMainPyInterpreter());
            if (func.is(py::module::import("torch.library")
                            .attr("fallthrough_kernel"))) {
              lib.fallback(
@ -903,6 +913,8 @@ void initDispatchBindings(PyObject* module) {
        handle.setReportErrorCallback_(std::move(callback_obj));
      });

+  m.def(
+      "_dispatch_is_main_interpreter", []() { return isMainPyInterpreter(); });
  m.def("_dispatch_pystub", [](const char* name, const char* overload) {
    return c10::Dispatcher::singleton().getPyStub(
        c10::OperatorName(name, overload));