From d1993c27ae59842c887d549a3f8936fbcd769498 Mon Sep 17 00:00:00 2001
From: Sahan Paliskara <sahanp@meta.com>
Date: Wed, 17 Sep 2025 16:40:52 +0000
Subject: [PATCH] [BE] Make PyObjectSlot use a global PyInterpreter (#162659)

This pr gets rid of the pyobj_interpreter_ variable from PyObjectSlot and saves a word in the process

Gonna ask for review from @huydhn as there are some changes to CI.
Pull Request resolved: https://github.com/pytorch/pytorch/pull/162659
Approved by: https://github.com/albanD, https://github.com/huydhn
---
 .github/workflows/pull.yml              |  2 +
 .github/workflows/slow.yml              |  2 +
 c10/core/TensorImpl.h                   |  2 +-
 c10/core/impl/PyInterpreterHooks.h      |  7 ++--
 c10/core/impl/PyObjectSlot.cpp          | 10 ++---
 c10/core/impl/PyObjectSlot.h            | 50 +++++++------------------
 functorch/csrc/dim/dim.cpp              |  3 +-
 torch/csrc/Module.cpp                   |  6 +--
 torch/csrc/PyInterpreter.cpp            |  6 +--
 torch/csrc/Storage.cpp                  | 14 +++----
 torch/csrc/autograd/python_variable.cpp | 17 ++++-----
 11 files changed, 45 insertions(+), 74 deletions(-)

diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
index 3f13fbf27688..ff6e9ed10711 100644
--- a/.github/workflows/pull.yml
+++ b/.github/workflows/pull.yml
@@ -127,6 +127,8 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
+      # More memory is needed to build with asan
+      runner: linux.2xlarge.memory
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.10-clang18-asan
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml
index 19b402f85457..9675ee4169f4 100644
--- a/.github/workflows/slow.yml
+++ b/.github/workflows/slow.yml
@@ -140,6 +140,8 @@ jobs:
     uses: ./.github/workflows/_linux-build.yml
     needs: get-label-type
     with:
+      # More memory is needed to build with asan
+      runner: linux.2xlarge.memory
       runner_prefix: "${{ needs.get-label-type.outputs.label-type }}"
       build-environment: linux-jammy-py3.10-clang18-asan
       docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan
diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h
index 972181327b1f..98867da60a7f 100644
--- a/c10/core/TensorImpl.h
+++ b/c10/core/TensorImpl.h
@@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl {
     is_le<sizeof(autograd_meta_),         16,  FieldNameEnum::autograd_meta_>();
     is_le<sizeof(extra_meta_),            16,  FieldNameEnum::extra_meta_>();
     are_equal<sizeof(version_counter_),    8,  FieldNameEnum::version_counter_>();
-    are_equal<sizeof(pyobj_slot_),   16,  FieldNameEnum::pyobj_slot_>();
+    are_equal<sizeof(pyobj_slot_),         8,  FieldNameEnum::pyobj_slot_>();
     are_equal<sizeof(sizes_and_strides_), 88,  FieldNameEnum::sizes_and_strides_>();
     are_equal<sizeof(storage_offset_),     8,  FieldNameEnum::storage_offset_>();
     are_equal<sizeof(numel_),              8,  FieldNameEnum::numel_>();
diff --git a/c10/core/impl/PyInterpreterHooks.h b/c10/core/impl/PyInterpreterHooks.h
index 32a17ad9a8a0..4fe025d2e778 100644
--- a/c10/core/impl/PyInterpreterHooks.h
+++ b/c10/core/impl/PyInterpreterHooks.h
@@ -13,11 +13,10 @@ struct C10_API PyInterpreterHooksInterface {
 
   // Get the PyInterpreter instance
   // Stub implementation throws error when Python is not available
+  // We return nullptr rather than throwing an error since there are bits of c10
+  // that expect an empty PyObjectSlot when python is not available.
   virtual PyInterpreter* getPyInterpreter() const {
-    TORCH_CHECK(
-        false,
-        "PyTorch was compiled without Python support. "
-        "Cannot access Python interpreter from C++.");
+    return nullptr;
   }
 };
 
diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp
index 0f1bfb211074..7476ac1d4c39 100644
--- a/c10/core/impl/PyObjectSlot.cpp
+++ b/c10/core/impl/PyObjectSlot.cpp
@@ -2,7 +2,7 @@
 
 namespace c10::impl {
 
-PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {}
+PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {}
 
 PyObjectSlot::~PyObjectSlot() {
   maybe_destroy_pyobj();
@@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() {
 
 void PyObjectSlot::maybe_destroy_pyobj() {
   if (owns_pyobj()) {
-    TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr);
+    TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr);
     TORCH_INTERNAL_ASSERT(pyobj_ != nullptr);
-    (*pyobj_interpreter_.load(std::memory_order_acquire))
+    (*getGlobalPyInterpreter())
         ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true);
     // NB: this destructor can only be entered when there are no
     // references to this C++ object (obviously), NOR any references
@@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() {
 }
 
 PyInterpreter* PyObjectSlot::pyobj_interpreter() {
-  return pyobj_interpreter_.load(std::memory_order_acquire);
+  return getGlobalPyInterpreter();
 }
 
 PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
@@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const {
 }
 
 PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const {
-  auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire);
+  auto interpreter = getGlobalPyInterpreter();
   if (interpreter) {
     return *interpreter;
   }
diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h
index 58b2490eba00..e7d78f8360c3 100644
--- a/c10/core/impl/PyObjectSlot.h
+++ b/c10/core/impl/PyObjectSlot.h
@@ -6,10 +6,17 @@
 #include <c10/util/python_stub.h>
 #include <optional>
 
-#include <atomic>
-
 namespace c10::impl {
 
+// Function pointer type for getting the global interpreter
+using GetPyInterpreterFn = PyInterpreter* (*)();
+
+// Global function pointer (set by csrc initialization)
+C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn;
+
+// Helper function to get the global interpreter
+C10_API PyInterpreter* getGlobalPyInterpreter();
+
 struct C10_API PyObjectSlot {
  public:
   PyObjectSlot();
@@ -26,8 +33,6 @@ struct C10_API PyObjectSlot {
   // NB: THIS FUNCTION CAN RAISE AN EXCEPTION.  Make sure to clean up after
   // PyObject if necessary!
   void init_pyobj(PyObject* pyobj) {
-    pyobj_interpreter_.store(
-        getGlobalPyInterpreter(), std::memory_order_relaxed);
     pyobj_ = pyobj;
   }
 
@@ -55,18 +60,15 @@ struct C10_API PyObjectSlot {
 
   // @todo alban: I'm not too sure what's going on here, we can probably delete
   // it but it's worthwhile making sure
-  std::optional<PyObject*> check_pyobj(bool ignore_hermetic_tls = false) const {
-    impl::PyInterpreter* interpreter =
-        pyobj_interpreter_.load(std::memory_order_acquire);
-    if (interpreter == nullptr) {
+  std::optional<PyObject*> check_pyobj() const {
+    impl::PyInterpreter* interpreter = getGlobalPyInterpreter();
+    if (interpreter == nullptr || pyobj_ == nullptr) {
       return std::nullopt;
     }
-
-    if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) {
+    if (c10::impl::HermeticPyObjectTLS::get_state()) {
       return std::nullopt;
-    } else {
-      return _unchecked_untagged_pyobj();
     }
+    return _unchecked_untagged_pyobj();
   }
 
   PyInterpreter& load_pyobj_interpreter() const;
@@ -76,30 +78,6 @@ struct C10_API PyObjectSlot {
   void set_owns_pyobj(bool b);
 
  private:
-  // This field contains the interpreter tag for this object.  See
-  // Note [Python interpreter tag] for general context
-  //
-  // Note [Memory ordering on Python interpreter tag]
-  // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-  // What memory_order do we need when accessing this atomic?  We don't
-  // need a single total modification order (as provided by
-  // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only
-  // transition from -1 to some positive integer and never changes afterwards.
-  // Because there is only one modification, it trivially already has a total
-  // modification order (e.g., we don't need fences or locked instructions on
-  // x86)
-  //
-  // In fact, one could make a reasonable argument that relaxed reads are OK,
-  // due to the presence of external locking (GIL) to ensure that interactions
-  // with other data structures are still correctly synchronized, so that
-  // we fall in the "Single-Location Data Structures" case as described in
-  // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf
-  // However, on x86, it doesn't matter if I use acquire or relaxed on the load
-  // as I get the same assembly in both cases.  So I just use the more
-  // conservative acquire (which will impede compiler optimizations but I don't
-  // care)
-  std::atomic<PyInterpreter*> pyobj_interpreter_;
-
   // This field contains a reference to a PyObject representing this Tensor.
   // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new
   // PyObject for it and set this field.  This field does not have to be
diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp
index 8f1e561e2051..5258ba52f99c 100644
--- a/functorch/csrc/dim/dim.cpp
+++ b/functorch/csrc/dim/dim.cpp
@@ -1187,8 +1187,7 @@ int64_t _Tensor_ndim(mpy::handle h) {
 mpy::handle handle_from_tensor(Arena& A, TensorRef t) {
   // fast case: tensor is live in python
   std::optional<PyObject*> mb_obj =
-      t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
+      t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
   if (mb_obj.has_value() &&
       !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) {
     return *mb_obj;
diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp
index ac2b03d2651c..3a3e8bfef047 100644
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@@ -403,11 +403,9 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) {
   // The TensorImpls contain PyObjectSlots that have a reference to the PyObject
   // associated with the TensorImpl. Swap this field as well.
   std::optional<PyObject*> mb_obj_a =
-      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
+      a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
   std::optional<PyObject*> mb_obj_b =
-      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
+      b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
   TORCH_INTERNAL_ASSERT(
       mb_obj_a.has_value() && mb_obj_b.has_value(),
       "Both tensors should have PyObjects tagged by the current python interpreter");
diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp
index e6016a7721e8..993f8b8216a6 100644
--- a/torch/csrc/PyInterpreter.cpp
+++ b/torch/csrc/PyInterpreter.cpp
@@ -614,8 +614,7 @@ static void set_tensor_attr_with_capsule(
     const c10::TensorImpl* tensor,
     py::capsule& capsule,
     const char* attr_name) {
-  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
   TORCH_CHECK(
       mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
   auto obj = mb_obj.value();
@@ -642,8 +641,7 @@ static c10::ArrayRef<T> get_set_cached_attr(
     const c10::TensorImpl* tensor,
     const char* base_attr_name,
     const py::object& obj) {
-  std::optional<PyObject*> mb_obj =
-      tensor->pyobj_slot()->check_pyobj(getPyInterpreter());
+  std::optional<PyObject*> mb_obj = tensor->pyobj_slot()->check_pyobj();
   TORCH_CHECK(
       mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value");
   auto tensor_obj = mb_obj.value();
diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp
index 08112b41aaae..f6638bbd10c1 100644
--- a/torch/csrc/Storage.cpp
+++ b/torch/csrc/Storage.cpp
@@ -41,8 +41,8 @@ PyObject* THPStorage_NewWithStorage(
       "Creating a Storage subclass from a class that does not inherit from ",
       "Storage is not possible. Make sure your class inherits from Storage.");
 
-  auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+  auto maybe_pyobj =
+      _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj();
   if (maybe_pyobj.has_value() && maybe_pyobj.value()) {
     TORCH_CHECK(
         allow_preexisting_pyobj,
@@ -93,8 +93,7 @@ PyObject* THPStorage_Wrap(c10::Storage storage) {
   }
   c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot();
 
-  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+  std::optional<PyObject*> maybe_pyobj = pyobj_slot->check_pyobj();
   if (maybe_pyobj.has_value()) {
     auto obj = *maybe_pyobj;
     if (obj) {
@@ -127,8 +126,8 @@ static bool THPStorage_isPreservable(THPStorage* self) {
     return false;
   }
 
-  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/true) != (PyObject*)self) {
+  if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj() !=
+      (PyObject*)self) {
     return false;
   }
   if (storage.use_count() <= 1) {
@@ -145,8 +144,7 @@ static bool THPStorage_tryPreserve(THPStorage* self) {
   const auto& storage = THPStorage_Unpack(self);
   c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl();
 
-  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/true);
+  auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj();
   // NOTE: It is possible to just set the PyObjectSlot here, but the point is
   // that we should have already set PyObjectSlot when the storage PyObject
   // was created.
diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp
index fcb011167b33..c6f65453e871 100644
--- a/torch/csrc/autograd/python_variable.cpp
+++ b/torch/csrc/autograd/python_variable.cpp
@@ -265,8 +265,7 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) {
   }
 
   std::optional<PyObject*> mb_obj =
-      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false);
+      var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
   if (mb_obj.has_value()) {
     auto obj = *mb_obj;
     if (obj) {
@@ -329,8 +328,8 @@ static bool isResurrectable(THPVariable* self) {
     return false;
   }
   // Check if this is hermetic. If it is, no resurrection.
-  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-          /*ignore_hermetic_tls=*/false) != (PyObject*)self) {
+  if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() !=
+      (PyObject*)self) {
     return false;
   }
   return true;
@@ -355,8 +354,7 @@ static bool THPVariable_tryResurrect(THPVariable* self) {
       !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj());
 
   c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl();
-  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+  auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj();
 
   TORCH_INTERNAL_ASSERT(
       maybe_pyobj.has_value(),
@@ -1933,8 +1931,8 @@ static int THPVariable_subclass_clear(THPVariable* self) {
     //        because Tensor asked us to (it's already destructing).
 
     if (!self->cdata.unsafeIsBorrowed() &&
-        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-            /*ignore_hermetic_tls=*/false) == (PyObject*)self) {
+        tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() ==
+            (PyObject*)self) {
       // TODO: empirically, on OS X this assert appears to be untrue
       // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn
       // distributed/rpc/test_process_group_agent.py
@@ -2120,8 +2118,7 @@ static PyObject* THPVariable_NewWithVar(
 
   // This function overwrite the Tensor's pyobj field without extra checks
   // Make sure it is not set otherwise we would leak memory
-  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(
-      /*ignore_hermetic_tls=*/false);
+  auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj();
 
   // Under some circumstances, we may attempt to create a new Python
   // object for a variable that already has a Python object.  The most common