From d1993c27ae59842c887d549a3f8936fbcd769498 Mon Sep 17 00:00:00 2001 From: Sahan Paliskara Date: Wed, 17 Sep 2025 16:40:52 +0000 Subject: [PATCH] [BE] Make PyObjectSlot use a global PyInterpreter (#162659) This pr gets rid of the pyobj_interpreter_ variable from PyObjectSlot and saves a word in the process Gonna ask for review from @huydhn as there are some changes to CI. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162659 Approved by: https://github.com/albanD, https://github.com/huydhn --- .github/workflows/pull.yml | 2 + .github/workflows/slow.yml | 2 + c10/core/TensorImpl.h | 2 +- c10/core/impl/PyInterpreterHooks.h | 7 ++-- c10/core/impl/PyObjectSlot.cpp | 10 ++--- c10/core/impl/PyObjectSlot.h | 50 +++++++------------------ functorch/csrc/dim/dim.cpp | 3 +- torch/csrc/Module.cpp | 6 +-- torch/csrc/PyInterpreter.cpp | 6 +-- torch/csrc/Storage.cpp | 14 +++---- torch/csrc/autograd/python_variable.cpp | 17 ++++----- 11 files changed, 45 insertions(+), 74 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 3f13fbf27688..ff6e9ed10711 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -127,6 +127,8 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: + # More memory is needed to build with asan + runner: linux.2xlarge.memory runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 19b402f85457..9675ee4169f4 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -140,6 +140,8 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: + # More memory is needed to build with asan + runner: linux.2xlarge.memory runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 972181327b1f..98867da60a7f 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl { is_le(); is_le(); are_equal(); - are_equal(); + are_equal(); are_equal(); are_equal(); are_equal(); diff --git a/c10/core/impl/PyInterpreterHooks.h b/c10/core/impl/PyInterpreterHooks.h index 32a17ad9a8a0..4fe025d2e778 100644 --- a/c10/core/impl/PyInterpreterHooks.h +++ b/c10/core/impl/PyInterpreterHooks.h @@ -13,11 +13,10 @@ struct C10_API PyInterpreterHooksInterface { // Get the PyInterpreter instance // Stub implementation throws error when Python is not available + // We return nullptr rather than throwing an error since there are bits of c10 + // that expect an empty PyObjectSlot when python is not available. virtual PyInterpreter* getPyInterpreter() const { - TORCH_CHECK( - false, - "PyTorch was compiled without Python support. " - "Cannot access Python interpreter from C++."); + return nullptr; } }; diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp index 0f1bfb211074..7476ac1d4c39 100644 --- a/c10/core/impl/PyObjectSlot.cpp +++ b/c10/core/impl/PyObjectSlot.cpp @@ -2,7 +2,7 @@ namespace c10::impl { -PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {} +PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {} PyObjectSlot::~PyObjectSlot() { maybe_destroy_pyobj(); @@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() { void PyObjectSlot::maybe_destroy_pyobj() { if (owns_pyobj()) { - TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr); + TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr); TORCH_INTERNAL_ASSERT(pyobj_ != nullptr); - (*pyobj_interpreter_.load(std::memory_order_acquire)) + (*getGlobalPyInterpreter()) ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true); // NB: this destructor can only be entered when there are no // references to this C++ object (obviously), NOR any references @@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() { } PyInterpreter* PyObjectSlot::pyobj_interpreter() { - return pyobj_interpreter_.load(std::memory_order_acquire); + return getGlobalPyInterpreter(); } PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const { @@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const { } PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const { - auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire); + auto interpreter = getGlobalPyInterpreter(); if (interpreter) { return *interpreter; } diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h index 58b2490eba00..e7d78f8360c3 100644 --- a/c10/core/impl/PyObjectSlot.h +++ b/c10/core/impl/PyObjectSlot.h @@ -6,10 +6,17 @@ #include #include -#include - namespace c10::impl { +// Function pointer type for getting the global interpreter +using GetPyInterpreterFn = PyInterpreter* (*)(); + +// Global function pointer (set by csrc initialization) +C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn; + +// Helper function to get the global interpreter +C10_API PyInterpreter* getGlobalPyInterpreter(); + struct C10_API PyObjectSlot { public: PyObjectSlot(); @@ -26,8 +33,6 @@ struct C10_API PyObjectSlot { // NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after // PyObject if necessary! void init_pyobj(PyObject* pyobj) { - pyobj_interpreter_.store( - getGlobalPyInterpreter(), std::memory_order_relaxed); pyobj_ = pyobj; } @@ -55,18 +60,15 @@ struct C10_API PyObjectSlot { // @todo alban: I'm not too sure what's going on here, we can probably delete // it but it's worthwhile making sure - std::optional check_pyobj(bool ignore_hermetic_tls = false) const { - impl::PyInterpreter* interpreter = - pyobj_interpreter_.load(std::memory_order_acquire); - if (interpreter == nullptr) { + std::optional check_pyobj() const { + impl::PyInterpreter* interpreter = getGlobalPyInterpreter(); + if (interpreter == nullptr || pyobj_ == nullptr) { return std::nullopt; } - - if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) { + if (c10::impl::HermeticPyObjectTLS::get_state()) { return std::nullopt; - } else { - return _unchecked_untagged_pyobj(); } + return _unchecked_untagged_pyobj(); } PyInterpreter& load_pyobj_interpreter() const; @@ -76,30 +78,6 @@ struct C10_API PyObjectSlot { void set_owns_pyobj(bool b); private: - // This field contains the interpreter tag for this object. See - // Note [Python interpreter tag] for general context - // - // Note [Memory ordering on Python interpreter tag] - // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - // What memory_order do we need when accessing this atomic? We don't - // need a single total modification order (as provided by - // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only - // transition from -1 to some positive integer and never changes afterwards. - // Because there is only one modification, it trivially already has a total - // modification order (e.g., we don't need fences or locked instructions on - // x86) - // - // In fact, one could make a reasonable argument that relaxed reads are OK, - // due to the presence of external locking (GIL) to ensure that interactions - // with other data structures are still correctly synchronized, so that - // we fall in the "Single-Location Data Structures" case as described in - // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf - // However, on x86, it doesn't matter if I use acquire or relaxed on the load - // as I get the same assembly in both cases. So I just use the more - // conservative acquire (which will impede compiler optimizations but I don't - // care) - std::atomic pyobj_interpreter_; - // This field contains a reference to a PyObject representing this Tensor. // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new // PyObject for it and set this field. This field does not have to be diff --git a/functorch/csrc/dim/dim.cpp b/functorch/csrc/dim/dim.cpp index 8f1e561e2051..5258ba52f99c 100644 --- a/functorch/csrc/dim/dim.cpp +++ b/functorch/csrc/dim/dim.cpp @@ -1187,8 +1187,7 @@ int64_t _Tensor_ndim(mpy::handle h) { mpy::handle handle_from_tensor(Arena& A, TensorRef t) { // fast case: tensor is live in python std::optional mb_obj = - t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + t->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); if (mb_obj.has_value() && !t->unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()) { return *mb_obj; diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index ac2b03d2651c..3a3e8bfef047 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -403,11 +403,9 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) { // The TensorImpls contain PyObjectSlots that have a reference to the PyObject // associated with the TensorImpl. Swap this field as well. std::optional mb_obj_a = - a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); std::optional mb_obj_b = - b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); TORCH_INTERNAL_ASSERT( mb_obj_a.has_value() && mb_obj_b.has_value(), "Both tensors should have PyObjects tagged by the current python interpreter"); diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp index e6016a7721e8..993f8b8216a6 100644 --- a/torch/csrc/PyInterpreter.cpp +++ b/torch/csrc/PyInterpreter.cpp @@ -614,8 +614,7 @@ static void set_tensor_attr_with_capsule( const c10::TensorImpl* tensor, py::capsule& capsule, const char* attr_name) { - std::optional mb_obj = tensor->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + std::optional mb_obj = tensor->pyobj_slot()->check_pyobj(); TORCH_CHECK( mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value"); auto obj = mb_obj.value(); @@ -642,8 +641,7 @@ static c10::ArrayRef get_set_cached_attr( const c10::TensorImpl* tensor, const char* base_attr_name, const py::object& obj) { - std::optional mb_obj = - tensor->pyobj_slot()->check_pyobj(getPyInterpreter()); + std::optional mb_obj = tensor->pyobj_slot()->check_pyobj(); TORCH_CHECK( mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value"); auto tensor_obj = mb_obj.value(); diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp index 08112b41aaae..f6638bbd10c1 100644 --- a/torch/csrc/Storage.cpp +++ b/torch/csrc/Storage.cpp @@ -41,8 +41,8 @@ PyObject* THPStorage_NewWithStorage( "Creating a Storage subclass from a class that does not inherit from ", "Storage is not possible. Make sure your class inherits from Storage."); - auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + auto maybe_pyobj = + _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(); if (maybe_pyobj.has_value() && maybe_pyobj.value()) { TORCH_CHECK( allow_preexisting_pyobj, @@ -93,8 +93,7 @@ PyObject* THPStorage_Wrap(c10::Storage storage) { } c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot(); - std::optional maybe_pyobj = pyobj_slot->check_pyobj( - /*ignore_hermetic_tls=*/false); + std::optional maybe_pyobj = pyobj_slot->check_pyobj(); if (maybe_pyobj.has_value()) { auto obj = *maybe_pyobj; if (obj) { @@ -127,8 +126,8 @@ static bool THPStorage_isPreservable(THPStorage* self) { return false; } - if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/true) != (PyObject*)self) { + if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj() != + (PyObject*)self) { return false; } if (storage.use_count() <= 1) { @@ -145,8 +144,7 @@ static bool THPStorage_tryPreserve(THPStorage* self) { const auto& storage = THPStorage_Unpack(self); c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl(); - auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/true); + auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(); // NOTE: It is possible to just set the PyObjectSlot here, but the point is // that we should have already set PyObjectSlot when the storage PyObject // was created. diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index fcb011167b33..c6f65453e871 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -265,8 +265,7 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) { } std::optional mb_obj = - var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); if (mb_obj.has_value()) { auto obj = *mb_obj; if (obj) { @@ -329,8 +328,8 @@ static bool isResurrectable(THPVariable* self) { return false; } // Check if this is hermetic. If it is, no resurrection. - if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false) != (PyObject*)self) { + if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() != + (PyObject*)self) { return false; } return true; @@ -355,8 +354,7 @@ static bool THPVariable_tryResurrect(THPVariable* self) { !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()); c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl(); - auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(); TORCH_INTERNAL_ASSERT( maybe_pyobj.has_value(), @@ -1933,8 +1931,8 @@ static int THPVariable_subclass_clear(THPVariable* self) { // because Tensor asked us to (it's already destructing). if (!self->cdata.unsafeIsBorrowed() && - tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false) == (PyObject*)self) { + tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() == + (PyObject*)self) { // TODO: empirically, on OS X this assert appears to be untrue // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn // distributed/rpc/test_process_group_agent.py @@ -2120,8 +2118,7 @@ static PyObject* THPVariable_NewWithVar( // This function overwrite the Tensor's pyobj field without extra checks // Make sure it is not set otherwise we would leak memory - auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( - /*ignore_hermetic_tls=*/false); + auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); // Under some circumstances, we may attempt to create a new Python // object for a variable that already has a Python object. The most common