diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 67a1e786fc90..209a47ad4dfc 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -127,8 +127,6 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - # More memory is needed to build with asan - runner: linux.2xlarge.memory runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index 9675ee4169f4..19b402f85457 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -140,8 +140,6 @@ jobs: uses: ./.github/workflows/_linux-build.yml needs: get-label-type with: - # More memory is needed to build with asan - runner: linux.2xlarge.memory runner_prefix: "${{ needs.get-label-type.outputs.label-type }}" build-environment: linux-jammy-py3.10-clang18-asan docker-image-name: ci-image:pytorch-linux-jammy-py3-clang18-asan diff --git a/c10/core/TensorImpl.h b/c10/core/TensorImpl.h index 2269765cb718..972181327b1f 100644 --- a/c10/core/TensorImpl.h +++ b/c10/core/TensorImpl.h @@ -3244,7 +3244,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl { are_equal(); are_equal(); are_equal(); - are_equal(); + are_equal(); is_le(); are_equal(); are_equal(); @@ -3269,7 +3269,7 @@ class C10_TensorImpl_Size_Check_Dummy_Class : private TensorImpl { is_le(); is_le(); are_equal(); - are_equal(); + are_equal(); are_equal(); are_equal(); are_equal(); diff --git a/c10/core/impl/PyInterpreterHooks.h b/c10/core/impl/PyInterpreterHooks.h index 4fe025d2e778..32a17ad9a8a0 100644 --- a/c10/core/impl/PyInterpreterHooks.h +++ b/c10/core/impl/PyInterpreterHooks.h @@ -13,10 +13,11 @@ struct C10_API PyInterpreterHooksInterface { // Get the PyInterpreter instance // Stub implementation throws error when Python is not available - // We return nullptr rather than throwing an error since there are bits of c10 - // that expect an empty PyObjectSlot when python is not available. virtual PyInterpreter* getPyInterpreter() const { - return nullptr; + TORCH_CHECK( + false, + "PyTorch was compiled without Python support. " + "Cannot access Python interpreter from C++."); } }; diff --git a/c10/core/impl/PyObjectSlot.cpp b/c10/core/impl/PyObjectSlot.cpp index 7476ac1d4c39..0f1bfb211074 100644 --- a/c10/core/impl/PyObjectSlot.cpp +++ b/c10/core/impl/PyObjectSlot.cpp @@ -2,7 +2,7 @@ namespace c10::impl { -PyObjectSlot::PyObjectSlot() : pyobj_(nullptr) {} +PyObjectSlot::PyObjectSlot() : pyobj_interpreter_(nullptr), pyobj_(nullptr) {} PyObjectSlot::~PyObjectSlot() { maybe_destroy_pyobj(); @@ -10,9 +10,9 @@ PyObjectSlot::~PyObjectSlot() { void PyObjectSlot::maybe_destroy_pyobj() { if (owns_pyobj()) { - TORCH_INTERNAL_ASSERT(getGlobalPyInterpreter() != nullptr); + TORCH_INTERNAL_ASSERT(pyobj_interpreter_ != nullptr); TORCH_INTERNAL_ASSERT(pyobj_ != nullptr); - (*getGlobalPyInterpreter()) + (*pyobj_interpreter_.load(std::memory_order_acquire)) ->decref(_unchecked_untagged_pyobj(), /*has_pyobj_slot*/ true); // NB: this destructor can only be entered when there are no // references to this C++ object (obviously), NOR any references @@ -25,7 +25,7 @@ void PyObjectSlot::maybe_destroy_pyobj() { } PyInterpreter* PyObjectSlot::pyobj_interpreter() { - return getGlobalPyInterpreter(); + return pyobj_interpreter_.load(std::memory_order_acquire); } PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const { @@ -35,7 +35,7 @@ PyObject* PyObjectSlot::_unchecked_untagged_pyobj() const { } PyInterpreter& PyObjectSlot::load_pyobj_interpreter() const { - auto interpreter = getGlobalPyInterpreter(); + auto interpreter = pyobj_interpreter_.load(std::memory_order_acquire); if (interpreter) { return *interpreter; } diff --git a/c10/core/impl/PyObjectSlot.h b/c10/core/impl/PyObjectSlot.h index e7d78f8360c3..58b2490eba00 100644 --- a/c10/core/impl/PyObjectSlot.h +++ b/c10/core/impl/PyObjectSlot.h @@ -6,17 +6,10 @@ #include #include +#include + namespace c10::impl { -// Function pointer type for getting the global interpreter -using GetPyInterpreterFn = PyInterpreter* (*)(); - -// Global function pointer (set by csrc initialization) -C10_API extern GetPyInterpreterFn g_get_pyinterpreter_fn; - -// Helper function to get the global interpreter -C10_API PyInterpreter* getGlobalPyInterpreter(); - struct C10_API PyObjectSlot { public: PyObjectSlot(); @@ -33,6 +26,8 @@ struct C10_API PyObjectSlot { // NB: THIS FUNCTION CAN RAISE AN EXCEPTION. Make sure to clean up after // PyObject if necessary! void init_pyobj(PyObject* pyobj) { + pyobj_interpreter_.store( + getGlobalPyInterpreter(), std::memory_order_relaxed); pyobj_ = pyobj; } @@ -60,15 +55,18 @@ struct C10_API PyObjectSlot { // @todo alban: I'm not too sure what's going on here, we can probably delete // it but it's worthwhile making sure - std::optional check_pyobj() const { - impl::PyInterpreter* interpreter = getGlobalPyInterpreter(); - if (interpreter == nullptr || pyobj_ == nullptr) { + std::optional check_pyobj(bool ignore_hermetic_tls = false) const { + impl::PyInterpreter* interpreter = + pyobj_interpreter_.load(std::memory_order_acquire); + if (interpreter == nullptr) { return std::nullopt; } - if (c10::impl::HermeticPyObjectTLS::get_state()) { + + if (!ignore_hermetic_tls && c10::impl::HermeticPyObjectTLS::get_state()) { return std::nullopt; + } else { + return _unchecked_untagged_pyobj(); } - return _unchecked_untagged_pyobj(); } PyInterpreter& load_pyobj_interpreter() const; @@ -78,6 +76,30 @@ struct C10_API PyObjectSlot { void set_owns_pyobj(bool b); private: + // This field contains the interpreter tag for this object. See + // Note [Python interpreter tag] for general context + // + // Note [Memory ordering on Python interpreter tag] + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + // What memory_order do we need when accessing this atomic? We don't + // need a single total modification order (as provided by + // memory_order_seq_cst) as pyobj_interpreter_ is monotonic: it can only + // transition from -1 to some positive integer and never changes afterwards. + // Because there is only one modification, it trivially already has a total + // modification order (e.g., we don't need fences or locked instructions on + // x86) + // + // In fact, one could make a reasonable argument that relaxed reads are OK, + // due to the presence of external locking (GIL) to ensure that interactions + // with other data structures are still correctly synchronized, so that + // we fall in the "Single-Location Data Structures" case as described in + // http://www.open-std.org/jtc1/sc22/wg21/docs/papers/2020/p2055r0.pdf + // However, on x86, it doesn't matter if I use acquire or relaxed on the load + // as I get the same assembly in both cases. So I just use the more + // conservative acquire (which will impede compiler optimizations but I don't + // care) + std::atomic pyobj_interpreter_; + // This field contains a reference to a PyObject representing this Tensor. // If pyobj is nullptr, when we transfer Tensor to Python, we allocate a new // PyObject for it and set this field. This field does not have to be diff --git a/torch/csrc/Module.cpp b/torch/csrc/Module.cpp index da2628537899..567c1264fa14 100644 --- a/torch/csrc/Module.cpp +++ b/torch/csrc/Module.cpp @@ -410,9 +410,11 @@ static PyObject* THPModule_swap_tensor_impl(PyObject* _unused, PyObject* args) { // The TensorImpls contain PyObjectSlots that have a reference to the PyObject // associated with the TensorImpl. Swap this field as well. std::optional mb_obj_a = - a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); + a->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false); std::optional mb_obj_b = - b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); + b->cdata->unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false); TORCH_INTERNAL_ASSERT( mb_obj_a.has_value() && mb_obj_b.has_value(), "Both tensors should have PyObjects tagged by the current python interpreter"); diff --git a/torch/csrc/PyInterpreter.cpp b/torch/csrc/PyInterpreter.cpp index 993f8b8216a6..e6016a7721e8 100644 --- a/torch/csrc/PyInterpreter.cpp +++ b/torch/csrc/PyInterpreter.cpp @@ -614,7 +614,8 @@ static void set_tensor_attr_with_capsule( const c10::TensorImpl* tensor, py::capsule& capsule, const char* attr_name) { - std::optional mb_obj = tensor->pyobj_slot()->check_pyobj(); + std::optional mb_obj = tensor->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false); TORCH_CHECK( mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value"); auto obj = mb_obj.value(); @@ -641,7 +642,8 @@ static c10::ArrayRef get_set_cached_attr( const c10::TensorImpl* tensor, const char* base_attr_name, const py::object& obj) { - std::optional mb_obj = tensor->pyobj_slot()->check_pyobj(); + std::optional mb_obj = + tensor->pyobj_slot()->check_pyobj(getPyInterpreter()); TORCH_CHECK( mb_obj.has_value(), "Tensor subclass's PyInterpreter has no value"); auto tensor_obj = mb_obj.value(); diff --git a/torch/csrc/Storage.cpp b/torch/csrc/Storage.cpp index 2527a18dbc7f..d9775a7f7866 100644 --- a/torch/csrc/Storage.cpp +++ b/torch/csrc/Storage.cpp @@ -41,8 +41,8 @@ PyObject* THPStorage_NewWithStorage( "Creating a Storage subclass from a class that does not inherit from ", "Storage is not possible. Make sure your class inherits from Storage."); - auto maybe_pyobj = - _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj(); + auto maybe_pyobj = _storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false); if (maybe_pyobj.has_value() && maybe_pyobj.value()) { TORCH_CHECK( allow_preexisting_pyobj, @@ -93,7 +93,8 @@ PyObject* THPStorage_Wrap(c10::Storage storage) { } c10::impl::PyObjectSlot* pyobj_slot = storage_impl->pyobj_slot(); - std::optional maybe_pyobj = pyobj_slot->check_pyobj(); + std::optional maybe_pyobj = pyobj_slot->check_pyobj( + /*ignore_hermetic_tls=*/false); if (maybe_pyobj.has_value()) { auto obj = *maybe_pyobj; if (obj) { @@ -126,8 +127,8 @@ static bool THPStorage_isPreservable(THPStorage* self) { return false; } - if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj() != - (PyObject*)self) { + if (storage.unsafeGetStorageImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/true) != (PyObject*)self) { return false; } if (storage.use_count() <= 1) { @@ -144,7 +145,8 @@ static bool THPStorage_tryPreserve(THPStorage* self) { const auto& storage = THPStorage_Unpack(self); c10::StorageImpl* storage_impl = storage.unsafeGetStorageImpl(); - auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj(); + auto maybe_pyobj = storage_impl->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/true); // NOTE: It is possible to just set the PyObjectSlot here, but the point is // that we should have already set PyObjectSlot when the storage PyObject // was created. diff --git a/torch/csrc/autograd/python_variable.cpp b/torch/csrc/autograd/python_variable.cpp index 0efef58d85f2..91f4bffbd103 100644 --- a/torch/csrc/autograd/python_variable.cpp +++ b/torch/csrc/autograd/python_variable.cpp @@ -265,7 +265,8 @@ PyObject* THPVariable_Wrap(const at::TensorBase& var) { } std::optional mb_obj = - var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); + var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false); if (mb_obj.has_value()) { auto obj = *mb_obj; if (obj) { @@ -328,8 +329,8 @@ static bool isResurrectable(THPVariable* self) { return false; } // Check if this is hermetic. If it is, no resurrection. - if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() != - (PyObject*)self) { + if (tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false) != (PyObject*)self) { return false; } return true; @@ -354,7 +355,8 @@ static bool THPVariable_tryResurrect(THPVariable* self) { !tensor.unsafeGetTensorImpl()->pyobj_slot()->owns_pyobj()); c10::TensorImpl* tensor_impl = tensor.unsafeGetTensorImpl(); - auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj(); + auto maybe_pyobj = tensor_impl->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false); TORCH_INTERNAL_ASSERT( maybe_pyobj.has_value(), @@ -2220,8 +2222,8 @@ static int THPVariable_subclass_clear(THPVariable* self) { // because Tensor asked us to (it's already destructing). if (!self->cdata.unsafeIsBorrowed() && - tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj() == - (PyObject*)self) { + tensor.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false) == (PyObject*)self) { // TODO: empirically, on OS X this assert appears to be untrue // In test_py_tensors_multi_async_call - ProcessGroupRpcTestWithSpawn // distributed/rpc/test_process_group_agent.py @@ -2407,7 +2409,8 @@ static PyObject* THPVariable_NewWithVar( // This function overwrite the Tensor's pyobj field without extra checks // Make sure it is not set otherwise we would leak memory - auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj(); + auto mb_obj = _var.unsafeGetTensorImpl()->pyobj_slot()->check_pyobj( + /*ignore_hermetic_tls=*/false); // Under some circumstances, we may attempt to create a new Python // object for a variable that already has a Python object. The most common