mirror of
https://github.com/pytorch/pytorch.git
synced 2025-11-19 18:14:54 +08:00
Shrink binary size (#168080)
Summary: Shrink binary size to reduce relocation overflows. The most important change is to split `intrusive_ptr::reset_()` into two functions and mark the bigger one as `C10_NOINLINE`. Differential Revision: D87308588 Pull Request resolved: https://github.com/pytorch/pytorch/pull/168080 Approved by: https://github.com/albanD, https://github.com/Skylion007, https://github.com/malfet, https://github.com/ezyang
This commit is contained in:
committed by
PyTorch MergeBot
parent
65f08eeec1
commit
d48cae96a6
@ -48,7 +48,7 @@ void warnDeprecatedDataPtr() {
|
||||
TORCH_CHECK(false, "Cannot access data pointer of Storage that is invalid.");
|
||||
}
|
||||
|
||||
void StorageImpl::incref_pyobject() const {
|
||||
void StorageImpl::incref_pyobject() const noexcept {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
@ -59,12 +59,12 @@ void StorageImpl::incref_pyobject() const {
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void StorageImpl::decref_pyobject() const {
|
||||
void StorageImpl::decref_pyobject() const noexcept {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool StorageImpl::try_incref_pyobject() const {
|
||||
bool StorageImpl::try_incref_pyobject() const noexcept {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
|
||||
@ -105,11 +105,11 @@ struct C10_API StorageImpl : public c10::intrusive_ptr_target {
|
||||
data_ptr_.clear();
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
void incref_pyobject() const noexcept override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
void decref_pyobject() const noexcept override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
bool try_incref_pyobject() const noexcept override final;
|
||||
|
||||
size_t nbytes() const {
|
||||
// OK to do this instead of maybe_as_int as nbytes is guaranteed positive
|
||||
|
||||
@ -988,7 +988,7 @@ void TensorImpl::empty_tensor_restride_symint(MemoryFormat memory_format) {
|
||||
}
|
||||
}
|
||||
|
||||
void TensorImpl::incref_pyobject() const {
|
||||
void TensorImpl::incref_pyobject() const noexcept {
|
||||
// Because intrusive_ptr incref uses relaxed memory order, we need to
|
||||
// do an acquire fence to ensure that the kHasPyObject bit was
|
||||
// observed before the load of the PyObject* below.
|
||||
@ -999,12 +999,12 @@ void TensorImpl::incref_pyobject() const {
|
||||
(*pyobj_slot_.pyobj_interpreter())->incref(obj);
|
||||
}
|
||||
|
||||
void TensorImpl::decref_pyobject() const {
|
||||
void TensorImpl::decref_pyobject() const noexcept {
|
||||
PyObject* obj = pyobj_slot_.load_pyobj();
|
||||
(*pyobj_slot_.pyobj_interpreter())->decref(obj);
|
||||
}
|
||||
|
||||
bool TensorImpl::try_incref_pyobject() const {
|
||||
bool TensorImpl::try_incref_pyobject() const noexcept {
|
||||
c10::impl::PyInterpreter* interp = pyobj_slot_.pyobj_interpreter();
|
||||
if (C10_UNLIKELY(!interp)) {
|
||||
return false;
|
||||
|
||||
@ -2178,11 +2178,11 @@ struct C10_API TensorImpl : public c10::intrusive_ptr_target {
|
||||
return &pyobj_slot_;
|
||||
}
|
||||
|
||||
void incref_pyobject() const override final;
|
||||
void incref_pyobject() const noexcept override final;
|
||||
|
||||
void decref_pyobject() const override final;
|
||||
void decref_pyobject() const noexcept override final;
|
||||
|
||||
bool try_incref_pyobject() const override final;
|
||||
bool try_incref_pyobject() const noexcept override final;
|
||||
|
||||
private:
|
||||
// See NOTE [std::optional operator usage in CUDA]
|
||||
|
||||
@ -68,6 +68,10 @@ inline bool has_pyobject(uint64_t combined_refcount) {
|
||||
return (combined_refcount & kHasPyObject) != 0;
|
||||
}
|
||||
|
||||
inline bool is_uniquely_owned(uint64_t combined_refcount) {
|
||||
return (combined_refcount & ~detail::kHasPyObject) == detail::kUniqueRef;
|
||||
}
|
||||
|
||||
// The only requirement for refcount increment is that it happens-before
|
||||
// decrement, so no additional memory ordering is needed.
|
||||
inline uint64_t atomic_combined_refcount_increment(
|
||||
@ -287,9 +291,9 @@ class C10_API intrusive_ptr_target {
|
||||
* These two methods are called when the refcount transitions between one
|
||||
* and two and the object has a PyObject wrapper.
|
||||
*/
|
||||
virtual void incref_pyobject() const {}
|
||||
virtual void decref_pyobject() const {}
|
||||
virtual bool try_incref_pyobject() const {
|
||||
virtual void incref_pyobject() const noexcept {}
|
||||
virtual void decref_pyobject() const noexcept {}
|
||||
virtual bool try_incref_pyobject() const noexcept {
|
||||
return false;
|
||||
}
|
||||
|
||||
@ -363,7 +367,7 @@ class intrusive_ptr final {
|
||||
template <typename, typename...>
|
||||
friend class pybind11::class_;
|
||||
|
||||
void retain_() {
|
||||
void retain_() noexcept {
|
||||
if (target_ != NullType::singleton()) {
|
||||
uint64_t combined = detail::atomic_combined_refcount_increment(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
@ -377,9 +381,7 @@ class intrusive_ptr final {
|
||||
// PyObject. In other words, we need to ensure that the PyObject stays
|
||||
// alive now that we have a C++ reference to this object in addition to
|
||||
// the PyObject itself.
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) {
|
||||
target_->incref_pyobject();
|
||||
}
|
||||
} else {
|
||||
@ -392,51 +394,60 @@ class intrusive_ptr final {
|
||||
|
||||
void reset_() noexcept {
|
||||
if (target_ != NullType::singleton()) {
|
||||
if (is_uniquely_owned()) {
|
||||
// Both counts are 1, so there are no weak references and
|
||||
// we are releasing the last strong reference. No other
|
||||
// threads can observe the effects of this target_ deletion
|
||||
// call (e.g. calling use_count()) without a data race.
|
||||
target_->combined_refcount_.store(0, std::memory_order_relaxed);
|
||||
delete target_;
|
||||
reset_not_null_(target_);
|
||||
}
|
||||
}
|
||||
|
||||
// C10_NOINLINE to keep binary size a bit smaller. We pass TTarget* here
|
||||
// to avoid an extra pointer dereference in the call from reset_().
|
||||
C10_NOINLINE static void reset_not_null_(TTarget* target) noexcept {
|
||||
if (detail::is_uniquely_owned(
|
||||
target->combined_refcount_.load(std::memory_order_acquire))) {
|
||||
// Both counts are 1, so there are no weak references and
|
||||
// we are releasing the last strong reference. No other
|
||||
// threads can observe the effects of this target deletion
|
||||
// call (e.g. calling use_count()) without a data race.
|
||||
target->combined_refcount_.store(0, std::memory_order_relaxed);
|
||||
delete target;
|
||||
return;
|
||||
}
|
||||
|
||||
auto combined_refcount = detail::atomic_combined_refcount_decrement(
|
||||
target->combined_refcount_, detail::kReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined_refcount);
|
||||
bool has_pyobject = detail::has_pyobject(combined_refcount);
|
||||
if (new_refcount == 0) {
|
||||
if (detail::weakcount(combined_refcount) == 1) {
|
||||
delete target;
|
||||
return;
|
||||
}
|
||||
|
||||
auto combined_refcount = detail::atomic_combined_refcount_decrement(
|
||||
target_->combined_refcount_, detail::kReferenceCountOne);
|
||||
uint32_t new_refcount = detail::refcount(combined_refcount);
|
||||
bool has_pyobject = detail::has_pyobject(combined_refcount);
|
||||
if (new_refcount == 0) {
|
||||
bool should_delete = detail::weakcount(combined_refcount) == 1;
|
||||
// See comment above about weakcount. As long as refcount>0,
|
||||
// weakcount is one larger than the actual number of weak references.
|
||||
// So we need to decrement it here.
|
||||
if (!should_delete) {
|
||||
// justification for const_cast: release_resources is basically a
|
||||
// destructor and a destructor always mutates the object, even for
|
||||
// const objects.
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
|
||||
const_cast<std::remove_const_t<TTarget>*>(target_)
|
||||
->release_resources();
|
||||
should_delete = detail::atomic_weakcount_decrement(
|
||||
target_->combined_refcount_) == 0;
|
||||
}
|
||||
if (should_delete) {
|
||||
delete target_;
|
||||
}
|
||||
} else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 2 to 1, we need to decref the
|
||||
// PyObject. In other words, we don't want to keep the PyObject alive if
|
||||
// there are no C++ references to this object other than the PyObject
|
||||
// itself.
|
||||
if (C10_UNLIKELY(has_pyobject && new_refcount == 1)) {
|
||||
target_->decref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!has_pyobject,
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
// See comment above about weakcount. As long as refcount>0,
|
||||
// weakcount is one larger than the actual number of weak references.
|
||||
// So we need to decrement it here.
|
||||
release_resources_and_decrement_weakrefs_(target);
|
||||
} else if constexpr (detail::TargetTraits<TTarget>::can_have_pyobject) {
|
||||
// If the refcount transitioned from 2 to 1, we need to decref the
|
||||
// PyObject. In other words, we don't want to keep the PyObject alive if
|
||||
// there are no C++ references to this object other than the PyObject
|
||||
// itself.
|
||||
if (has_pyobject && new_refcount == 1) {
|
||||
target->decref_pyobject();
|
||||
}
|
||||
} else {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
|
||||
!has_pyobject,
|
||||
"TargetTraits indicates that type cannot have PyObject, but refcount has PyObject bit set.");
|
||||
}
|
||||
}
|
||||
|
||||
C10_NOINLINE static void release_resources_and_decrement_weakrefs_(
|
||||
TTarget* target) noexcept {
|
||||
// justification for const_cast: release_resources is basically a
|
||||
// destructor and a destructor always mutates the object, even for
|
||||
// const objects.
|
||||
const_cast<std::remove_const_t<TTarget>*>(target)->release_resources();
|
||||
if (detail::atomic_weakcount_decrement(target->combined_refcount_) == 0) {
|
||||
delete target;
|
||||
}
|
||||
}
|
||||
|
||||
@ -607,9 +618,8 @@ class intrusive_ptr final {
|
||||
*/
|
||||
bool is_uniquely_owned() const noexcept {
|
||||
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(target_ != NullType::singleton());
|
||||
uint64_t combined =
|
||||
target_->combined_refcount_.load(std::memory_order_acquire);
|
||||
return (combined & ~detail::kHasPyObject) == detail::kUniqueRef;
|
||||
return detail::is_uniquely_owned(
|
||||
target_->combined_refcount_.load(std::memory_order_acquire));
|
||||
}
|
||||
|
||||
/**
|
||||
@ -1174,9 +1184,7 @@ inline void incref(intrusive_ptr_target* self) {
|
||||
self->combined_refcount_, detail::kReferenceCountOne);
|
||||
|
||||
#ifndef C10_MOBILE
|
||||
if (C10_UNLIKELY(
|
||||
detail::has_pyobject(combined) &&
|
||||
detail::refcount(combined) == 2)) {
|
||||
if (detail::has_pyobject(combined) && detail::refcount(combined) == 2) {
|
||||
self->incref_pyobject();
|
||||
}
|
||||
#else
|
||||
|
||||
Reference in New Issue
Block a user