mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Relax fences for intrusive ptr's refcnt (#162072)
Summary: Relax fences for intrusive ptr's refcnt dec op for performance testing. lock needs acquire when the op succeeds and relaxed if the op is not. In addition, the expire call and the following refcnt reads were merged to remove one extra read. incref does not need any fences because the caller should already have a valid reference. use_count follows the same reasoning. decref only needs a release fence to make sure every write op prior to it has finished. When the refcnt goes to zero, there should be a acquire fence to make sure no read op reads stale data before the object is destructed. However, microbenchmark showed that the optimal fence for decref is not performing noticeably better than the current decref with acq-rel, so we keep decref as-is. This change should have no material impact on x86, but for Arm64 (and other CPUs with weak memory models), it should boost performance. Pull Request resolved: https://github.com/pytorch/pytorch/pull/162072 Approved by: https://github.com/swolchok, https://github.com/yfeldblum
This commit is contained in:
committed by
PyTorch MergeBot
parent
e0c910149c
commit
36338fc7f2
@ -196,20 +196,25 @@ TTarget* assign_ptr_(TTarget* rhs) {
|
||||
}
|
||||
}
|
||||
|
||||
// Increment needs to be acquire-release to make use_count() and
|
||||
// unique() reliable.
|
||||
// The only requirement for refcount increment is that it happens-before
|
||||
// decrement, so no additional memory ordering is needed.
|
||||
inline uint32_t atomic_refcount_increment(std::atomic<uint32_t>& refcount) {
|
||||
return refcount.fetch_add(1, std::memory_order_acq_rel) + 1;
|
||||
return refcount.fetch_add(1, std::memory_order_relaxed) + 1;
|
||||
}
|
||||
|
||||
// weak_use_count() is only used for testing, so we don't need it to
|
||||
// be reliable. Relaxed should be fine.
|
||||
inline uint32_t atomic_weakcount_increment(std::atomic<uint32_t>& weakcount) {
|
||||
return weakcount.fetch_add(1, std::memory_order_relaxed) + 1;
|
||||
}
|
||||
|
||||
// Both decrements need to be acquire-release for correctness. See
|
||||
// e.g. std::shared_ptr implementation.
|
||||
// The requirement is that all modifications to the managed object happen-before
|
||||
// invocation of the managed object destructor, and that allocation of the
|
||||
// managed object storage happens-before deallocation of the storage.
|
||||
//
|
||||
// To get this ordering, all non-final decrements must synchronize-with the
|
||||
// final decrement. So all non-final decrements have to store-release while the
|
||||
// final decrement has to load-acquire, either directly or with the help of
|
||||
// fences. But it's easiest just to have all decrements be acq-rel. And it turns
|
||||
// out, on modern architectures and chips, it's also fastest.
|
||||
inline uint32_t atomic_refcount_decrement(std::atomic<uint32_t>& refcount) {
|
||||
return refcount.fetch_sub(1, std::memory_order_acq_rel) - 1;
|
||||
}
|
||||
@ -332,7 +337,7 @@ class intrusive_ptr final {
|
||||
intrusive_ptr() noexcept
|
||||
: intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
|
||||
|
||||
intrusive_ptr(std::nullptr_t) noexcept
|
||||
/* implicit */ intrusive_ptr(std::nullptr_t) noexcept
|
||||
: intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {}
|
||||
|
||||
// This constructor will not increase the ref counter for you.
|
||||
@ -445,14 +450,14 @@ class intrusive_ptr final {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return 0;
|
||||
}
|
||||
return target_->refcount_.load(std::memory_order_acquire);
|
||||
return target_->refcount_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
uint32_t weak_use_count() const noexcept {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return 0;
|
||||
}
|
||||
return target_->weakcount_.load(std::memory_order_acquire);
|
||||
return target_->weakcount_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
bool unique() const noexcept {
|
||||
@ -851,14 +856,14 @@ class weak_intrusive_ptr final {
|
||||
return 0;
|
||||
}
|
||||
return target_->refcount_.load(
|
||||
std::memory_order_acquire); // refcount, not weakcount!
|
||||
std::memory_order_relaxed); // refcount, not weakcount!
|
||||
}
|
||||
|
||||
uint32_t weak_use_count() const noexcept {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return 0;
|
||||
}
|
||||
return target_->weakcount_.load(std::memory_order_acquire);
|
||||
return target_->weakcount_.load(std::memory_order_relaxed);
|
||||
}
|
||||
|
||||
bool expired() const noexcept {
|
||||
@ -866,18 +871,22 @@ class weak_intrusive_ptr final {
|
||||
}
|
||||
|
||||
intrusive_ptr<TTarget, NullType> lock() const noexcept {
|
||||
if (expired()) {
|
||||
if (target_ == NullType::singleton()) {
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
} else {
|
||||
auto refcount = target_->refcount_.load(std::memory_order_seq_cst);
|
||||
auto refcount = target_->refcount_.load(std::memory_order_relaxed);
|
||||
do {
|
||||
if (refcount == 0) {
|
||||
// Object already destructed, no strong references left anymore.
|
||||
// Return nullptr.
|
||||
return intrusive_ptr<TTarget, NullType>();
|
||||
}
|
||||
} while (
|
||||
!target_->refcount_.compare_exchange_weak(refcount, refcount + 1));
|
||||
} while (!target_->refcount_.compare_exchange_weak(
|
||||
refcount,
|
||||
refcount + 1,
|
||||
std::memory_order_acquire,
|
||||
std::memory_order_relaxed));
|
||||
|
||||
return intrusive_ptr<TTarget, NullType>(
|
||||
target_, raw::DontIncreaseRefcount{});
|
||||
}
|
||||
|
Reference in New Issue
Block a user