[CCA][Memory Snapshot] Create TraceEntryRingBuffer class for alloc_trace logic (#130741)

Summary:
Move the alloc_trace logic into a separate class, to reduce risk of deadlocks when mixing with CCA's lock. Switch to an std::mutex instead of std::recursive_mutex.

Let's us re-use the logic in TraceEntryRingBuffer class for later diffs.

Test Plan: CI, resnet run, and FBR model.

Differential Revision: D59690408

Pulled By: aaronenyeshi

Pull Request resolved: https://github.com/pytorch/pytorch/pull/130741
Approved by: https://github.com/davidberard98
This commit is contained in:
Aaron Enye Shi
2024-07-16 15:01:48 +00:00
committed by PyTorch MergeBot
parent e11c41035c
commit aa4ad711ef

View File

@ -792,6 +792,69 @@ cudaError_t cudaMallocMaybeCapturing(void** p, size_t size) {
}
}
class TraceEntryRingBuffer {
public:
TraceEntryRingBuffer() {
// alloc_trace is a pointer because we need to intentionally
// leak this on deallocation it can hold references to Python
// state which will already be destroyed when we are in exit handlers
// NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer)
alloc_trace = new std::vector<TraceEntry>();
}
void setMaxEntries(size_t size) {
std::lock_guard<std::mutex> lk(alloc_trace_lock);
alloc_trace_max_entries_ = std::max(size_t(1), size);
}
void insertTraceEntries(const TraceEntry& te) {
std::lock_guard<std::mutex> lk(alloc_trace_lock);
if (alloc_trace->size() < alloc_trace_max_entries_) {
alloc_trace->emplace_back(te);
} else {
(*alloc_trace)[alloc_trace_next++] = te;
if (alloc_trace_next == alloc_trace_max_entries_) {
alloc_trace_next = 0;
}
}
}
void getTraceEntries(std::vector<TraceEntry>& result) {
std::lock_guard<std::mutex> lk(alloc_trace_lock);
result.reserve(alloc_trace->size());
result.insert(
result.end(),
alloc_trace->begin() +
static_cast<std::vector<TraceEntry>::difference_type>(
alloc_trace_next),
alloc_trace->end());
result.insert(
result.end(),
alloc_trace->begin(),
alloc_trace->begin() +
static_cast<std::vector<TraceEntry>::difference_type>(
alloc_trace_next));
}
void clear() {
std::lock_guard<std::mutex> lk(alloc_trace_lock);
alloc_trace_next = 0;
alloc_trace->clear();
}
private:
size_t alloc_trace_max_entries_ = 1;
// Both alloc_trace and alloc_trace_next needs to be used
// under alloc_trace_lock.
std::mutex alloc_trace_lock;
size_t alloc_trace_next = 0;
std::vector<TraceEntry>*
alloc_trace; // pointer because we need to intentionally leak this on
// deallocation it can hold references to Python state which
// will already be destroyed when we are in exit handlers
};
} // anonymous namespace
} // namespace Native
@ -902,18 +965,9 @@ class DeviceCachingAllocator {
std::atomic<CreateContextFn> context_recorder_;
RecordContext record_context_ = RecordContext::NEVER;
size_t alloc_trace_max_entries_ = 1;
// Both alloc_trace and alloc_trace_next needs to be used
// under alloc_trace_lock.
// TODO: reduce risk of deadlock and remove recursive lock by
// wrapping this into a class instance.
std::recursive_mutex alloc_trace_lock;
size_t alloc_trace_next = 0;
std::vector<TraceEntry>*
alloc_trace; // pointer because we need to intentionally leak this on
// deallocation it can hold references to Python state which
// will already be destroyed when we are in exit handlers
// Ring buffer for memory snapshot TraceEntry's
TraceEntryRingBuffer alloc_buffer;
// Members specific to CUDA graphs
@ -939,9 +993,7 @@ class DeviceCachingAllocator {
public:
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
DeviceCachingAllocator()
: large_blocks(/*small=*/false),
small_blocks(/*small=*/true),
alloc_trace(new std::vector<TraceEntry>()) {
: large_blocks(/*small=*/false), small_blocks(/*small=*/true) {
stats.max_split_size =
static_cast<int64_t>(CUDAAllocatorConfig::max_split_size());
context_recorder_.store(nullptr);
@ -950,18 +1002,16 @@ class DeviceCachingAllocator {
void recordHistory(
bool enabled,
CreateContextFn context_recorder,
size_t alloc_trace_max_entries,
size_t alloc_buffer_max_entries,
RecordContext when) {
std::unique_lock<std::recursive_mutex> lock(mutex);
TORCH_CHECK(when == RecordContext::NEVER || context_recorder);
record_history = enabled;
context_recorder_.store(record_history ? context_recorder : nullptr);
alloc_trace_max_entries_ = std::max(size_t(1), alloc_trace_max_entries);
alloc_buffer.setMaxEntries(alloc_buffer_max_entries);
record_context_ = enabled ? when : RecordContext::NEVER;
if (!enabled) {
std::lock_guard<std::recursive_mutex> lk(alloc_trace_lock);
alloc_trace_next = 0;
alloc_trace->clear();
alloc_buffer.clear();
}
}
@ -1792,23 +1842,7 @@ class DeviceCachingAllocator {
const std::function<time_t(approx_time_t)>& tsc_to_us) {
std::lock_guard<std::recursive_mutex> lock(mutex);
std::vector<TraceEntry> result;
{
std::lock_guard<std::recursive_mutex> lk(alloc_trace_lock);
result.reserve(alloc_trace->size());
result.insert(
result.end(),
alloc_trace->begin() +
static_cast<std::vector<TraceEntry>::difference_type>(
alloc_trace_next),
alloc_trace->end());
result.insert(
result.end(),
alloc_trace->begin(),
alloc_trace->begin() +
static_cast<std::vector<TraceEntry>::difference_type>(
alloc_trace_next));
}
alloc_buffer.getTraceEntries(result);
// Convert all the timestamps from tsc to epoch time in microseconds.
for (auto& te : result) {
@ -2910,15 +2944,7 @@ class DeviceCachingAllocator {
}
if (record_history) {
std::lock_guard<std::recursive_mutex> lk(alloc_trace_lock);
if (alloc_trace->size() < alloc_trace_max_entries_) {
alloc_trace->emplace_back(te);
} else {
(*alloc_trace)[alloc_trace_next++] = te;
if (alloc_trace_next == alloc_trace_max_entries_) {
alloc_trace_next = 0;
}
}
alloc_buffer.insertTraceEntries(te);
}
}
};
@ -3062,11 +3088,11 @@ class NativeCachingAllocator : public CUDAAllocator {
void recordHistory(
bool enabled,
CreateContextFn context_recorder,
size_t alloc_trace_max_entries,
size_t alloc_buffer_max_entries,
RecordContext when) override {
for (auto& allocator : device_allocator) {
allocator->recordHistory(
enabled, context_recorder, alloc_trace_max_entries, when);
enabled, context_recorder, alloc_buffer_max_entries, when);
}
}