mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Follows #137407 Pull Request resolved: https://github.com/pytorch/pytorch/pull/137459 Approved by: https://github.com/Skylion007
466 lines
16 KiB
C++
466 lines
16 KiB
C++
#include <c10/core/impl/alloc_cpu.h>
|
|
#include <c10/mobile/CPUProfilingAllocator.h>
|
|
#include <c10/util/Exception.h>
|
|
#include <c10/util/irange.h>
|
|
|
|
#include <map>
|
|
#include <set>
|
|
|
|
namespace c10 {
|
|
|
|
namespace {
|
|
thread_local AllocationPlanner* allocation_planner{nullptr};
|
|
thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
|
|
|
|
struct MemBlock {
|
|
uint64_t start_offset, end_offset;
|
|
MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
|
|
bool operator<(const MemBlock& other) const {
|
|
return start_offset < other.start_offset;
|
|
}
|
|
};
|
|
|
|
enum class EventType { Allocate = 0, Free, Invalid };
|
|
|
|
struct MemEvent {
|
|
uint64_t time;
|
|
uint64_t allocation_id;
|
|
uint64_t size;
|
|
EventType type{EventType::Invalid};
|
|
MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
|
|
: time(t), allocation_id(id), size(s), type(e) {}
|
|
};
|
|
|
|
bool overlaps(const MemBlock& a, const MemBlock& b) {
|
|
// two blocks dont overlap if
|
|
// |---a--------|--------------b--------|
|
|
// strat_a end_a <= start_b end_b
|
|
return !(
|
|
(a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
|
|
}
|
|
|
|
bool validate_allocation_plan(
|
|
const std::vector<MemEvent>& alloc_events,
|
|
const std::vector<uint64_t>& allocation_offsets) {
|
|
std::set<MemBlock> allocations;
|
|
for (const auto& event : alloc_events) {
|
|
auto alloc_id = event.allocation_id;
|
|
// Skip allocations not managed by AllocationPlan
|
|
if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
|
|
continue;
|
|
}
|
|
auto start_offset = allocation_offsets[alloc_id];
|
|
auto end_offset = allocation_offsets[alloc_id] + event.size;
|
|
MemBlock mem_block(start_offset, end_offset);
|
|
if (event.type == EventType::Allocate) {
|
|
auto it = allocations.lower_bound(mem_block);
|
|
if (it != allocations.end()) {
|
|
auto next_block = *it;
|
|
if (overlaps(next_block, mem_block)) {
|
|
return false;
|
|
}
|
|
}
|
|
if (it != allocations.begin()) {
|
|
auto prev_block = *(--it);
|
|
if (overlaps(prev_block, mem_block)) {
|
|
return false;
|
|
}
|
|
}
|
|
allocations.emplace(mem_block);
|
|
} else if (event.type == EventType::Free) {
|
|
auto it = allocations.find(mem_block);
|
|
TORCH_CHECK(
|
|
(*it).end_offset == end_offset,
|
|
"Enf offset of allocation being freed must match the one recorded.");
|
|
TORCH_CHECK(
|
|
it != allocations.end(),
|
|
"ProfilingAllocator: Allocate event "
|
|
"must have preceded deallocate event.");
|
|
allocations.erase(it);
|
|
} else {
|
|
TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
std::vector<MemEvent> create_and_sort_mem_events(
|
|
const std::vector<uint64_t>& allocation_sizes,
|
|
const std::vector<uint64_t>& allocation_lifetimes) {
|
|
std::vector<MemEvent> events;
|
|
for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
|
|
// If observed allocation are freed outside the scope of
|
|
// observation, then allocations are not managed by the
|
|
// AllocationPlan.
|
|
if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
|
|
continue;
|
|
}
|
|
events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
|
|
events.emplace_back(
|
|
allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
|
|
}
|
|
std::sort(
|
|
events.begin(),
|
|
events.end(),
|
|
[](const MemEvent& a, const MemEvent& b) -> bool {
|
|
return a.time < b.time;
|
|
});
|
|
return events;
|
|
}
|
|
|
|
std::vector<uint64_t> formulate_greedy_allocation_plan(
|
|
const std::vector<uint64_t>& allocation_sizes,
|
|
const std::vector<uint64_t>& allocation_lifetimes) {
|
|
// Step 1. Construct all allocation/free events.
|
|
// Sort these events by timestamp.
|
|
// Step 2. Iterate through all events.
|
|
// 2.1 If allocate event:
|
|
// Find all candidate in free_size_to_offset map
|
|
// Greedily pick the first one.
|
|
// Remove the entry from free_size_to_offset map.
|
|
// new_offset = offset + request_size
|
|
// new_size = size - request_size
|
|
// Add new entry to both maps
|
|
// 2.2 If free event.
|
|
// Check if the returned offset merges with another chunk.
|
|
// If so merge until no more merging is possible.
|
|
// If returned offset does not merge, then
|
|
// just return it as a chunk.
|
|
|
|
// lower_bound on this map will get all candidates of
|
|
// the right size for allocation.
|
|
std::map<uint64_t, uint64_t> free_size_to_offset;
|
|
// This provides fast lookup when we want to insert freed block
|
|
// back, especially when we want to merge blocks.
|
|
ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
|
|
free_start_offset_to_size_iter;
|
|
ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
|
|
free_end_offset_to_size_iter;
|
|
// Upon free end_ptr = offset + size
|
|
// If end_ptr exists merge freed allocation
|
|
// Also find corresponding offset in size_to_offset
|
|
// Remove that entry and update with new size and offset
|
|
// If end_ptr does not exist then just insert offset,size
|
|
// in map and correspondingly size, offset in the other map.
|
|
// Merging should always be done recursively until no more chunks
|
|
// that can be found.
|
|
// After last free we should have only one entry left in these maps.
|
|
|
|
std::vector<uint64_t> allocation_offsets(
|
|
allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
|
|
auto mem_events =
|
|
create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
|
|
uint64_t max_offset{0};
|
|
for (const auto& mem_event : mem_events) {
|
|
uint64_t alloc_offset = 0;
|
|
uint64_t new_offset = 0, new_size = 0;
|
|
if (mem_event.type == EventType::Allocate) {
|
|
auto it = free_size_to_offset.lower_bound(mem_event.size);
|
|
if (it == free_size_to_offset.end()) {
|
|
// If there is no contiguous block of the size requested
|
|
// allocate a new one.
|
|
alloc_offset = max_offset;
|
|
max_offset += mem_event.size;
|
|
} else {
|
|
// If we have found a block of the size we want
|
|
// 1. change the block by allocating out of it.
|
|
// 1.1 Erase the entire block
|
|
// 1.2 Erase the reverse map entries
|
|
// 2. If block still has space left insert the remainder back in map.
|
|
// Including reverse map entries.
|
|
alloc_offset = it->second;
|
|
new_offset = alloc_offset + mem_event.size;
|
|
new_size = it->first - mem_event.size;
|
|
free_size_to_offset.erase(it);
|
|
free_start_offset_to_size_iter.erase(alloc_offset);
|
|
free_end_offset_to_size_iter.erase(alloc_offset + it->first);
|
|
if (new_size > 0) {
|
|
auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
|
|
free_start_offset_to_size_iter.emplace(new_offset, ref_it);
|
|
free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
|
|
}
|
|
}
|
|
allocation_offsets[mem_event.allocation_id] = alloc_offset;
|
|
} else {
|
|
// 1. Check if freed block is adjacent to an existing free block
|
|
// at its end boundary. This is done by checking
|
|
// free_end_offset_to_size_iter.
|
|
// If we find such a block, remove it and adjust size of
|
|
// the block being freed.
|
|
// 2. Similarly check if freed block is adjacent to an existing
|
|
// free block at start boundary. This is done by checking
|
|
// free_start_offset_to_size_iter.
|
|
// If we find such a block, remove it and adjust size of
|
|
// the block being freed.
|
|
// 3. Insert the freed block in map.
|
|
auto freed_offset = allocation_offsets[mem_event.allocation_id];
|
|
auto freed_size = mem_event.size;
|
|
auto end_offset = freed_offset + freed_size;
|
|
// Merge when another free block exist at the end of this block
|
|
auto end_it = free_start_offset_to_size_iter.find(end_offset);
|
|
if (end_it != free_start_offset_to_size_iter.end()) {
|
|
auto merge_block_iter = end_it->second;
|
|
auto merge_block_size = merge_block_iter->first;
|
|
freed_size += merge_block_size;
|
|
free_size_to_offset.erase(merge_block_iter);
|
|
free_start_offset_to_size_iter.erase(end_it);
|
|
// If the block is being merged then also remove it from
|
|
// free_end_offset_to_size_iter
|
|
free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
|
|
}
|
|
// Merge when freed block exist at the end of another free block
|
|
auto start_it = free_end_offset_to_size_iter.find(freed_offset);
|
|
if (start_it != free_end_offset_to_size_iter.end()) {
|
|
auto merge_block_iter = start_it->second;
|
|
auto merge_block_size = merge_block_iter->first;
|
|
freed_size += merge_block_size;
|
|
freed_offset -= merge_block_size;
|
|
free_size_to_offset.erase(merge_block_iter);
|
|
free_end_offset_to_size_iter.erase(start_it);
|
|
// If the block is being merged then also remove it from
|
|
// free_start_offset_to_size_iter
|
|
free_start_offset_to_size_iter.erase(freed_offset);
|
|
}
|
|
auto freed_block_it =
|
|
free_size_to_offset.emplace(freed_size, freed_offset).first;
|
|
free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
|
|
free_end_offset_to_size_iter.emplace(
|
|
freed_offset + freed_size, freed_block_it);
|
|
}
|
|
}
|
|
TORCH_CHECK(
|
|
validate_allocation_plan(mem_events, allocation_offsets),
|
|
"ProfilingAllocator: Allocation plan invalid.");
|
|
return allocation_offsets;
|
|
}
|
|
|
|
} // namespace
|
|
|
|
void AllocationPlan::clear() {
|
|
allocation_sizes.clear();
|
|
allocation_lifetimes.clear();
|
|
allocation_offsets.clear();
|
|
}
|
|
|
|
void AllocationPlanner::record_allocation(
|
|
const uint64_t size,
|
|
const void* ptr) {
|
|
if (validation_mode_) {
|
|
validation_success = validation_success && validate_allocation(size, ptr);
|
|
return;
|
|
}
|
|
allocation_plan_->allocation_sizes.push_back(size);
|
|
allocation_plan_->allocation_lifetimes.push_back(
|
|
std::numeric_limits<uint64_t>::max());
|
|
allocation_ptr_to_id_[ptr] = allocation_id_;
|
|
allocation_id_++;
|
|
}
|
|
|
|
void AllocationPlanner::record_free(const void* ptr) {
|
|
if (validation_mode_) {
|
|
validation_success = validation_success && validate_free(ptr);
|
|
return;
|
|
}
|
|
auto it = allocation_ptr_to_id_.find(ptr);
|
|
if (it == allocation_ptr_to_id_.end()) {
|
|
// Free being recorded was allocated outside of WithProfileAllocationGuard
|
|
return;
|
|
}
|
|
auto id = it->second;
|
|
TORCH_CHECK(
|
|
id < allocation_plan_->allocation_lifetimes.size(),
|
|
"Allocation must have been recorded during record_allocation.");
|
|
allocation_plan_->allocation_lifetimes[id] = allocation_id_;
|
|
}
|
|
|
|
bool AllocationPlanner::validate_allocation(
|
|
const uint64_t size,
|
|
const void* ptr) {
|
|
if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
|
|
allocation_plan_->allocation_sizes[allocation_id_] != size) {
|
|
TORCH_WARN(
|
|
"Allocation request does not match plan:",
|
|
"Allocation id:",
|
|
allocation_id_,
|
|
", Number of recorded allocations:",
|
|
allocation_plan_->allocation_sizes.size(),
|
|
", Recorded size of the requested allocation:",
|
|
allocation_plan_->allocation_sizes[allocation_id_],
|
|
", but got:",
|
|
size);
|
|
|
|
return false;
|
|
}
|
|
allocation_ptr_to_id_[ptr] = allocation_id_;
|
|
allocation_id_++;
|
|
return true;
|
|
}
|
|
|
|
bool AllocationPlanner::validate_free(const void* ptr) {
|
|
auto it = allocation_ptr_to_id_.find(ptr);
|
|
if (it == allocation_ptr_to_id_.end()) {
|
|
// Allocation that was made outside the validation scope is being freed here
|
|
return true;
|
|
}
|
|
auto id = (*it).second;
|
|
TORCH_CHECK(
|
|
id < allocation_plan_->allocation_lifetimes.size(),
|
|
"Allocation must have been recorded during validate_allocation.");
|
|
auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
|
|
return (lifetime_id == allocation_id_);
|
|
}
|
|
|
|
void AllocationPlanner::formulate_plan() {
|
|
allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
|
|
allocation_plan_->allocation_sizes,
|
|
allocation_plan_->allocation_lifetimes);
|
|
allocation_plan_->total_size = 0;
|
|
for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
|
|
if (allocation_plan_->allocation_lifetimes[i] ==
|
|
std::numeric_limits<uint64_t>::max()) {
|
|
continue;
|
|
}
|
|
auto limit = allocation_plan_->allocation_offsets[i] +
|
|
allocation_plan_->allocation_sizes[i];
|
|
allocation_plan_->total_size =
|
|
std::max(allocation_plan_->total_size, limit);
|
|
}
|
|
}
|
|
|
|
void AllocationPlanner::clear() {
|
|
allocation_plan_->clear();
|
|
allocation_ptr_to_id_.clear();
|
|
}
|
|
|
|
void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
|
|
TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
|
|
plan_ = plan;
|
|
allocation_id_ = 0;
|
|
allocation_ptr_to_id_.clear();
|
|
if (current_size_ < plan->total_size) {
|
|
// Free existing memory and reallocate for larger size.
|
|
c10::free_cpu(blob_);
|
|
blob_ = c10::alloc_cpu(plan->total_size);
|
|
current_size_ = plan->total_size;
|
|
}
|
|
}
|
|
|
|
void CPUProfilingAllocator::unset_plan() {
|
|
allocation_id_ = 0;
|
|
allocation_ptr_to_id_.clear();
|
|
plan_ = nullptr;
|
|
}
|
|
|
|
void* CPUProfilingAllocator::allocate(const size_t bytes) {
|
|
TORCH_CHECK(
|
|
bytes == plan_->allocation_sizes[allocation_id_],
|
|
"Got allocation request that does not match with the plan.");
|
|
if (plan_->allocation_lifetimes[allocation_id_] ==
|
|
std::numeric_limits<uint64_t>::max()) {
|
|
// This allocation is not managed by ProfilingAllocator.
|
|
allocation_id_++;
|
|
return c10::alloc_cpu(bytes);
|
|
}
|
|
void* ptr = reinterpret_cast<uint8_t*>(blob_) +
|
|
plan_->allocation_offsets[allocation_id_];
|
|
allocation_ptr_to_id_[ptr] = allocation_id_;
|
|
allocation_id_++;
|
|
return ptr;
|
|
}
|
|
|
|
void CPUProfilingAllocator::free(void* const ptr) {
|
|
auto it = allocation_ptr_to_id_.find(ptr);
|
|
if (it == allocation_ptr_to_id_.end()) {
|
|
// Either
|
|
// 1. Allocation that was made outside the validation scope is being freed
|
|
// here or
|
|
// 2. Allocation that is not managed by profiling allocator is being freed.
|
|
// Example of the second type
|
|
// Tensor out;
|
|
// for (....) {
|
|
// {
|
|
// CPUProfilingAllocator
|
|
// out = ...some op (This also frees previous memory held by out)
|
|
// }
|
|
// out is used..
|
|
// }
|
|
c10::free_cpu(ptr);
|
|
return;
|
|
}
|
|
auto id = it->second;
|
|
TORCH_CHECK(
|
|
id < plan_->allocation_lifetimes.size(),
|
|
"Freeing allocation that is not accordingly to the plan.");
|
|
auto lifetime_id = plan_->allocation_lifetimes[id];
|
|
TORCH_CHECK(
|
|
lifetime_id == allocation_id_,
|
|
"Lifetime of allocations do not match: allocation_id ",
|
|
id,
|
|
", expected:",
|
|
lifetime_id,
|
|
", got:",
|
|
allocation_id_);
|
|
}
|
|
|
|
CPUProfilingAllocator::~CPUProfilingAllocator() {
|
|
c10::free_cpu(blob_);
|
|
}
|
|
|
|
WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
|
|
// Nesting of allocation profiling does not seem meaningful.
|
|
TORCH_CHECK(
|
|
allocation_planner == nullptr,
|
|
"Nesting profiling allocations is not supported.");
|
|
planner_ = std::make_unique<AllocationPlanner>(plan);
|
|
planner_->clear();
|
|
allocation_planner = planner_.get();
|
|
}
|
|
|
|
WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
|
|
planner_->formulate_plan();
|
|
allocation_planner = nullptr;
|
|
}
|
|
|
|
WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
|
|
AllocationPlan* plan,
|
|
bool* success) {
|
|
// Nesting of allocation profiling does not seem meaningful.
|
|
TORCH_CHECK(
|
|
allocation_planner == nullptr,
|
|
"Nesting profiling allocations is not supported.");
|
|
planner_ = std::make_unique<AllocationPlanner>(plan, true);
|
|
success_ = success;
|
|
allocation_planner = planner_.get();
|
|
}
|
|
|
|
WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
|
|
*success_ = planner_->validation_success;
|
|
allocation_planner = nullptr;
|
|
}
|
|
|
|
AllocationPlanner* GetThreadLocalAllocationPlanner() {
|
|
return allocation_planner;
|
|
}
|
|
|
|
WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
|
|
CPUProfilingAllocator* allocator,
|
|
const AllocationPlan* plan) {
|
|
// Nesting of profiling allocator is not supported.
|
|
TORCH_CHECK(
|
|
profiling_allocator == nullptr,
|
|
"Nesting profiling allocators is not supported.");
|
|
profiling_allocator = allocator;
|
|
profiling_allocator->set_plan(plan);
|
|
}
|
|
|
|
WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
|
|
profiling_allocator->unset_plan();
|
|
profiling_allocator = nullptr;
|
|
}
|
|
|
|
CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
|
|
return profiling_allocator;
|
|
}
|
|
|
|
} // namespace c10
|