pytorch/c10/mobile/CPUProfilingAllocator.cpp

#include <c10/core/impl/alloc_cpu.h>
#include <c10/mobile/CPUProfilingAllocator.h>
#include <c10/util/Exception.h>
#include <c10/util/irange.h>

#include <map>
#include <set>

namespace c10 {

namespace {
thread_local AllocationPlanner* allocation_planner{nullptr};
thread_local CPUProfilingAllocator* profiling_allocator{nullptr};

struct MemBlock {
  uint64_t start_offset, end_offset;
  MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
  bool operator<(const MemBlock& other) const {
    return start_offset < other.start_offset;
  }
};

enum class EventType { Allocate = 0, Free, Invalid };

struct MemEvent {
  uint64_t time;
  uint64_t allocation_id;
  uint64_t size;
  EventType type{EventType::Invalid};
  MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e)
      : time(t), allocation_id(id), size(s), type(e) {}
};

bool overlaps(const MemBlock& a, const MemBlock& b) {
  // two blocks dont overlap if
  // |---a--------|--------------b--------|
  // strat_a     end_a <= start_b       end_b
  return !(
      (a.end_offset <= b.start_offset) || (b.end_offset <= a.start_offset));
}

bool validate_allocation_plan(
    const std::vector<MemEvent>& alloc_events,
    const std::vector<uint64_t>& allocation_offsets) {
  std::set<MemBlock> allocations;
  for (const auto& event : alloc_events) {
    auto alloc_id = event.allocation_id;
    // Skip allocations not managed by AllocationPlan
    if (allocation_offsets[alloc_id] == std::numeric_limits<uint64_t>::max()) {
      continue;
    }
    auto start_offset = allocation_offsets[alloc_id];
    auto end_offset = allocation_offsets[alloc_id] + event.size;
    MemBlock mem_block(start_offset, end_offset);
    if (event.type == EventType::Allocate) {
      auto it = allocations.lower_bound(mem_block);
      if (it != allocations.end()) {
        auto next_block = *it;
        if (overlaps(next_block, mem_block)) {
          return false;
        }
      }
      if (it != allocations.begin()) {
        auto prev_block = *(--it);
        if (overlaps(prev_block, mem_block)) {
          return false;
        }
      }
      allocations.emplace(mem_block);
    } else if (event.type == EventType::Free) {
      auto it = allocations.find(mem_block);
      TORCH_CHECK(
          (*it).end_offset == end_offset,
          "Enf offset of allocation being freed must match the one recorded.");
      TORCH_CHECK(
          it != allocations.end(),
          "ProfilingAllocator: Allocate event "
          "must have preceded deallocate event.");
      allocations.erase(it);
    } else {
      TORCH_CHECK(false, "ProfilingAllocator: Invalid event type.");
    }
  }
  return true;
}

std::vector<MemEvent> create_and_sort_mem_events(
    const std::vector<uint64_t>& allocation_sizes,
    const std::vector<uint64_t>& allocation_lifetimes) {
  std::vector<MemEvent> events;
  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
    // If observed allocation are freed outside the scope of
    // observation, then allocations are not managed by the
    // AllocationPlan.
    if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
      continue;
    }
    events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
    events.emplace_back(
        allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
  }
  std::sort(
      events.begin(),
      events.end(),
      [](const MemEvent& a, const MemEvent& b) -> bool {
        return a.time < b.time;
      });
  return events;
}

std::vector<uint64_t> formulate_greedy_allocation_plan(
    const std::vector<uint64_t>& allocation_sizes,
    const std::vector<uint64_t>& allocation_lifetimes) {
  // Step 1. Construct all allocation/free events.
  //         Sort these events by timestamp.
  // Step 2. Iterate through all events.
  //  2.1 If allocate event:
  //      Find all candidate in free_size_to_offset map
  //      Greedily pick the first one.
  //      Remove the entry from free_size_to_offset map.
  //      new_offset = offset + request_size
  //      new_size = size - request_size
  //      Add new entry to both maps
  //  2.2 If free event.
  //      Check if the returned offset merges with another chunk.
  //      If so merge until no more merging is possible.
  //      If returned offset does not merge, then
  //      just return it as a chunk.

  // lower_bound on this map will get all candidates of
  // the right size for allocation.
  std::map<uint64_t, uint64_t> free_size_to_offset;
  // This provides fast lookup when we want to insert freed block
  // back, especially when we want to merge blocks.
  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
      free_start_offset_to_size_iter;
  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator>
      free_end_offset_to_size_iter;
  // Upon free end_ptr = offset + size
  // If end_ptr exists merge freed allocation
  // Also find corresponding offset in size_to_offset
  // Remove that entry and update with new size and offset
  // If end_ptr does not exist then just insert offset,size
  // in map and correspondingly size, offset in the other map.
  // Merging should always be done recursively until no more chunks
  // that can be found.
  // After last free we should have only one entry left in these maps.

  std::vector<uint64_t> allocation_offsets(
      allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
  auto mem_events =
      create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
  uint64_t max_offset{0};
  for (const auto& mem_event : mem_events) {
    uint64_t alloc_offset = 0;
    uint64_t new_offset = 0, new_size = 0;
    if (mem_event.type == EventType::Allocate) {
      auto it = free_size_to_offset.lower_bound(mem_event.size);
      if (it == free_size_to_offset.end()) {
        // If there is no contiguous block of the size requested
        // allocate a new one.
        alloc_offset = max_offset;
        max_offset += mem_event.size;
      } else {
        // If we have found a block of the size we want
        // 1. change the block by allocating out of it.
        //    1.1 Erase the entire block
        //    1.2 Erase the reverse map entries
        // 2. If block still has space left insert the remainder back in map.
        //    Including reverse map entries.
        alloc_offset = it->second;
        new_offset = alloc_offset + mem_event.size;
        new_size = it->first - mem_event.size;
        free_size_to_offset.erase(it);
        free_start_offset_to_size_iter.erase(alloc_offset);
        free_end_offset_to_size_iter.erase(alloc_offset + it->first);
        if (new_size > 0) {
          auto ref_it = free_size_to_offset.emplace(new_size, new_offset).first;
          free_start_offset_to_size_iter.emplace(new_offset, ref_it);
          free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
        }
      }
      allocation_offsets[mem_event.allocation_id] = alloc_offset;
    } else {
      // 1. Check if freed block is adjacent to an existing free block
      //    at its end boundary. This is done by checking
      //    free_end_offset_to_size_iter.
      //    If we find such a block, remove it and adjust size of
      //    the block being freed.
      // 2. Similarly check if freed block is adjacent to an existing
      //    free block at start boundary. This is done by checking
      //    free_start_offset_to_size_iter.
      //    If we find such a block, remove it and adjust size of
      //    the block being freed.
      // 3. Insert the freed block in map.
      auto freed_offset = allocation_offsets[mem_event.allocation_id];
      auto freed_size = mem_event.size;
      auto end_offset = freed_offset + freed_size;
      // Merge when another free block exist at the end of this block
      auto end_it = free_start_offset_to_size_iter.find(end_offset);
      if (end_it != free_start_offset_to_size_iter.end()) {
        auto merge_block_iter = end_it->second;
        auto merge_block_size = merge_block_iter->first;
        freed_size += merge_block_size;
        free_size_to_offset.erase(merge_block_iter);
        free_start_offset_to_size_iter.erase(end_it);
        // If the block is being merged then also remove it from
        // free_end_offset_to_size_iter
        free_end_offset_to_size_iter.erase(end_offset + merge_block_size);
      }
      // Merge when freed block exist at the end of another free block
      auto start_it = free_end_offset_to_size_iter.find(freed_offset);
      if (start_it != free_end_offset_to_size_iter.end()) {
        auto merge_block_iter = start_it->second;
        auto merge_block_size = merge_block_iter->first;
        freed_size += merge_block_size;
        freed_offset -= merge_block_size;
        free_size_to_offset.erase(merge_block_iter);
        free_end_offset_to_size_iter.erase(start_it);
        // If the block is being merged then also remove it from
        // free_start_offset_to_size_iter
        free_start_offset_to_size_iter.erase(freed_offset);
      }
      auto freed_block_it =
          free_size_to_offset.emplace(freed_size, freed_offset).first;
      free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
      free_end_offset_to_size_iter.emplace(
          freed_offset + freed_size, freed_block_it);
    }
  }
  TORCH_CHECK(
      validate_allocation_plan(mem_events, allocation_offsets),
      "ProfilingAllocator: Allocation plan invalid.");
  return allocation_offsets;
}

} // namespace

void AllocationPlan::clear() {
  allocation_sizes.clear();
  allocation_lifetimes.clear();
  allocation_offsets.clear();
}

void AllocationPlanner::record_allocation(
    const uint64_t size,
    const void* ptr) {
  if (validation_mode_) {
    validation_success = validation_success && validate_allocation(size, ptr);
    return;
  }
  allocation_plan_->allocation_sizes.push_back(size);
  allocation_plan_->allocation_lifetimes.push_back(
      std::numeric_limits<uint64_t>::max());
  allocation_ptr_to_id_[ptr] = allocation_id_;
  allocation_id_++;
}

void AllocationPlanner::record_free(const void* ptr) {
  if (validation_mode_) {
    validation_success = validation_success && validate_free(ptr);
    return;
  }
  auto it = allocation_ptr_to_id_.find(ptr);
  if (it == allocation_ptr_to_id_.end()) {
    // Free being recorded was allocated outside of WithProfileAllocationGuard
    return;
  }
  auto id = it->second;
  TORCH_CHECK(
      id < allocation_plan_->allocation_lifetimes.size(),
      "Allocation must have been recorded during record_allocation.");
  allocation_plan_->allocation_lifetimes[id] = allocation_id_;
}

bool AllocationPlanner::validate_allocation(
    const uint64_t size,
    const void* ptr) {
  if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
      allocation_plan_->allocation_sizes[allocation_id_] != size) {
    TORCH_WARN(
        "Allocation request does not match plan:",
        "Allocation id:",
        allocation_id_,
        ", Number of recorded allocations:",
        allocation_plan_->allocation_sizes.size(),
        ", Recorded size of the requested allocation:",
        allocation_plan_->allocation_sizes[allocation_id_],
        ", but got:",
        size);

    return false;
  }
  allocation_ptr_to_id_[ptr] = allocation_id_;
  allocation_id_++;
  return true;
}

bool AllocationPlanner::validate_free(const void* ptr) {
  auto it = allocation_ptr_to_id_.find(ptr);
  if (it == allocation_ptr_to_id_.end()) {
    // Allocation that was made outside the validation scope is being freed here
    return true;
  }
  auto id = (*it).second;
  TORCH_CHECK(
      id < allocation_plan_->allocation_lifetimes.size(),
      "Allocation must have been recorded during validate_allocation.");
  auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
  return (lifetime_id == allocation_id_);
}

void AllocationPlanner::formulate_plan() {
  allocation_plan_->allocation_offsets = formulate_greedy_allocation_plan(
      allocation_plan_->allocation_sizes,
      allocation_plan_->allocation_lifetimes);
  allocation_plan_->total_size = 0;
  for (const auto i : c10::irange(allocation_plan_->allocation_sizes.size())) {
    if (allocation_plan_->allocation_lifetimes[i] ==
        std::numeric_limits<uint64_t>::max()) {
      continue;
    }
    auto limit = allocation_plan_->allocation_offsets[i] +
        allocation_plan_->allocation_sizes[i];
    allocation_plan_->total_size =
        std::max(allocation_plan_->total_size, limit);
  }
}

void AllocationPlanner::clear() {
  allocation_plan_->clear();
  allocation_ptr_to_id_.clear();
}

void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
  TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
  plan_ = plan;
  allocation_id_ = 0;
  allocation_ptr_to_id_.clear();
  if (current_size_ < plan->total_size) {
    // Free existing memory and reallocate for larger size.
    c10::free_cpu(blob_);
    blob_ = c10::alloc_cpu(plan->total_size);
    current_size_ = plan->total_size;
  }
}

void CPUProfilingAllocator::unset_plan() {
  allocation_id_ = 0;
  allocation_ptr_to_id_.clear();
  plan_ = nullptr;
}

void* CPUProfilingAllocator::allocate(const size_t bytes) {
  TORCH_CHECK(
      bytes == plan_->allocation_sizes[allocation_id_],
      "Got allocation request that does not match with the plan.");
  if (plan_->allocation_lifetimes[allocation_id_] ==
      std::numeric_limits<uint64_t>::max()) {
    // This allocation is not managed by ProfilingAllocator.
    allocation_id_++;
    return c10::alloc_cpu(bytes);
  }
  void* ptr = reinterpret_cast<uint8_t*>(blob_) +
      plan_->allocation_offsets[allocation_id_];
  allocation_ptr_to_id_[ptr] = allocation_id_;
  allocation_id_++;
  return ptr;
}

void CPUProfilingAllocator::free(void* const ptr) {
  auto it = allocation_ptr_to_id_.find(ptr);
  if (it == allocation_ptr_to_id_.end()) {
    // Either
    // 1. Allocation that was made outside the validation scope is being freed
    // here or
    // 2. Allocation that is not managed by profiling allocator is being freed.
    //    Example of the second type
    //    Tensor out;
    //    for (....) {
    //      {
    //        CPUProfilingAllocator
    //        out = ...some op (This also frees previous memory held by out)
    //      }
    //      out is used..
    //    }
    c10::free_cpu(ptr);
    return;
  }
  auto id = it->second;
  TORCH_CHECK(
      id < plan_->allocation_lifetimes.size(),
      "Freeing allocation that is not accordingly to the plan.");
  auto lifetime_id = plan_->allocation_lifetimes[id];
  TORCH_CHECK(
      lifetime_id == allocation_id_,
      "Lifetime of allocations do not match: allocation_id ",
      id,
      ", expected:",
      lifetime_id,
      ", got:",
      allocation_id_);
}

CPUProfilingAllocator::~CPUProfilingAllocator() {
  c10::free_cpu(blob_);
}

WithProfileAllocationsGuard::WithProfileAllocationsGuard(AllocationPlan* plan) {
  // Nesting of allocation profiling does not seem meaningful.
  TORCH_CHECK(
      allocation_planner == nullptr,
      "Nesting profiling allocations is not supported.");
  planner_ = std::make_unique<AllocationPlanner>(plan);
  planner_->clear();
  allocation_planner = planner_.get();
}

WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
  planner_->formulate_plan();
  allocation_planner = nullptr;
}

WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
    AllocationPlan* plan,
    bool* success) {
  // Nesting of allocation profiling does not seem meaningful.
  TORCH_CHECK(
      allocation_planner == nullptr,
      "Nesting profiling allocations is not supported.");
  planner_ = std::make_unique<AllocationPlanner>(plan, true);
  success_ = success;
  allocation_planner = planner_.get();
}

WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
  *success_ = planner_->validation_success;
  allocation_planner = nullptr;
}

AllocationPlanner* GetThreadLocalAllocationPlanner() {
  return allocation_planner;
}

WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
    CPUProfilingAllocator* allocator,
    const AllocationPlan* plan) {
  // Nesting of profiling allocator is not supported.
  TORCH_CHECK(
      profiling_allocator == nullptr,
      "Nesting profiling allocators is not supported.");
  profiling_allocator = allocator;
  profiling_allocator->set_plan(plan);
}

WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
  profiling_allocator->unset_plan();
  profiling_allocator = nullptr;
}

CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
  return profiling_allocator;
}

} // namespace c10