Profiling allocator for mobile. (#43951)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/43951 AllocationPlan: Stores the sequence of allocations, their sizes and liftime of the allocations. Along with this it also stores the total size of a single memory blob, total_size, required to satisfy all the allocations. It also stores the offsets in the blob, of size total_size, corresponding to each allocation. Thus allocation plan contains: - allocation sizes - allocation lifetimes - allocation offsets - total size AllocationPlaner: Takes a pointer to the allocation plan and fills it ups with plan, i.e. sizes, lifetimes, offsets, total size. This is done via WithProfileAllocationsGuard which takes in AllocationPlan* and constructs AllocationPlanner* and set the thread local allocation_planner to it. MobileCPUAllocator profiles allocations via allocation_planner. In WithValidateAllocationsGuard, allocations profiled in the allocation plan are validated. CPUProfilingAllocator: Application owns CPUProfilingAllocator Using WithProfilingAllocatorGuard, it passes both CPUProfilingAllocator and AllocationPlan created earlier. Then CPUProfilingAllocator will manage allocations and frees according to the plan. Allocations that are not managed by CPUProfilingAllocator will be routed through c10::alloc_cpu, c10::free_cpu. Test Plan: cpu_profiling_allocator_test on mobile. Imported from OSS Reviewed By: dreiss Differential Revision: D23451019 fbshipit-source-id: 98bf1dbcfa8fcfb83d505ac01095e84a3f5b778d
2025-10-20 21:14:14 +08:00 · 2020-10-06 09:07:22 -07:00
parent b1373a74e0
commit a09e1098e7
5 changed files with 744 additions and 1 deletions
--- a/aten/src/ATen/test/CMakeLists.txt
+++ b/aten/src/ATen/test/CMakeLists.txt
@ -79,11 +79,13 @@ list(APPEND ATen_VULKAN_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/vulkan_test.cpp)

 list(APPEND ATen_MOBILE_TEST_SRCS
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
+  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_profiling_allocator_test.cpp
  ${CMAKE_CURRENT_SOURCE_DIR}/cpu_caching_allocator_test.cpp)

 list(APPEND ATen_VEC256_TEST_SRCS
  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test.cpp
-  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp  
+  ${CMAKE_CURRENT_SOURCE_DIR}/vec256_test_all_types.cpp
  )

 # Caffe2 specific tests
--- a/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
+++ b/aten/src/ATen/test/cpu_profiling_allocator_test.cpp
@ -0,0 +1,167 @@
+#include <gtest/gtest.h>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+#include <ATen/ATen.h>
+
+at::Tensor run_with_control_flow(
+    at::Tensor input,
+    at::Tensor conv_weight,
+    at::Tensor linear_weight,
+    bool cond,
+    std::vector<void*>& pointers,
+    bool record = false,
+    bool validate = false) {
+  if (cond) {
+    input = input * 2;
+  }
+  void* input_ptr = input.data_ptr();
+  auto conv_out = at::conv2d(input, conv_weight);
+  void* conv_out_ptr = input.data_ptr();
+  auto conv_out_flat = conv_out.view({conv_out.size(0), -1});
+  auto output = at::linear(conv_out_flat, linear_weight);
+  if (record) {
+    pointers.push_back(input_ptr);
+    pointers.push_back(conv_out_ptr);
+  }
+  if (validate) {
+    TORCH_CHECK(input_ptr == pointers[0]);
+    TORCH_CHECK(conv_out_ptr == pointers[1]);
+  }
+  return output;
+}
+
+TEST(CPUAllocationPlanTest, with_control_flow) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, true, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode, bool validation_mode) -> bool {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output =
+        run_with_control_flow(a, conv_weight, linear_weight, record_mode, pointers);
+    }
+    bool success{true};
+    for (uint64_t i = 0; i < 10; ++i) {
+      bool validation_success;
+      {
+        c10::WithValidateAllocationPlanGuard
+          validation_guard(&plan, &validation_success);
+        output = run_with_control_flow(
+            a, conv_weight, linear_weight, validation_mode, pointers);
+      }
+      success = success && validation_success;
+    }
+    return success;
+  };
+  ASSERT_FALSE(validate_allocation_plan(false, true));
+  ASSERT_FALSE(validate_allocation_plan(true, false));
+  ASSERT_TRUE(validate_allocation_plan(true, true));
+  ASSERT_TRUE(validate_allocation_plan(false, false));
+}
+
+TEST(CPUAllocationPlanTest, with_profiling_alloc) {
+  at::Tensor a = at::rand({23, 16, 16, 16});
+  at::Tensor conv_weight = at::rand({16, 16, 3, 3});
+  // output shape
+  // 23, 16, 14, 14
+  // Flattened shape = 23, 3136
+  at::Tensor linear_weight = at::rand({32, 3136});
+  at::Tensor output;
+  std::vector<void*> pointers;
+
+  auto valid_allocation_plan = [&]() {
+    c10::AllocationPlan plan;
+    {
+      c10::WithProfileAllocationsGuard profile_guard(&plan);
+      output = run_with_control_flow(
+          a, conv_weight, linear_weight, false, pointers);
+    }
+  };
+  ASSERT_NO_THROW(valid_allocation_plan());
+
+  auto validate_allocation_plan =
+    [&](bool record_mode,
+        bool validation_mode,
+        bool validate_pointers) {
+      pointers.clear();
+      c10::AllocationPlan plan;
+      {
+        c10::WithProfileAllocationsGuard profile_guard(&plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            record_mode,
+            pointers,
+            false,
+            false);
+      }
+      c10::CPUProfilingAllocator profiling_allocator;
+      {
+        c10::WithProfilingAllocatorGuard
+          profiling_allocator_guard(&profiling_allocator, &plan);
+        output = run_with_control_flow(
+            a,
+            conv_weight,
+            linear_weight,
+            validation_mode,
+            pointers,
+            validate_pointers,
+            false);
+      }
+      for (uint64_t i = 0; i < 10; ++i) {
+        {
+          c10::WithProfilingAllocatorGuard
+            profiling_allocator_guard(&profiling_allocator, &plan);
+          output = run_with_control_flow(
+              a,
+              conv_weight,
+              linear_weight,
+              validation_mode,
+              pointers,
+              false,
+              validate_pointers);
+        }
+      }
+  };
+  // When control flow conditions are same between profiling and evaluation
+  // profiling allocator should not throw.
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, false));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, false));
+  // Furthermore profiling allocator should return the same pointers
+  // back for the intermediate tensors
+  ASSERT_NO_THROW(validate_allocation_plan(true, true, true));
+  ASSERT_NO_THROW(validate_allocation_plan(false, false, true));
+
+  // When control flow conditions are different between profiling and evaluation
+  // profiling allocator should throw.
+  ASSERT_THROW(validate_allocation_plan(true, false, false), c10::Error);
+  ASSERT_THROW(validate_allocation_plan(false, true, false), c10::Error);
+}
+
+int main(int argc, char* argv[]) {
+// At the moment caching allocator is only exposed to mobile cpu allocator.
+#ifdef C10_MOBILE
+  ::testing::InitGoogleTest(&argc, argv);
+  at::manual_seed(42);
+  return RUN_ALL_TESTS();
+#endif /* C10_Mobile */
+}
--- a/c10/core/CPUAllocator.cpp
+++ b/c10/core/CPUAllocator.cpp
@ -1,6 +1,7 @@
 #include <c10/core/CPUAllocator.h>
 #include <c10/core/DeviceType.h>
 #include <c10/mobile/CPUCachingAllocator.h>
+#include <c10/mobile/CPUProfilingAllocator.h>

 // TODO: rename flags to C10
 C10_DEFINE_bool(
@ -156,13 +157,20 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
    // TODO: enable with better TLS support on mobile
    // profiledCPUMemoryReporter().Delete(pointer);
    auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
    if (allocator_ptr != nullptr) {
      allocator_ptr->free(pointer);
+    } else if (profiling_allocator_ptr != nullptr) {
+      profiling_allocator_ptr->free(pointer);
    } else {
      c10::free_cpu(pointer);
      // This adds extra cost to freeing memory to the default case when
      // caching allocator is not enabled.
      CPUCachingAllocator::record_free(pointer);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_free(pointer);
+      }
    }
  }

@ -179,10 +187,17 @@ class DefaultMobileCPUAllocator final : public at::Allocator {
    auto alloc_size = PreGuardBytes + nbytes + PostGuardBytes;
    void* data;
    auto allocator_ptr = GetThreadLocalCachingAllocator();
+    auto profiling_allocator_ptr = GetThreadLocalProfilingAllocator();
    if (allocator_ptr != nullptr) {
      data = allocator_ptr->allocate(alloc_size);
+    } else if (profiling_allocator_ptr != nullptr) {
+      data = profiling_allocator_ptr->allocate(alloc_size);
    } else {
      data = c10::alloc_cpu(alloc_size);
+      auto allocation_planner = GetThreadLocalAllocationPlanner();
+      if (allocation_planner != nullptr) {
+        allocation_planner->record_allocation(alloc_size, data);
+      }
    }
    //  profiledCPUMemoryReporter().New(data, alloc_size);
    return {
--- a/c10/mobile/CPUProfilingAllocator.cpp
+++ b/c10/mobile/CPUProfilingAllocator.cpp
@ -0,0 +1,410 @@
+#include <climits>
+
+#include <c10/mobile/CPUProfilingAllocator.h>
+
+namespace c10 {
+
+namespace {
+thread_local AllocationPlanner* allocation_planner{nullptr};
+thread_local CPUProfilingAllocator* profiling_allocator{nullptr};
+
+struct MemBlock {
+  uint64_t start_offset, end_offset;
+  MemBlock(uint64_t s, uint64_t e) : start_offset(s), end_offset(e) {}
+  bool operator<(const MemBlock& other) const {
+    return end_offset <= other.start_offset;
+  }
+};
+
+bool validate_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_offsets) {
+  std::set<MemBlock> allocations;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // Skip allocations not managed by AllocationPlan
+    if (allocation_offsets[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto start_offset = allocation_offsets[i];
+    auto end_offset = allocation_offsets[i] + allocation_sizes[i];
+    if (!allocations.emplace(start_offset, end_offset).second) {
+      return false;
+    }
+  }
+  return true;
+}
+
+enum class EventType {
+  Allocate = 0,
+  Free,
+  Invalid
+};
+
+struct MemEvent {
+  uint64_t time;
+  uint64_t allocation_id;
+  uint64_t size;
+  EventType type{EventType::Invalid};
+  MemEvent(uint64_t t, uint64_t id, uint64_t s, EventType e) :
+    time(t), allocation_id(id), size(s), type(e) {}
+};
+
+std::vector<MemEvent> create_and_sort_mem_events(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  std::vector<MemEvent> events;
+  for (uint64_t i = 0; i < allocation_sizes.size(); ++i) {
+    // If observed allocation are freed outside the scope of
+    // observation, then allocations are not managed by the
+    // AllocationPlan.
+    if (allocation_lifetimes[i] == std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    events.emplace_back(i, i, allocation_sizes[i], EventType::Allocate);
+    events.emplace_back(allocation_lifetimes[i], i, allocation_sizes[i], EventType::Free);
+  }
+  std::sort(
+      events.begin(),
+      events.end(),
+      [](const MemEvent& a,
+         const MemEvent& b) -> bool {return a.time < b.time;});
+  return events;
+}
+
+std::vector<uint64_t> formulate_greedy_allocation_plan(
+    const std::vector<uint64_t>& allocation_sizes,
+    const std::vector<uint64_t>& allocation_lifetimes) {
+  // Step 1. Construct all allocation/free events.
+  //         Sort these events by timestamp.
+  // Step 2. Iterate through all events.
+  //  2.1 If allocate event:
+  //      Find all candidate in free_size_to_offset map
+  //      Greedily pick the first one.
+  //      Remove the entry from free_size_to_offset map.
+  //      new_offset = offset + request_size
+  //      new_size = size - request_size
+  //      Add new entry to both maps
+  //  2.2 If free event.
+  //      Check if the returned offset merges with another chunk.
+  //      If so merge until no more merging is possible.
+  //      If returned offset does not merge, then
+  //      just return it as a chunk.
+
+  // lower_bound on this map will get all candidates of
+  // the right size for allocation.
+  std::map<uint64_t, uint64_t> free_size_to_offset;
+  // This provides fast lookup when we want to insert freed block
+  // back, especially when we want to merge blocks.
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_start_offset_to_size_iter;
+  ska::flat_hash_map<uint64_t, std::map<uint64_t, uint64_t>::iterator> free_end_offset_to_size_iter;
+  // Upon free end_ptr = offset + size
+  // If end_ptr exists merge freed allocation
+  // Also find coresponding offset in size_to_offet
+  // Remove that entry and update with new size and offset
+  // If end_ptr does not exist then just insert offset,size
+  // in map and correspondingly size, offset in the other map.
+  // Merging should always be done recursively until no more chunks
+  // that can be found.
+  // After last free we should have only one entry left in these maps.
+  ska::flat_hash_map<uint64_t, uint64_t> allocated_offset_to_size;
+
+  std::vector<uint64_t> allocation_offsets(
+      allocation_sizes.size(), std::numeric_limits<uint64_t>::max());
+  auto mem_events = create_and_sort_mem_events(allocation_sizes, allocation_lifetimes);
+  uint64_t max_offset{0};
+  for (const auto& mem_event : mem_events) {
+    uint64_t alloc_offset;
+    uint64_t new_offset, new_size;
+    if (mem_event.type == EventType::Allocate) {
+      auto it = free_size_to_offset.lower_bound(mem_event.size);
+      if (it == free_size_to_offset.end()) {
+        // If there is no contiguous block of the size requested
+        // allocate a new one.
+        alloc_offset = max_offset;
+        max_offset += mem_event.size;
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      } else {
+        // If we have found a block of the size we want
+        // 1. change the block by allocating out of it.
+        //    1.1 Erase the entire block
+        //    1.2 Erase the reverse map entries
+        // 2. If block still has space left insert the remainder back in map.
+        //    Including reverse map entries.
+        // 3. Insert the allocated block in allocated_offset_to_size.
+        alloc_offset = it->second;
+        new_offset = alloc_offset + mem_event.size;
+        new_size = it->first - mem_event.size;
+        free_size_to_offset.erase(it);
+        free_start_offset_to_size_iter.erase(alloc_offset);
+        free_end_offset_to_size_iter.erase(alloc_offset + it->first);
+        if (new_size > 0) {
+          auto ref_it = free_size_to_offset.emplace(new_offset, new_size).first;
+          free_start_offset_to_size_iter.emplace(new_offset, ref_it);
+          free_end_offset_to_size_iter.emplace(new_offset + new_size, ref_it);
+        }
+        allocated_offset_to_size.emplace(alloc_offset, mem_event.size);
+      }
+      allocation_offsets[mem_event.allocation_id] = alloc_offset;
+    } else {
+      // 1. Check if freed block is adjancent to an existing free block
+      //    at its end boundary. This is done by checking
+      //    free_end_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 2. Similarly check if freed block is adjacent to an existing
+      //    free block at start boundary. This is done by checking
+      //    free_start_offset_to_size_iter.
+      //    If we find such a block, remove it and adjust size of
+      //    the block being freed.
+      // 3. Inser the freed block in map.
+      auto freed_offset = allocation_offsets[mem_event.allocation_id];
+      auto freed_size = mem_event.size;
+      auto end_offset = freed_offset + freed_size;
+      // Merge when another free block exist at the end of this block
+      auto end_it = free_end_offset_to_size_iter.find(end_offset);
+      if (end_it != free_end_offset_to_size_iter.end()) {
+        auto size_to_end_offset_iter = end_it->second;
+        freed_size += size_to_end_offset_iter->first;
+        free_size_to_offset.erase(size_to_end_offset_iter);
+        free_end_offset_to_size_iter.erase(end_it);
+      }
+      // Merge when freed block exist at the end of another free block
+      auto start_it = free_start_offset_to_size_iter.find(freed_offset);
+      if (start_it != free_start_offset_to_size_iter.end()) {
+        auto size_to_start_offset_iter = start_it->second;
+        freed_size += size_to_start_offset_iter->first;
+        freed_offset -= size_to_start_offset_iter->first;
+        free_size_to_offset.erase(size_to_start_offset_iter);
+        free_start_offset_to_size_iter.erase(start_it);
+      }
+      allocated_offset_to_size.erase(freed_offset);
+      auto freed_block_it =
+        free_size_to_offset.emplace(freed_size, freed_offset).first;
+      free_start_offset_to_size_iter.emplace(freed_offset, freed_block_it);
+      free_end_offset_to_size_iter.emplace(
+          freed_offset + freed_size, freed_block_it);
+    }
+  }
+  TORCH_CHECK(validate_allocation_plan(allocation_sizes, allocation_offsets),
+      "Allocation plan invaild.");
+  return allocation_offsets;
+}
+
+} // namespace
+
+void AllocationPlan::clear() {
+  allocation_sizes.clear();
+  allocation_lifetimes.clear();
+  allocation_offsets.clear();
+}
+
+void AllocationPlanner::record_allocation(
+    const uint64_t size, const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_allocation(size, ptr);
+    return;
+  }
+  allocation_plan_->allocation_sizes.push_back(size);
+  allocation_plan_->allocation_lifetimes.push_back(
+      std::numeric_limits<uint64_t>::max());
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+}
+
+void AllocationPlanner::record_free(const void* ptr) {
+  if (validation_mode_) {
+    validation_success = validation_success && validate_free(ptr);
+    return;
+  }
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Free being recorded was allocated outside of WithProfileAllocationGuard
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during record_allocation.");
+  allocation_plan_->allocation_lifetimes[id] = allocation_id_;
+}
+
+bool AllocationPlanner::validate_allocation(
+    const uint64_t size, const void* ptr) {
+  if (allocation_id_ >= allocation_plan_->allocation_sizes.size() ||
+      allocation_plan_->allocation_sizes[allocation_id_] != size) {
+    TORCH_WARN(
+        "Allocation request does not match plan:",
+        "Allocation id:",
+        allocation_id_,
+        ", Number of recorded allocations:",
+        allocation_plan_->allocation_sizes.size(),
+        ", Recorded size of the requested allocation:",
+        allocation_plan_->allocation_sizes[allocation_id_],
+        ", but got:",
+        size);
+
+    return false;
+  }
+  allocation_ptr_to_id_.emplace(ptr, allocation_id_);
+  allocation_id_++;
+  return true;
+}
+
+bool AllocationPlanner::validate_free(const void* ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Allocation that was made outside the validation scope is being freed here
+    return true;
+  }
+  auto id = (*it).second;
+  TORCH_CHECK(id < allocation_plan_->allocation_lifetimes.size(),
+      "Allocation must have been recorded during validate_allocation.");
+  auto lifetime_id = allocation_plan_->allocation_lifetimes[id];
+  return (lifetime_id == allocation_id_);
+}
+
+void AllocationPlanner::formulate_plan() {
+  allocation_plan_->allocation_offsets =
+    formulate_greedy_allocation_plan(
+        allocation_plan_->allocation_sizes, allocation_plan_->allocation_lifetimes);
+  allocation_plan_->total_size = 0;
+  for (auto i = 0; i < allocation_plan_->allocation_sizes.size(); ++i) {
+    if (allocation_plan_->allocation_lifetimes[i] ==
+        std::numeric_limits<uint64_t>::max()) {
+      continue;
+    }
+    auto limit = allocation_plan_->allocation_offsets[i] + allocation_plan_->allocation_sizes[i];
+    allocation_plan_->total_size = std::max(allocation_plan_->total_size, limit);
+  }
+}
+
+void AllocationPlanner::clear() {
+  allocation_plan_->clear();
+  allocation_ptr_to_id_.clear();
+}
+
+void CPUProfilingAllocator::set_plan(const AllocationPlan* plan) {
+  TORCH_CHECK(plan != nullptr, "Allocation plan is nullptr.");
+  plan_ = plan;
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  if (current_size_ < plan->total_size) {
+    // Free existing memory and reallocate for larger size.
+    c10::free_cpu(blob_);
+    blob_ = c10::alloc_cpu(plan->total_size);
+    current_size_ = plan->total_size;
+  }
+}
+
+void CPUProfilingAllocator::unset_plan() {
+  allocation_id_ = 0;
+  allocation_ptr_to_id_.clear();
+  plan_ = nullptr;
+}
+
+void* CPUProfilingAllocator::allocate(const size_t bytes) {
+  TORCH_CHECK(bytes == plan_->allocation_sizes[allocation_id_],
+      "Got allocation request that does not match with the plan.");
+  if (plan_->allocation_lifetimes[allocation_id_] ==
+      std::numeric_limits<uint64_t>::max()) {
+    // This allocation is not managed by ProfilingAllocator.
+    allocation_id_++;
+    return c10::alloc_cpu(bytes);
+  }
+  void* ptr =
+    reinterpret_cast<uint8_t*>(blob_) +
+    plan_->allocation_offsets[allocation_id_];
+  TORCH_CHECK(allocation_ptr_to_id_.emplace(ptr, allocation_id_).second);
+  allocation_id_++;
+  return ptr;
+}
+
+void CPUProfilingAllocator::free(void* const ptr) {
+  auto it = allocation_ptr_to_id_.find(ptr);
+  if (it == allocation_ptr_to_id_.end()) {
+    // Either
+    // 1. Allocation that was made outside the validation scope is being freed here
+    // or
+    // 2. Allocation that is not managed by profiling allocator is being freed.
+    //    Example of the second type
+    //    Tensor out;
+    //    for (....) {
+    //      {
+    //        CPUProfilingAllocator
+    //        out = ...some op (This also frees previous memory held by out)
+    //      }
+    //      out is used..
+    //    }
+    c10::free_cpu(ptr);
+    return;
+  }
+  auto id = it->second;
+  TORCH_CHECK(id < plan_->allocation_lifetimes.size(),
+      "Freeing allocation that is not accordingly to the plan.");
+  auto lifetime_id = plan_->allocation_lifetimes[id];
+  TORCH_CHECK(
+      lifetime_id == allocation_id_,
+      "Lifetime of allocations do not match: allocation_id ",
+      id,
+      ", expected:",
+      lifetime_id,
+      ", got:",
+      allocation_id_);
+}
+
+CPUProfilingAllocator::~CPUProfilingAllocator() {
+  c10::free_cpu(blob_);
+}
+
+WithProfileAllocationsGuard::WithProfileAllocationsGuard(
+    AllocationPlan* plan) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan);
+  planner_->clear();
+  allocation_planner = planner_.get();
+}
+
+WithProfileAllocationsGuard::~WithProfileAllocationsGuard() {
+  planner_->formulate_plan();
+  allocation_planner = nullptr;
+}
+
+WithValidateAllocationPlanGuard::WithValidateAllocationPlanGuard(
+    AllocationPlan* plan, bool* success) {
+  // Nesting of allocation profiling does not seem meanigful.
+  TORCH_CHECK(allocation_planner == nullptr,
+      "Nesting profiling allocations is not supported.");
+  planner_ = std::make_unique<AllocationPlanner>(plan, true);
+  success_ = success;
+  allocation_planner = planner_.get();
+}
+
+WithValidateAllocationPlanGuard::~WithValidateAllocationPlanGuard() {
+  *success_ = planner_->validation_success;
+  allocation_planner = nullptr;
+}
+
+AllocationPlanner* GetThreadLocalAllocationPlanner() {
+  return allocation_planner;
+}
+
+WithProfilingAllocatorGuard::WithProfilingAllocatorGuard(
+    CPUProfilingAllocator* allocator, const AllocationPlan* plan) {
+  // Nesting of profiling allocator is not supported.
+  TORCH_CHECK(profiling_allocator == nullptr,
+      "Nesting profiling allocators is not supported.");
+  profiling_allocator = allocator;
+  profiling_allocator->set_plan(plan);
+}
+
+WithProfilingAllocatorGuard::~WithProfilingAllocatorGuard() {
+  profiling_allocator->unset_plan();
+  profiling_allocator = nullptr;
+}
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator() {
+  return profiling_allocator;
+}
+
+} // namespace c10
--- a/c10/mobile/CPUProfilingAllocator.h
+++ b/c10/mobile/CPUProfilingAllocator.h
@ -0,0 +1,149 @@
+#pragma once
+
+#include <algorithm>
+#include <deque>
+#include <memory>
+#include <mutex>
+
+#include <c10/core/CPUAllocator.h>
+#include <c10/util/Exception.h>
+#include <c10/util/SmallVector.h>
+#include <c10/util/flat_hash_map.h>
+
+namespace c10 {
+
+/*
+ * Given a sequence of allocations in a thread, AllocationPlan records
+ * 1. size of each allocation
+ * 2. Lifetime of each allocation.
+ * 3. allocation offsets: Memory offset for each allocation in a single blob of memory
+ * 4. Total size of a blob of memory required to satisfy all the allocations.
+ */
+class C10_API AllocationPlan {
+  private:
+    // Records size of each allocation by their sequential allocation ids.
+    std::vector<uint64_t> allocation_sizes;
+    // This maps one allocation id (X) to another allocation id (Y).
+    // Allocation X is alive until allocation Y. From allocation Y onwards
+    // allocation X is not referenced.
+    // Thus Y is the id of the first allocation after X is freed.
+    // NB: When an allocation is recorded, along with recording its size,
+    // we also set the lifetime to be numeric_limits::max()
+    // This is to track allocations that are made during the scope of
+    // profiling but were not freed until after the scope ended.
+    // Such allocations are not managed by profiling allocator.
+    std::vector<uint64_t> allocation_lifetimes;
+    // Maps an allocation to some offset in a blob of memory.
+    std::vector<uint64_t> allocation_offsets;
+    uint64_t total_size{0};
+    void clear();
+    friend class AllocationPlanner;
+    friend class CPUProfilingAllocator;
+};
+
+/*
+ * Map of memory ptr to allocation id. This is auxiliary information only
+ * used to establish lifetime of allocations.
+ */
+class C10_API AllocationPlanner {
+  private:
+    AllocationPlan* allocation_plan_{nullptr};
+    // Maps allocated ptr to its allocation id.
+    // This is used when freeing the memory to lookup the allocation id
+    // in order to establish the lifetime of a particular allocation.
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+    uint64_t allocation_id_{0};
+    bool validation_mode_{false};
+
+    bool validate_allocation(const uint64_t size, const void* ptr);
+    bool validate_free(const void* ptr);
+  public:
+    bool validation_success{true};
+
+    AllocationPlanner() = delete;
+    AllocationPlanner(AllocationPlan* plan, bool validate = false) :
+      allocation_plan_(plan), validation_mode_(validate) {}
+    void record_allocation(const uint64_t size, const void* ptr);
+    void record_free(const void* ptr);
+    void formulate_plan();
+    void clear();
+};
+
+// NOT THREAD SAFE profiling allocator.
+class C10_API CPUProfilingAllocator {
+  private:
+    const AllocationPlan* plan_{nullptr};
+    uint64_t allocation_id_{0};
+    uint64_t current_size_{0};
+    void* blob_{nullptr};
+    ska::flat_hash_map<const void*, uint64_t> allocation_ptr_to_id_;
+  public:
+    ~CPUProfilingAllocator();
+    void set_plan(const AllocationPlan* plan);
+    void unset_plan();
+    void* allocate(const size_t bytes);
+    void free(void* const ptr);
+};
+
+/*
+ * Usage: Profile allocations made by one run of the model.
+ * AllocationPlan plan;
+ * {
+ *   WithProfileAllocationGuard profile_guard(&plan);
+ *   module.forward(...);
+ * }
+ * plan now contains allocation plan.
+ */
+class C10_API WithProfileAllocationsGuard {
+  public:
+    WithProfileAllocationsGuard(AllocationPlan* plan);
+    ~WithProfileAllocationsGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+};
+
+/*
+ * Usage: Validate allocation plan made with WithProfileAllocationGuard
+ * bool plan_validation_success, success = true;
+ * for (some number of representative inputs)
+ * {
+ *   WithValidateAllocationPlanGuard(&plan, &plan_validation_success);
+ *   module.forward(...);
+ *   success = success && plan_validation_success;
+ * }
+ * success == true means allocations are according to plan
+ * else for some inputs allocation pattern changed.
+ */
+class C10_API WithValidateAllocationPlanGuard {
+  public:
+    WithValidateAllocationPlanGuard(AllocationPlan* plan, bool* success);
+    ~WithValidateAllocationPlanGuard();
+  private:
+    std::unique_ptr<AllocationPlanner> planner_;
+    bool* success_;
+};
+
+AllocationPlanner* GetThreadLocalAllocationPlanner();
+
+/*
+ * Usage: Allocate tensors accordingly to allocation plan
+ * First make allocation plan.
+ *  See WithProfileAllocationsGuard usage.
+ * Second validate allocation plan.
+ *  See WithValidateAllocationPlanGuard usage.
+ * CPUProfilingAllocator profiling_allocator;
+ * {
+ *   WithProfilingAllocatorGuard allocator_guard(&profiling_allocator, &plan);
+ *   module.forward(...);
+ * }
+ */
+class C10_API WithProfilingAllocatorGuard {
+  public:
+    WithProfilingAllocatorGuard(
+        CPUProfilingAllocator* allocator, const AllocationPlan* plan);
+    ~WithProfilingAllocatorGuard();
+};
+
+CPUProfilingAllocator* GetThreadLocalProfilingAllocator();
+
+} // namespace c10