diff --git a/build_variables.bzl b/build_variables.bzl index b390e819440d..91ccf243c7d3 100644 --- a/build_variables.bzl +++ b/build_variables.bzl @@ -610,6 +610,8 @@ libtorch_nativert_sources = [ "torch/nativert/detail/ITree.cpp", "torch/nativert/kernels/C10Kernel.cpp", "torch/nativert/kernels/AutoFunctionalizeKernel.cpp", + "torch/nativert/executor/memory/GreedyBySize.cpp", + "torch/nativert/executor/memory/Bump.cpp", ] torch_mobile_tracer_sources = [ diff --git a/test/cpp/nativert/CMakeLists.txt b/test/cpp/nativert/CMakeLists.txt index 026dcc7b788f..9f2ad858dfd1 100644 --- a/test/cpp/nativert/CMakeLists.txt +++ b/test/cpp/nativert/CMakeLists.txt @@ -18,6 +18,8 @@ set(NATIVERT_TEST_SRCS ${TORCH_ROOT}/torch/nativert/detail/ITree.cpp ${TORCH_ROOT}/torch/nativert/executor/ExecutionFrame.cpp ${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp + ${TORCH_ROOT}/torch/nativert/executor/memory/GreedyBySize.cpp + ${TORCH_ROOT}/torch/nativert/executor/memory/Bump.cpp ) add_executable(test_nativert diff --git a/test/cpp/nativert/test_layout_planner_algorithm.cpp b/test/cpp/nativert/test_layout_planner_algorithm.cpp new file mode 100644 index 000000000000..4fb1096b8fbd --- /dev/null +++ b/test/cpp/nativert/test_layout_planner_algorithm.cpp @@ -0,0 +1,63 @@ +#include +#include + +#include +#include + +using namespace ::testing; +using namespace torch::nativert; + +std::vector create_test_allocation_specs() { + std::vector specs; + + const std::vector> test_cases = { + {0, 1, 32}, + {1, 4, 28}, + {2, 5, 36}, + {3, 5, 16}, + {4, 5, 8}, + {5, 7, 64}, + {6, 8, 10}, + {7, 8, 40}, + }; + + for (const auto& [l_start, l_end, size] : test_cases) { + specs.push_back(AllocationSpec{ + .lifetime = AllocationLifetime(l_start, l_end), .size = size}); + }; + + return specs; +} + +// figure 6 -- https://arxiv.org/pdf/2001.03288 +TEST(LayoutPlannerAlgorithmTests, TestGreedyBySize) { + auto result = GreedyBySizeAllocationPlanner(create_test_allocation_specs()); + + EXPECT_EQ(result.total_size, 124); + + auto& allocations = result.allocations; + + EXPECT_EQ(allocations[0].offset, 0); + EXPECT_EQ(allocations[1].offset, 32); + EXPECT_EQ(allocations[2].offset, 64); + EXPECT_EQ(allocations[3].offset, 100); + EXPECT_EQ(allocations[4].offset, 116); + EXPECT_EQ(allocations[5].offset, 0); + EXPECT_EQ(allocations[6].offset, 104); + EXPECT_EQ(allocations[7].offset, 64); +} + +TEST(LayoutPlannerAlgorithmTests, TestBump) { + auto specs = create_test_allocation_specs(); + auto result = BumpAllocationPlanner(create_test_allocation_specs()); + + auto& allocations = result.allocations; + + auto offset = 0; + for (auto&& [i, spec] : c10::enumerate(specs)) { + EXPECT_EQ(allocations[i].offset, offset); + offset += spec.size; + } + + EXPECT_EQ(result.total_size, offset); +} diff --git a/torch/nativert/executor/memory/Bump.cpp b/torch/nativert/executor/memory/Bump.cpp new file mode 100644 index 000000000000..34b84317ec3d --- /dev/null +++ b/torch/nativert/executor/memory/Bump.cpp @@ -0,0 +1,24 @@ +#include + +namespace torch::nativert { + +LayoutPlan BumpAllocationPlanner( + const std::vector& allocation_specs) { + LayoutPlan plan; + + auto& allocations = plan.allocations; + auto& total_size = plan.total_size; + + allocations.reserve(allocation_specs.size()); + for (const auto& spec : allocation_specs) { + allocations.push_back(Allocation{ + .size = spec.size, + .offset = total_size, + }); + total_size += spec.size; + } + + return plan; +} + +} // namespace torch::nativert diff --git a/torch/nativert/executor/memory/Bump.h b/torch/nativert/executor/memory/Bump.h new file mode 100644 index 000000000000..d424e2bb6924 --- /dev/null +++ b/torch/nativert/executor/memory/Bump.h @@ -0,0 +1,13 @@ +#pragma once + +#include + +namespace torch::nativert { + +// lay out all tensors contiguously in memory +// this doesn't take into account lifetimes, +// it literally just puts them all next to each other +LayoutPlan BumpAllocationPlanner( + const std::vector& allocation_specs); + +} // namespace torch::nativert diff --git a/torch/nativert/executor/memory/GreedyBySize.cpp b/torch/nativert/executor/memory/GreedyBySize.cpp new file mode 100644 index 000000000000..2d59a16e4f18 --- /dev/null +++ b/torch/nativert/executor/memory/GreedyBySize.cpp @@ -0,0 +1,166 @@ +#include +#include +#include + +#include +#include +#include + +#include + +namespace { + +using namespace torch::nativert; + +// we need to track the original order in which allocations were made +// since they will be re-sorted between iterations +struct GreedyAllocation : public Allocation { + explicit GreedyAllocation( + Allocation allocation, + size_t allocation_idx, + size_t input_spec_idx) + : Allocation(allocation), + allocation_index(allocation_idx), + input_spec_index(input_spec_idx) {} + // we need to maintain the allocation ordering s.t., we can look up + // previous allocations directly from descending_allocation_specs_ + // even after allocations has been re-sorted, which happens after + // each allocation is complete. + // + // i.e., this index represents the index of the spec that was used + // to create this allocation inside descending_allocation_specs_ + // AFTER the sorting was completed. + size_t allocation_index{0}; + // index of the spec associated with this allocation + // in the event that the specs get re-ordered + // in the process of creating allocations + // e.g., + // allocation_specs[sX, sY, sZ] + // ^ ^ ^ + // values[vX, vY, vZ] + // + // means that an allocation created from sY + // will have an input_spec_index of 1 + // + // this allows us to return to the original + // ordering before returning the allocations + size_t input_spec_index{0}; +}; + +struct AllocationSpecWithIndex { + const AllocationSpec* spec; + size_t index; +}; + +// associate specs with their original (unsorted) index +// and then sort them in descending order by byte size +std::vector prepare_allocation_specs( + const std::vector& allocation_specs) { + std::vector specs; + specs.reserve(allocation_specs.size()); + + for (const auto i : c10::irange(allocation_specs.size())) { + specs.push_back({&allocation_specs[i], i}); + } + + std::sort(specs.begin(), specs.end(), [](auto& lhs, auto& rhs) { + return lhs.spec->size > rhs.spec->size; + }); + + return specs; +} + +} // namespace + +namespace torch::nativert { + +// https://arxiv.org/pdf/2001.03288 +LayoutPlan GreedyBySizeAllocationPlanner( + const std::vector& allocation_specs) { + LayoutPlan plan; + + auto descending_allocation_specs = prepare_allocation_specs(allocation_specs); + + std::vector allocations; + allocations.reserve(allocation_specs.size()); + + auto get_next_offset = [&](const AllocationSpec& spec) -> size_t { + size_t prev_offset = 0; + std::optional best_offset = std::nullopt; + size_t smallest_gap = std::numeric_limits::max(); + + for (const auto& alloc : allocations) { + if (auto* allocated_spec = + descending_allocation_specs.at(alloc.allocation_index).spec; + allocated_spec->not_overlapping_with(spec)) { + continue; + } + + if (alloc.offset > prev_offset) { + if (size_t gap = alloc.offset - prev_offset; + gap >= spec.size && gap < smallest_gap) { + smallest_gap = gap; + best_offset = prev_offset; + } + } + + prev_offset = std::max(prev_offset, alloc.offset + alloc.size); + } + + return best_offset.value_or(prev_offset); + }; + + size_t total_allocation_size = 0; + for (const auto&& [allocation_index, spec_with_original_index] : + c10::enumerate(descending_allocation_specs)) { + auto& spec = spec_with_original_index.spec; + + auto new_allocation = GreedyAllocation( + Allocation{.size = spec->size, .offset = get_next_offset(*spec)}, + allocation_index, + spec_with_original_index.index); + + total_allocation_size += new_allocation.size; + plan.total_size = + std::max(plan.total_size, new_allocation.offset + new_allocation.size); + + VLOG(1) << "allocation with interval " << spec->lifetime.start << "-->" + << spec->lifetime.end << " placed at offset " + << new_allocation.offset; + + // insert new allocation while maintaining relative-offset ordering + // the algorithm is already quadratic because of get_next_offset + // so this is negligible + + auto it = std::lower_bound( + allocations.begin(), + allocations.end(), + new_allocation, + [](auto& lhs, auto& rhs) { return lhs.offset < rhs.offset; }); + allocations.insert(it, new_allocation); + } + + // sort allocations so their ordering is consistent with the input specs + std::sort(allocations.begin(), allocations.end(), [](auto& lhs, auto& rhs) { + return lhs.input_spec_index < rhs.input_spec_index; + }); + + plan.allocations.reserve(allocations.size()); + std::move( + allocations.begin(), + allocations.end(), + std::back_inserter(plan.allocations)); + + if (plan.total_size > 0) { + VLOG(1) << std::fixed << std::setprecision(2) + << "greedy-by-size bytes saved over strictly increasing: " + << (1.0 - ((float)plan.total_size / (float)total_allocation_size)) * + 100 + << "% (" << total_allocation_size << " - " << plan.total_size + << " = " << (total_allocation_size - plan.total_size) << " bytes)"; + } + + return plan; +} + +} // namespace torch::nativert diff --git a/torch/nativert/executor/memory/GreedyBySize.h b/torch/nativert/executor/memory/GreedyBySize.h new file mode 100644 index 000000000000..0d5a61132cf9 --- /dev/null +++ b/torch/nativert/executor/memory/GreedyBySize.h @@ -0,0 +1,10 @@ +#pragma once + +#include + +namespace torch::nativert { + +LayoutPlan GreedyBySizeAllocationPlanner( + const std::vector& allocation_specs); + +} // namespace torch::nativert