mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
[nativert] move layout planner algorithms to libtorch (#156508)
Summary: tt Test Plan: ci Rollback Plan: Differential Revision: D76832891 Pull Request resolved: https://github.com/pytorch/pytorch/pull/156508 Approved by: https://github.com/zhxchen17
This commit is contained in:
@ -610,6 +610,8 @@ libtorch_nativert_sources = [
|
|||||||
"torch/nativert/detail/ITree.cpp",
|
"torch/nativert/detail/ITree.cpp",
|
||||||
"torch/nativert/kernels/C10Kernel.cpp",
|
"torch/nativert/kernels/C10Kernel.cpp",
|
||||||
"torch/nativert/kernels/AutoFunctionalizeKernel.cpp",
|
"torch/nativert/kernels/AutoFunctionalizeKernel.cpp",
|
||||||
|
"torch/nativert/executor/memory/GreedyBySize.cpp",
|
||||||
|
"torch/nativert/executor/memory/Bump.cpp",
|
||||||
]
|
]
|
||||||
|
|
||||||
torch_mobile_tracer_sources = [
|
torch_mobile_tracer_sources = [
|
||||||
|
@ -18,6 +18,8 @@ set(NATIVERT_TEST_SRCS
|
|||||||
${TORCH_ROOT}/torch/nativert/detail/ITree.cpp
|
${TORCH_ROOT}/torch/nativert/detail/ITree.cpp
|
||||||
${TORCH_ROOT}/torch/nativert/executor/ExecutionFrame.cpp
|
${TORCH_ROOT}/torch/nativert/executor/ExecutionFrame.cpp
|
||||||
${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp
|
${TORCH_ROOT}/torch/nativert/kernels/C10Kernel.cpp
|
||||||
|
${TORCH_ROOT}/torch/nativert/executor/memory/GreedyBySize.cpp
|
||||||
|
${TORCH_ROOT}/torch/nativert/executor/memory/Bump.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
add_executable(test_nativert
|
add_executable(test_nativert
|
||||||
|
63
test/cpp/nativert/test_layout_planner_algorithm.cpp
Normal file
63
test/cpp/nativert/test_layout_planner_algorithm.cpp
Normal file
@ -0,0 +1,63 @@
|
|||||||
|
#include <c10/util/Enumerate.h>
|
||||||
|
#include <gtest/gtest.h>
|
||||||
|
|
||||||
|
#include <torch/nativert/executor/memory/Bump.h>
|
||||||
|
#include <torch/nativert/executor/memory/GreedyBySize.h>
|
||||||
|
|
||||||
|
using namespace ::testing;
|
||||||
|
using namespace torch::nativert;
|
||||||
|
|
||||||
|
std::vector<AllocationSpec> create_test_allocation_specs() {
|
||||||
|
std::vector<AllocationSpec> specs;
|
||||||
|
|
||||||
|
const std::vector<std::tuple<size_t, size_t, size_t>> test_cases = {
|
||||||
|
{0, 1, 32},
|
||||||
|
{1, 4, 28},
|
||||||
|
{2, 5, 36},
|
||||||
|
{3, 5, 16},
|
||||||
|
{4, 5, 8},
|
||||||
|
{5, 7, 64},
|
||||||
|
{6, 8, 10},
|
||||||
|
{7, 8, 40},
|
||||||
|
};
|
||||||
|
|
||||||
|
for (const auto& [l_start, l_end, size] : test_cases) {
|
||||||
|
specs.push_back(AllocationSpec{
|
||||||
|
.lifetime = AllocationLifetime(l_start, l_end), .size = size});
|
||||||
|
};
|
||||||
|
|
||||||
|
return specs;
|
||||||
|
}
|
||||||
|
|
||||||
|
// figure 6 -- https://arxiv.org/pdf/2001.03288
|
||||||
|
TEST(LayoutPlannerAlgorithmTests, TestGreedyBySize) {
|
||||||
|
auto result = GreedyBySizeAllocationPlanner(create_test_allocation_specs());
|
||||||
|
|
||||||
|
EXPECT_EQ(result.total_size, 124);
|
||||||
|
|
||||||
|
auto& allocations = result.allocations;
|
||||||
|
|
||||||
|
EXPECT_EQ(allocations[0].offset, 0);
|
||||||
|
EXPECT_EQ(allocations[1].offset, 32);
|
||||||
|
EXPECT_EQ(allocations[2].offset, 64);
|
||||||
|
EXPECT_EQ(allocations[3].offset, 100);
|
||||||
|
EXPECT_EQ(allocations[4].offset, 116);
|
||||||
|
EXPECT_EQ(allocations[5].offset, 0);
|
||||||
|
EXPECT_EQ(allocations[6].offset, 104);
|
||||||
|
EXPECT_EQ(allocations[7].offset, 64);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST(LayoutPlannerAlgorithmTests, TestBump) {
|
||||||
|
auto specs = create_test_allocation_specs();
|
||||||
|
auto result = BumpAllocationPlanner(create_test_allocation_specs());
|
||||||
|
|
||||||
|
auto& allocations = result.allocations;
|
||||||
|
|
||||||
|
auto offset = 0;
|
||||||
|
for (auto&& [i, spec] : c10::enumerate(specs)) {
|
||||||
|
EXPECT_EQ(allocations[i].offset, offset);
|
||||||
|
offset += spec.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
EXPECT_EQ(result.total_size, offset);
|
||||||
|
}
|
24
torch/nativert/executor/memory/Bump.cpp
Normal file
24
torch/nativert/executor/memory/Bump.cpp
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
#include <torch/nativert/executor/memory/Bump.h>
|
||||||
|
|
||||||
|
namespace torch::nativert {
|
||||||
|
|
||||||
|
LayoutPlan BumpAllocationPlanner(
|
||||||
|
const std::vector<AllocationSpec>& allocation_specs) {
|
||||||
|
LayoutPlan plan;
|
||||||
|
|
||||||
|
auto& allocations = plan.allocations;
|
||||||
|
auto& total_size = plan.total_size;
|
||||||
|
|
||||||
|
allocations.reserve(allocation_specs.size());
|
||||||
|
for (const auto& spec : allocation_specs) {
|
||||||
|
allocations.push_back(Allocation{
|
||||||
|
.size = spec.size,
|
||||||
|
.offset = total_size,
|
||||||
|
});
|
||||||
|
total_size += spec.size;
|
||||||
|
}
|
||||||
|
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace torch::nativert
|
13
torch/nativert/executor/memory/Bump.h
Normal file
13
torch/nativert/executor/memory/Bump.h
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <torch/nativert/executor/memory/LayoutPlannerAlgorithm.h>
|
||||||
|
|
||||||
|
namespace torch::nativert {
|
||||||
|
|
||||||
|
// lay out all tensors contiguously in memory
|
||||||
|
// this doesn't take into account lifetimes,
|
||||||
|
// it literally just puts them all next to each other
|
||||||
|
LayoutPlan BumpAllocationPlanner(
|
||||||
|
const std::vector<AllocationSpec>& allocation_specs);
|
||||||
|
|
||||||
|
} // namespace torch::nativert
|
166
torch/nativert/executor/memory/GreedyBySize.cpp
Normal file
166
torch/nativert/executor/memory/GreedyBySize.cpp
Normal file
@ -0,0 +1,166 @@
|
|||||||
|
#include <iomanip>
|
||||||
|
#include <limits>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
|
#include <c10/util/Enumerate.h>
|
||||||
|
#include <c10/util/Logging.h>
|
||||||
|
#include <c10/util/irange.h>
|
||||||
|
|
||||||
|
#include <torch/nativert/executor/memory/GreedyBySize.h>
|
||||||
|
|
||||||
|
namespace {
|
||||||
|
|
||||||
|
using namespace torch::nativert;
|
||||||
|
|
||||||
|
// we need to track the original order in which allocations were made
|
||||||
|
// since they will be re-sorted between iterations
|
||||||
|
struct GreedyAllocation : public Allocation {
|
||||||
|
explicit GreedyAllocation(
|
||||||
|
Allocation allocation,
|
||||||
|
size_t allocation_idx,
|
||||||
|
size_t input_spec_idx)
|
||||||
|
: Allocation(allocation),
|
||||||
|
allocation_index(allocation_idx),
|
||||||
|
input_spec_index(input_spec_idx) {}
|
||||||
|
// we need to maintain the allocation ordering s.t., we can look up
|
||||||
|
// previous allocations directly from descending_allocation_specs_
|
||||||
|
// even after allocations has been re-sorted, which happens after
|
||||||
|
// each allocation is complete.
|
||||||
|
//
|
||||||
|
// i.e., this index represents the index of the spec that was used
|
||||||
|
// to create this allocation inside descending_allocation_specs_
|
||||||
|
// AFTER the sorting was completed.
|
||||||
|
size_t allocation_index{0};
|
||||||
|
// index of the spec associated with this allocation
|
||||||
|
// in the event that the specs get re-ordered
|
||||||
|
// in the process of creating allocations
|
||||||
|
// e.g.,
|
||||||
|
// allocation_specs[sX, sY, sZ]
|
||||||
|
// ^ ^ ^
|
||||||
|
// values[vX, vY, vZ]
|
||||||
|
//
|
||||||
|
// means that an allocation created from sY
|
||||||
|
// will have an input_spec_index of 1
|
||||||
|
//
|
||||||
|
// this allows us to return to the original
|
||||||
|
// ordering before returning the allocations
|
||||||
|
size_t input_spec_index{0};
|
||||||
|
};
|
||||||
|
|
||||||
|
struct AllocationSpecWithIndex {
|
||||||
|
const AllocationSpec* spec;
|
||||||
|
size_t index;
|
||||||
|
};
|
||||||
|
|
||||||
|
// associate specs with their original (unsorted) index
|
||||||
|
// and then sort them in descending order by byte size
|
||||||
|
std::vector<AllocationSpecWithIndex> prepare_allocation_specs(
|
||||||
|
const std::vector<AllocationSpec>& allocation_specs) {
|
||||||
|
std::vector<AllocationSpecWithIndex> specs;
|
||||||
|
specs.reserve(allocation_specs.size());
|
||||||
|
|
||||||
|
for (const auto i : c10::irange(allocation_specs.size())) {
|
||||||
|
specs.push_back({&allocation_specs[i], i});
|
||||||
|
}
|
||||||
|
|
||||||
|
std::sort(specs.begin(), specs.end(), [](auto& lhs, auto& rhs) {
|
||||||
|
return lhs.spec->size > rhs.spec->size;
|
||||||
|
});
|
||||||
|
|
||||||
|
return specs;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace
|
||||||
|
|
||||||
|
namespace torch::nativert {
|
||||||
|
|
||||||
|
// https://arxiv.org/pdf/2001.03288
|
||||||
|
LayoutPlan GreedyBySizeAllocationPlanner(
|
||||||
|
const std::vector<AllocationSpec>& allocation_specs) {
|
||||||
|
LayoutPlan plan;
|
||||||
|
|
||||||
|
auto descending_allocation_specs = prepare_allocation_specs(allocation_specs);
|
||||||
|
|
||||||
|
std::vector<GreedyAllocation> allocations;
|
||||||
|
allocations.reserve(allocation_specs.size());
|
||||||
|
|
||||||
|
auto get_next_offset = [&](const AllocationSpec& spec) -> size_t {
|
||||||
|
size_t prev_offset = 0;
|
||||||
|
std::optional<size_t> best_offset = std::nullopt;
|
||||||
|
size_t smallest_gap = std::numeric_limits<size_t>::max();
|
||||||
|
|
||||||
|
for (const auto& alloc : allocations) {
|
||||||
|
if (auto* allocated_spec =
|
||||||
|
descending_allocation_specs.at(alloc.allocation_index).spec;
|
||||||
|
allocated_spec->not_overlapping_with(spec)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (alloc.offset > prev_offset) {
|
||||||
|
if (size_t gap = alloc.offset - prev_offset;
|
||||||
|
gap >= spec.size && gap < smallest_gap) {
|
||||||
|
smallest_gap = gap;
|
||||||
|
best_offset = prev_offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
prev_offset = std::max(prev_offset, alloc.offset + alloc.size);
|
||||||
|
}
|
||||||
|
|
||||||
|
return best_offset.value_or(prev_offset);
|
||||||
|
};
|
||||||
|
|
||||||
|
size_t total_allocation_size = 0;
|
||||||
|
for (const auto&& [allocation_index, spec_with_original_index] :
|
||||||
|
c10::enumerate(descending_allocation_specs)) {
|
||||||
|
auto& spec = spec_with_original_index.spec;
|
||||||
|
|
||||||
|
auto new_allocation = GreedyAllocation(
|
||||||
|
Allocation{.size = spec->size, .offset = get_next_offset(*spec)},
|
||||||
|
allocation_index,
|
||||||
|
spec_with_original_index.index);
|
||||||
|
|
||||||
|
total_allocation_size += new_allocation.size;
|
||||||
|
plan.total_size =
|
||||||
|
std::max(plan.total_size, new_allocation.offset + new_allocation.size);
|
||||||
|
|
||||||
|
VLOG(1) << "allocation with interval " << spec->lifetime.start << "-->"
|
||||||
|
<< spec->lifetime.end << " placed at offset "
|
||||||
|
<< new_allocation.offset;
|
||||||
|
|
||||||
|
// insert new allocation while maintaining relative-offset ordering
|
||||||
|
// the algorithm is already quadratic because of get_next_offset
|
||||||
|
// so this is negligible
|
||||||
|
|
||||||
|
auto it = std::lower_bound(
|
||||||
|
allocations.begin(),
|
||||||
|
allocations.end(),
|
||||||
|
new_allocation,
|
||||||
|
[](auto& lhs, auto& rhs) { return lhs.offset < rhs.offset; });
|
||||||
|
allocations.insert(it, new_allocation);
|
||||||
|
}
|
||||||
|
|
||||||
|
// sort allocations so their ordering is consistent with the input specs
|
||||||
|
std::sort(allocations.begin(), allocations.end(), [](auto& lhs, auto& rhs) {
|
||||||
|
return lhs.input_spec_index < rhs.input_spec_index;
|
||||||
|
});
|
||||||
|
|
||||||
|
plan.allocations.reserve(allocations.size());
|
||||||
|
std::move(
|
||||||
|
allocations.begin(),
|
||||||
|
allocations.end(),
|
||||||
|
std::back_inserter(plan.allocations));
|
||||||
|
|
||||||
|
if (plan.total_size > 0) {
|
||||||
|
VLOG(1) << std::fixed << std::setprecision(2)
|
||||||
|
<< "greedy-by-size bytes saved over strictly increasing: "
|
||||||
|
<< (1.0 - ((float)plan.total_size / (float)total_allocation_size)) *
|
||||||
|
100
|
||||||
|
<< "% (" << total_allocation_size << " - " << plan.total_size
|
||||||
|
<< " = " << (total_allocation_size - plan.total_size) << " bytes)";
|
||||||
|
}
|
||||||
|
|
||||||
|
return plan;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace torch::nativert
|
10
torch/nativert/executor/memory/GreedyBySize.h
Normal file
10
torch/nativert/executor/memory/GreedyBySize.h
Normal file
@ -0,0 +1,10 @@
|
|||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <torch/nativert/executor/memory/LayoutPlannerAlgorithm.h>
|
||||||
|
|
||||||
|
namespace torch::nativert {
|
||||||
|
|
||||||
|
LayoutPlan GreedyBySizeAllocationPlanner(
|
||||||
|
const std::vector<AllocationSpec>& allocation_specs);
|
||||||
|
|
||||||
|
} // namespace torch::nativert
|
Reference in New Issue
Block a user