mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
[static-runtime] one more attempt to improve crash log readability (#96903)
Summary: * add human readable type and ivalue printout * fix internal linter warnings Test Plan: error message now looks like e.g. ``` E0315 16:27:32.409082 422313 ExceptionTracer.cpp:222] exception stack complete terminate called after throwing an instance of 'c10::Error' what(): List[int] is not a subtype of List[int]; schema arg name: 'split_sizes', ivalue: [1, 1] ``` Differential Revision: D44112297 Pull Request resolved: https://github.com/pytorch/pytorch/pull/96903 Approved by: https://github.com/davidberard98
This commit is contained in:
committed by
PyTorch MergeBot
parent
44d7bbfe22
commit
57bb5b159d
20
torch/csrc/jit/runtime/static/.clang-tidy
Normal file
20
torch/csrc/jit/runtime/static/.clang-tidy
Normal file
@ -0,0 +1,20 @@
|
||||
# NOTE there must be no spaces before the '-', so put the comma after.
|
||||
# When making changes, be sure to verify the output of the following command to ensure
|
||||
# the desired checks are enabled (run from the directory containing a .clang-tidy file):
|
||||
# ~/fbsource/tools/lint/clangtidy/clang-tidy-platform010 -list-checks
|
||||
# NOTE: Please don't disable inheritance from the parent to make sure that common checks get propagated.
|
||||
|
||||
# Remove noisy checks which are not necessary.
|
||||
---
|
||||
InheritParentConfig: true
|
||||
Checks: '
|
||||
-cppcoreguidelines-avoid-c-arrays,
|
||||
-modernize-avoid-c-arrays,
|
||||
-modernize-loop-convert,
|
||||
-readability-container-size-empty,
|
||||
'
|
||||
WarningsAsErrors: '
|
||||
-facebook-hte-BadMemberName,
|
||||
-facebook-hte-NullableReturn,
|
||||
'
|
||||
...
|
||||
@ -27,6 +27,7 @@
|
||||
#include <torch/csrc/jit/runtime/static/passes.h>
|
||||
#include <torch/csrc/jit/runtime/vararg_functions.h>
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/NativeFunctions.h>
|
||||
@ -56,6 +57,12 @@ namespace jit {
|
||||
|
||||
namespace {
|
||||
|
||||
std::string iValueToString(const c10::IValue& val) {
|
||||
std::ostringstream oss;
|
||||
oss << val;
|
||||
return oss.str();
|
||||
}
|
||||
|
||||
bool allArgsAreTensors(const Node* node) {
|
||||
const auto& inputs = node->inputs();
|
||||
return std::all_of(inputs.begin(), inputs.end(), [](const Value* value) {
|
||||
@ -395,7 +402,7 @@ ManagedTensorRanges::ManagedTensorRanges(
|
||||
const c10::FastSet<const Value*> graph_inputs(
|
||||
block.inputs().begin(), block.inputs().end());
|
||||
|
||||
const auto num_nodes = nodes.size();
|
||||
const auto num_nodes = static_cast<uint32_t>(nodes.size());
|
||||
for (const auto i : c10::irange(num_nodes)) {
|
||||
auto* node = nodes[i];
|
||||
for (auto* input : node->inputs()) {
|
||||
@ -613,7 +620,7 @@ size_t StaticModule::prepareBlockInfo(
|
||||
c10::FastMap<const Value*, uint32_t>& value_to_index) {
|
||||
block_infos_.emplace(block, BlockInfo(start_idx, *block));
|
||||
|
||||
const auto num_inputs = block->inputs().size();
|
||||
const auto num_inputs = static_cast<uint32_t>(block->inputs().size());
|
||||
for (const auto i : c10::irange(num_inputs)) {
|
||||
value_to_index.emplace(block->inputs()[i], start_idx + i);
|
||||
}
|
||||
@ -634,7 +641,7 @@ size_t StaticModule::prepareBlockInfo(
|
||||
cur_idx,
|
||||
" would overflow 2-byte index storage");
|
||||
|
||||
const auto num_outputs = node->outputs().size();
|
||||
const auto num_outputs = static_cast<uint32_t>(node->outputs().size());
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
value_to_index.emplace(node->outputs()[i], cur_idx + i);
|
||||
}
|
||||
@ -720,8 +727,9 @@ size_t StaticModule::prepareStaticNodeInfos(
|
||||
node_idx +=
|
||||
prepareStaticNodeInfos(sub_block, value_to_index, alias_db, node_idx);
|
||||
}
|
||||
ProcessedNodeInputs input_indices(node->inputs().size());
|
||||
for (const auto input_idx : c10::irange(node->inputs().size())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(node->inputs().size());
|
||||
ProcessedNodeInputs input_indices(num_outputs);
|
||||
for (const auto input_idx : c10::irange<uint32_t>(num_outputs)) {
|
||||
auto* input = node->inputs()[input_idx];
|
||||
auto input_ivalue_idx = value_to_index.at(input);
|
||||
TORCH_CHECK(
|
||||
@ -782,7 +790,8 @@ void BlockInfo::prepare_for_memory_planner(
|
||||
continue;
|
||||
}
|
||||
auto outputs = pnode.node()->outputs();
|
||||
for (const auto i : c10::irange(outputs.size())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(outputs.size());
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
const Value* out_v = outputs[i];
|
||||
// Types are stored in the underlying TorchScript IR
|
||||
bool is_tensor_type = out_v->type()->castRaw<TensorType>();
|
||||
@ -934,18 +943,18 @@ void check_type(const Argument& schema_arg, const IValue& arg) {
|
||||
}
|
||||
TORCH_CHECK(
|
||||
arg.type()->isSubtypeOf(schema_arg.type()),
|
||||
c10::to_string(arg.type()),
|
||||
arg.type()->annotation_str(),
|
||||
" is not a subtype of ",
|
||||
c10::to_string(schema_arg.type()),
|
||||
"; schema arg name: ",
|
||||
schema_arg.name());
|
||||
schema_arg.type()->annotation_str(),
|
||||
"; schema arg name: '",
|
||||
schema_arg.name(),
|
||||
"', ivalue: ",
|
||||
iValueToString(arg));
|
||||
}
|
||||
} // namespace
|
||||
|
||||
template <typename IValueList>
|
||||
void BlockRunner::set_inputs(
|
||||
IValueList&& args,
|
||||
const std::unordered_map<std::string, c10::IValue>& kwargs) {
|
||||
void BlockRunner::set_inputs(IValueList&& args, const KeywordArgs& kwargs) {
|
||||
const auto& schema = static_module_.schema();
|
||||
if (first_input_is_self_) {
|
||||
Input(0) = static_module_.module()._ivalue();
|
||||
@ -972,8 +981,8 @@ void BlockRunner::set_inputs(
|
||||
" inputs for schema ",
|
||||
schema ? schema->name() : "(not available)");
|
||||
|
||||
for (size_t i = 0; i < args.size(); ++i) {
|
||||
set_arg(i, std::forward<IValueList>(args));
|
||||
for (const auto i_arg : c10::irange(args.size())) {
|
||||
set_arg(i_arg, std::forward<IValueList>(args));
|
||||
}
|
||||
return;
|
||||
}
|
||||
@ -989,27 +998,28 @@ void BlockRunner::set_inputs(
|
||||
std::to_string(schema_args.size() - 1),
|
||||
" for schema ",
|
||||
schema->name());
|
||||
for (size_t i = 0; i < schema_args.size() - 1; ++i) {
|
||||
// Start at 1 since the schema always contains `self`.
|
||||
const auto& schema_arg = schema_args[i + 1];
|
||||
|
||||
if (i < args.size()) {
|
||||
check_type(schema_arg, args[i]);
|
||||
set_arg(i, std::forward<IValueList>(args));
|
||||
for (const auto i_arg : c10::irange(1, schema_args.size())) {
|
||||
// Start at 1 since the schema always contains `self`.
|
||||
const auto& schema_arg = schema_args[i_arg];
|
||||
|
||||
if (i_arg - 1 < args.size()) {
|
||||
check_type(schema_arg, std::forward<IValueList>(args)[i_arg - 1]);
|
||||
set_arg(i_arg - 1, std::forward<IValueList>(args));
|
||||
continue;
|
||||
}
|
||||
|
||||
auto it = kwargs.find(schema_arg.name());
|
||||
if (it != kwargs.end()) {
|
||||
check_type(schema_arg, it->second);
|
||||
set_arg(i, it->second);
|
||||
set_arg(i_arg - 1, it->second);
|
||||
++consumed_kwargs;
|
||||
continue;
|
||||
}
|
||||
|
||||
auto maybe_default_val = schema_arg.default_value();
|
||||
if (maybe_default_val) {
|
||||
set_arg(i, *maybe_default_val);
|
||||
set_arg(i_arg - 1, *maybe_default_val);
|
||||
continue;
|
||||
}
|
||||
|
||||
@ -1017,6 +1027,8 @@ void BlockRunner::set_inputs(
|
||||
false,
|
||||
"Static runtime is missing required kwarg ",
|
||||
schema_arg.name(),
|
||||
" i_arg: ",
|
||||
std::to_string(i_arg),
|
||||
" for schema ",
|
||||
schema->name());
|
||||
}
|
||||
@ -1045,7 +1057,8 @@ namespace {
|
||||
|
||||
void destroyNodeOutputs(ProcessedNode& p_node) {
|
||||
const auto borrows_outputs = borrowsOutputs(p_node.node()->kind());
|
||||
for (const auto i : c10::irange(p_node.num_outputs())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(p_node.num_outputs());
|
||||
for (const auto i : c10::irange<uint32_t>(num_outputs)) {
|
||||
auto& output = p_node.Output(i);
|
||||
if (doesNotHeapAllocateWhenStoredInIValue(*output.type())) {
|
||||
continue;
|
||||
@ -1179,7 +1192,8 @@ void BlockRunner::verify_and_correct_memory_overlap(ProcessedNode& n) {
|
||||
n.verify_and_correct_memory_overlap();
|
||||
} else {
|
||||
bool overlap_detected_with_fast_check = false;
|
||||
for (size_t i = 0; i < n.outputs().size(); i++) {
|
||||
const auto n_outputs = static_cast<uint32_t>(n.outputs().size());
|
||||
for (auto i : c10::irange(n_outputs)) {
|
||||
auto& output = n.Output(i);
|
||||
if (output.isTensor()) {
|
||||
overlap_detected_with_fast_check |=
|
||||
@ -1403,6 +1417,8 @@ std::string generate_latency_json(const std::string& label, double millis) {
|
||||
json["value"] = millis;
|
||||
return "PyTorchObserver " + folly::toJson(json);
|
||||
#else
|
||||
(void)label;
|
||||
(void)millis;
|
||||
return "";
|
||||
#endif
|
||||
}
|
||||
@ -1412,8 +1428,8 @@ std::string generate_latency_json(const std::string& label, double millis) {
|
||||
void BlockRunner::benchmark(
|
||||
const std::vector<std::vector<c10::IValue>>& args_list,
|
||||
const std::vector<KeywordArgs>& kwargs_list,
|
||||
const int warmup_runs,
|
||||
const int main_runs,
|
||||
const uint32_t warmup_runs,
|
||||
const uint32_t main_runs,
|
||||
bool print_per_node_time,
|
||||
bool generate_ai_pep_output) {
|
||||
TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
|
||||
@ -1427,7 +1443,8 @@ void BlockRunner::benchmark(
|
||||
benchmark_individual_ops(args_list, kwargs_list, warmup_runs, main_runs);
|
||||
|
||||
if (print_per_node_time) {
|
||||
for (const auto i : c10::irange(nodes_.size())) {
|
||||
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
|
||||
for (const auto i : c10::irange(num_nodes)) {
|
||||
const Node* node = nodes_[i].node();
|
||||
std::cout << "Node #" << i << ": " << results.time_per_node[i]
|
||||
<< " ms/iter, ";
|
||||
@ -1512,13 +1529,13 @@ void BlockRunner::benchmark(
|
||||
std::cout << "Total number of 'out' variant nodes/total number of nodes: "
|
||||
<< results.out_nodes_count << "/" << results.total_nodes_count
|
||||
<< " ("
|
||||
<< 100.0 * (results.out_nodes_count) /
|
||||
<< 100.0 * static_cast<float>(results.out_nodes_count) /
|
||||
static_cast<float>(results.total_nodes_count)
|
||||
<< "%)" << std::endl;
|
||||
std::cout << "Total number of nodes not covered by SR/total number of nodes: "
|
||||
<< unsupported_nodes_count << "/" << results.total_nodes_count
|
||||
<< " ("
|
||||
<< 100.0 * (unsupported_nodes_count) /
|
||||
<< 100.0 * static_cast<float>(unsupported_nodes_count) /
|
||||
static_cast<float>(results.total_nodes_count)
|
||||
<< "%)" << std::endl;
|
||||
|
||||
@ -1534,16 +1551,17 @@ void BlockRunner::benchmark(
|
||||
float BlockRunner::benchmark_model(
|
||||
const std::vector<std::vector<c10::IValue>>& args_list,
|
||||
const std::vector<KeywordArgs>& kwargs_list,
|
||||
const int warmup_runs,
|
||||
const int main_runs) {
|
||||
TORCH_CHECK(warmup_runs >= 0 && main_runs >= 1);
|
||||
const unsigned int warmup_runs,
|
||||
const unsigned int main_runs) {
|
||||
TORCH_CHECK(main_runs >= 1);
|
||||
TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
|
||||
|
||||
const bool is_kwargs_empty = kwargs_list.empty();
|
||||
const KeywordArgs empty_kwargs;
|
||||
for (const auto i : c10::irange(warmup_runs)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
for (const auto j : c10::irange(args_list.size())) {
|
||||
for (const auto _n_run : c10::irange(warmup_runs)) {
|
||||
(void)_n_run; // Suppress unused variable warning
|
||||
const auto num_args = static_cast<uint32_t>(args_list.size());
|
||||
for (const auto j : c10::irange(num_args)) {
|
||||
operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
|
||||
if (manage_output_tensors_enabled_) {
|
||||
deallocateOutputTensors();
|
||||
@ -1551,9 +1569,10 @@ float BlockRunner::benchmark_model(
|
||||
}
|
||||
}
|
||||
caffe2::Timer timer;
|
||||
for (const auto i : c10::irange(main_runs)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
for (const auto j : c10::irange(args_list.size())) {
|
||||
for (const auto _n_run : c10::irange(main_runs)) {
|
||||
(void)_n_run; // Suppress unused variable warning
|
||||
const auto num_args = static_cast<uint32_t>(args_list.size());
|
||||
for (const auto j : c10::irange(num_args)) {
|
||||
operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
|
||||
if (manage_output_tensors_enabled_) {
|
||||
deallocateOutputTensors();
|
||||
@ -1561,15 +1580,18 @@ float BlockRunner::benchmark_model(
|
||||
}
|
||||
}
|
||||
float millis = timer.MilliSeconds();
|
||||
return millis / (static_cast<float>(main_runs) * args_list.size());
|
||||
return millis /
|
||||
(static_cast<float>(main_runs) * static_cast<float>(args_list.size()));
|
||||
}
|
||||
|
||||
bool display_ivalue(const IValue& iv) {
|
||||
if (iv.isTensor()) {
|
||||
std::cout << "Tensor " << iv.toTensor().toString() << " {";
|
||||
for (const auto i : c10::irange(iv.toTensor().sizes().size())) {
|
||||
const auto dims = iv.toTensor().sizes();
|
||||
const auto n_dims = static_cast<uint32_t>(dims.size());
|
||||
for (const auto i : c10::irange(n_dims)) {
|
||||
std::cout << iv.toTensor().sizes()[i];
|
||||
if (iv.toTensor().sizes().size() > i + 1) {
|
||||
if (n_dims > i + 1) {
|
||||
std::cout << ", ";
|
||||
}
|
||||
}
|
||||
@ -1599,14 +1621,16 @@ bool display_ivalue(const IValue& iv) {
|
||||
|
||||
void display_pnode_info(const ProcessedNode& pnode) {
|
||||
pnode.node()->print(std::cout, 0, nullptr, false);
|
||||
for (const auto i : c10::irange(pnode.num_inputs())) {
|
||||
const auto num_inputs = static_cast<uint32_t>(pnode.num_inputs());
|
||||
for (const auto i : c10::irange(num_inputs)) {
|
||||
std::cout << "\ti" << i << ": ";
|
||||
if (!display_ivalue(pnode.Input(i))) {
|
||||
std::cout << *(pnode.node()->inputs()[i]->type()) << '\n';
|
||||
}
|
||||
}
|
||||
const auto outputs = pnode.outputs();
|
||||
for (const auto i : c10::irange(outputs.size())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(outputs.size());
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
std::cout << "\to" << i << ": ";
|
||||
if (!display_ivalue(outputs[i])) {
|
||||
std::cout << *(pnode.node()->outputs()[i]->type()) << '\n';
|
||||
@ -1636,8 +1660,8 @@ void BlockRunner::display_nodes(
|
||||
BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
|
||||
const std::vector<std::vector<c10::IValue>>& args_list,
|
||||
const std::vector<KeywordArgs>& kwargs_list,
|
||||
const int warmup_runs,
|
||||
const int main_runs) {
|
||||
const uint32_t warmup_runs,
|
||||
const uint32_t main_runs) {
|
||||
TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
|
||||
TORCH_CHECK(warmup_runs >= 1 && main_runs >= 1);
|
||||
|
||||
@ -1646,7 +1670,8 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
|
||||
if (args_list.empty()) {
|
||||
// When the given input is empty, compute the op statistics from the given
|
||||
// graph without executing it.
|
||||
for (const auto i : c10::irange(nodes_.size())) {
|
||||
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
|
||||
for (const auto i : c10::irange(num_nodes)) {
|
||||
const Node* node = nodes_[i].node();
|
||||
std::string kind(node->kind().toQualString());
|
||||
// TODO: Collect op statistics from sub-blocks here.
|
||||
@ -1697,9 +1722,10 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
|
||||
results.first_iter_time = timer.MilliSeconds();
|
||||
|
||||
// warmup runs
|
||||
for (const auto i : c10::irange(warmup_runs - 1)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
for (const auto j : c10::irange(args_list.size())) {
|
||||
for (const auto _n_run : c10::irange(warmup_runs)) {
|
||||
(void)_n_run; // Suppress unused variable warning
|
||||
const auto num_args = static_cast<uint32_t>(args_list.size());
|
||||
for (const auto j : c10::irange(num_args)) {
|
||||
operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
|
||||
if (manage_output_tensors) {
|
||||
deallocateOutputTensors();
|
||||
@ -1710,8 +1736,8 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
|
||||
// main runs
|
||||
for (const auto i : c10::irange(main_runs)) {
|
||||
(void)i; // Suppress unused variable warning
|
||||
|
||||
for (const auto j : c10::irange(args_list.size())) {
|
||||
const auto num_args = static_cast<uint32_t>(args_list.size());
|
||||
for (const auto j : c10::irange(num_args)) {
|
||||
set_inputs(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
|
||||
|
||||
timer.Start();
|
||||
@ -1720,8 +1746,8 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
|
||||
}
|
||||
float millis = timer.MilliSeconds();
|
||||
results.memory_alloc_time += millis;
|
||||
|
||||
for (const auto k : c10::irange(nodes_.size())) {
|
||||
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
|
||||
for (const auto k : c10::irange<uint32_t>(num_nodes)) {
|
||||
timer.Start();
|
||||
nodes_[k].run();
|
||||
millis = timer.MilliSeconds();
|
||||
@ -1759,8 +1785,9 @@ BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
|
||||
|
||||
// post processing
|
||||
const float num_total_iters =
|
||||
(static_cast<float>(main_runs) * args_list.size());
|
||||
for (const auto i : c10::irange(nodes_.size())) {
|
||||
(static_cast<float>(main_runs) * static_cast<float>(args_list.size()));
|
||||
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
|
||||
for (const auto i : c10::irange(num_nodes)) {
|
||||
const Node* node = nodes_[i].node();
|
||||
std::string kind = std::string(node->kind().toQualString());
|
||||
results.time_per_node[i] /= num_total_iters;
|
||||
@ -1789,7 +1816,8 @@ bool BlockRunner::check_for_memory_leak(
|
||||
bool output_returned,
|
||||
bool recurse_on_sub_blocks) {
|
||||
// check for inputs
|
||||
for (const auto i : c10::irange(block_info_.num_inputs())) {
|
||||
const auto num_inputs = static_cast<uint32_t>(block_info_.num_inputs());
|
||||
for (const auto i : c10::irange(num_inputs)) {
|
||||
TORCH_CHECK(
|
||||
values_[i + block_info_.block_inputs_idx()].isNone(),
|
||||
"Input ",
|
||||
@ -1797,9 +1825,11 @@ bool BlockRunner::check_for_memory_leak(
|
||||
" was not cleaned up");
|
||||
}
|
||||
c10::FastSet<const IValue*> output_ivalues(outputs_.begin(), outputs_.end());
|
||||
for (const auto n : c10::irange(nodes_.size())) {
|
||||
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
|
||||
for (const auto n : c10::irange(num_nodes)) {
|
||||
auto& pnode = nodes_[n];
|
||||
for (const auto i : c10::irange(pnode.num_outputs())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
const IValue* ival = &pnode.Output(i);
|
||||
const Value* val = pnode.node()->output(i);
|
||||
// subtlety: isManagedOutputTensorValue may give a false
|
||||
@ -1875,9 +1905,11 @@ bool BlockRunner::checkOutputTensorMemoryLeaks() {
|
||||
if (!static_module_.opts().manage_output_tensors || !planner_) {
|
||||
return true;
|
||||
}
|
||||
for (const auto n : c10::irange(nodes_.size())) {
|
||||
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
|
||||
for (const auto n : c10::irange(num_nodes)) {
|
||||
auto& pnode = nodes_[n];
|
||||
for (const auto i : c10::irange(pnode.num_outputs())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
const IValue* ival = &pnode.Output(i);
|
||||
const Value* val = pnode.node()->output(i);
|
||||
if (!isManagedOutputTensorValue(val) || !ival->isTensor()) {
|
||||
@ -1924,7 +1956,8 @@ void BlockRunner::disableManageOutputTensors() {
|
||||
// Reset all IValues and destruct planner_ so that it can be reconstructed in
|
||||
// the next run.
|
||||
for (auto& n : nodes_) {
|
||||
for (const auto i : c10::irange(n.outputs().size())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(n.outputs().size());
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
n.Output(i) = IValue();
|
||||
}
|
||||
}
|
||||
@ -1964,7 +1997,7 @@ ProcessedFunction::ProcessedFunction(
|
||||
f_ = [node_op = op.getOperation(node),
|
||||
has_var_args = hasVarArgs(node)](ProcessedNode* pnode) mutable {
|
||||
std::vector<IValue> stack;
|
||||
const size_t size = pnode->num_inputs();
|
||||
const auto size = static_cast<uint32_t>(pnode->num_inputs());
|
||||
stack.reserve(size + has_var_args);
|
||||
for (const auto i : c10::irange(size)) {
|
||||
stack.emplace_back(pnode->Input(i));
|
||||
@ -1974,8 +2007,9 @@ ProcessedFunction::ProcessedFunction(
|
||||
stack.emplace_back(static_cast<int>(size));
|
||||
}
|
||||
node_op(stack);
|
||||
TORCH_DCHECK_EQ(stack.size(), pnode->num_outputs());
|
||||
for (const auto i : c10::irange(pnode->num_outputs())) {
|
||||
const auto num_outputs = static_cast<uint32_t>(pnode->num_outputs());
|
||||
TORCH_DCHECK_EQ(stack.size(), num_outputs);
|
||||
for (const auto i : c10::irange(num_outputs)) {
|
||||
pnode->Output(i) = std::move(stack[i]);
|
||||
}
|
||||
};
|
||||
@ -2005,8 +2039,10 @@ StaticNodeInfo::StaticNodeInfo(
|
||||
|
||||
std::vector<IValue> ProcessedNode::inputs_ivalue_vec() const {
|
||||
std::vector<IValue> result;
|
||||
result.reserve(inputs_.size());
|
||||
for (const auto idx : c10::irange(num_inputs())) {
|
||||
const auto num_inputs = static_cast<uint32_t>(inputs_.size());
|
||||
result.reserve(num_inputs);
|
||||
|
||||
for (const auto idx : c10::irange(num_inputs)) {
|
||||
result.emplace_back(Input(idx));
|
||||
}
|
||||
return result;
|
||||
@ -2081,12 +2117,13 @@ bool ProcessedNode::verify_no_memory_overlap(bool force_check) const {
|
||||
}
|
||||
|
||||
bool ProcessedNode::verify_outputs_dont_overlap_each_other() const {
|
||||
for (const auto i : c10::irange(num_outputs())) {
|
||||
const auto n_outputs = static_cast<uint32_t>(num_outputs());
|
||||
for (const auto i : c10::irange(n_outputs)) {
|
||||
if (!Output(i).isTensor()) {
|
||||
continue;
|
||||
}
|
||||
const auto& out0_t = Output(i).toTensor();
|
||||
for (const auto j : c10::irange(i + 1, num_outputs())) {
|
||||
for (const auto j : c10::irange(i + 1, n_outputs)) {
|
||||
if (!Output(j).isTensor()) {
|
||||
continue;
|
||||
}
|
||||
@ -2117,14 +2154,15 @@ bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
|
||||
<< ", num_outputs_: " << num_outputs();
|
||||
return true;
|
||||
}
|
||||
|
||||
for (const auto i : c10::irange(inputs_.size())) {
|
||||
const auto n_inputs = static_cast<uint32_t>(inputs_.size());
|
||||
const auto n_outputs = static_cast<uint32_t>(num_outputs());
|
||||
for (const auto i : c10::irange<uint32_t>(n_inputs)) {
|
||||
const IValue* in = &Input(i);
|
||||
if (!in->isTensor()) {
|
||||
continue;
|
||||
}
|
||||
const auto& in_t = in->toTensor();
|
||||
for (const auto j : c10::irange(num_outputs())) {
|
||||
for (const auto j : c10::irange(n_outputs)) {
|
||||
const IValue& out = Output(j);
|
||||
if (!out.isTensor()) {
|
||||
continue;
|
||||
@ -2155,13 +2193,15 @@ bool ProcessedNode::check_and_correct_overlap_with(
|
||||
}
|
||||
|
||||
void ProcessedNode::verify_and_correct_memory_overlap() {
|
||||
for (const auto i : c10::irange(inputs_.size())) {
|
||||
const auto n_inputs = static_cast<uint32_t>(inputs_.size());
|
||||
const auto n_outputs = static_cast<uint32_t>(num_outputs());
|
||||
for (const auto i : c10::irange(n_inputs)) {
|
||||
const IValue& in = Input(i);
|
||||
if (!in.isTensor()) {
|
||||
continue;
|
||||
}
|
||||
const auto& in_t = in.toTensor();
|
||||
for (const auto j : c10::irange(num_outputs())) {
|
||||
for (const auto j : c10::irange(n_outputs)) {
|
||||
auto& output = Output(j);
|
||||
if (output.isTensor()) {
|
||||
check_and_correct_overlap_with(in_t, output);
|
||||
|
||||
@ -1,5 +1,4 @@
|
||||
#pragma once
|
||||
|
||||
#include <ATen/core/ivalue.h>
|
||||
#include <ATen/core/symbol.h>
|
||||
#include <c10/core/CPUAllocator.h>
|
||||
@ -567,8 +566,8 @@ class TORCH_API BlockRunner {
|
||||
void benchmark(
|
||||
const std::vector<std::vector<c10::IValue>>& args_list,
|
||||
const std::vector<KeywordArgs>& kwargs_list,
|
||||
const int warmup_runs,
|
||||
const int main_runs,
|
||||
const uint32_t warmup_runs,
|
||||
const uint32_t main_runs,
|
||||
bool print_per_node_time = false,
|
||||
bool generate_ai_pep_output = false);
|
||||
|
||||
@ -592,8 +591,8 @@ class TORCH_API BlockRunner {
|
||||
IndividualMetrics benchmark_individual_ops(
|
||||
const std::vector<std::vector<c10::IValue>>& args_list,
|
||||
const std::vector<KeywordArgs>& kwargs_list,
|
||||
const int warmup_runs,
|
||||
const int main_runs);
|
||||
const uint32_t warmup_runs,
|
||||
const uint32_t main_runs);
|
||||
|
||||
// Input is readwrite
|
||||
IValue& Input(uint32_t i) {
|
||||
@ -704,9 +703,7 @@ class TORCH_API BlockRunner {
|
||||
|
||||
// helper method for copying input args/kwargs into inputs_
|
||||
template <typename IValueList>
|
||||
void set_inputs(
|
||||
IValueList&& args,
|
||||
const std::unordered_map<std::string, c10::IValue>& kwargs);
|
||||
void set_inputs(IValueList&& args, const KeywordArgs& kwargs);
|
||||
|
||||
// Set Input(idx) to args[idx]. Invoked by set_inputs. Copies or moves
|
||||
// depending on overload.
|
||||
@ -737,8 +734,8 @@ class TORCH_API BlockRunner {
|
||||
float benchmark_model(
|
||||
const std::vector<std::vector<c10::IValue>>& args_list,
|
||||
const std::vector<KeywordArgs>& kwargs_list,
|
||||
const int warmup_runs,
|
||||
const int main_runs);
|
||||
const uint32_t warmup_runs,
|
||||
const uint32_t main_runs);
|
||||
|
||||
void display_nodes(
|
||||
const std::vector<c10::IValue>& args,
|
||||
@ -1073,8 +1070,8 @@ class TORCH_API StaticRuntime {
|
||||
void benchmark(
|
||||
const std::vector<std::vector<c10::IValue>>& args_list,
|
||||
const std::vector<KeywordArgs>& kwargs_list,
|
||||
const int warmup_runs,
|
||||
const int main_runs,
|
||||
const uint32_t warmup_runs,
|
||||
const uint32_t main_runs,
|
||||
bool print_per_node_time = false,
|
||||
bool generate_ai_pep_output = false) {
|
||||
block_->benchmark(
|
||||
@ -1125,7 +1122,7 @@ class TORCH_API StaticRuntime {
|
||||
// NOLINTNEXTLINE(modernize-avoid-c-arrays)
|
||||
// NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays)
|
||||
std::unique_ptr<IValue[]> array_ = nullptr;
|
||||
size_t size_;
|
||||
size_t size_ = 0;
|
||||
};
|
||||
|
||||
std::unique_ptr<BlockRunner> block_;
|
||||
|
||||
Reference in New Issue
Block a user