Files
pytorch/torch/csrc/jit/runtime/static/impl.cpp
2024-06-04 14:22:21 +00:00

2325 lines
76 KiB
C++

#include <torch/csrc/jit/runtime/static/impl.h>
#include <ATen/MemoryOverlap.h>
#include <ATen/core/symbol.h>
#include <ATen/record_function.h>
#include <c10/core/CPUAllocator.h>
#include <c10/core/InferenceMode.h>
#include <c10/macros/Macros.h>
#include <c10/util/MaybeOwned.h>
#include <c10/util/irange.h>
#include <caffe2/core/timer.h>
#include <torch/csrc/jit/ir/alias_analysis.h>
#include <torch/csrc/jit/jit_log.h>
#include <torch/csrc/jit/passes/add_if_then_else.h>
#include <torch/csrc/jit/passes/canonicalize.h>
#include <torch/csrc/jit/passes/dead_code_elimination.h>
#include <torch/csrc/jit/passes/eliminate_no_ops.h>
#include <torch/csrc/jit/passes/freeze_module.h>
#include <torch/csrc/jit/passes/remove_mutation.h>
#include <torch/csrc/jit/passes/subgraph_rewrite.h>
#include <torch/csrc/jit/passes/variadic_ops.h>
#include <torch/csrc/jit/runtime/graph_iterator.h>
#include <torch/csrc/jit/runtime/static/fusion.h>
#include <torch/csrc/jit/runtime/static/memory_planner.h>
#include <torch/csrc/jit/runtime/static/ops.h>
#include <torch/csrc/jit/runtime/static/passes.h>
#include <torch/csrc/jit/runtime/vararg_functions.h>
#include <algorithm>
#include <cstdint>
#include <iostream>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/NativeFunctions.h>
#else
#include <ATen/ops/clone_native.h>
#endif
#include <iterator>
#include <limits>
#include <sstream>
#include <stdexcept>
#ifdef FBCODE_CAFFE2
#include <common/logging/logging.h>
#include <folly/dynamic.h>
#include <folly/json.h>
#endif
// used in test only
C10_DEFINE_bool(
static_runtime_disable_debug_memory_overlap_check,
false,
"If true, disable the memory overlap check in debug mode in ProcessedNode::run()");
namespace torch::jit {
namespace {
std::string iValueToString(const c10::IValue& val) {
std::ostringstream oss;
oss << val;
return oss.str();
}
bool allArgsAreTensors(const Node* node) {
const auto& inputs = node->inputs();
return std::all_of(inputs.begin(), inputs.end(), [](const Value* value) {
return value->type()->kind() == TypeKind::TensorType;
});
}
} // namespace
// A manually curated set of ops that are disallowed in static runtime.
// These are rarely-used ops. Disallowing them typically eliminates
// corner cases in graph optimizations, allowing for more aggressive
// optimizations and better performance.
static bool isUnsupportedOp(const Node* node) {
auto kind = node->kind();
if (kind != aten::__is__ && kind != aten::__isnot__) {
return false;
}
// We can't support aten::__is__ (and __isnot__) with tensor arguments.
// Consider the following graph:
// def forward(x):
// y = x.detach()
// return x is y
// We have a graph optimization that removes the `detach` node since it is
// a no-op during inference. But this affects the result - we get true
// instead of false! There are many other graph passes affected by this
// issue.
return allArgsAreTensors(node);
}
namespace {
bool canEnableStaticRuntimeImpl(const Block* block) {
if (block == nullptr) {
return false;
}
bool can_support = true;
for (auto* node : block->nodes()) {
for (auto* subblock : node->blocks()) {
// The ordering prevents && from short circuiting, which we want -
// it's useful to see *all* the unsupported ops.
can_support = canEnableStaticRuntimeImpl(subblock) && can_support;
}
const auto kind = node->kind();
if (kind == prim::Constant) {
continue;
}
// check if can get op from Node
const Operator* op = node->maybeOperator();
if (isUnsupportedOp(node) || (!op && !nativeOpIsRegistered(kind))) {
can_support = false;
LOG(WARNING) << "Found unsupported op: " << kind.toQualString();
}
}
return can_support;
}
} // namespace
// Graph must be frozen. canEnableStaticRuntime will return false
// if there's any prim::CallMethod ops left in the graph.
bool canEnableStaticRuntime(const std::shared_ptr<torch::jit::Graph>& graph) {
return canEnableStaticRuntimeImpl(graph->block());
}
namespace {
auto sr_metadata_registerer = torch::class_<StaticRuntimeMetadata>(
"StaticRuntime",
"StaticRuntimeMetadata");
} // namespace
std::string dumpValueSet(
const c10::FastSet<const Value*>& value_set,
const char* set_name) {
std::ostringstream oss;
oss << set_name << ": {";
for (const auto* val : value_set) {
oss << "%" << val->debugName() << ", ";
}
oss << "}";
return oss.str();
}
namespace {
void OptimizeGraph(
std::shared_ptr<torch::jit::Graph>& graph,
const StaticModuleOptions& opts,
std::vector<IValue> sample_inputs) {
GRAPH_DUMP("Before optimizations: ", graph);
if (opts.enable_tensorexpr_fusion) {
if (sample_inputs.empty()) {
VLOG(1) << "Cannot perform TensorExpr fusion - sample_inputs is empty";
} else {
VLOG(1) << "Performing TensorExpr fusion";
performTensorExprFusion(graph, std::move(sample_inputs));
}
}
Inline(*graph);
ConstantPropagation(graph);
Canonicalize(graph);
ConstantPropagation(graph);
RemoveTensorMutation(graph);
ConstantPropagation(graph);
EliminateNoOpSlice(graph);
EliminateDeadCode(graph);
FuseInferenceOpsForSparseNN(graph);
UseVariadicCat(graph);
UseVariadicStack(graph);
EliminateTrivialEquallySplit(graph);
EliminateExtraPermuteOps(graph);
if (opts.enable_out_variant) {
UseVariadicOp(
graph,
fromQualString("fb::sigrid_transforms_torch_bind"),
fromQualString("fb::variadic_sigrid_transforms_torch_bind"));
UseVariadicOp(
graph,
fromQualString("torcharrow::inference_wrapper_run_flat"),
fromQualString("torcharrow::variadic_inference_wrapper_run_flat"));
// These fused ops only have out variants - we can't do the fusion when
// out variants are disabled.
FuseSignLog1P(graph);
FuseClampNaNToNum(graph);
#ifdef FBCODE_CAFFE2
if (opts.use_copy_variants && !opts.enable_tensorexpr_fusion) {
ReplaceWithCopy(graph);
} else {
ReplacePermuteWithCopy(graph);
}
if (opts.use_maybe_copy_variants && !opts.enable_tensorexpr_fusion) {
ReplaceWithMaybeCopy(graph);
}
FuseListUnpack(graph);
RemoveUnnecessaryOutputs(graph);
PrepackWeights(graph);
#endif
}
ConstantPropagation(graph);
RemoveImmutableInputDictLookups(graph);
UseVariadicTupleUnpack(graph);
UseVariadicGroupedAccessor(graph);
EliminateNoOps(
graph, /* custom_ops */ {fromQualString("fb::scale_gradient")});
AddIfThenElseOp(graph);
UseSplitAndSqueeze(graph);
UseInPlaceGetRealInputsFromOptionalInputsV2(graph);
GRAPH_DUMP("Final graph after optimizations: ", graph);
}
bool IsSelfInGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
return !graph->inputs().empty() && graph->inputs().at(0)->type()->is_module();
}
// remove unused input 0 from graph
bool removeSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
if (graph->inputs().at(0)->type()->is_module()) {
if (graph->inputs().at(0)->hasUses()) {
return false;
}
graph->eraseInput(0);
}
return true;
}
std::vector<Value*> valueVecFromFastSet(const c10::FastSet<const Value*>& s) {
std::vector<Value*> result;
result.reserve(s.size());
for (auto* v : s) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
result.emplace_back(const_cast<Value*>(v));
}
return result;
}
bool mayContainAlias(const AliasDb& db, const Value* v1, const Value* v2) {
// AliasDb is not const-correct here, so we have to const_cast
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
return db.mayContainAlias(const_cast<Value*>(v1), const_cast<Value*>(v2));
}
bool mayContainAlias(
const AliasDb& db,
const Value* a,
const c10::FastSet<const Value*>& b) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
return db.mayContainAlias(const_cast<Value*>(a), valueVecFromFastSet(b));
}
bool escapesScope(const AliasDb& db, const Value* a) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
return db.escapesScope({const_cast<Value*>(a)});
}
void PrepareGraphForStaticModule(
std::shared_ptr<torch::jit::Graph> graph,
const StaticModuleOptions& opts,
std::vector<IValue> sample_inputs) {
TORCH_CHECK(canEnableStaticRuntime(graph));
OptimizeGraph(graph, opts, std::move(sample_inputs));
// Static runtime moves its outputs out of the runtime
// by default. In some rare cases, this is not actually safe to
// do - for example, if the value is a constant, static runtime
// needs to hold onto a copy. Rather than adding special logic
// to handle this rare case, we use this pass to detect it and
// create an owned reference that can be safely moved out of the
// runtime.
CreateOwnedRefsForSpecialValues(*graph);
// We assume that each sub-block has at least one output. If we
// detect any that have 0, force the sub-block to return None.
ForceNonEmptyOutputs(*graph);
}
std::pair<std::shared_ptr<Graph>, std::optional<Module>> PrepareForStaticModule(
const torch::jit::Module& m,
bool is_frozen,
const StaticModuleOptions& opts,
std::vector<IValue> sample_inputs) {
LOG(INFO) << "StaticModuleOptions: enable_out_variant "
<< opts.enable_out_variant << ", optimize_memory "
<< opts.optimize_memory << ", manage_output_tensors "
<< opts.manage_output_tensors << ", use_copy_variants "
<< opts.use_copy_variants << ", use_maybe_copy_variants "
<< opts.use_maybe_copy_variants << ", enable_tensorexpr_fusion "
<< opts.enable_tensorexpr_fusion;
Module module = m.copy();
if (!is_frozen) {
module.eval();
module = freeze_module(module);
}
Method method = module.get_method("forward");
auto graph = module.get_method("forward").graph();
if (!sample_inputs.empty() && IsSelfInGraphInput(graph)) {
sample_inputs.insert(sample_inputs.begin(), m._ivalue());
}
PrepareGraphForStaticModule(graph, opts, std::move(sample_inputs));
return std::make_pair(graph, module);
}
std::pair<std::shared_ptr<Graph>, std::optional<Module>> PrepareForStaticModule(
std::shared_ptr<torch::jit::Graph> graph,
const StaticModuleOptions& opts,
std::vector<IValue> sample_inputs) {
PrepareGraphForStaticModule(graph, opts, std::move(sample_inputs));
return std::make_pair(graph, c10::nullopt);
}
} // namespace
void ValueGroup::init(const Block& block, const AliasDb& db) {
external_aliases_.clear();
output_aliases_.clear();
// Build `external_aliases` as we look through nodes forwardly from
// the graph's inputs and add aliases of the inputs being created by the
// nodes.
external_aliases_.insert(block.inputs().begin(), block.inputs().end());
for (const auto* node : block.nodes()) {
if (node->kind() == prim::Constant) {
for (const auto* output : node->outputs()) {
external_aliases_.insert(output);
}
}
}
for (const auto* node : block.nodes()) {
if (node->kind() == prim::Constant) {
// Constants are already in `external_aliases`.
continue;
}
for (const auto* v : node->outputs()) {
if (escapesScope(db, v) || mayContainAlias(db, v, external_aliases_)) {
external_aliases_.insert(v);
}
}
}
// Build `output_aliases` as we look through nodes reversely so that we can
// start from the output values, and follow the flows backwardly from there.
output_aliases_.insert(block.outputs().begin(), block.outputs().end());
for (const auto* node : block.nodes().reverse()) {
if (node->kind() == prim::Constant) {
// Constants cannot create any aliases.
continue;
}
for (const auto* v : node->outputs()) {
if (mayContainAlias(db, v, output_aliases_)) {
output_aliases_.insert(v);
}
}
}
}
namespace {
bool isTensorList(const Value* value) {
auto* type = value->type()->castRaw<ListType>();
if (!type) {
return false;
}
return type->getElementType()->kind() == c10::TypeKind::TensorType;
}
bool containTensorsOnly(at::ArrayRef<Value*> values) {
// return true only if all outputs are tensors
return std::all_of(values.begin(), values.end(), [](const Value* value) {
return value->type()->kind() == c10::TypeKind::TensorType ||
isTensorList(value);
});
}
bool isPureFunction(const Node* node) {
auto* schema = node->maybeSchema();
return schema &&
schema->aliasAnalysis() == c10::AliasAnalysisKind::PURE_FUNCTION;
}
} // namespace
ManagedTensorRanges::ManagedTensorRanges(
Block& block,
const AliasDb& alias_db,
const c10::FastSet<const Value*>& managed_tensor_values) {
const std::vector<Node*> nodes(block.nodes().begin(), block.nodes().end());
const c10::FastSet<const Value*> graph_inputs(
block.inputs().begin(), block.inputs().end());
const auto num_nodes = static_cast<uint32_t>(nodes.size());
for (const auto i : c10::irange(num_nodes)) {
auto* node = nodes[i];
for (auto* input : node->inputs()) {
auto* lifetime = getLifetime(input);
if (!lifetime) {
continue;
}
DCHECK(lifetime->end <= i);
lifetime->end = i;
}
for (auto* output : node->outputs()) {
if (!alias_db.isMutableType(output)) {
continue;
}
value_lifetimes_.emplace(output, Lifetime(i, i));
}
}
for (auto* graph_output : block.outputs()) {
auto* lifetime = getLifetime(graph_output);
if (!lifetime) {
continue;
}
lifetime->end = num_nodes;
}
// Handle aliases. Aliases may extend a Value*'s lifetime. If a node
// has an input and output that may alias each other, set the input's
// lifetime end to max(input.lifetime_end, output.lifetime_end). Iterate
// backwards to handle chains of aliases.
for (const auto* node : block.nodes().reverse()) {
if (isPureFunction(node)) {
// If the node is a pure function, it doesn't create any aliases,
// so we can safely skip it.
continue;
}
auto inputs = collectValuesWithTrackedLifetimes(node->inputs());
auto outputs = collectValuesWithTrackedLifetimes(node->outputs());
for (auto* input : inputs) {
auto* input_lifetime = getLifetime(input);
DCHECK(input_lifetime != nullptr);
for (auto* output : outputs) {
if (mayContainAlias(alias_db, input, output)) {
auto* output_lifetime = getLifetime(output);
DCHECK(output_lifetime != nullptr);
input_lifetime->end =
std::max(output_lifetime->end, input_lifetime->end);
}
}
}
}
for (auto* managed_tensor : managed_tensor_values) {
auto* lifetime = getLifetime(managed_tensor);
DCHECK(lifetime && lifetime->end <= num_nodes);
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
Node* freeing_node;
if (lifetime->end == num_nodes) {
freeing_node = block.return_node();
} else {
freeing_node = nodes[lifetime->end];
}
node_to_newly_free_tensors_[freeing_node].emplace_back(managed_tensor);
}
}
bool ManagedTensorRanges::nodeFreesManagedTensors(Node* node) const {
auto it = node_to_newly_free_tensors_.find(node);
return it != node_to_newly_free_tensors_.end() && !it->second.empty();
}
const std::vector<const Value*>& ManagedTensorRanges::
availableTensorValuesAfterNode(Node* node) const {
return node_to_newly_free_tensors_.at(node);
}
bool ManagedTensorRanges::lifetimesOverlap(const Value* v1, const Value* v2)
const {
const auto* v1_lifetime = getLifetime(v1);
const auto* v2_lifetime = getLifetime(v2);
if (!v1_lifetime || !v2_lifetime) {
return false;
}
if (v1_lifetime->start < v2_lifetime->start) {
return v1_lifetime->end >= v2_lifetime->start;
}
return v2_lifetime->end >= v1_lifetime->start;
}
const ManagedTensorRanges::Lifetime* ManagedTensorRanges::getLifetime(
const Value* value) const {
auto it = value_lifetimes_.find(value);
if (it != value_lifetimes_.end()) {
return &it->second;
}
return nullptr;
}
ManagedTensorRanges::Lifetime* ManagedTensorRanges::getLifetime(
const Value* value) {
// const_cast is safe here, this is just a way to avoid code duplication
// between the const/non-const versions of getLifetime.
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
const auto* const_this = const_cast<const ManagedTensorRanges*>(this);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
return const_cast<ManagedTensorRanges::Lifetime*>(
const_this->getLifetime(value));
}
std::vector<const Value*> ManagedTensorRanges::
collectValuesWithTrackedLifetimes(at::ArrayRef<const Value*> values) {
std::vector<const Value*> mutable_values;
mutable_values.reserve(values.size());
std::copy_if(
values.begin(),
values.end(),
std::back_inserter(mutable_values),
[this](const Value* value) { return getLifetime(value) != nullptr; });
return mutable_values;
}
StaticModule::StaticModule(
std::shared_ptr<torch::jit::Graph> g,
const StaticModuleOptions& opts,
std::vector<IValue> sample_inputs)
: StaticModule(
PrepareForStaticModule(g->copy(), opts, std::move(sample_inputs)),
opts) {}
StaticModule::StaticModule(
const torch::jit::Module& m,
bool is_frozen,
const StaticModuleOptions& opts,
std::vector<IValue> sample_inputs)
: StaticModule(
PrepareForStaticModule(m, is_frozen, opts, std::move(sample_inputs)),
opts) {}
StaticModule::StaticModule(
std::pair<std::shared_ptr<torch::jit::Graph>, std::optional<Module>>
graph_and_module,
const StaticModuleOptions& opts)
: opts_(opts),
graph_(std::move(graph_and_module.first)),
module_(std::move(graph_and_module.second)),
num_inputs_(graph_->inputs().size()) {
sr_metadata_ = c10::make_intrusive<jit::StaticRuntimeMetadata>(opts_);
// recursively attach metadata to prim::fork nodes
attachNodeMetadata(graph_->block());
// check opt flags
if (opts.manage_output_tensors) {
TORCH_CHECK(
opts_.enable_out_variant,
"When manage_output_tensors is true, enable_out_variant must be set to true");
}
if (opts_.optimize_memory) {
TORCH_CHECK(
opts_.enable_out_variant,
"When optimize_memory is true, enable_out_variant must be set to true");
}
// handle schema
if (module_.has_value()) {
Method method = module_->get_method("forward");
schema_ = method.function().getSchema();
const auto num_schema_args = schema_->arguments().size();
DCHECK(num_schema_args > 0);
if (removeSelfFromGraphInput(graph_)) {
module_ = c10::nullopt;
num_inputs_ = num_schema_args - 1;
}
}
{
size_t nodes_size = 0, constants_size = 0;
for (Node* node : graph_->nodes()) {
++(node->kind() == prim::Constant ? constants_size : nodes_size);
}
constants_.reserve(constants_size);
functions_.reserve(nodes_size);
}
// Create ProcessedFunction instances first to freeze their addresses to pass
// to ProcessedNode.
AliasDb alias_db(graph_, /*isFrozen=*/false);
GRAPH_DEBUG("AliasDb: ", alias_db.toString());
// Maps each Value* in the graph to its index in the values_ array that will
// eventually be created by StaticRuntime.
c10::FastMap<const Value*, uint32_t> value_to_index;
prepareFunctionsAndConstants(graph_->block(), alias_db, value_to_index);
const auto constants_index_offset = 0;
const auto values_index_offset = constants_index_offset + constants().size();
value_buffer_size_ = values_index_offset;
value_buffer_size_ +=
prepareBlockInfo(graph_->block(), values_index_offset, value_to_index);
prepareStaticNodeInfos(graph_->block(), value_to_index, alias_db);
for (auto& block_and_info : block_infos_) {
auto& block_info = block_and_info.second;
block_info.prepare_for_memory_planner(alias_db, opts);
}
}
size_t StaticModule::prepareBlockInfo(
Block* block,
const size_t start_idx,
c10::FastMap<const Value*, uint32_t>& value_to_index) {
block_infos_.emplace(block, BlockInfo(start_idx, *block));
const auto num_inputs = static_cast<uint32_t>(block->inputs().size());
for (const auto i : c10::irange(num_inputs)) {
value_to_index.emplace(block->inputs()[i], start_idx + i);
}
auto cur_idx = start_idx + num_inputs;
for (auto* node : block->nodes()) {
for (auto* sub_block : node->blocks()) {
cur_idx += prepareBlockInfo(sub_block, cur_idx, value_to_index);
}
if (node->kind() == prim::Constant) {
continue;
}
TORCH_CHECK(
cur_idx < (1 << 16),
"outputs offset in values table",
cur_idx,
" would overflow 2-byte index storage");
const auto num_outputs = static_cast<uint32_t>(node->outputs().size());
for (const auto i : c10::irange(num_outputs)) {
value_to_index.emplace(node->outputs()[i], cur_idx + i);
}
cur_idx += num_outputs;
}
std::vector<uint16_t> output_indices;
output_indices.reserve(block->outputs().size());
for (auto* output : block->outputs()) {
const auto output_idx = value_to_index.at(output);
TORCH_CHECK(
output_idx < (1 << 16),
"outputs offset in values table",
output_idx,
" would overflow 2-byte index storage");
output_indices.push_back(output_idx);
}
block_infos_.at(block).set_output_indices(std::move(output_indices));
return cur_idx - start_idx;
}
void StaticModule::attachNodeMetadata(Block* block) {
for (auto* node : block->nodes()) {
if (node->kind() == prim::fork) {
node->ival_(getStaticRuntimeMetadataSymbol(), IValue(sr_metadata_));
}
for (auto* sub_block : node->blocks()) {
attachNodeMetadata(sub_block);
}
}
}
void StaticModule::prepareFunctionsAndConstants(
Block* block,
const AliasDb& alias_db,
c10::FastMap<const Value*, uint32_t>& value_to_index) {
for (auto* node : block->nodes()) {
for (auto* sub_block : node->blocks()) {
prepareFunctionsAndConstants(sub_block, alias_db, value_to_index);
}
if (node->kind() == prim::Constant) {
auto* v = node->output();
TORCH_CHECK(
v->type()->kind() != FunctionType::Kind,
"got ",
typeKindToString(v->type()->kind()),
" instead of ",
typeKindToString(FunctionType::Kind));
value_to_index.emplace(v, constants_.size());
constants_.emplace_back(toIValue(v).value());
continue;
}
// see [Check and correct bad schema alias info at runtime]
bool check_outputs_for_overlap =
!alias_db.mayContainAlias(node->inputs(), node->outputs()) &&
containTensorsOnly(node->outputs());
// new ProcessedFunction
functions_.emplace_back(
node, opts_.enable_out_variant, check_outputs_for_overlap);
}
}
size_t StaticModule::prepareStaticNodeInfos(
Block* block,
const c10::FastMap<const Value*, uint32_t>& value_to_index,
const AliasDb& alias_db,
size_t node_idx) {
const auto node_start = node_idx;
auto& block_info = block_infos_.at(block);
std::vector<StaticNodeInfo> nodes;
c10::FastMap<Node*, bool> node_has_out_variant;
for (auto* node : block->nodes()) {
if (node->kind() == prim::Constant) {
continue;
}
for (auto* sub_block : node->blocks()) {
node_idx +=
prepareStaticNodeInfos(sub_block, value_to_index, alias_db, node_idx);
}
const auto num_outputs = static_cast<uint32_t>(node->inputs().size());
ProcessedNodeInputs input_indices(num_outputs);
for (const auto input_idx : c10::irange<uint32_t>(num_outputs)) {
auto* input = node->inputs()[input_idx];
auto input_ivalue_idx = value_to_index.at(input);
TORCH_CHECK(
input_ivalue_idx < (1 << 16),
"input index in values table ",
input_ivalue_idx,
" would overflow 2-byte index storage");
input_indices[input_idx] = input_ivalue_idx;
}
ProcessedFunction* fn = &functions_[node_idx];
// create a new ProcessedNode
const auto node_output_idx = node->outputs().empty()
// The index is unused if there are no outputs, so just create a
// placeholder value.
? std::numeric_limits<uint16_t>::max()
: value_to_index.at(node->output(0));
nodes.emplace_back(node, fn, std::move(input_indices), node_output_idx);
node_has_out_variant.emplace(node, nodes.back().has_out_variant());
++node_idx;
}
block_info.set_nodes(std::move(nodes), node_has_out_variant);
block_info.init_value_group(alias_db);
return node_idx - node_start;
}
#ifdef FBCODE_CAFFE2
thread_local SROperatorObserver* tlsOpObserver = nullptr;
void SROperatorObserver::setCurrentThreadObserver(
SROperatorObserver* observer) {
tlsOpObserver = observer;
}
SROperatorObserver* SROperatorObserver::getCurrentThreadObserver() {
return tlsOpObserver;
}
void SROperatorObserver::onStart(const Node* node) {
if (tlsOpObserver != nullptr && tlsOpObserver->startCb != nullptr) {
tlsOpObserver->startCb(node);
}
}
void SROperatorObserver::onEnd(const Node* node) {
if (tlsOpObserver != nullptr && tlsOpObserver->endCb != nullptr) {
tlsOpObserver->endCb(node);
}
}
#endif // FBCODE_CAFFE2
BlockInfo::BlockInfo(uint32_t input_idx, Block& block)
: input_idx_(input_idx), block_(block) {}
void BlockInfo::set_nodes(
std::vector<StaticNodeInfo> nodes,
const c10::FastMap<Node*, bool>& node_has_out_variant) {
nodes_ = std::move(nodes);
for (auto& node : nodes_) {
if (node.num_outputs() == 1 &&
isOptimizableContainerType(node.node(), node_has_out_variant)) {
node_is_optimizable_container_type_.emplace(node.node());
}
}
}
void BlockInfo::prepare_for_memory_planner(
const AliasDb& alias_db,
const StaticModuleOptions& opts) {
if (!opts.enable_out_variant) {
return;
}
// Never manage graph outputs so that we can do std::move(output_ivalue).
// This does not affect performance if the graph returns a collection object.
c10::FastSet<const Value*> graph_output_values(
block_.outputs().begin(), block_.outputs().end());
// collect register indices of outputs of ops with out variant
for (StaticNodeInfo& pnode : nodes_) {
if (!pnode.has_out_variant()) {
continue;
}
auto outputs = pnode.node()->outputs();
const auto num_outputs = static_cast<uint32_t>(outputs.size());
for (const auto i : c10::irange(num_outputs)) {
const Value* out_v = outputs[i];
// Types are stored in the underlying TorchScript IR
bool is_tensor_type = out_v->type()->castRaw<TensorType>();
if (opts.manage_output_tensors && is_tensor_type &&
graph_output_values.find(out_v) == graph_output_values.end() &&
value_group_.isOutputAlias(out_v)) {
managed_output_tensor_values_.insert(out_v);
continue;
}
if (value_group_.isAlwaysAlive(out_v)) {
continue;
}
if (is_tensor_type) {
managed_tensor_values_.insert(out_v);
} else if (node_is_optimizable_container_type(pnode.node())) {
// We "leak" certain container types because their allocations
// take a long time
leaked_values_.insert(out_v);
}
}
}
for (const Value* output : block_.outputs()) {
managed_tensor_values_.erase(output);
}
GRAPH_DEBUG("managed_tensor_values: ", dumpValueSet(managed_tensor_values_));
GRAPH_DEBUG(
"managed_output_tensor_values_: ",
dumpValueSet(managed_output_tensor_values_));
managed_tensor_ranges_ =
ManagedTensorRanges(block_, alias_db, managed_tensor_values_);
}
const StaticModuleOptions& StaticModule::opts() const {
return opts_;
}
size_t StaticModule::num_outputs() const {
return graph_->outputs().size();
}
size_t StaticModule::num_inputs() const {
return num_inputs_;
}
StaticRuntime& StaticModule::runtime() {
if (!cached_runtime_) {
cached_runtime_ = std::make_unique<StaticRuntime>(*this);
}
return *cached_runtime_;
}
Node* StaticModule::findNodeWithKindForTesting(const std::string& kind) const {
for (auto& block_and_info : block_infos_) {
auto& block_info = block_and_info.second;
for (auto& pnode : block_info.nodes()) {
if (pnode.node()->kind().toQualString() == kind) {
return pnode.node();
}
}
}
return nullptr;
}
c10::IValue StaticModule::operator()(
const std::vector<c10::IValue>& args,
const KeywordArgs& kwargs) {
return runtime()(args, kwargs);
}
c10::IValue StaticModule::operator()(
std::vector<c10::IValue>&& args,
const KeywordArgs& kwargs) {
return runtime()(std::move(args), kwargs);
}
BlockRunner::BlockRunner(
const StaticModule& sm,
IValue* values,
Block* block,
torch::jit::TaskLauncher* launcher,
bool is_root_block)
: static_module_(sm),
block_info_(static_module_.block_info(block)),
is_root_block_(is_root_block),
first_input_is_self_(
is_root_block_ && static_module_.first_input_is_self()),
inputs_begin_(block_info_.block_inputs_idx()),
// TODO(T108633124): Turn on manage output tensors for sub-blocks.
manage_output_tensors_enabled_(
is_root_block_ && sm.opts().manage_output_tensors),
values_(values) {
nodes_.reserve(block_info_.nodes().size());
for (auto& pre_pnode : block_info_.nodes()) {
nodes_.emplace_back(pre_pnode, values_);
}
for (auto index : block_info_.block_output_indices()) {
outputs_.emplace_back(&values_[index]);
}
for (auto& pnode : nodes_) {
auto* node = pnode.node();
// attach the async taskLauncher to processedNodes
pnode.set_metadata(launcher);
auto blocks = node->blocks();
const auto num_blocks = blocks.size();
if (num_blocks == 0) {
continue;
}
DCHECK(node->kind() == prim::If || node->kind() == prim::Loop);
std::vector<BlockRunner> block_runners;
block_runners.reserve(num_blocks);
for (auto* b : blocks) {
block_runners.emplace_back(sm, values_, b, launcher);
}
pnode.set_metadata(std::move(block_runners));
}
}
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
BlockRunner::BlockRunner(BlockRunner&&) noexcept = default;
BlockRunner::~BlockRunner() = default;
void BlockRunner::set_arg(const size_t idx, std::vector<IValue>&& args) {
DCHECK(idx < args.size());
Input(idx + first_input_is_self_) = std::move(args[idx]);
}
void BlockRunner::set_arg(const size_t idx, const std::vector<IValue>& args) {
DCHECK(idx < args.size());
Input(idx + first_input_is_self_) = args[idx];
}
void BlockRunner::set_arg(const size_t idx, const IValue& arg) {
Input(idx + first_input_is_self_) = arg;
}
namespace {
void check_type(const Argument& schema_arg, const IValue& arg) {
// Fast path for most common case
if (arg.isTensor() &&
schema_arg.type()->kind() == c10::TypeKind::TensorType) {
return;
}
TORCH_CHECK(
arg.type()->isSubtypeOf(schema_arg.type()),
arg.type()->annotation_str(),
" is not a subtype of ",
schema_arg.type()->annotation_str(),
"; schema arg name: '",
schema_arg.name(),
"', ivalue: ",
iValueToString(arg));
}
} // namespace
template <typename IValueList>
void BlockRunner::set_inputs(IValueList&& args, const KeywordArgs& kwargs) {
const auto& schema = static_module_.schema();
if (first_input_is_self_) {
Input(0) = static_module_.module()._ivalue();
}
if (!is_root_block_ || C10_UNLIKELY(!schema)) {
TORCH_CHECK(
kwargs.empty(),
"BlockRunner got kwargs; is_root_block: ",
std::to_string(is_root_block_),
"schema: ",
schema ? schema->name() : "(not available)");
const auto total_num_inputs = args.size() + first_input_is_self_;
TORCH_CHECK(
total_num_inputs == block_info_.num_inputs(),
"Block runner got ",
std::to_string(total_num_inputs),
" inputs; ",
" first_input_is_self: ",
std::to_string(first_input_is_self_),
"; SR block expects ",
std::to_string(block_info_.num_inputs()),
" inputs for schema ",
schema ? schema->name() : "(not available)");
for (const auto i_arg : c10::irange(args.size())) {
set_arg(i_arg, std::forward<IValueList>(args));
}
return;
}
const auto& schema_args = schema->arguments();
size_t consumed_kwargs = 0;
DCHECK(!schema_args.empty());
TORCH_CHECK(
args.size() < schema_args.size(),
"Static runtime got ",
std::to_string(args.size()),
" arguments, expects ",
std::to_string(schema_args.size() - 1),
" for schema ",
schema->name());
for (const auto i_arg : c10::irange(1, schema_args.size())) {
// Start at 1 since the schema always contains `self`.
const auto& schema_arg = schema_args[i_arg];
if (i_arg - 1 < args.size()) {
check_type(schema_arg, std::forward<IValueList>(args)[i_arg - 1]);
set_arg(i_arg - 1, std::forward<IValueList>(args));
continue;
}
auto it = kwargs.find(schema_arg.name());
if (it != kwargs.end()) {
check_type(schema_arg, it->second);
set_arg(i_arg - 1, it->second);
++consumed_kwargs;
continue;
}
auto maybe_default_val = schema_arg.default_value();
if (maybe_default_val) {
set_arg(i_arg - 1, *maybe_default_val);
continue;
}
TORCH_CHECK(
false,
"Static runtime is missing required kwarg ",
schema_arg.name(),
" i_arg: ",
std::to_string(i_arg),
" for schema ",
schema->name());
}
TORCH_CHECK(
consumed_kwargs == kwargs.size(),
"kwargs size mismatch (consumed ",
std::to_string(consumed_kwargs),
", expected ",
std::to_string(kwargs.size()),
" for schema ",
schema->name());
}
void BlockRunner::create_memory_planner() {
if (!planner_) {
planner_ = std::make_unique<StandardMemoryPlanner>(
this,
block_info_,
static_module_.opts().enable_out_variant,
manage_output_tensors_enabled_,
static_module_.opts().optimize_memory);
}
}
namespace {
void destroyNodeOutputs(ProcessedNode& p_node) {
const auto borrows_outputs = borrowsOutputs(p_node.node()->kind());
const auto num_outputs = static_cast<uint32_t>(p_node.num_outputs());
for (const auto i : c10::irange<uint32_t>(num_outputs)) {
auto& output = p_node.Output(i);
if (doesNotHeapAllocateWhenStoredInIValue(*output.type())) {
continue;
}
if (borrows_outputs) {
// NB: No need to incref here. This codepath is only hit if the run didn't
// finish, so we shouldn't be returning anything to the client.
c10::MaybeOwnedTraits<IValue>::destroyBorrow(output);
} else {
output = IValue();
}
}
}
} // namespace
void BlockRunner::clean_up_intermediate_ivalues() noexcept {
// We have to iterate in reverse order here due to borrowed
// IValues - we don't want to destroy a value until all of its
// borrows are cleaned up!
for (auto it = nodes_.rbegin(); it != nodes_.rend(); ++it) {
destroyNodeOutputs(*it);
}
}
void BlockRunner::resetMemory() noexcept {
planner_.reset();
// We must clean up intermediate values before inputs in case
// there are borrowed inputs and static runtime owns the only
// reference (e.g. the inputs were std::move'd into the runtime)
clean_up_intermediate_ivalues();
clean_up_input_ivalues();
}
c10::IValue BlockRunner::move_outputs_to_tuple(uint32_t num_outputs) {
switch (num_outputs) {
case 1:
return c10::ivalue::Tuple::create(IValue(std::move(*outputs_[0])));
case 2:
return c10::ivalue::Tuple::create(
IValue(std::move(*outputs_[0])), IValue(std::move(*outputs_[1])));
case 3:
return c10::ivalue::Tuple::create(
IValue(std::move(*outputs_[0])),
IValue(std::move(*outputs_[1])),
IValue(std::move(*outputs_[2])));
default: {
std::vector<c10::IValue> outputs;
outputs.reserve(num_outputs);
for (const auto i : c10::irange(num_outputs)) {
// use move here. Otherwise, clean up outputs_[i] explicitly
outputs.emplace_back(std::move(*outputs_[i]));
}
return c10::ivalue::Tuple::create(std::move(outputs));
}
}
}
/// [Check and correct bad schema alias info at runtime]
/// Static runtime relies on the operator schema's alias info to be correct for
/// memory planning. Because it's hard to enforce the alias info to be correct,
/// we need to do runtime detection for accidental aliases that do not comply
/// with the schema. Only aliases of managed tensors are problematic. To avoid
/// runtime crashes, we can add runtime detection and force the op to comply
/// with its schema by cloning the alias. Because all managed tensors' data_ptrs
/// are part of the internal buffer that the MemoryPlanner allocates, we can
/// check aliases by checking the memory overlap with this internal buffer. But
/// a tensor's storage can be resized during inferenceso we need another way to
/// handle the resized case.
///
/// There are two ways for incorrect schema to break memory planning. Let's look
/// at two examples:
///
/// Example 1:
/// @code
/// def forward(x):
/// a = x + x
/// b = bad_op(a) # b ends up aliasing a incorrectly
/// return (b)
/// @endcode
/// bad_op: its schema says it returns a new Tensor, but it actually returns an
/// alias. In this case, the memory planner would recognize `a` as a managed
/// tensor and clean up its memory before returning `b`. But `b` is actually an
/// alias of `a`, when `a`'s data_ptr get reset, `b`'s data_ptr gets reset too.
///
/// Example 2:
/// @code
/// def forward(x):
/// a = x + x
/// a2 = bad_op(a) # a2 ends up alias a incorrectly
/// b = a + a
/// c = b * b # c shares storage with a
/// d = c + 2 # d shares storage with b
/// e = a2 * a2
/// return (d, e)
/// @endcode
/// With the memory reuse algorithm, `c` could end up sharing storage with `a`,
/// but because of bad_op, `a2` now aliases `a`. `c` overwrites `a` and
/// therefore `a2`, leading to the wrong results. We solve this problem with two
/// steps. Note this doesn't happen with the current memory reuse algorithm
/// because of the way it's implemented. Things could change with a different
/// implementation.
///
/// Step 1, annotate the ProcessedNodes with a flag `check_memory_overlap_` set
/// to true if its outputs do not alias its inputs as indicated by the AliasDb
/// and all of its outputs are Tensors. Then at runtime, we check that the
/// nodes' output tensors do not overlap with the internal buffer that the
/// MemoryPlanner allocates. For latency concerns, we only run this check for
/// fallback ops. The schemas of native ops and out variants are vetted and
/// enforced with static runtime unit tests. For the first iteration, we do a
/// full memory overlap check with
/// ProcessedNode::verify_and_correct_memory_overlap() because the internal
/// buffer doesn't exist yet.
///
/// Step 2, if a managed tensor gets resized during inference, it gets a new
/// data_ptr which is not from the buffer. We can tackle this corner case by
/// delaying the deallocation of the managed tensors to after the outputs are no
/// longer used (essentially merging the internal/output buffers into one).
/// Before the merging is implemented, we add another flag `overlap_detected_`
/// to flag any node with overlap detected in Step 1 and do a full memory
/// overlap check if the fast check (by checking memory overlap with internal
/// buffer) fails. There is still a corner case that fails with the added flag.
/// If a resize is triggered at the same time as the op creating an alias at the
/// same time, the current checks would fail to detect the alias.
void BlockRunner::verify_and_correct_memory_overlap(ProcessedNode& n) {
// The slow check can be removed once the internal/output buffers are merged
if (C10_UNLIKELY(n.check_outputs_for_memory_overlap())) {
if (C10_UNLIKELY(!planner_)) {
// slow check, for first iter only
n.verify_and_correct_memory_overlap();
} else {
bool overlap_detected_with_fast_check = false;
const auto n_outputs = static_cast<uint32_t>(n.outputs().size());
for (auto i : c10::irange(n_outputs)) {
auto& output = n.Output(i);
if (output.isTensor()) {
overlap_detected_with_fast_check |=
fast_check_and_correct_overlap_with(n, output);
} else if (output.isTensorList()) {
auto tensor_list = output.toListRef();
for (auto& ival : tensor_list) {
overlap_detected_with_fast_check |=
fast_check_and_correct_overlap_with(
n,
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
const_cast<c10::IValue&>(ival));
}
}
}
if (n.outputs_memory_overlap_detected() &&
!overlap_detected_with_fast_check) {
// slow check. Only run when the fast check fails.
n.verify_and_correct_memory_overlap();
}
}
}
}
bool BlockRunner::fast_check_and_correct_overlap_with(
ProcessedNode& n,
c10::IValue& tensor_ival) {
auto& tensor = tensor_ival.toTensor();
if (planner_->overlapWithInternalBuffer(tensor.data_ptr())) {
DLOG(INFO) << "Detected alias for node: " << PrintNode(n.node());
tensor_ival = at::native::clone(tensor, c10::nullopt);
n.set_outputs_memory_overlap_detected();
return true;
}
return false;
}
BlockRunner::Deallocator::~Deallocator() {
// Assume cleanup cannot throw.
cleanupImpl();
#ifndef NDEBUG
block_runner_.check_for_memory_leak(/*output_returned*/ false);
#endif
}
void BlockRunner::Deallocator::cleanupImpl() {
// MemoryPlanner is created after the first invocation of `run()`. This
// is done intentionally because MemoryPlanner uses `Tensor` sizes of
// the previous `run()` for memory planning of subsequent runs
if (C10_LIKELY(finished_)) {
block_runner_.create_memory_planner();
}
if (C10_LIKELY(block_runner_.planner_)) {
block_runner_.planner_->deallocate();
} else {
// This is the first run, and it didn't finish, so we can't use a
// `MemoryPlanner` to deallocate stuff. Just reset everything manually.
block_runner_.resetMemory();
}
// clean up owning refs of input tensors
block_runner_.clean_up_input_ivalues();
if (C10_UNLIKELY(!finished_)) {
block_runner_.deallocateOutputTensors();
}
}
template <typename IValueList>
c10::IValue BlockRunner::run_impl(
IValueList&& args,
const KeywordArgs& kwargs) {
// We assume inference workloads, so we do not need
// autograd. Enabling this is a significant win on dispatcher
// overhead because it saves a round of dispatch for at least some
// functions, such as resize_ and resize_as_.
c10::InferenceMode mode;
{
auto on_exit = Deallocator(*this);
if (planner_) {
DCHECK(!manage_output_tensors_enabled_ || checkOutputTensorMemoryLeaks());
planner_->allocate();
}
set_inputs(std::forward<IValueList>(args), kwargs);
for (auto& n : nodes_) {
// LOG(INFO) << "Running node: " << PrintNode(n.node());
n.run();
// Check for incorrect schema alias info.
verify_and_correct_memory_overlap(n);
}
on_exit.setFinished();
}
// no need to keep references of outputs in static runtime anymore
if (block_info_.num_outputs() > 1) {
return move_outputs_to_tuple(block_info_.num_outputs());
}
DCHECK(check_for_memory_leak(/*output_returned*/ false));
// use move here. Otherwise, clean up outputs_[0] explicitly
return std::move(*outputs_[0]);
}
template <typename IValueList>
c10::IValue BlockRunner::run_impl_record_functions(
IValueList&& args,
const KeywordArgs& kwargs) {
auto step_callbacks =
at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_MODEL);
if (C10_UNLIKELY(step_callbacks.has_value())) {
at::RecordFunction guard(std::move(*step_callbacks));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
guard.needsInputs()
? guard.before(
"forward", c10::ArrayRef<const IValue>(args.data(), args.size()))
: guard.before("forward");
return run_impl(std::forward<IValueList>(args), kwargs);
}
return run_impl(std::forward<IValueList>(args), kwargs);
}
template <typename IValueList>
c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::run_impl_async(
IValueList&& args,
const KeywordArgs& kwargs) {
// run the graph inline in the caller thread. Async ops will be
// executed on taskLauncher attached to the metadata of ProcessedNodes
c10::IValue output = run_impl(std::forward<IValueList>(args), kwargs);
// If the output is of type future, return it
if (output.isFuture()) {
return output.toFuture();
}
// wrap the output into future, mark completed and return it
TypePtr return_type;
if (block_info_.num_outputs() > 1) {
return_type = TupleType::create(
fmap(outputs(), [](const IValue* v) { return v->type(); }));
} else {
return_type = outputs().at(0)->type();
}
c10::intrusive_ptr<Future> future = c10::make_intrusive<Future>(return_type);
future->markCompleted(output);
return future;
}
template <typename IValueList>
c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::
run_impl_record_functions_async(
IValueList&& args,
const KeywordArgs& kwargs) {
auto step_callbacks =
at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_MODEL);
if (C10_UNLIKELY(step_callbacks.has_value())) {
at::RecordFunction guard(std::move(*step_callbacks));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
guard.needsInputs()
? guard.before(
"forward", c10::ArrayRef<const IValue>(args.data(), args.size()))
: guard.before("forward");
return run_impl_async(std::forward<IValueList>(args), kwargs);
}
return run_impl_async(std::forward<IValueList>(args), kwargs);
}
c10::IValue BlockRunner::operator()(
const std::vector<c10::IValue>& args,
const KeywordArgs& kwargs) {
#ifdef PYTORCH_DISABLE_NET_PROFILING
return run_impl(args, kwargs);
#else
return run_impl_record_functions(args, kwargs);
#endif
}
c10::IValue BlockRunner::operator()(
std::vector<c10::IValue>&& args,
const KeywordArgs& kwargs) {
#ifdef PYTORCH_DISABLE_NET_PROFILING
return run_impl(std::move(args), kwargs);
#else
return run_impl_record_functions(std::move(args), kwargs);
#endif
}
c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::runAsync(
const std::vector<c10::IValue>& args,
const KeywordArgs& kwargs) {
#ifdef PYTORCH_DISABLE_NET_PROFILING
return run_impl_async(args, kwargs);
#else
return run_impl_record_functions_async(args, kwargs);
#endif
}
c10::intrusive_ptr<c10::ivalue::Future> BlockRunner::runAsync(
std::vector<c10::IValue>&& args,
const KeywordArgs& kwargs) {
#ifdef PYTORCH_DISABLE_NET_PROFILING
return run_impl_async(std::move(args), kwargs);
#else
return run_impl_record_functions_async(std::move(args), kwargs);
#endif
}
namespace {
std::string generate_latency_json(const std::string& label, double millis) {
#ifdef FBCODE_CAFFE2
folly::dynamic json = folly::dynamic::object();
json["type"] = label;
json["metric"] = "latency";
json["unit"] = "ms";
json["value"] = millis;
return "PyTorchObserver " + folly::toJson(json);
#else
(void)label;
(void)millis;
return "";
#endif
}
} // namespace
void BlockRunner::benchmark(
const std::vector<std::vector<c10::IValue>>& args_list,
const std::vector<KeywordArgs>& kwargs_list,
const uint32_t warmup_runs,
const uint32_t main_runs,
bool print_per_node_time,
bool generate_ai_pep_output) {
TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
std::cout << "Input size: " << args_list.size() << std::endl;
float time_per_iter =
benchmark_model(args_list, kwargs_list, warmup_runs, main_runs);
std::cout << "Static runtime ms per iter: " << time_per_iter
<< ". Iters per second: " << 1000.0 / time_per_iter << std::endl;
IndividualMetrics results =
benchmark_individual_ops(args_list, kwargs_list, warmup_runs, main_runs);
if (print_per_node_time) {
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
for (const auto i : c10::irange(num_nodes)) {
const Node* node = nodes_[i].node();
std::cout << "Node #" << i << ": " << results.time_per_node[i]
<< " ms/iter, ";
node->print(std::cout, 0, nullptr, false);
}
}
std::vector<std::pair<std::string, double>> time_per_node_type_vec{
results.time_per_node_type.begin(), results.time_per_node_type.end()};
if (args_list.empty()) {
std::sort(
time_per_node_type_vec.begin(),
time_per_node_type_vec.end(),
[&results](auto& left, auto& right) {
return results.instances_per_node_type[left.first] >
results.instances_per_node_type[right.first];
});
} else {
std::sort(
time_per_node_type_vec.begin(),
time_per_node_type_vec.end(),
[](auto& left, auto& right) { return left.second > right.second; });
}
std::cout << "Time per node type:" << std::endl;
for (const auto& p : time_per_node_type_vec) {
const std::string& kind = p.first;
const double ms = p.second;
std::cout << std::setw(15) << ms << " ms. " << std::setw(10)
<< results.percent_per_node_type[kind] << "%. " << kind << " ("
<< results.instances_per_node_type[kind] << " nodes";
if (results.out_nodes.count(kind)) {
std::cout << ", out variant)" << std::endl;
} else if (results.native_nodes.count(kind)) {
std::cout << ", native)" << std::endl;
} else {
std::cout << ")" << std::endl;
}
if (generate_ai_pep_output) {
LOG(INFO) << generate_latency_json(kind, ms);
}
}
if (generate_ai_pep_output) {
LOG(INFO) << generate_latency_json(
"static_runtime_first_iter", results.first_iter_time);
}
std::cout << std::setw(15) << results.total_time << " ms. in Total"
<< std::endl;
std::cout << "BlockRunner setup time: " << results.setup_time << " ms"
<< std::endl;
std::cout << "Memory allocation time: " << results.memory_alloc_time
<< " ms\n";
std::cout << "Memory deallocation time: " << results.memory_dealloc_time
<< " ms" << std::endl;
std::cout << "Outputs deallocation time: " << results.output_dealloc_time
<< " ms" << std::endl;
std::cout << "First iter time: " << results.first_iter_time << " ms"
<< std::endl;
std::cout << "Number of operators: " << nodes_.size() << std::endl;
if (planner_) {
std::cout << "Total number of managed tensors: "
<< planner_->total_num_managed_tensors() << std::endl;
std::cout << "Total number of managed output tensors: "
<< planner_->total_num_managed_output_tensors() << std::endl;
std::cout << "Total number of unmanaged values: "
<< planner_->total_num_unmanaged() << std::endl;
std::cout << "Number of unmanaged values requiring cleanup: "
<< planner_->num_unmanaged_non_scalars() << std::endl;
std::cout << "Number of unmanaged values not requiring cleanup: "
<< planner_->num_unmanaged_scalars() << std::endl;
std::cout << "Total memory managed: " << planner_->total_managed()
<< " bytes" << std::endl;
if (static_module_.opts().optimize_memory) {
std::cout << "Total number of reused tensors: "
<< planner_->total_reused_tensors() << std::endl;
}
}
auto unsupported_nodes_count = results.total_nodes_count -
results.out_nodes_count - results.native_nodes.size();
std::cout << "Total number of 'out' variant nodes/total number of nodes: "
<< results.out_nodes_count << "/" << results.total_nodes_count
<< " ("
<< 100.0 * static_cast<float>(results.out_nodes_count) /
static_cast<float>(results.total_nodes_count)
<< "%)" << std::endl;
std::cout << "Total number of nodes not covered by SR/total number of nodes: "
<< unsupported_nodes_count << "/" << results.total_nodes_count
<< " ("
<< 100.0 * static_cast<float>(unsupported_nodes_count) /
static_cast<float>(results.total_nodes_count)
<< "%)" << std::endl;
check_for_memory_leak();
#ifndef NDEBUG
KeywordArgs empty_kwargs;
display_nodes(
args_list[0], kwargs_list.size() > 0 ? kwargs_list[0] : empty_kwargs);
#endif
}
float BlockRunner::benchmark_model(
const std::vector<std::vector<c10::IValue>>& args_list,
const std::vector<KeywordArgs>& kwargs_list,
const unsigned int warmup_runs,
const unsigned int main_runs) {
TORCH_CHECK(main_runs >= 1);
TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
const bool is_kwargs_empty = kwargs_list.empty();
const KeywordArgs empty_kwargs;
for (const auto _n_run : c10::irange(warmup_runs)) {
(void)_n_run; // Suppress unused variable warning
const auto num_args = static_cast<uint32_t>(args_list.size());
for (const auto j : c10::irange(num_args)) {
operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
if (manage_output_tensors_enabled_) {
deallocateOutputTensors();
}
}
}
caffe2::Timer timer;
for (const auto _n_run : c10::irange(main_runs)) {
(void)_n_run; // Suppress unused variable warning
const auto num_args = static_cast<uint32_t>(args_list.size());
for (const auto j : c10::irange(num_args)) {
operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
if (manage_output_tensors_enabled_) {
deallocateOutputTensors();
}
}
}
float millis = timer.MilliSeconds();
return millis /
(static_cast<float>(main_runs) * static_cast<float>(args_list.size()));
}
static bool display_ivalue(const IValue& iv) {
if (iv.isTensor()) {
std::cout << "Tensor " << iv.toTensor().toString() << " {";
const auto dims = iv.toTensor().sizes();
const auto n_dims = static_cast<uint32_t>(dims.size());
for (const auto i : c10::irange(n_dims)) {
std::cout << iv.toTensor().sizes()[i];
if (n_dims > i + 1) {
std::cout << ", ";
}
}
std::cout << "}\n";
return true;
} else if (iv.isTensorList()) {
std::cout << "TensorList {" << iv.toTensorList().size() << "}\n";
return true;
} else if (iv.isGenericDict()) {
std::cout << "Dict {" << iv.toGenericDict().size() << "}\n";
return true;
} else if (iv.isTuple()) {
std::cout << "Tuple {" << iv.toTupleRef().elements().size() << "}\n";
return true;
} else if (iv.isInt()) {
std::cout << "int {" << iv.toInt() << "}\n";
return true;
} else if (iv.isBool()) {
std::cout << "bool {" << iv.toBool() << "}\n";
return true;
} else if (iv.isDouble()) {
std::cout << "double {" << iv.toDouble() << "}\n";
return true;
}
return false;
}
static void display_pnode_info(const ProcessedNode& pnode) {
pnode.node()->print(std::cout, 0, nullptr, false);
const auto num_inputs = static_cast<uint32_t>(pnode.num_inputs());
for (const auto i : c10::irange(num_inputs)) {
std::cout << "\ti" << i << ": ";
if (!display_ivalue(pnode.Input(i))) {
std::cout << *(pnode.node()->inputs()[i]->type()) << '\n';
}
}
const auto outputs = pnode.outputs();
const auto num_outputs = static_cast<uint32_t>(outputs.size());
for (const auto i : c10::irange(num_outputs)) {
std::cout << "\to" << i << ": ";
if (!display_ivalue(outputs[i])) {
std::cout << *(pnode.node()->outputs()[i]->type()) << '\n';
}
}
}
void BlockRunner::display_nodes(
const std::vector<c10::IValue>& args,
const KeywordArgs& kwargs) {
c10::InferenceMode mode;
auto on_exit = Deallocator(*this);
if (planner_) {
planner_->allocate();
}
set_inputs(args, kwargs);
for (auto& node : nodes_) {
node.run();
display_pnode_info(node);
}
on_exit.setFinished();
}
BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops(
const std::vector<std::vector<c10::IValue>>& args_list,
const std::vector<KeywordArgs>& kwargs_list,
const uint32_t warmup_runs,
const uint32_t main_runs) {
TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size());
TORCH_CHECK(warmup_runs >= 1 && main_runs >= 1);
IndividualMetrics results;
results.time_per_node.resize(nodes_.size(), 0);
if (args_list.empty()) {
// When the given input is empty, compute the op statistics from the given
// graph without executing it.
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
for (const auto i : c10::irange(num_nodes)) {
const Node* node = nodes_[i].node();
std::string kind(node->kind().toQualString());
// TODO: Collect op statistics from sub-blocks here.
results.time_per_node[i] = 0;
results.time_per_node_type[kind] = 0;
results.instances_per_node_type[kind]++;
if (nodes_[i].has_out_variant()) {
results.out_nodes.insert(kind);
results.out_nodes_count++;
} else if (nodes_[i].has_native()) {
results.native_nodes.insert(kind);
}
results.total_time += results.time_per_node[i];
}
results.total_nodes_count = nodes_.size();
results.memory_alloc_time = 0;
results.memory_dealloc_time = 0;
results.output_dealloc_time = 0;
for (const auto& p : results.time_per_node_type) {
const std::string& kind = p.first;
results.percent_per_node_type[kind] = 0;
}
return results;
}
const bool is_kwargs_empty = kwargs_list.empty();
const KeywordArgs empty_kwargs;
bool manage_output_tensors = static_module_.opts().manage_output_tensors;
// See comment on above use of InferenceMode for
// explanation.
c10::InferenceMode mode;
// setup time
caffe2::Timer timer;
set_inputs(args_list[0], is_kwargs_empty ? empty_kwargs : kwargs_list[0]);
results.setup_time = timer.MilliSeconds();
// The first iteration profiles each node's output Tensors' sizes and
// initializes the memory planner with the profile information. Following
// iterations just use the already established memory planning.
timer.Start();
operator()(args_list[0], is_kwargs_empty ? empty_kwargs : kwargs_list[0]);
if (manage_output_tensors) {
deallocateOutputTensors();
}
results.first_iter_time = timer.MilliSeconds();
// warmup runs
for (const auto _n_run : c10::irange(warmup_runs)) {
(void)_n_run; // Suppress unused variable warning
const auto num_args = static_cast<uint32_t>(args_list.size());
for (const auto j : c10::irange(num_args)) {
operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
if (manage_output_tensors) {
deallocateOutputTensors();
}
}
}
// main runs
for (const auto i : c10::irange(main_runs)) {
(void)i; // Suppress unused variable warning
const auto num_args = static_cast<uint32_t>(args_list.size());
for (const auto j : c10::irange(num_args)) {
set_inputs(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]);
timer.Start();
if (planner_) {
planner_->allocate();
}
float millis = timer.MilliSeconds();
results.memory_alloc_time += millis;
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
for (const auto k : c10::irange<uint32_t>(num_nodes)) {
timer.Start();
nodes_[k].run();
millis = timer.MilliSeconds();
results.time_per_node[k] += millis;
verify_and_correct_memory_overlap(nodes_[k]);
}
timer.Start();
create_memory_planner();
planner_->deallocate();
// clean up owning refs of input tensors
clean_up_input_ivalues();
if (manage_output_tensors) {
deallocateOutputTensors();
}
millis = timer.MilliSeconds();
results.memory_dealloc_time += millis;
timer.Start();
// no need to keep references of outputs in static runtime anymore
c10::IValue output;
if (static_module_.num_outputs() > 1) {
output = move_outputs_to_tuple(static_module_.num_outputs());
}
DCHECK(check_for_memory_leak(/*output_returned*/ false));
// use move here. Otherwise, clean up outputs_[0] explicitly
output = std::move(*outputs_[0]);
// release outputs explicitly to measure the time it takes
output = IValue();
millis = timer.MilliSeconds();
results.output_dealloc_time += millis;
}
}
// post processing
const float num_total_iters =
(static_cast<float>(main_runs) * static_cast<float>(args_list.size()));
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
for (const auto i : c10::irange(num_nodes)) {
const Node* node = nodes_[i].node();
std::string kind = std::string(node->kind().toQualString());
results.time_per_node[i] /= num_total_iters;
results.time_per_node_type[kind] += results.time_per_node[i];
results.instances_per_node_type[kind]++;
if (nodes_[i].has_out_variant()) {
results.out_nodes.insert(kind);
results.out_nodes_count++;
} else if (nodes_[i].has_native()) {
results.native_nodes.insert(kind);
}
results.total_time += results.time_per_node[i];
}
results.total_nodes_count = nodes_.size();
results.memory_alloc_time /= num_total_iters;
results.memory_dealloc_time /= num_total_iters;
results.output_dealloc_time /= num_total_iters;
for (const auto& p : results.time_per_node_type) {
const std::string& kind = p.first;
results.percent_per_node_type[kind] = p.second / results.total_time * 100;
}
return results;
}
bool BlockRunner::check_for_memory_leak(
bool output_returned,
bool recurse_on_sub_blocks) {
// check for inputs
const auto num_inputs = static_cast<uint32_t>(block_info_.num_inputs());
for (const auto i : c10::irange(num_inputs)) {
TORCH_CHECK(
values_[i + block_info_.block_inputs_idx()].isNone(),
"Input ",
i,
" was not cleaned up");
}
c10::FastSet<const IValue*> output_ivalues(outputs_.begin(), outputs_.end());
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
for (const auto n : c10::irange(num_nodes)) {
auto& pnode = nodes_[n];
const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
for (const auto i : c10::irange(num_outputs)) {
const IValue* ival = &pnode.Output(i);
const Value* val = pnode.node()->output(i);
// subtlety: isManagedOutputTensorValue may give a false
// negative here if an output is an alias of this value, so
// check the actual tensor!
if (planner_ &&
(isManagedOutputTensor(*ival) || isManagedOutputTensorValue(val))) {
// `ival` contains a managed output tensor that the runtime doesn't
// reclaim at the end of an iteration, but the client does so
// by explicitly calling
// `BlockRunner::deallocateOutputTensors`.
continue;
}
const std::string error_msg = "Output " + c10::to_string(i) + ", %" +
val->debugName() + " of node " + c10::to_string(n) +
" which has kind " + pnode.node()->kind().toQualString() +
" was not cleaned up";
if (output_ivalues.count(ival) == 0) {
// check for intermediates
if (!ival->isNone()) {
TORCH_CHECK(
ival->isTensor() ||
block_info_.node_is_optimizable_container_type(
pnode.node()) ||
doesNotHeapAllocateWhenStoredInIValue(*val->type()),
error_msg);
if (ival->isTensor()) {
const auto& t = ival->toTensor();
if (t.defined()) {
auto* storage_impl = t.storage().unsafeGetStorageImpl();
TORCH_CHECK(
storage_impl->data() == nullptr ||
(planner_ &&
planner_->isManagedStorageImpl(storage_impl)),
error_msg);
}
}
}
} else {
// check for outputs
if (output_returned) {
TORCH_CHECK(ival->isNone(), error_msg);
}
}
}
auto* metadata = pnode.metadata();
if (recurse_on_sub_blocks && metadata) {
auto& block_runners = metadata->block_runners();
for (auto& block_runner : block_runners) {
block_runner.check_for_memory_leak(
output_returned, recurse_on_sub_blocks);
}
}
}
VLOG(1) << "Finished checking for memory leak";
return true;
}
void BlockRunner::deallocateOutputTensors() {
if (!static_module_.opts().manage_output_tensors) {
TORCH_CHECK(
!planner_ || planner_->numOutputBufferBytes() == 0,
"manage_output_tensors is disabled, but output tensor buffer is not empty.");
return;
}
if (planner_) {
planner_->deallocateOutputTensors();
DCHECK(checkOutputTensorMemoryLeaks());
}
}
bool BlockRunner::checkOutputTensorMemoryLeaks() {
if (!static_module_.opts().manage_output_tensors || !planner_) {
return true;
}
const auto num_nodes = static_cast<uint32_t>(nodes_.size());
for (const auto n : c10::irange(num_nodes)) {
auto& pnode = nodes_[n];
const auto num_outputs = static_cast<uint32_t>(pnode.num_outputs());
for (const auto i : c10::irange(num_outputs)) {
const IValue* ival = &pnode.Output(i);
const Value* val = pnode.node()->output(i);
if (!isManagedOutputTensorValue(val) || !ival->isTensor()) {
// ival can not be a tensor if it's being managed by ops like
// to_maybe_copy_out; see ReplaceWithMaybeCopy for details.
continue;
}
const auto& t = ival->toTensor();
if (t.defined()) {
auto* storage_impl = t.storage().unsafeGetStorageImpl();
const std::string error_msg = "Output " + c10::to_string(i) + ", %" +
val->debugName() + " of node " + c10::to_string(n) +
" was not cleaned up";
TORCH_CHECK(storage_impl->data() == nullptr, error_msg);
}
}
}
VLOG(1) << "Finished checking for memory leak from output tensors";
return true;
}
bool BlockRunner::isManagedOutputTensor(const IValue& ivalue) const {
return planner_ && planner_->isManagedOutputTensor(ivalue);
}
bool BlockRunner::isManagedOutputTensorValue(const Value* value) const {
// It's possible that manage_output_tensors_ was disabled after initializing
// managed_output_tensor_values, so we have to check that flag here.
if (!planner_ || !manage_output_tensors_enabled_) {
return false;
}
const auto& managed_outputs = block_info_.managed_output_tensor_values();
return managed_outputs.find(value) != managed_outputs.end();
}
void BlockRunner::disableManageOutputTensors() {
if (!manage_output_tensors_enabled_) {
return;
}
manage_output_tensors_enabled_ = false;
if (!planner_) {
return;
}
// Reset all IValues and destruct planner_ so that it can be reconstructed in
// the next run.
for (auto& n : nodes_) {
const auto num_outputs = static_cast<uint32_t>(n.outputs().size());
for (const auto i : c10::irange(num_outputs)) {
n.Output(i) = IValue();
}
}
planner_.reset();
}
ProcessedFunction::ProcessedFunction(
Node* node,
bool enable_out_variant,
bool check_memory_overlap)
: check_memory_overlap_(check_memory_overlap),
num_outputs_(node->outputs().size()) {
if (enable_out_variant) {
f_ = getOutOfPlaceOperation(node);
if (f_) {
kind_ = ProcessedFunction::Kind::kOutVariant;
// do not check memory overlap for out variants
check_memory_overlap_ = false;
VLOG(1) << "Switch to out variant for node: " << PrintNode(node);
return;
}
}
{
f_ = getNativeOperation(node);
if (f_) {
kind_ = ProcessedFunction::Kind::kNativeFunction;
#ifdef NDEBUG
// skip this check in opt mode because these ops are better vetted
check_memory_overlap_ = false;
#endif
VLOG(1) << "Switch to native impl for node: " << PrintNode(node);
return;
}
}
{
const Operator& op = node->getOperator();
f_ = [node_op = op.getOperation(node),
has_var_args = hasVarArgs(node)](ProcessedNode* pnode) mutable {
std::vector<IValue> stack;
const auto size = static_cast<uint32_t>(pnode->num_inputs());
stack.reserve(size + has_var_args);
for (const auto i : c10::irange(size)) {
stack.emplace_back(pnode->Input(i));
}
// Need to store the number of inputs in stack for variadic ops.
if (has_var_args) {
stack.emplace_back(static_cast<int>(size));
}
node_op(stack);
const auto num_outputs = static_cast<uint32_t>(pnode->num_outputs());
TORCH_DCHECK_EQ(stack.size(), num_outputs);
for (const auto i : c10::irange(num_outputs)) {
pnode->Output(i) = std::move(stack[i]);
}
};
kind_ = ProcessedFunction::Kind::kInterpreterFallback;
VLOG(1) << "Fallback interpreter for node: " << PrintNode(node);
}
}
StaticNodeInfo::StaticNodeInfo(
Node* node,
ProcessedFunction* fn,
ProcessedNodeInputs inputs,
uint16_t outputs_offset)
: node_(node),
fn_(fn),
inputs_(std::move(inputs)),
outputs_offset_(outputs_offset) {
TORCH_CHECK(
num_outputs() == node->outputs().size(),
"Node ",
node->kind().toQualString(),
" has ",
std::to_string(num_outputs()),
" outputs, expected ",
std::to_string(node->outputs().size()));
}
std::vector<IValue> ProcessedNode::inputs_ivalue_vec() const {
std::vector<IValue> result;
const auto num_inputs = static_cast<uint32_t>(inputs_.size());
result.reserve(num_inputs);
for (const auto idx : c10::irange(num_inputs)) {
result.emplace_back(Input(idx));
}
return result;
}
void ProcessedNode::run() {
#ifdef FBCODE_CAFFE2
SROperatorObserver::onStart(node());
#endif
#ifndef PYTORCH_DISABLE_PER_OP_PROFILING
auto step_callbacks =
at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_OP);
if (C10_UNLIKELY(step_callbacks.has_value())) {
at::RecordFunction guard(std::move(*step_callbacks));
TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive());
if (guard.needsInputs()) {
const auto inputs = inputs_ivalue_vec();
guard.before(
get_op_name(),
c10::ArrayRef<const IValue>(inputs.data(), inputs.size()));
} else {
guard.before(get_op_name());
}
if (has_out_variant()) {
guard._setStaticRuntimeOutVariant();
}
fn_->run(this);
} else {
fn_->run(this);
}
#else
fn_->run(this);
#endif
#ifndef NDEBUG
if (FLAGS_static_runtime_disable_debug_memory_overlap_check) {
// run check but do not enforce
verify_no_memory_overlap();
} else {
DCHECK(verify_no_memory_overlap());
}
#endif
#ifdef FBCODE_CAFFE2
SROperatorObserver::onEnd(node());
#endif
}
static bool checkNoMemoryOverlap(const at::Tensor& a, const at::Tensor& b) {
at::MemOverlapStatus status = at::get_overlap_status(a, b);
if (status == at::MemOverlapStatus::Full ||
status == at::MemOverlapStatus::Partial) {
return false;
}
if (status == at::MemOverlapStatus::TooHard) {
VLOG(1) << "Detected TOO_HARD memory overlap status";
}
return true;
}
bool ProcessedNode::verify_no_memory_overlap(bool force_check) const {
const static std::array<c10::Symbol, 7> special_case_ops = {
fromQualString("prim::TypeCheck"),
fromQualString("prim::IfThenElse"),
fromQualString("static_runtime::select_tensor"),
fromQualString("static_runtime::VarTupleUnpack"),
fromQualString("static_runtime::dict_unpack"),
fromQualString("static_runtime::fused_split_and_squeeze"),
fromQualString("static_runtime::create_owned_ref")};
if (!force_check &&
std::find(
begin(special_case_ops), end(special_case_ops), node()->kind()) !=
end(special_case_ops)) {
return true;
}
return verify_outputs_dont_overlap_each_other() &&
verify_inputs_dont_overlap_outputs(force_check);
}
bool ProcessedNode::verify_outputs_dont_overlap_each_other() const {
const auto n_outputs = static_cast<uint32_t>(num_outputs());
for (const auto i : c10::irange(n_outputs)) {
if (!Output(i).isTensor()) {
continue;
}
const auto& out0_t = Output(i).toTensor();
for (const auto j : c10::irange(i + 1, n_outputs)) {
if (!Output(j).isTensor()) {
continue;
}
const auto& out1_t = Output(j).toTensor();
if (!checkNoMemoryOverlap(out0_t, out1_t)) {
LOG(INFO) << "Node output " << i << " overlaps with output " << j
<< ", " << PrintNode(node_);
return false;
}
}
}
return true;
}
bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const {
auto schema = node()->maybeSchema();
// skip memory overlap check for mutable or view ops with only one output
bool skip_check = !schema ||
((schema->is_mutable() || !fn_->checkMemoryOverlap()) &&
num_outputs() == 1);
if (!schema || (!force_check && skip_check)) {
if (!schema) {
VLOG(2) << "Detected that op schema is null";
return true;
}
VLOG(2) << "schema->is_mutable: " << schema->is_mutable()
<< ", fn_->checkMemoryOverlap: " << fn_->checkMemoryOverlap()
<< ", num_outputs_: " << num_outputs();
return true;
}
const auto n_inputs = static_cast<uint32_t>(inputs_.size());
const auto n_outputs = static_cast<uint32_t>(num_outputs());
for (const auto i : c10::irange<uint32_t>(n_inputs)) {
const IValue* in = &Input(i);
if (!in->isTensor()) {
continue;
}
const auto& in_t = in->toTensor();
for (const auto j : c10::irange(n_outputs)) {
const IValue& out = Output(j);
if (!out.isTensor()) {
continue;
}
const auto& out_t = out.toTensor();
if (!checkNoMemoryOverlap(in_t, out_t)) {
LOG(INFO) << "Node input " << i << " overlaps with output " << j << ", "
<< PrintNode(node_);
LOG(INFO) << *schema;
return false;
}
}
}
return true;
}
bool ProcessedNode::check_and_correct_overlap_with(
const at::Tensor& input,
c10::IValue& output_ival) {
auto& tensor = output_ival.toTensor();
if (!checkNoMemoryOverlap(input, tensor)) {
DLOG(INFO) << "Detected alias for node: " << PrintNode(node());
output_ival = at::native::clone(tensor, c10::nullopt);
set_outputs_memory_overlap_detected();
return true;
}
return false;
}
void ProcessedNode::verify_and_correct_memory_overlap() {
const auto n_inputs = static_cast<uint32_t>(inputs_.size());
const auto n_outputs = static_cast<uint32_t>(num_outputs());
for (const auto i : c10::irange(n_inputs)) {
const IValue& in = Input(i);
if (!in.isTensor()) {
continue;
}
const auto& in_t = in.toTensor();
for (const auto j : c10::irange(n_outputs)) {
auto& output = Output(j);
if (output.isTensor()) {
check_and_correct_overlap_with(in_t, output);
} else if (output.isTensorList()) {
auto tensors = output.toListRef();
for (const auto& ival : tensors) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
check_and_correct_overlap_with(in_t, const_cast<c10::IValue&>(ival));
}
#ifdef FBCODE_CAFFE2
if (outputs_memory_overlap_detected()) {
LOG_EVERY_MS(WARNING, 60000)
<< "Detected alias for node: " << PrintNode(node());
}
#endif
}
}
}
}
StaticRuntime::StaticRuntime(const StaticModule& sm)
: values_(sm.value_buffer_size()) {
std::copy(sm.constants().begin(), sm.constants().end(), values_.data());
// default task launcher set to inter-op thread pool
async_task_launcher_ = at::launch;
block_ = std::make_unique<BlockRunner>(
sm,
values_.data(),
sm.root_block(),
&async_task_launcher_,
true /*is_root_block*/);
}
c10::IValue StaticRuntime::operator()(
const std::vector<c10::IValue>& args,
const KeywordArgs& kwargs) {
return (*block_)(args, kwargs);
}
c10::IValue StaticRuntime::operator()(
std::vector<c10::IValue>&& args,
const KeywordArgs& kwargs) {
return (*block_)(std::move(args), kwargs);
}
c10::intrusive_ptr<c10::ivalue::Future> StaticRuntime::runAsync(
const std::vector<c10::IValue>& args,
const KeywordArgs& kwargs,
torch::jit::TaskLauncher taskLauncher) {
async_task_launcher_ = std::move(taskLauncher);
return block_->runAsync(args, kwargs);
}
c10::intrusive_ptr<c10::ivalue::Future> StaticRuntime::runAsync(
std::vector<c10::IValue>&& args,
const KeywordArgs& kwargs,
torch::jit::TaskLauncher taskLauncher) {
async_task_launcher_ = std::move(taskLauncher);
return block_->runAsync(std::move(args), kwargs);
}
bool StaticRuntime::check_for_memory_leak(bool output_returned) {
return block_->check_for_memory_leak(
output_returned, /* recurse_on_sub_blocks */ true);
}
bool StaticRuntime::checkOutputTensorMemoryLeaks() {
return block_->checkOutputTensorMemoryLeaks();
}
void StaticRuntime::deallocateOutputTensors() {
block_->deallocateOutputTensors();
}
bool StaticRuntime::isManagedOutputTensor(const IValue& ivalue) const {
return block_->isManagedOutputTensor(ivalue);
}
void StaticRuntime::disableManageOutputTensors() {
block_->disableManageOutputTensors();
}
const MemoryPlanner* StaticRuntime::get_memory_planner() const {
return block_->get_memory_planner();
}
} // namespace torch::jit