#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef AT_PER_OPERATOR_HEADERS #include #else #include #endif #include #include #include #include #ifdef FBCODE_CAFFE2 #include #include #include #endif // used in test only C10_DEFINE_bool( static_runtime_disable_debug_memory_overlap_check, false, "If true, disable the memory overlap check in debug mode in ProcessedNode::run()"); namespace torch::jit { namespace { #ifndef STRIP_ERROR_MESSAGES std::string iValueToString(const c10::IValue& val) { std::ostringstream oss; oss << val; return oss.str(); } #endif bool allArgsAreTensors(const Node* node) { const auto& inputs = node->inputs(); return std::all_of(inputs.begin(), inputs.end(), [](const Value* value) { return value->type()->kind() == TypeKind::TensorType; }); } } // namespace // A manually curated set of ops that are disallowed in static runtime. // These are rarely-used ops. Disallowing them typically eliminates // corner cases in graph optimizations, allowing for more aggressive // optimizations and better performance. static bool isUnsupportedOp(const Node* node) { auto kind = node->kind(); if (kind != aten::__is__ && kind != aten::__isnot__) { return false; } // We can't support aten::__is__ (and __isnot__) with tensor arguments. // Consider the following graph: // def forward(x): // y = x.detach() // return x is y // We have a graph optimization that removes the `detach` node since it is // a no-op during inference. But this affects the result - we get true // instead of false! There are many other graph passes affected by this // issue. return allArgsAreTensors(node); } namespace { bool canEnableStaticRuntimeImpl(const Block* block) { if (block == nullptr) { return false; } bool can_support = true; for (auto* node : block->nodes()) { for (auto* subblock : node->blocks()) { // The ordering prevents && from short circuiting, which we want - // it's useful to see *all* the unsupported ops. can_support = canEnableStaticRuntimeImpl(subblock) && can_support; } const auto kind = node->kind(); if (kind == prim::Constant) { continue; } // check if can get op from Node const Operator* op = node->maybeOperator(); if (isUnsupportedOp(node) || (!op && !nativeOpIsRegistered(kind))) { can_support = false; LOG(WARNING) << "Found unsupported op: " << kind.toQualString(); } } return can_support; } } // namespace // Graph must be frozen. canEnableStaticRuntime will return false // if there's any prim::CallMethod ops left in the graph. bool canEnableStaticRuntime(const std::shared_ptr& graph) { return canEnableStaticRuntimeImpl(graph->block()); } namespace { auto sr_metadata_registerer = torch::class_( "StaticRuntime", "StaticRuntimeMetadata"); } // namespace std::string dumpValueSet( const c10::FastSet& value_set, const char* set_name) { std::ostringstream oss; oss << set_name << ": {"; for (const auto* val : value_set) { oss << "%" << val->debugName() << ", "; } oss << "}"; return oss.str(); } namespace { void OptimizeGraph( std::shared_ptr& graph, const StaticModuleOptions& opts, std::vector sample_inputs) { GRAPH_DUMP("Before optimizations: ", graph); if (opts.enable_tensorexpr_fusion) { if (sample_inputs.empty()) { VLOG(1) << "Cannot perform TensorExpr fusion - sample_inputs is empty"; } else { VLOG(1) << "Performing TensorExpr fusion"; performTensorExprFusion(graph, std::move(sample_inputs)); } } Inline(*graph); ConstantPropagation(graph); Canonicalize(graph); ConstantPropagation(graph); RemoveTensorMutation(graph); ConstantPropagation(graph); EliminateNoOpSlice(graph); EliminateDeadCode(graph); FuseInferenceOpsForSparseNN(graph); UseVariadicCat(graph); UseVariadicStack(graph); EliminateTrivialEquallySplit(graph); EliminateExtraPermuteOps(graph); if (opts.enable_out_variant) { UseVariadicOp( graph, fromQualString("fb::sigrid_transforms_torch_bind"), fromQualString("fb::variadic_sigrid_transforms_torch_bind")); UseVariadicOp( graph, fromQualString("torcharrow::inference_wrapper_run_flat"), fromQualString("torcharrow::variadic_inference_wrapper_run_flat")); // These fused ops only have out variants - we can't do the fusion when // out variants are disabled. FuseSignLog1P(graph); FuseClampNaNToNum(graph); #ifdef FBCODE_CAFFE2 if (opts.use_copy_variants && !opts.enable_tensorexpr_fusion) { ReplaceWithCopy(graph); } else { ReplacePermuteWithCopy(graph); } if (opts.use_maybe_copy_variants && !opts.enable_tensorexpr_fusion) { ReplaceWithMaybeCopy(graph); } FuseListUnpack(graph); RemoveUnnecessaryOutputs(graph); PrepackWeights(graph); #endif } ConstantPropagation(graph); RemoveImmutableInputDictLookups(graph); UseVariadicTupleUnpack(graph); UseVariadicGroupedAccessor(graph); EliminateNoOps( graph, /* custom_ops */ {fromQualString("fb::scale_gradient")}); AddIfThenElseOp(graph); UseSplitAndSqueeze(graph); UseInPlaceGetRealInputsFromOptionalInputsV2(graph); GRAPH_DUMP("Final graph after optimizations: ", graph); } bool IsSelfInGraphInput(std::shared_ptr& graph) { return !graph->inputs().empty() && graph->inputs().at(0)->type()->is_module(); } // remove unused input 0 from graph bool removeSelfFromGraphInput(std::shared_ptr& graph) { if (graph->inputs().at(0)->type()->is_module()) { if (graph->inputs().at(0)->hasUses()) { return false; } graph->eraseInput(0); } return true; } std::vector valueVecFromFastSet(const c10::FastSet& s) { std::vector result; result.reserve(s.size()); for (auto* v : s) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) result.emplace_back(const_cast(v)); } return result; } bool mayContainAlias(const AliasDb& db, const Value* v1, const Value* v2) { // AliasDb is not const-correct here, so we have to const_cast // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) return db.mayContainAlias(const_cast(v1), const_cast(v2)); } bool mayContainAlias( const AliasDb& db, const Value* a, const c10::FastSet& b) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) return db.mayContainAlias(const_cast(a), valueVecFromFastSet(b)); } bool escapesScope(const AliasDb& db, const Value* a) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) return db.escapesScope({const_cast(a)}); } void PrepareGraphForStaticModule( std::shared_ptr graph, const StaticModuleOptions& opts, std::vector sample_inputs) { TORCH_CHECK(canEnableStaticRuntime(graph)); OptimizeGraph(graph, opts, std::move(sample_inputs)); // Static runtime moves its outputs out of the runtime // by default. In some rare cases, this is not actually safe to // do - for example, if the value is a constant, static runtime // needs to hold onto a copy. Rather than adding special logic // to handle this rare case, we use this pass to detect it and // create an owned reference that can be safely moved out of the // runtime. CreateOwnedRefsForSpecialValues(*graph); // We assume that each sub-block has at least one output. If we // detect any that have 0, force the sub-block to return None. ForceNonEmptyOutputs(*graph); } std::pair, std::optional> PrepareForStaticModule( const torch::jit::Module& m, bool is_frozen, const StaticModuleOptions& opts, std::vector sample_inputs) { LOG(INFO) << "StaticModuleOptions: enable_out_variant " << opts.enable_out_variant << ", optimize_memory " << opts.optimize_memory << ", manage_output_tensors " << opts.manage_output_tensors << ", use_copy_variants " << opts.use_copy_variants << ", use_maybe_copy_variants " << opts.use_maybe_copy_variants << ", enable_tensorexpr_fusion " << opts.enable_tensorexpr_fusion; Module module = m.copy(); if (!is_frozen) { module.eval(); module = freeze_module(module); } Method method = module.get_method("forward"); auto graph = module.get_method("forward").graph(); if (!sample_inputs.empty() && IsSelfInGraphInput(graph)) { sample_inputs.insert(sample_inputs.begin(), m._ivalue()); } PrepareGraphForStaticModule(graph, opts, std::move(sample_inputs)); return std::make_pair(graph, module); } std::pair, std::optional> PrepareForStaticModule( std::shared_ptr graph, const StaticModuleOptions& opts, std::vector sample_inputs) { PrepareGraphForStaticModule(graph, opts, std::move(sample_inputs)); return std::make_pair(graph, c10::nullopt); } } // namespace void ValueGroup::init(const Block& block, const AliasDb& db) { external_aliases_.clear(); output_aliases_.clear(); // Build `external_aliases` as we look through nodes forwardly from // the graph's inputs and add aliases of the inputs being created by the // nodes. external_aliases_.insert(block.inputs().begin(), block.inputs().end()); for (const auto* node : block.nodes()) { if (node->kind() == prim::Constant) { for (const auto* output : node->outputs()) { external_aliases_.insert(output); } } } for (const auto* node : block.nodes()) { if (node->kind() == prim::Constant) { // Constants are already in `external_aliases`. continue; } for (const auto* v : node->outputs()) { if (escapesScope(db, v) || mayContainAlias(db, v, external_aliases_)) { external_aliases_.insert(v); } } } // Build `output_aliases` as we look through nodes reversely so that we can // start from the output values, and follow the flows backwardly from there. output_aliases_.insert(block.outputs().begin(), block.outputs().end()); for (const auto* node : block.nodes().reverse()) { if (node->kind() == prim::Constant) { // Constants cannot create any aliases. continue; } for (const auto* v : node->outputs()) { if (mayContainAlias(db, v, output_aliases_)) { output_aliases_.insert(v); } } } } namespace { bool isTensorList(const Value* value) { auto* type = value->type()->castRaw(); if (!type) { return false; } return type->getElementType()->kind() == c10::TypeKind::TensorType; } bool containTensorsOnly(at::ArrayRef values) { // return true only if all outputs are tensors return std::all_of(values.begin(), values.end(), [](const Value* value) { return value->type()->kind() == c10::TypeKind::TensorType || isTensorList(value); }); } bool isPureFunction(const Node* node) { auto* schema = node->maybeSchema(); return schema && schema->aliasAnalysis() == c10::AliasAnalysisKind::PURE_FUNCTION; } } // namespace ManagedTensorRanges::ManagedTensorRanges( Block& block, const AliasDb& alias_db, const c10::FastSet& managed_tensor_values) { const std::vector nodes(block.nodes().begin(), block.nodes().end()); const c10::FastSet graph_inputs( block.inputs().begin(), block.inputs().end()); const auto num_nodes = static_cast(nodes.size()); for (const auto i : c10::irange(num_nodes)) { auto* node = nodes[i]; for (auto* input : node->inputs()) { auto* lifetime = getLifetime(input); if (!lifetime) { continue; } DCHECK(lifetime->end <= i); lifetime->end = i; } for (auto* output : node->outputs()) { if (!alias_db.isMutableType(output)) { continue; } value_lifetimes_.emplace(output, Lifetime(i, i)); } } for (auto* graph_output : block.outputs()) { auto* lifetime = getLifetime(graph_output); if (!lifetime) { continue; } lifetime->end = num_nodes; } // Handle aliases. Aliases may extend a Value*'s lifetime. If a node // has an input and output that may alias each other, set the input's // lifetime end to max(input.lifetime_end, output.lifetime_end). Iterate // backwards to handle chains of aliases. for (const auto* node : block.nodes().reverse()) { if (isPureFunction(node)) { // If the node is a pure function, it doesn't create any aliases, // so we can safely skip it. continue; } auto inputs = collectValuesWithTrackedLifetimes(node->inputs()); auto outputs = collectValuesWithTrackedLifetimes(node->outputs()); for (auto* input : inputs) { auto* input_lifetime = getLifetime(input); DCHECK(input_lifetime != nullptr); for (auto* output : outputs) { if (mayContainAlias(alias_db, input, output)) { auto* output_lifetime = getLifetime(output); DCHECK(output_lifetime != nullptr); input_lifetime->end = std::max(output_lifetime->end, input_lifetime->end); } } } } for (auto* managed_tensor : managed_tensor_values) { auto* lifetime = getLifetime(managed_tensor); DCHECK(lifetime && lifetime->end <= num_nodes); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) Node* freeing_node; if (lifetime->end == num_nodes) { freeing_node = block.return_node(); } else { freeing_node = nodes[lifetime->end]; } node_to_newly_free_tensors_[freeing_node].emplace_back(managed_tensor); } } bool ManagedTensorRanges::nodeFreesManagedTensors(Node* node) const { auto it = node_to_newly_free_tensors_.find(node); return it != node_to_newly_free_tensors_.end() && !it->second.empty(); } const std::vector& ManagedTensorRanges:: availableTensorValuesAfterNode(Node* node) const { return node_to_newly_free_tensors_.at(node); } bool ManagedTensorRanges::lifetimesOverlap(const Value* v1, const Value* v2) const { const auto* v1_lifetime = getLifetime(v1); const auto* v2_lifetime = getLifetime(v2); if (!v1_lifetime || !v2_lifetime) { return false; } if (v1_lifetime->start < v2_lifetime->start) { return v1_lifetime->end >= v2_lifetime->start; } return v2_lifetime->end >= v1_lifetime->start; } const ManagedTensorRanges::Lifetime* ManagedTensorRanges::getLifetime( const Value* value) const { auto it = value_lifetimes_.find(value); if (it != value_lifetimes_.end()) { return &it->second; } return nullptr; } ManagedTensorRanges::Lifetime* ManagedTensorRanges::getLifetime( const Value* value) { // const_cast is safe here, this is just a way to avoid code duplication // between the const/non-const versions of getLifetime. // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) const auto* const_this = const_cast(this); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) return const_cast( const_this->getLifetime(value)); } std::vector ManagedTensorRanges:: collectValuesWithTrackedLifetimes(at::ArrayRef values) { std::vector mutable_values; mutable_values.reserve(values.size()); std::copy_if( values.begin(), values.end(), std::back_inserter(mutable_values), [this](const Value* value) { return getLifetime(value) != nullptr; }); return mutable_values; } StaticModule::StaticModule( std::shared_ptr g, const StaticModuleOptions& opts, std::vector sample_inputs) : StaticModule( PrepareForStaticModule(g->copy(), opts, std::move(sample_inputs)), opts) {} StaticModule::StaticModule( const torch::jit::Module& m, bool is_frozen, const StaticModuleOptions& opts, std::vector sample_inputs) : StaticModule( PrepareForStaticModule(m, is_frozen, opts, std::move(sample_inputs)), opts) {} StaticModule::StaticModule( std::pair, std::optional> graph_and_module, const StaticModuleOptions& opts) : opts_(opts), graph_(std::move(graph_and_module.first)), module_(std::move(graph_and_module.second)), num_inputs_(graph_->inputs().size()) { sr_metadata_ = c10::make_intrusive(opts_); // recursively attach metadata to prim::fork nodes attachNodeMetadata(graph_->block()); // check opt flags if (opts.manage_output_tensors) { TORCH_CHECK( opts_.enable_out_variant, "When manage_output_tensors is true, enable_out_variant must be set to true"); } if (opts_.optimize_memory) { TORCH_CHECK( opts_.enable_out_variant, "When optimize_memory is true, enable_out_variant must be set to true"); } // handle schema if (module_.has_value()) { Method method = module_->get_method("forward"); schema_ = method.function().getSchema(); const auto num_schema_args = schema_->arguments().size(); DCHECK(num_schema_args > 0); if (removeSelfFromGraphInput(graph_)) { module_ = c10::nullopt; num_inputs_ = num_schema_args - 1; } } { size_t nodes_size = 0, constants_size = 0; for (Node* node : graph_->nodes()) { ++(node->kind() == prim::Constant ? constants_size : nodes_size); } constants_.reserve(constants_size); functions_.reserve(nodes_size); } // Create ProcessedFunction instances first to freeze their addresses to pass // to ProcessedNode. AliasDb alias_db(graph_, /*isFrozen=*/false); GRAPH_DEBUG("AliasDb: ", alias_db.toString()); // Maps each Value* in the graph to its index in the values_ array that will // eventually be created by StaticRuntime. c10::FastMap value_to_index; prepareFunctionsAndConstants(graph_->block(), alias_db, value_to_index); const auto constants_index_offset = 0; const auto values_index_offset = constants_index_offset + constants().size(); value_buffer_size_ = values_index_offset; value_buffer_size_ += prepareBlockInfo(graph_->block(), values_index_offset, value_to_index); prepareStaticNodeInfos(graph_->block(), value_to_index, alias_db); for (auto& block_and_info : block_infos_) { auto& block_info = block_and_info.second; block_info.prepare_for_memory_planner(alias_db, opts); } } size_t StaticModule::prepareBlockInfo( Block* block, const size_t start_idx, c10::FastMap& value_to_index) { block_infos_.emplace(block, BlockInfo(start_idx, *block)); const auto num_inputs = static_cast(block->inputs().size()); for (const auto i : c10::irange(num_inputs)) { value_to_index.emplace(block->inputs()[i], start_idx + i); } auto cur_idx = start_idx + num_inputs; for (auto* node : block->nodes()) { for (auto* sub_block : node->blocks()) { cur_idx += prepareBlockInfo(sub_block, cur_idx, value_to_index); } if (node->kind() == prim::Constant) { continue; } TORCH_CHECK( cur_idx < (1 << 16), "outputs offset in values table", cur_idx, " would overflow 2-byte index storage"); const auto num_outputs = static_cast(node->outputs().size()); for (const auto i : c10::irange(num_outputs)) { value_to_index.emplace(node->outputs()[i], cur_idx + i); } cur_idx += num_outputs; } std::vector output_indices; output_indices.reserve(block->outputs().size()); for (auto* output : block->outputs()) { const auto output_idx = value_to_index.at(output); TORCH_CHECK( output_idx < (1 << 16), "outputs offset in values table", output_idx, " would overflow 2-byte index storage"); output_indices.push_back(output_idx); } block_infos_.at(block).set_output_indices(std::move(output_indices)); return cur_idx - start_idx; } void StaticModule::attachNodeMetadata(Block* block) { for (auto* node : block->nodes()) { if (node->kind() == prim::fork) { node->ival_(getStaticRuntimeMetadataSymbol(), IValue(sr_metadata_)); } for (auto* sub_block : node->blocks()) { attachNodeMetadata(sub_block); } } } void StaticModule::prepareFunctionsAndConstants( Block* block, const AliasDb& alias_db, c10::FastMap& value_to_index) { for (auto* node : block->nodes()) { for (auto* sub_block : node->blocks()) { prepareFunctionsAndConstants(sub_block, alias_db, value_to_index); } if (node->kind() == prim::Constant) { auto* v = node->output(); TORCH_CHECK( v->type()->kind() != FunctionType::Kind, "got ", typeKindToString(v->type()->kind()), " instead of ", typeKindToString(FunctionType::Kind)); value_to_index.emplace(v, constants_.size()); constants_.emplace_back(toIValue(v).value()); continue; } // see [Check and correct bad schema alias info at runtime] bool check_outputs_for_overlap = !alias_db.mayContainAlias(node->inputs(), node->outputs()) && containTensorsOnly(node->outputs()); // new ProcessedFunction functions_.emplace_back( node, opts_.enable_out_variant, check_outputs_for_overlap); } } size_t StaticModule::prepareStaticNodeInfos( Block* block, const c10::FastMap& value_to_index, const AliasDb& alias_db, size_t node_idx) { const auto node_start = node_idx; auto& block_info = block_infos_.at(block); std::vector nodes; c10::FastMap node_has_out_variant; for (auto* node : block->nodes()) { if (node->kind() == prim::Constant) { continue; } for (auto* sub_block : node->blocks()) { node_idx += prepareStaticNodeInfos(sub_block, value_to_index, alias_db, node_idx); } const auto num_outputs = static_cast(node->inputs().size()); ProcessedNodeInputs input_indices(num_outputs); for (const auto input_idx : c10::irange(num_outputs)) { auto* input = node->inputs()[input_idx]; auto input_ivalue_idx = value_to_index.at(input); TORCH_CHECK( input_ivalue_idx < (1 << 16), "input index in values table ", input_ivalue_idx, " would overflow 2-byte index storage"); input_indices[input_idx] = input_ivalue_idx; } ProcessedFunction* fn = &functions_[node_idx]; // create a new ProcessedNode const auto node_output_idx = node->outputs().empty() // The index is unused if there are no outputs, so just create a // placeholder value. ? std::numeric_limits::max() : value_to_index.at(node->output(0)); nodes.emplace_back(node, fn, std::move(input_indices), node_output_idx); node_has_out_variant.emplace(node, nodes.back().has_out_variant()); ++node_idx; } block_info.set_nodes(std::move(nodes), node_has_out_variant); block_info.init_value_group(alias_db); return node_idx - node_start; } #ifdef FBCODE_CAFFE2 thread_local SROperatorObserver* tlsOpObserver = nullptr; void SROperatorObserver::setCurrentThreadObserver( SROperatorObserver* observer) { tlsOpObserver = observer; } SROperatorObserver* SROperatorObserver::getCurrentThreadObserver() { return tlsOpObserver; } void SROperatorObserver::onStart(const Node* node) { if (tlsOpObserver != nullptr && tlsOpObserver->startCb != nullptr) { tlsOpObserver->startCb(node); } } void SROperatorObserver::onEnd(const Node* node) { if (tlsOpObserver != nullptr && tlsOpObserver->endCb != nullptr) { tlsOpObserver->endCb(node); } } #endif // FBCODE_CAFFE2 BlockInfo::BlockInfo(uint32_t input_idx, Block& block) : input_idx_(input_idx), block_(block) {} void BlockInfo::set_nodes( std::vector nodes, const c10::FastMap& node_has_out_variant) { nodes_ = std::move(nodes); for (auto& node : nodes_) { if (node.num_outputs() == 1 && isOptimizableContainerType(node.node(), node_has_out_variant)) { node_is_optimizable_container_type_.emplace(node.node()); } } } void BlockInfo::prepare_for_memory_planner( const AliasDb& alias_db, const StaticModuleOptions& opts) { if (!opts.enable_out_variant) { return; } // Never manage graph outputs so that we can do std::move(output_ivalue). // This does not affect performance if the graph returns a collection object. c10::FastSet graph_output_values( block_.outputs().begin(), block_.outputs().end()); // collect register indices of outputs of ops with out variant for (StaticNodeInfo& pnode : nodes_) { if (!pnode.has_out_variant()) { continue; } auto outputs = pnode.node()->outputs(); const auto num_outputs = static_cast(outputs.size()); for (const auto i : c10::irange(num_outputs)) { const Value* out_v = outputs[i]; // Types are stored in the underlying TorchScript IR bool is_tensor_type = out_v->type()->castRaw(); if (opts.manage_output_tensors && is_tensor_type && graph_output_values.find(out_v) == graph_output_values.end() && value_group_.isOutputAlias(out_v)) { managed_output_tensor_values_.insert(out_v); continue; } if (value_group_.isAlwaysAlive(out_v)) { continue; } if (is_tensor_type) { managed_tensor_values_.insert(out_v); } else if (node_is_optimizable_container_type(pnode.node())) { // We "leak" certain container types because their allocations // take a long time leaked_values_.insert(out_v); } } } for (const Value* output : block_.outputs()) { managed_tensor_values_.erase(output); } GRAPH_DEBUG("managed_tensor_values: ", dumpValueSet(managed_tensor_values_)); GRAPH_DEBUG( "managed_output_tensor_values_: ", dumpValueSet(managed_output_tensor_values_)); managed_tensor_ranges_ = ManagedTensorRanges(block_, alias_db, managed_tensor_values_); } const StaticModuleOptions& StaticModule::opts() const { return opts_; } size_t StaticModule::num_outputs() const { return graph_->outputs().size(); } size_t StaticModule::num_inputs() const { return num_inputs_; } StaticRuntime& StaticModule::runtime() { if (!cached_runtime_) { cached_runtime_ = std::make_unique(*this); } return *cached_runtime_; } Node* StaticModule::findNodeWithKindForTesting(const std::string& kind) const { for (auto& block_and_info : block_infos_) { auto& block_info = block_and_info.second; for (auto& pnode : block_info.nodes()) { if (pnode.node()->kind().toQualString() == kind) { return pnode.node(); } } } return nullptr; } c10::IValue StaticModule::operator()( const std::vector& args, const KeywordArgs& kwargs) { return runtime()(args, kwargs); } c10::IValue StaticModule::operator()( std::vector&& args, const KeywordArgs& kwargs) { return runtime()(std::move(args), kwargs); } BlockRunner::BlockRunner( const StaticModule& sm, IValue* values, Block* block, torch::jit::TaskLauncher* launcher, bool is_root_block) : static_module_(sm), block_info_(static_module_.block_info(block)), is_root_block_(is_root_block), first_input_is_self_( is_root_block_ && static_module_.first_input_is_self()), inputs_begin_(block_info_.block_inputs_idx()), // TODO(T108633124): Turn on manage output tensors for sub-blocks. manage_output_tensors_enabled_( is_root_block_ && sm.opts().manage_output_tensors), values_(values) { nodes_.reserve(block_info_.nodes().size()); for (auto& pre_pnode : block_info_.nodes()) { nodes_.emplace_back(pre_pnode, values_); } for (auto index : block_info_.block_output_indices()) { outputs_.emplace_back(&values_[index]); } for (auto& pnode : nodes_) { auto* node = pnode.node(); // attach the async taskLauncher to processedNodes pnode.set_metadata(launcher); auto blocks = node->blocks(); const auto num_blocks = blocks.size(); if (num_blocks == 0) { continue; } DCHECK(node->kind() == prim::If || node->kind() == prim::Loop); std::vector block_runners; block_runners.reserve(num_blocks); for (auto* b : blocks) { block_runners.emplace_back(sm, values_, b, launcher); } pnode.set_metadata(std::move(block_runners)); } } // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) BlockRunner::BlockRunner(BlockRunner&&) noexcept = default; BlockRunner::~BlockRunner() = default; void BlockRunner::set_arg(const size_t idx, std::vector&& args) { DCHECK(idx < args.size()); Input(idx + first_input_is_self_) = std::move(args[idx]); } void BlockRunner::set_arg(const size_t idx, const std::vector& args) { DCHECK(idx < args.size()); Input(idx + first_input_is_self_) = args[idx]; } void BlockRunner::set_arg(const size_t idx, const IValue& arg) { Input(idx + first_input_is_self_) = arg; } namespace { void check_type(const Argument& schema_arg, const IValue& arg) { // Fast path for most common case if (arg.isTensor() && schema_arg.type()->kind() == c10::TypeKind::TensorType) { return; } TORCH_CHECK( arg.type()->isSubtypeOf(schema_arg.type()), arg.type()->annotation_str(), " is not a subtype of ", schema_arg.type()->annotation_str(), "; schema arg name: '", schema_arg.name(), "', ivalue: ", iValueToString(arg)); } } // namespace template void BlockRunner::set_inputs(IValueList&& args, const KeywordArgs& kwargs) { const auto& schema = static_module_.schema(); if (first_input_is_self_) { Input(0) = static_module_.module()._ivalue(); } if (!is_root_block_ || C10_UNLIKELY(!schema)) { TORCH_CHECK( kwargs.empty(), "BlockRunner got kwargs; is_root_block: ", std::to_string(is_root_block_), "schema: ", schema ? schema->name() : "(not available)"); const auto total_num_inputs = args.size() + first_input_is_self_; TORCH_CHECK( total_num_inputs == block_info_.num_inputs(), "Block runner got ", std::to_string(total_num_inputs), " inputs; ", " first_input_is_self: ", std::to_string(first_input_is_self_), "; SR block expects ", std::to_string(block_info_.num_inputs()), " inputs for schema ", schema ? schema->name() : "(not available)"); for (const auto i_arg : c10::irange(args.size())) { set_arg(i_arg, std::forward(args)); } return; } const auto& schema_args = schema->arguments(); size_t consumed_kwargs = 0; DCHECK(!schema_args.empty()); TORCH_CHECK( args.size() < schema_args.size(), "Static runtime got ", std::to_string(args.size()), " arguments, expects ", std::to_string(schema_args.size() - 1), " for schema ", schema->name()); for (const auto i_arg : c10::irange(1, schema_args.size())) { // Start at 1 since the schema always contains `self`. const auto& schema_arg = schema_args[i_arg]; if (i_arg - 1 < args.size()) { check_type(schema_arg, std::forward(args)[i_arg - 1]); set_arg(i_arg - 1, std::forward(args)); continue; } auto it = kwargs.find(schema_arg.name()); if (it != kwargs.end()) { check_type(schema_arg, it->second); set_arg(i_arg - 1, it->second); ++consumed_kwargs; continue; } auto maybe_default_val = schema_arg.default_value(); if (maybe_default_val) { set_arg(i_arg - 1, *maybe_default_val); continue; } TORCH_CHECK( false, "Static runtime is missing required kwarg ", schema_arg.name(), " i_arg: ", std::to_string(i_arg), " for schema ", schema->name()); } TORCH_CHECK( consumed_kwargs == kwargs.size(), "kwargs size mismatch (consumed ", std::to_string(consumed_kwargs), ", expected ", std::to_string(kwargs.size()), " for schema ", schema->name()); } void BlockRunner::create_memory_planner() { if (!planner_) { planner_ = std::make_unique( this, block_info_, static_module_.opts().enable_out_variant, manage_output_tensors_enabled_, static_module_.opts().optimize_memory); } } namespace { void destroyNodeOutputs(ProcessedNode& p_node) { const auto borrows_outputs = borrowsOutputs(p_node.node()->kind()); const auto num_outputs = static_cast(p_node.num_outputs()); for (const auto i : c10::irange(num_outputs)) { auto& output = p_node.Output(i); if (doesNotHeapAllocateWhenStoredInIValue(*output.type())) { continue; } if (borrows_outputs) { // NB: No need to incref here. This codepath is only hit if the run didn't // finish, so we shouldn't be returning anything to the client. c10::MaybeOwnedTraits::destroyBorrow(output); } else { output = IValue(); } } } } // namespace void BlockRunner::clean_up_intermediate_ivalues() noexcept { // We have to iterate in reverse order here due to borrowed // IValues - we don't want to destroy a value until all of its // borrows are cleaned up! for (auto it = nodes_.rbegin(); it != nodes_.rend(); ++it) { destroyNodeOutputs(*it); } } void BlockRunner::resetMemory() noexcept { planner_.reset(); // We must clean up intermediate values before inputs in case // there are borrowed inputs and static runtime owns the only // reference (e.g. the inputs were std::move'd into the runtime) clean_up_intermediate_ivalues(); clean_up_input_ivalues(); } c10::IValue BlockRunner::move_outputs_to_tuple(uint32_t num_outputs) { switch (num_outputs) { case 1: return c10::ivalue::Tuple::create(IValue(std::move(*outputs_[0]))); case 2: return c10::ivalue::Tuple::create( IValue(std::move(*outputs_[0])), IValue(std::move(*outputs_[1]))); case 3: return c10::ivalue::Tuple::create( IValue(std::move(*outputs_[0])), IValue(std::move(*outputs_[1])), IValue(std::move(*outputs_[2]))); default: { std::vector outputs; outputs.reserve(num_outputs); for (const auto i : c10::irange(num_outputs)) { // use move here. Otherwise, clean up outputs_[i] explicitly outputs.emplace_back(std::move(*outputs_[i])); } return c10::ivalue::Tuple::create(std::move(outputs)); } } } /// [Check and correct bad schema alias info at runtime] /// Static runtime relies on the operator schema's alias info to be correct for /// memory planning. Because it's hard to enforce the alias info to be correct, /// we need to do runtime detection for accidental aliases that do not comply /// with the schema. Only aliases of managed tensors are problematic. To avoid /// runtime crashes, we can add runtime detection and force the op to comply /// with its schema by cloning the alias. Because all managed tensors' data_ptrs /// are part of the internal buffer that the MemoryPlanner allocates, we can /// check aliases by checking the memory overlap with this internal buffer. But /// a tensor's storage can be resized during inferenceso we need another way to /// handle the resized case. /// /// There are two ways for incorrect schema to break memory planning. Let's look /// at two examples: /// /// Example 1: /// @code /// def forward(x): /// a = x + x /// b = bad_op(a) # b ends up aliasing a incorrectly /// return (b) /// @endcode /// bad_op: its schema says it returns a new Tensor, but it actually returns an /// alias. In this case, the memory planner would recognize `a` as a managed /// tensor and clean up its memory before returning `b`. But `b` is actually an /// alias of `a`, when `a`'s data_ptr get reset, `b`'s data_ptr gets reset too. /// /// Example 2: /// @code /// def forward(x): /// a = x + x /// a2 = bad_op(a) # a2 ends up alias a incorrectly /// b = a + a /// c = b * b # c shares storage with a /// d = c + 2 # d shares storage with b /// e = a2 * a2 /// return (d, e) /// @endcode /// With the memory reuse algorithm, `c` could end up sharing storage with `a`, /// but because of bad_op, `a2` now aliases `a`. `c` overwrites `a` and /// therefore `a2`, leading to the wrong results. We solve this problem with two /// steps. Note this doesn't happen with the current memory reuse algorithm /// because of the way it's implemented. Things could change with a different /// implementation. /// /// Step 1, annotate the ProcessedNodes with a flag `check_memory_overlap_` set /// to true if its outputs do not alias its inputs as indicated by the AliasDb /// and all of its outputs are Tensors. Then at runtime, we check that the /// nodes' output tensors do not overlap with the internal buffer that the /// MemoryPlanner allocates. For latency concerns, we only run this check for /// fallback ops. The schemas of native ops and out variants are vetted and /// enforced with static runtime unit tests. For the first iteration, we do a /// full memory overlap check with /// ProcessedNode::verify_and_correct_memory_overlap() because the internal /// buffer doesn't exist yet. /// /// Step 2, if a managed tensor gets resized during inference, it gets a new /// data_ptr which is not from the buffer. We can tackle this corner case by /// delaying the deallocation of the managed tensors to after the outputs are no /// longer used (essentially merging the internal/output buffers into one). /// Before the merging is implemented, we add another flag `overlap_detected_` /// to flag any node with overlap detected in Step 1 and do a full memory /// overlap check if the fast check (by checking memory overlap with internal /// buffer) fails. There is still a corner case that fails with the added flag. /// If a resize is triggered at the same time as the op creating an alias at the /// same time, the current checks would fail to detect the alias. void BlockRunner::verify_and_correct_memory_overlap(ProcessedNode& n) { // The slow check can be removed once the internal/output buffers are merged if (C10_UNLIKELY(n.check_outputs_for_memory_overlap())) { if (C10_UNLIKELY(!planner_)) { // slow check, for first iter only n.verify_and_correct_memory_overlap(); } else { bool overlap_detected_with_fast_check = false; const auto n_outputs = static_cast(n.outputs().size()); for (auto i : c10::irange(n_outputs)) { auto& output = n.Output(i); if (output.isTensor()) { overlap_detected_with_fast_check |= fast_check_and_correct_overlap_with(n, output); } else if (output.isTensorList()) { auto tensor_list = output.toListRef(); for (auto& ival : tensor_list) { overlap_detected_with_fast_check |= fast_check_and_correct_overlap_with( n, // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) const_cast(ival)); } } } if (n.outputs_memory_overlap_detected() && !overlap_detected_with_fast_check) { // slow check. Only run when the fast check fails. n.verify_and_correct_memory_overlap(); } } } } bool BlockRunner::fast_check_and_correct_overlap_with( ProcessedNode& n, c10::IValue& tensor_ival) { auto& tensor = tensor_ival.toTensor(); if (planner_->overlapWithInternalBuffer(tensor.data_ptr())) { DLOG(INFO) << "Detected alias for node: " << PrintNode(n.node()); tensor_ival = at::native::clone(tensor, c10::nullopt); n.set_outputs_memory_overlap_detected(); return true; } return false; } BlockRunner::Deallocator::~Deallocator() { // Assume cleanup cannot throw. cleanupImpl(); #ifndef NDEBUG block_runner_.check_for_memory_leak(/*output_returned*/ false); #endif } void BlockRunner::Deallocator::cleanupImpl() { // MemoryPlanner is created after the first invocation of `run()`. This // is done intentionally because MemoryPlanner uses `Tensor` sizes of // the previous `run()` for memory planning of subsequent runs if (C10_LIKELY(finished_)) { block_runner_.create_memory_planner(); } if (C10_LIKELY(block_runner_.planner_)) { block_runner_.planner_->deallocate(); } else { // This is the first run, and it didn't finish, so we can't use a // `MemoryPlanner` to deallocate stuff. Just reset everything manually. block_runner_.resetMemory(); } // clean up owning refs of input tensors block_runner_.clean_up_input_ivalues(); if (C10_UNLIKELY(!finished_)) { block_runner_.deallocateOutputTensors(); } } template c10::IValue BlockRunner::run_impl( IValueList&& args, const KeywordArgs& kwargs) { // We assume inference workloads, so we do not need // autograd. Enabling this is a significant win on dispatcher // overhead because it saves a round of dispatch for at least some // functions, such as resize_ and resize_as_. c10::InferenceMode mode; { auto on_exit = Deallocator(*this); if (planner_) { DCHECK(!manage_output_tensors_enabled_ || checkOutputTensorMemoryLeaks()); planner_->allocate(); } set_inputs(std::forward(args), kwargs); for (auto& n : nodes_) { // LOG(INFO) << "Running node: " << PrintNode(n.node()); n.run(); // Check for incorrect schema alias info. verify_and_correct_memory_overlap(n); } on_exit.setFinished(); } // no need to keep references of outputs in static runtime anymore if (block_info_.num_outputs() > 1) { return move_outputs_to_tuple(block_info_.num_outputs()); } DCHECK(check_for_memory_leak(/*output_returned*/ false)); // use move here. Otherwise, clean up outputs_[0] explicitly return std::move(*outputs_[0]); } template c10::IValue BlockRunner::run_impl_record_functions( IValueList&& args, const KeywordArgs& kwargs) { auto step_callbacks = at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_MODEL); if (C10_UNLIKELY(step_callbacks.has_value())) { at::RecordFunction guard(std::move(*step_callbacks)); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive()); guard.needsInputs() ? guard.before( "forward", c10::ArrayRef(args.data(), args.size())) : guard.before("forward"); return run_impl(std::forward(args), kwargs); } return run_impl(std::forward(args), kwargs); } template c10::intrusive_ptr BlockRunner::run_impl_async( IValueList&& args, const KeywordArgs& kwargs) { // run the graph inline in the caller thread. Async ops will be // executed on taskLauncher attached to the metadata of ProcessedNodes c10::IValue output = run_impl(std::forward(args), kwargs); // If the output is of type future, return it if (output.isFuture()) { return output.toFuture(); } // wrap the output into future, mark completed and return it TypePtr return_type; if (block_info_.num_outputs() > 1) { return_type = TupleType::create( fmap(outputs(), [](const IValue* v) { return v->type(); })); } else { return_type = outputs().at(0)->type(); } c10::intrusive_ptr future = c10::make_intrusive(return_type); future->markCompleted(output); return future; } template c10::intrusive_ptr BlockRunner:: run_impl_record_functions_async( IValueList&& args, const KeywordArgs& kwargs) { auto step_callbacks = at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_MODEL); if (C10_UNLIKELY(step_callbacks.has_value())) { at::RecordFunction guard(std::move(*step_callbacks)); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive()); guard.needsInputs() ? guard.before( "forward", c10::ArrayRef(args.data(), args.size())) : guard.before("forward"); return run_impl_async(std::forward(args), kwargs); } return run_impl_async(std::forward(args), kwargs); } c10::IValue BlockRunner::operator()( const std::vector& args, const KeywordArgs& kwargs) { #ifdef PYTORCH_DISABLE_NET_PROFILING return run_impl(args, kwargs); #else return run_impl_record_functions(args, kwargs); #endif } c10::IValue BlockRunner::operator()( std::vector&& args, const KeywordArgs& kwargs) { #ifdef PYTORCH_DISABLE_NET_PROFILING return run_impl(std::move(args), kwargs); #else return run_impl_record_functions(std::move(args), kwargs); #endif } c10::intrusive_ptr BlockRunner::runAsync( const std::vector& args, const KeywordArgs& kwargs) { #ifdef PYTORCH_DISABLE_NET_PROFILING return run_impl_async(args, kwargs); #else return run_impl_record_functions_async(args, kwargs); #endif } c10::intrusive_ptr BlockRunner::runAsync( std::vector&& args, const KeywordArgs& kwargs) { #ifdef PYTORCH_DISABLE_NET_PROFILING return run_impl_async(std::move(args), kwargs); #else return run_impl_record_functions_async(std::move(args), kwargs); #endif } namespace { std::string generate_latency_json(const std::string& label, double millis) { #ifdef FBCODE_CAFFE2 folly::dynamic json = folly::dynamic::object(); json["type"] = label; json["metric"] = "latency"; json["unit"] = "ms"; json["value"] = millis; return "PyTorchObserver " + folly::toJson(json); #else (void)label; (void)millis; return ""; #endif } } // namespace void BlockRunner::benchmark( const std::vector>& args_list, const std::vector& kwargs_list, const uint32_t warmup_runs, const uint32_t main_runs, bool print_per_node_time, bool generate_ai_pep_output) { TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size()); std::cout << "Input size: " << args_list.size() << std::endl; float time_per_iter = benchmark_model(args_list, kwargs_list, warmup_runs, main_runs); std::cout << "Static runtime ms per iter: " << time_per_iter << ". Iters per second: " << 1000.0 / time_per_iter << std::endl; IndividualMetrics results = benchmark_individual_ops(args_list, kwargs_list, warmup_runs, main_runs); if (print_per_node_time) { const auto num_nodes = static_cast(nodes_.size()); for (const auto i : c10::irange(num_nodes)) { const Node* node = nodes_[i].node(); std::cout << "Node #" << i << ": " << results.time_per_node[i] << " ms/iter, "; node->print(std::cout, 0, nullptr, false); } } std::vector> time_per_node_type_vec{ results.time_per_node_type.begin(), results.time_per_node_type.end()}; if (args_list.empty()) { std::sort( time_per_node_type_vec.begin(), time_per_node_type_vec.end(), [&results](auto& left, auto& right) { return results.instances_per_node_type[left.first] > results.instances_per_node_type[right.first]; }); } else { std::sort( time_per_node_type_vec.begin(), time_per_node_type_vec.end(), [](auto& left, auto& right) { return left.second > right.second; }); } std::cout << "Time per node type:" << std::endl; for (const auto& p : time_per_node_type_vec) { const std::string& kind = p.first; const double ms = p.second; std::cout << std::setw(15) << ms << " ms. " << std::setw(10) << results.percent_per_node_type[kind] << "%. " << kind << " (" << results.instances_per_node_type[kind] << " nodes"; if (results.out_nodes.count(kind)) { std::cout << ", out variant)" << std::endl; } else if (results.native_nodes.count(kind)) { std::cout << ", native)" << std::endl; } else { std::cout << ")" << std::endl; } if (generate_ai_pep_output) { LOG(INFO) << generate_latency_json(kind, ms); } } if (generate_ai_pep_output) { LOG(INFO) << generate_latency_json( "static_runtime_first_iter", results.first_iter_time); } std::cout << std::setw(15) << results.total_time << " ms. in Total" << std::endl; std::cout << "BlockRunner setup time: " << results.setup_time << " ms" << std::endl; std::cout << "Memory allocation time: " << results.memory_alloc_time << " ms\n"; std::cout << "Memory deallocation time: " << results.memory_dealloc_time << " ms" << std::endl; std::cout << "Outputs deallocation time: " << results.output_dealloc_time << " ms" << std::endl; std::cout << "First iter time: " << results.first_iter_time << " ms" << std::endl; std::cout << "Number of operators: " << nodes_.size() << std::endl; if (planner_) { std::cout << "Total number of managed tensors: " << planner_->total_num_managed_tensors() << std::endl; std::cout << "Total number of managed output tensors: " << planner_->total_num_managed_output_tensors() << std::endl; std::cout << "Total number of unmanaged values: " << planner_->total_num_unmanaged() << std::endl; std::cout << "Number of unmanaged values requiring cleanup: " << planner_->num_unmanaged_non_scalars() << std::endl; std::cout << "Number of unmanaged values not requiring cleanup: " << planner_->num_unmanaged_scalars() << std::endl; std::cout << "Total memory managed: " << planner_->total_managed() << " bytes" << std::endl; if (static_module_.opts().optimize_memory) { std::cout << "Total number of reused tensors: " << planner_->total_reused_tensors() << std::endl; } } auto unsupported_nodes_count = results.total_nodes_count - results.out_nodes_count - results.native_nodes.size(); std::cout << "Total number of 'out' variant nodes/total number of nodes: " << results.out_nodes_count << "/" << results.total_nodes_count << " (" << 100.0 * static_cast(results.out_nodes_count) / static_cast(results.total_nodes_count) << "%)" << std::endl; std::cout << "Total number of nodes not covered by SR/total number of nodes: " << unsupported_nodes_count << "/" << results.total_nodes_count << " (" << 100.0 * static_cast(unsupported_nodes_count) / static_cast(results.total_nodes_count) << "%)" << std::endl; check_for_memory_leak(); #ifndef NDEBUG KeywordArgs empty_kwargs; display_nodes( args_list[0], kwargs_list.size() > 0 ? kwargs_list[0] : empty_kwargs); #endif } float BlockRunner::benchmark_model( const std::vector>& args_list, const std::vector& kwargs_list, const unsigned int warmup_runs, const unsigned int main_runs) { TORCH_CHECK(main_runs >= 1); TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size()); const bool is_kwargs_empty = kwargs_list.empty(); const KeywordArgs empty_kwargs; for (const auto _n_run : c10::irange(warmup_runs)) { (void)_n_run; // Suppress unused variable warning const auto num_args = static_cast(args_list.size()); for (const auto j : c10::irange(num_args)) { operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]); if (manage_output_tensors_enabled_) { deallocateOutputTensors(); } } } caffe2::Timer timer; for (const auto _n_run : c10::irange(main_runs)) { (void)_n_run; // Suppress unused variable warning const auto num_args = static_cast(args_list.size()); for (const auto j : c10::irange(num_args)) { operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]); if (manage_output_tensors_enabled_) { deallocateOutputTensors(); } } } float millis = timer.MilliSeconds(); return millis / (static_cast(main_runs) * static_cast(args_list.size())); } static bool display_ivalue(const IValue& iv) { if (iv.isTensor()) { std::cout << "Tensor " << iv.toTensor().toString() << " {"; const auto dims = iv.toTensor().sizes(); const auto n_dims = static_cast(dims.size()); for (const auto i : c10::irange(n_dims)) { std::cout << iv.toTensor().sizes()[i]; if (n_dims > i + 1) { std::cout << ", "; } } std::cout << "}\n"; return true; } else if (iv.isTensorList()) { std::cout << "TensorList {" << iv.toTensorList().size() << "}\n"; return true; } else if (iv.isGenericDict()) { std::cout << "Dict {" << iv.toGenericDict().size() << "}\n"; return true; } else if (iv.isTuple()) { std::cout << "Tuple {" << iv.toTupleRef().elements().size() << "}\n"; return true; } else if (iv.isInt()) { std::cout << "int {" << iv.toInt() << "}\n"; return true; } else if (iv.isBool()) { std::cout << "bool {" << iv.toBool() << "}\n"; return true; } else if (iv.isDouble()) { std::cout << "double {" << iv.toDouble() << "}\n"; return true; } return false; } static void display_pnode_info(const ProcessedNode& pnode) { pnode.node()->print(std::cout, 0, nullptr, false); const auto num_inputs = static_cast(pnode.num_inputs()); for (const auto i : c10::irange(num_inputs)) { std::cout << "\ti" << i << ": "; if (!display_ivalue(pnode.Input(i))) { std::cout << *(pnode.node()->inputs()[i]->type()) << '\n'; } } const auto outputs = pnode.outputs(); const auto num_outputs = static_cast(outputs.size()); for (const auto i : c10::irange(num_outputs)) { std::cout << "\to" << i << ": "; if (!display_ivalue(outputs[i])) { std::cout << *(pnode.node()->outputs()[i]->type()) << '\n'; } } } void BlockRunner::display_nodes( const std::vector& args, const KeywordArgs& kwargs) { c10::InferenceMode mode; auto on_exit = Deallocator(*this); if (planner_) { planner_->allocate(); } set_inputs(args, kwargs); for (auto& node : nodes_) { node.run(); display_pnode_info(node); } on_exit.setFinished(); } BlockRunner::IndividualMetrics BlockRunner::benchmark_individual_ops( const std::vector>& args_list, const std::vector& kwargs_list, const uint32_t warmup_runs, const uint32_t main_runs) { TORCH_CHECK(kwargs_list.empty() || args_list.size() == kwargs_list.size()); TORCH_CHECK(warmup_runs >= 1 && main_runs >= 1); IndividualMetrics results; results.time_per_node.resize(nodes_.size(), 0); if (args_list.empty()) { // When the given input is empty, compute the op statistics from the given // graph without executing it. const auto num_nodes = static_cast(nodes_.size()); for (const auto i : c10::irange(num_nodes)) { const Node* node = nodes_[i].node(); std::string kind(node->kind().toQualString()); // TODO: Collect op statistics from sub-blocks here. results.time_per_node[i] = 0; results.time_per_node_type[kind] = 0; results.instances_per_node_type[kind]++; if (nodes_[i].has_out_variant()) { results.out_nodes.insert(kind); results.out_nodes_count++; } else if (nodes_[i].has_native()) { results.native_nodes.insert(kind); } results.total_time += results.time_per_node[i]; } results.total_nodes_count = nodes_.size(); results.memory_alloc_time = 0; results.memory_dealloc_time = 0; results.output_dealloc_time = 0; for (const auto& p : results.time_per_node_type) { const std::string& kind = p.first; results.percent_per_node_type[kind] = 0; } return results; } const bool is_kwargs_empty = kwargs_list.empty(); const KeywordArgs empty_kwargs; bool manage_output_tensors = static_module_.opts().manage_output_tensors; // See comment on above use of InferenceMode for // explanation. c10::InferenceMode mode; // setup time caffe2::Timer timer; set_inputs(args_list[0], is_kwargs_empty ? empty_kwargs : kwargs_list[0]); results.setup_time = timer.MilliSeconds(); // The first iteration profiles each node's output Tensors' sizes and // initializes the memory planner with the profile information. Following // iterations just use the already established memory planning. timer.Start(); operator()(args_list[0], is_kwargs_empty ? empty_kwargs : kwargs_list[0]); if (manage_output_tensors) { deallocateOutputTensors(); } results.first_iter_time = timer.MilliSeconds(); // warmup runs for (const auto _n_run : c10::irange(warmup_runs)) { (void)_n_run; // Suppress unused variable warning const auto num_args = static_cast(args_list.size()); for (const auto j : c10::irange(num_args)) { operator()(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]); if (manage_output_tensors) { deallocateOutputTensors(); } } } // main runs for (const auto i : c10::irange(main_runs)) { (void)i; // Suppress unused variable warning const auto num_args = static_cast(args_list.size()); for (const auto j : c10::irange(num_args)) { set_inputs(args_list[j], is_kwargs_empty ? empty_kwargs : kwargs_list[j]); timer.Start(); if (planner_) { planner_->allocate(); } float millis = timer.MilliSeconds(); results.memory_alloc_time += millis; const auto num_nodes = static_cast(nodes_.size()); for (const auto k : c10::irange(num_nodes)) { timer.Start(); nodes_[k].run(); millis = timer.MilliSeconds(); results.time_per_node[k] += millis; verify_and_correct_memory_overlap(nodes_[k]); } timer.Start(); create_memory_planner(); planner_->deallocate(); // clean up owning refs of input tensors clean_up_input_ivalues(); if (manage_output_tensors) { deallocateOutputTensors(); } millis = timer.MilliSeconds(); results.memory_dealloc_time += millis; timer.Start(); // no need to keep references of outputs in static runtime anymore c10::IValue output; if (static_module_.num_outputs() > 1) { output = move_outputs_to_tuple(static_module_.num_outputs()); } DCHECK(check_for_memory_leak(/*output_returned*/ false)); // use move here. Otherwise, clean up outputs_[0] explicitly output = std::move(*outputs_[0]); // release outputs explicitly to measure the time it takes output = IValue(); millis = timer.MilliSeconds(); results.output_dealloc_time += millis; } } // post processing const float num_total_iters = (static_cast(main_runs) * static_cast(args_list.size())); const auto num_nodes = static_cast(nodes_.size()); for (const auto i : c10::irange(num_nodes)) { const Node* node = nodes_[i].node(); std::string kind = std::string(node->kind().toQualString()); results.time_per_node[i] /= num_total_iters; results.time_per_node_type[kind] += results.time_per_node[i]; results.instances_per_node_type[kind]++; if (nodes_[i].has_out_variant()) { results.out_nodes.insert(kind); results.out_nodes_count++; } else if (nodes_[i].has_native()) { results.native_nodes.insert(kind); } results.total_time += results.time_per_node[i]; } results.total_nodes_count = nodes_.size(); results.memory_alloc_time /= num_total_iters; results.memory_dealloc_time /= num_total_iters; results.output_dealloc_time /= num_total_iters; for (const auto& p : results.time_per_node_type) { const std::string& kind = p.first; results.percent_per_node_type[kind] = p.second / results.total_time * 100; } return results; } bool BlockRunner::check_for_memory_leak( bool output_returned, bool recurse_on_sub_blocks) { // check for inputs const auto num_inputs = static_cast(block_info_.num_inputs()); for (const auto i : c10::irange(num_inputs)) { TORCH_CHECK( values_[i + block_info_.block_inputs_idx()].isNone(), "Input ", i, " was not cleaned up"); } c10::FastSet output_ivalues(outputs_.begin(), outputs_.end()); const auto num_nodes = static_cast(nodes_.size()); for (const auto n : c10::irange(num_nodes)) { auto& pnode = nodes_[n]; const auto num_outputs = static_cast(pnode.num_outputs()); for (const auto i : c10::irange(num_outputs)) { const IValue* ival = &pnode.Output(i); const Value* val = pnode.node()->output(i); // subtlety: isManagedOutputTensorValue may give a false // negative here if an output is an alias of this value, so // check the actual tensor! if (planner_ && (isManagedOutputTensor(*ival) || isManagedOutputTensorValue(val))) { // `ival` contains a managed output tensor that the runtime doesn't // reclaim at the end of an iteration, but the client does so // by explicitly calling // `BlockRunner::deallocateOutputTensors`. continue; } const std::string error_msg = "Output " + std::to_string(i) + ", %" + val->debugName() + " of node " + std::to_string(n) + " which has kind " + pnode.node()->kind().toQualString() + " was not cleaned up"; if (output_ivalues.count(ival) == 0) { // check for intermediates if (!ival->isNone()) { TORCH_CHECK( ival->isTensor() || block_info_.node_is_optimizable_container_type( pnode.node()) || doesNotHeapAllocateWhenStoredInIValue(*val->type()), error_msg); if (ival->isTensor()) { const auto& t = ival->toTensor(); if (t.defined()) { auto* storage_impl = t.storage().unsafeGetStorageImpl(); TORCH_CHECK( storage_impl->data() == nullptr || (planner_ && planner_->isManagedStorageImpl(storage_impl)), error_msg); } } } } else { // check for outputs if (output_returned) { TORCH_CHECK(ival->isNone(), error_msg); } } } auto* metadata = pnode.metadata(); if (recurse_on_sub_blocks && metadata) { auto& block_runners = metadata->block_runners(); for (auto& block_runner : block_runners) { block_runner.check_for_memory_leak( output_returned, recurse_on_sub_blocks); } } } VLOG(1) << "Finished checking for memory leak"; return true; } void BlockRunner::deallocateOutputTensors() { if (!static_module_.opts().manage_output_tensors) { TORCH_CHECK( !planner_ || planner_->numOutputBufferBytes() == 0, "manage_output_tensors is disabled, but output tensor buffer is not empty."); return; } if (planner_) { planner_->deallocateOutputTensors(); DCHECK(checkOutputTensorMemoryLeaks()); } } bool BlockRunner::checkOutputTensorMemoryLeaks() { if (!static_module_.opts().manage_output_tensors || !planner_) { return true; } const auto num_nodes = static_cast(nodes_.size()); for (const auto n : c10::irange(num_nodes)) { auto& pnode = nodes_[n]; const auto num_outputs = static_cast(pnode.num_outputs()); for (const auto i : c10::irange(num_outputs)) { const IValue* ival = &pnode.Output(i); const Value* val = pnode.node()->output(i); if (!isManagedOutputTensorValue(val) || !ival->isTensor()) { // ival can not be a tensor if it's being managed by ops like // to_maybe_copy_out; see ReplaceWithMaybeCopy for details. continue; } const auto& t = ival->toTensor(); if (t.defined()) { auto* storage_impl = t.storage().unsafeGetStorageImpl(); const std::string error_msg = "Output " + std::to_string(i) + ", %" + val->debugName() + " of node " + std::to_string(n) + " was not cleaned up"; TORCH_CHECK(storage_impl->data() == nullptr, error_msg); } } } VLOG(1) << "Finished checking for memory leak from output tensors"; return true; } bool BlockRunner::isManagedOutputTensor(const IValue& ivalue) const { return planner_ && planner_->isManagedOutputTensor(ivalue); } bool BlockRunner::isManagedOutputTensorValue(const Value* value) const { // It's possible that manage_output_tensors_ was disabled after initializing // managed_output_tensor_values, so we have to check that flag here. if (!planner_ || !manage_output_tensors_enabled_) { return false; } const auto& managed_outputs = block_info_.managed_output_tensor_values(); return managed_outputs.find(value) != managed_outputs.end(); } void BlockRunner::disableManageOutputTensors() { if (!manage_output_tensors_enabled_) { return; } manage_output_tensors_enabled_ = false; if (!planner_) { return; } // Reset all IValues and destruct planner_ so that it can be reconstructed in // the next run. for (auto& n : nodes_) { const auto num_outputs = static_cast(n.outputs().size()); for (const auto i : c10::irange(num_outputs)) { n.Output(i) = IValue(); } } planner_.reset(); } ProcessedFunction::ProcessedFunction( Node* node, bool enable_out_variant, bool check_memory_overlap) : check_memory_overlap_(check_memory_overlap), num_outputs_(node->outputs().size()) { if (enable_out_variant) { f_ = getOutOfPlaceOperation(node); if (f_) { kind_ = ProcessedFunction::Kind::kOutVariant; // do not check memory overlap for out variants check_memory_overlap_ = false; VLOG(1) << "Switch to out variant for node: " << PrintNode(node); return; } } { f_ = getNativeOperation(node); if (f_) { kind_ = ProcessedFunction::Kind::kNativeFunction; #ifdef NDEBUG // skip this check in opt mode because these ops are better vetted check_memory_overlap_ = false; #endif VLOG(1) << "Switch to native impl for node: " << PrintNode(node); return; } } { const Operator& op = node->getOperator(); f_ = [node_op = op.getOperation(node), has_var_args = hasVarArgs(node)](ProcessedNode* pnode) mutable { std::vector stack; const auto size = static_cast(pnode->num_inputs()); stack.reserve(size + has_var_args); for (const auto i : c10::irange(size)) { stack.emplace_back(pnode->Input(i)); } // Need to store the number of inputs in stack for variadic ops. if (has_var_args) { stack.emplace_back(static_cast(size)); } node_op(stack); const auto num_outputs = static_cast(pnode->num_outputs()); TORCH_DCHECK_EQ(stack.size(), num_outputs); for (const auto i : c10::irange(num_outputs)) { pnode->Output(i) = std::move(stack[i]); } }; kind_ = ProcessedFunction::Kind::kInterpreterFallback; VLOG(1) << "Fallback interpreter for node: " << PrintNode(node); } } StaticNodeInfo::StaticNodeInfo( Node* node, ProcessedFunction* fn, ProcessedNodeInputs inputs, uint16_t outputs_offset) : node_(node), fn_(fn), inputs_(std::move(inputs)), outputs_offset_(outputs_offset) { TORCH_CHECK( num_outputs() == node->outputs().size(), "Node ", node->kind().toQualString(), " has ", std::to_string(num_outputs()), " outputs, expected ", std::to_string(node->outputs().size())); } std::vector ProcessedNode::inputs_ivalue_vec() const { std::vector result; const auto num_inputs = static_cast(inputs_.size()); result.reserve(num_inputs); for (const auto idx : c10::irange(num_inputs)) { result.emplace_back(Input(idx)); } return result; } void ProcessedNode::run() { #ifdef FBCODE_CAFFE2 SROperatorObserver::onStart(node()); #endif #ifndef PYTORCH_DISABLE_PER_OP_PROFILING auto step_callbacks = at::getStepCallbacksUnlessEmpty(at::RecordScope::STATIC_RUNTIME_OP); if (C10_UNLIKELY(step_callbacks.has_value())) { at::RecordFunction guard(std::move(*step_callbacks)); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(guard.isActive()); if (guard.needsInputs()) { const auto inputs = inputs_ivalue_vec(); guard.before( get_op_name(), c10::ArrayRef(inputs.data(), inputs.size())); } else { guard.before(get_op_name()); } if (has_out_variant()) { guard._setStaticRuntimeOutVariant(); } fn_->run(this); } else { fn_->run(this); } #else fn_->run(this); #endif #ifndef NDEBUG if (FLAGS_static_runtime_disable_debug_memory_overlap_check) { // run check but do not enforce verify_no_memory_overlap(); } else { DCHECK(verify_no_memory_overlap()); } #endif #ifdef FBCODE_CAFFE2 SROperatorObserver::onEnd(node()); #endif } static bool checkNoMemoryOverlap(const at::Tensor& a, const at::Tensor& b) { at::MemOverlapStatus status = at::get_overlap_status(a, b); if (status == at::MemOverlapStatus::Full || status == at::MemOverlapStatus::Partial) { return false; } if (status == at::MemOverlapStatus::TooHard) { VLOG(1) << "Detected TOO_HARD memory overlap status"; } return true; } bool ProcessedNode::verify_no_memory_overlap(bool force_check) const { const static std::array special_case_ops = { fromQualString("prim::TypeCheck"), fromQualString("prim::IfThenElse"), fromQualString("static_runtime::select_tensor"), fromQualString("static_runtime::VarTupleUnpack"), fromQualString("static_runtime::dict_unpack"), fromQualString("static_runtime::fused_split_and_squeeze"), fromQualString("static_runtime::create_owned_ref")}; if (!force_check && std::find( begin(special_case_ops), end(special_case_ops), node()->kind()) != end(special_case_ops)) { return true; } return verify_outputs_dont_overlap_each_other() && verify_inputs_dont_overlap_outputs(force_check); } bool ProcessedNode::verify_outputs_dont_overlap_each_other() const { const auto n_outputs = static_cast(num_outputs()); for (const auto i : c10::irange(n_outputs)) { if (!Output(i).isTensor()) { continue; } const auto& out0_t = Output(i).toTensor(); for (const auto j : c10::irange(i + 1, n_outputs)) { if (!Output(j).isTensor()) { continue; } const auto& out1_t = Output(j).toTensor(); if (!checkNoMemoryOverlap(out0_t, out1_t)) { LOG(INFO) << "Node output " << i << " overlaps with output " << j << ", " << PrintNode(node_); return false; } } } return true; } bool ProcessedNode::verify_inputs_dont_overlap_outputs(bool force_check) const { auto schema = node()->maybeSchema(); // skip memory overlap check for mutable or view ops with only one output bool skip_check = !schema || ((schema->is_mutable() || !fn_->checkMemoryOverlap()) && num_outputs() == 1); if (!schema || (!force_check && skip_check)) { if (!schema) { VLOG(2) << "Detected that op schema is null"; return true; } VLOG(2) << "schema->is_mutable: " << schema->is_mutable() << ", fn_->checkMemoryOverlap: " << fn_->checkMemoryOverlap() << ", num_outputs_: " << num_outputs(); return true; } const auto n_inputs = static_cast(inputs_.size()); const auto n_outputs = static_cast(num_outputs()); for (const auto i : c10::irange(n_inputs)) { const IValue* in = &Input(i); if (!in->isTensor()) { continue; } const auto& in_t = in->toTensor(); for (const auto j : c10::irange(n_outputs)) { const IValue& out = Output(j); if (!out.isTensor()) { continue; } const auto& out_t = out.toTensor(); if (!checkNoMemoryOverlap(in_t, out_t)) { LOG(INFO) << "Node input " << i << " overlaps with output " << j << ", " << PrintNode(node_); LOG(INFO) << *schema; return false; } } } return true; } bool ProcessedNode::check_and_correct_overlap_with( const at::Tensor& input, c10::IValue& output_ival) { auto& tensor = output_ival.toTensor(); if (!checkNoMemoryOverlap(input, tensor)) { DLOG(INFO) << "Detected alias for node: " << PrintNode(node()); output_ival = at::native::clone(tensor, c10::nullopt); set_outputs_memory_overlap_detected(); return true; } return false; } void ProcessedNode::verify_and_correct_memory_overlap() { const auto n_inputs = static_cast(inputs_.size()); const auto n_outputs = static_cast(num_outputs()); for (const auto i : c10::irange(n_inputs)) { const IValue& in = Input(i); if (!in.isTensor()) { continue; } const auto& in_t = in.toTensor(); for (const auto j : c10::irange(n_outputs)) { auto& output = Output(j); if (output.isTensor()) { check_and_correct_overlap_with(in_t, output); } else if (output.isTensorList()) { auto tensors = output.toListRef(); for (const auto& ival : tensors) { // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) check_and_correct_overlap_with(in_t, const_cast(ival)); } #ifdef FBCODE_CAFFE2 if (outputs_memory_overlap_detected()) { LOG_EVERY_MS(WARNING, 60000) << "Detected alias for node: " << PrintNode(node()); } #endif } } } } StaticRuntime::StaticRuntime(const StaticModule& sm) : values_(sm.value_buffer_size()) { std::copy(sm.constants().begin(), sm.constants().end(), values_.data()); // default task launcher set to inter-op thread pool async_task_launcher_ = at::launch; block_ = std::make_unique( sm, values_.data(), sm.root_block(), &async_task_launcher_, true /*is_root_block*/); } c10::IValue StaticRuntime::operator()( const std::vector& args, const KeywordArgs& kwargs) { return (*block_)(args, kwargs); } c10::IValue StaticRuntime::operator()( std::vector&& args, const KeywordArgs& kwargs) { return (*block_)(std::move(args), kwargs); } c10::intrusive_ptr StaticRuntime::runAsync( const std::vector& args, const KeywordArgs& kwargs, torch::jit::TaskLauncher taskLauncher) { async_task_launcher_ = std::move(taskLauncher); return block_->runAsync(args, kwargs); } c10::intrusive_ptr StaticRuntime::runAsync( std::vector&& args, const KeywordArgs& kwargs, torch::jit::TaskLauncher taskLauncher) { async_task_launcher_ = std::move(taskLauncher); return block_->runAsync(std::move(args), kwargs); } bool StaticRuntime::check_for_memory_leak(bool output_returned) { return block_->check_for_memory_leak( output_returned, /* recurse_on_sub_blocks */ true); } bool StaticRuntime::checkOutputTensorMemoryLeaks() { return block_->checkOutputTensorMemoryLeaks(); } void StaticRuntime::deallocateOutputTensors() { block_->deallocateOutputTensors(); } bool StaticRuntime::isManagedOutputTensor(const IValue& ivalue) const { return block_->isManagedOutputTensor(ivalue); } void StaticRuntime::disableManageOutputTensors() { block_->disableManageOutputTensors(); } const MemoryPlanner* StaticRuntime::get_memory_planner() const { return block_->get_memory_planner(); } } // namespace torch::jit