[PyTorch] add c10/util/FbcodeMaps.h (#96359)

Allow us to use folly maps in fbcode and std maps for compatibility in OSS, extending what static runtime is already doing.

Differential Revision: [D43926670](https://our.internmc.facebook.com/intern/diff/D43926670/)
Pull Request resolved: https://github.com/pytorch/pytorch/pull/96359
Approved by: https://github.com/ezyang
This commit is contained in:
Scott Wolchok
2023-03-08 21:29:40 -08:00
committed by PyTorch MergeBot
parent cc699c56dc
commit cc798f1a4f
9 changed files with 84 additions and 65 deletions

29
c10/util/FbcodeMaps.h Normal file
View File

@ -0,0 +1,29 @@
#ifndef C10_UTIL_FBCODEMAPS_H_
#define C10_UTIL_FBCODEMAPS_H_
// Map typedefs so that we can use folly's F14 maps in fbcode without
// taking a folly dependency.
#ifdef FBCODE_CAFFE2
#include <folly/container/F14Map.h>
#include <folly/container/F14Set.h>
#else
#include <unordered_map>
#include <unordered_set>
#endif
namespace c10 {
#ifdef FBCODE_CAFFE2
template <typename Key, typename Value>
using FastMap = folly::F14FastMap<Key, Value>;
template <typename Key>
using FastSet = folly::F14FastSet<Key>;
#else
template <typename Key, typename Value>
using FastMap = std::unordered_map<Key, Value>;
template <typename Key>
using FastSet = std::unordered_set<Key>;
#endif
} // namespace c10
#endif // C10_UTIL_FBCODEMAPS_H_

View File

@ -133,7 +133,7 @@ auto sr_metadata_registerer = torch::class_<StaticRuntimeMetadata>(
} // namespace
std::string dumpValueSet(
const FastSet<const Value*>& value_set,
const c10::FastSet<const Value*>& value_set,
const char* set_name) {
std::ostringstream oss;
oss << set_name << ": {";
@ -229,7 +229,7 @@ bool removeSelfFromGraphInput(std::shared_ptr<torch::jit::Graph>& graph) {
return true;
}
std::vector<Value*> valueVecFromFastSet(const FastSet<const Value*>& s) {
std::vector<Value*> valueVecFromFastSet(const c10::FastSet<const Value*>& s) {
std::vector<Value*> result;
result.reserve(s.size());
for (auto* v : s) {
@ -248,7 +248,7 @@ bool mayContainAlias(const AliasDb& db, const Value* v1, const Value* v2) {
bool mayContainAlias(
const AliasDb& db,
const Value* a,
const FastSet<const Value*>& b) {
const c10::FastSet<const Value*>& b) {
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast)
return db.mayContainAlias(const_cast<Value*>(a), valueVecFromFastSet(b));
}
@ -390,9 +390,9 @@ bool isPureFunction(const Node* node) {
ManagedTensorRanges::ManagedTensorRanges(
Block& block,
const AliasDb& alias_db,
const FastSet<const Value*>& managed_tensor_values) {
const c10::FastSet<const Value*>& managed_tensor_values) {
const std::vector<Node*> nodes(block.nodes().begin(), block.nodes().end());
const FastSet<const Value*> graph_inputs(
const c10::FastSet<const Value*> graph_inputs(
block.inputs().begin(), block.inputs().end());
const auto num_nodes = nodes.size();
@ -589,7 +589,7 @@ StaticModule::StaticModule(
// Maps each Value* in the graph to its index in the values_ array that will
// eventually be created by StaticRuntime.
FastMap<const Value*, uint32_t> value_to_index;
c10::FastMap<const Value*, uint32_t> value_to_index;
prepareFunctionsAndConstants(graph_->block(), alias_db, value_to_index);
const auto constants_index_offset = 0;
@ -610,7 +610,7 @@ StaticModule::StaticModule(
size_t StaticModule::prepareBlockInfo(
Block* block,
const size_t start_idx,
FastMap<const Value*, uint32_t>& value_to_index) {
c10::FastMap<const Value*, uint32_t>& value_to_index) {
block_infos_.emplace(block, BlockInfo(start_idx, *block));
const auto num_inputs = block->inputs().size();
@ -671,7 +671,7 @@ void StaticModule::attachNodeMetadata(Block* block) {
void StaticModule::prepareFunctionsAndConstants(
Block* block,
const AliasDb& alias_db,
FastMap<const Value*, uint32_t>& value_to_index) {
c10::FastMap<const Value*, uint32_t>& value_to_index) {
for (auto* node : block->nodes()) {
for (auto* sub_block : node->blocks()) {
prepareFunctionsAndConstants(sub_block, alias_db, value_to_index);
@ -702,14 +702,14 @@ void StaticModule::prepareFunctionsAndConstants(
size_t StaticModule::prepareStaticNodeInfos(
Block* block,
const FastMap<const Value*, uint32_t>& value_to_index,
const c10::FastMap<const Value*, uint32_t>& value_to_index,
const AliasDb& alias_db,
size_t node_idx) {
const auto node_start = node_idx;
auto& block_info = block_infos_.at(block);
std::vector<StaticNodeInfo> nodes;
FastMap<Node*, bool> node_has_out_variant;
c10::FastMap<Node*, bool> node_has_out_variant;
for (auto* node : block->nodes()) {
if (node->kind() == prim::Constant) {
@ -754,7 +754,7 @@ size_t StaticModule::prepareStaticNodeInfos(
void BlockInfo::set_nodes(
std::vector<StaticNodeInfo> nodes,
const FastMap<Node*, bool>& node_has_out_variant) {
const c10::FastMap<Node*, bool>& node_has_out_variant) {
nodes_ = std::move(nodes);
for (auto& node : nodes_) {
@ -773,7 +773,7 @@ void BlockInfo::prepare_for_memory_planner(
// Never manage graph outputs so that we can do std::move(output_ivalue).
// This does not affect performance if the graph returns a collection object.
FastSet<const Value*> graph_output_values(
c10::FastSet<const Value*> graph_output_values(
block_.outputs().begin(), block_.outputs().end());
// collect register indices of outputs of ops with out variant
@ -1796,7 +1796,7 @@ bool BlockRunner::check_for_memory_leak(
i,
" was not cleaned up");
}
FastSet<const IValue*> output_ivalues(outputs_.begin(), outputs_.end());
c10::FastSet<const IValue*> output_ivalues(outputs_.begin(), outputs_.end());
for (const auto n : c10::irange(nodes_.size())) {
auto& pnode = nodes_[n];
for (const auto i : c10::irange(pnode.num_outputs())) {

View File

@ -5,6 +5,7 @@
#include <c10/core/CPUAllocator.h>
#include <c10/macros/Macros.h>
#include <c10/util/ArrayRef.h>
#include <c10/util/FbcodeMaps.h>
#include <c10/util/variant.h>
#include <torch/csrc/jit/api/module.h>
#include <torch/csrc/jit/ir/graph_node_list.h>
@ -24,23 +25,11 @@
namespace torch {
namespace jit {
#ifdef FBCODE_CAFFE2
template <typename Key, typename Value>
using FastMap = folly::F14FastMap<Key, Value>;
template <typename Key>
using FastSet = folly::F14FastSet<Key>;
#else
template <typename Key, typename Value>
using FastMap = std::unordered_map<Key, Value>;
template <typename Key>
using FastSet = std::unordered_set<Key>;
#endif
TORCH_API bool canEnableStaticRuntime(
const std::shared_ptr<torch::jit::Graph>& graph);
TORCH_API std::string dumpValueSet(
const FastSet<const Value*>& value_set,
const c10::FastSet<const Value*>& value_set,
const char* set_name = "");
TORCH_API inline bool doesNotHeapAllocateWhenStoredInIValue(const Type& type) {
@ -111,8 +100,8 @@ class ValueGroup {
}
private:
FastSet<const Value*> output_aliases_;
FastSet<const Value*> external_aliases_;
c10::FastSet<const Value*> output_aliases_;
c10::FastSet<const Value*> external_aliases_;
};
class TORCH_API ManagedTensorRanges {
@ -121,7 +110,7 @@ class TORCH_API ManagedTensorRanges {
ManagedTensorRanges(
Block& block,
const AliasDb& alias_db,
const FastSet<const Value*>& managed_tensor_values);
const c10::FastSet<const Value*>& managed_tensor_values);
// If true, then this node is the last use of at least one
// managed tensor. availableTensorValuesAfterNode(node) will return a vector
@ -154,9 +143,9 @@ class TORCH_API ManagedTensorRanges {
// Maps Node* to the set of managed tensors that are now available
// for re-use after this node.
FastMap<Node*, std::vector<const Value*>> node_to_newly_free_tensors_{};
c10::FastMap<Node*, std::vector<const Value*>> node_to_newly_free_tensors_{};
// Maps each Value* to its lifetime (start node index, end node index)
FastMap<const Value*, Lifetime> value_lifetimes_{};
c10::FastMap<const Value*, Lifetime> value_lifetimes_{};
};
struct TORCH_API StaticModuleOptions {
@ -277,7 +266,7 @@ class BlockInfo {
void set_nodes(
std::vector<StaticNodeInfo> nodes,
const FastMap<Node*, bool>& node_has_out_variant);
const c10::FastMap<Node*, bool>& node_has_out_variant);
const std::vector<StaticNodeInfo>& nodes() const {
return nodes_;
@ -357,10 +346,10 @@ class BlockInfo {
ValueGroup value_group_;
FastSet<const Node*> node_is_optimizable_container_type_;
FastSet<const Value*> managed_tensor_values_;
FastSet<const Value*> managed_output_tensor_values_;
FastSet<const Value*> leaked_values_;
c10::FastSet<const Node*> node_is_optimizable_container_type_;
c10::FastSet<const Value*> managed_tensor_values_;
c10::FastSet<const Value*> managed_output_tensor_values_;
c10::FastSet<const Value*> leaked_values_;
ManagedTensorRanges managed_tensor_ranges_{};
@ -481,12 +470,12 @@ class TORCH_API StaticModule {
size_t prepareBlockInfo(
Block* block,
const size_t start_idx,
FastMap<const Value*, uint32_t>& value_to_index);
c10::FastMap<const Value*, uint32_t>& value_to_index);
void prepareFunctionsAndConstants(
Block* block,
const AliasDb& alias_db,
FastMap<const Value*, uint32_t>& value_to_index);
c10::FastMap<const Value*, uint32_t>& value_to_index);
// Recursively traverse the graph and attach SR metadata
// to the prim::fork nodes as additional attributes
@ -496,7 +485,7 @@ class TORCH_API StaticModule {
// Returns (number of nodes processed, number of blocks processed)
size_t prepareStaticNodeInfos(
Block* block,
const FastMap<const Value*, uint32_t>& value_to_index,
const c10::FastMap<const Value*, uint32_t>& value_to_index,
const AliasDb& alias_db,
size_t node_idx = 0);
@ -531,7 +520,7 @@ class TORCH_API StaticModule {
// includes it anyways to be consistent with the JIT interpreter.
size_t num_inputs_;
// See `BlockInfo` definition. The blocks are stored in depth-first order.
FastMap<Block*, BlockInfo> block_infos_;
c10::FastMap<Block*, BlockInfo> block_infos_;
size_t value_buffer_size_ = 0;
};

View File

@ -24,10 +24,10 @@ bool isUnmanagedSpecialCase(const ProcessedNode& pnode, size_t output_idx) {
pnode.Output(output_idx).isNone();
}
FastMap<const Value*, at::Tensor*> tensorValueToTensor(
c10::FastMap<const Value*, at::Tensor*> tensorValueToTensor(
const std::vector<ProcessedNode>& nodes,
const FastSet<const Value*>& managed_tensor_values) {
FastMap<const Value*, at::Tensor*> tensor_value_to_tensor;
const c10::FastSet<const Value*>& managed_tensor_values) {
c10::FastMap<const Value*, at::Tensor*> tensor_value_to_tensor;
for (auto& pnode : nodes) {
auto* node = pnode.node();
for (const auto output_idx : c10::irange(node->outputs().size())) {
@ -72,10 +72,10 @@ at::DataPtr allocate_buffer(size_t size) {
std::vector<StorageGroup> assignStorageToManagedTensors(
graph_node_list nodes,
const ManagedTensorRanges& ranges,
const FastMap<const Value*, at::Tensor*>& tensor_value_to_tensor) {
const c10::FastMap<const Value*, at::Tensor*>& tensor_value_to_tensor) {
std::vector<StorageGroup> managed_tensor_groups;
// This set maps each Value* to its assigned storage group.
FastMap<const Value*, size_t> storage_group_mapping;
c10::FastMap<const Value*, size_t> storage_group_mapping;
// On each iteration, this vector stores the set of storage groups that
// are available for re-use.
std::vector<size_t> free_storage_groups;
@ -137,13 +137,13 @@ std::vector<StorageGroup> assignStorageToManagedTensors(
namespace {
bool setIncludes(const FastSet<const Value*>& set, const Value* v) {
bool setIncludes(const c10::FastSet<const Value*>& set, const Value* v) {
return set.find(v) != set.end();
}
std::vector<std::pair<size_t, at::Tensor*>> assignStorageToOutputTensors(
BlockRunner* block_runner,
const FastSet<const Value*>& managed_output_tensor_values) {
const c10::FastSet<const Value*>& managed_output_tensor_values) {
std::vector<std::pair<size_t, at::Tensor*>> managed_output_tensors;
for (auto& pnode : block_runner->nodes()) {
for (const auto i : c10::irange(pnode.outputs().size())) {
@ -174,8 +174,8 @@ MemoryPlanner::MemoryPlanner(
const auto& leaked_values = block_info.leaked_values();
// collect unmanaged output ivalues
FastSet<IValue*> unmanaged_ivalues;
FastSet<IValue*> unmanaged_borrowed_ivalues;
c10::FastSet<IValue*> unmanaged_ivalues;
c10::FastSet<IValue*> unmanaged_borrowed_ivalues;
for (ProcessedNode& pnode : block_runner->nodes()) {
const auto borrows_outputs = borrowsOutputs(pnode.node()->kind());
for (const auto i : c10::irange(pnode.outputs().size())) {

View File

@ -42,7 +42,7 @@ class StorageGroup {
TORCH_API std::vector<StorageGroup> assignStorageToManagedTensors(
graph_node_list nodes,
const ManagedTensorRanges& ranges,
const FastMap<const Value*, at::Tensor*>& tensor_value_to_tensor);
const c10::FastMap<const Value*, at::Tensor*>& tensor_value_to_tensor);
// There are three types of ops in a processed graph in Static Runtime:
// 1. op with _out variant

View File

@ -393,7 +393,7 @@ bool disableUnsafeMathOp(const char* op_name) {
// not guarantee bit exactness vs the jit interpreter. Note aten::relu is not
// included even though it uses NNC because the results of relu should always
// match.
static const FastSet<std::string> fast_ops{
static const c10::FastSet<std::string> fast_ops{
"aten::add", "aten::tanh", "aten::sigmoid", "aten::logit"};
return fast_ops.count(op_name) > 0;
}
@ -417,7 +417,7 @@ bool hasVarArgs(Node* n) {
bool canReuseInputsOutputs(
Node* n,
const FastMap<Node*, bool>& node_has_out_variant) {
const c10::FastMap<Node*, bool>& node_has_out_variant) {
auto it = node_has_out_variant.find(n);
if (it != node_has_out_variant.end()) {
return it->second;
@ -430,7 +430,7 @@ bool canReuseInputsOutputs(
// This means the IValues will not change run to run
bool inputsCanRunOutOfPlace(
Node* n,
const FastMap<Node*, bool>& node_has_out_variant) {
const c10::FastMap<Node*, bool>& node_has_out_variant) {
for (auto* input : n->inputs()) {
if (!canReuseInputsOutputs(input->node(), node_has_out_variant)) {
return false;
@ -441,7 +441,7 @@ bool inputsCanRunOutOfPlace(
bool isOptimizableContainerType(
Node* n,
const FastMap<Node*, bool>& node_has_out_variant) {
const c10::FastMap<Node*, bool>& node_has_out_variant) {
const auto& type = n->output()->type();
bool is_supported_type = false;
if (type->kind() == TypeKind::ListType) {
@ -488,7 +488,7 @@ REGISTER_OPERATOR_FUNCTOR(
return nullptr;
}
const bool can_optimize =
isOptimizableContainerType(n, FastMap<Node*, bool>());
isOptimizableContainerType(n, c10::FastMap<Node*, bool>());
const auto& type = n->output()->type()->expectRef<ListType>();
const size_t size = n->inputs().size();
if (!can_optimize) {
@ -543,7 +543,7 @@ REGISTER_OPERATOR_FUNCTOR(
return nullptr;
}
const bool can_optimize =
isOptimizableContainerType(n, FastMap<Node*, bool>());
isOptimizableContainerType(n, c10::FastMap<Node*, bool>());
const size_t size = n->inputs().size();
if (!can_optimize) {
return [size](ProcessedNode* p_node) {

View File

@ -148,10 +148,10 @@ bool nativeOpIsRegistered(const c10::Symbol& op_name);
bool canReuseInputsOutputs(
Node* n,
const FastMap<Node*, bool>& node_has_out_variant);
const c10::FastMap<Node*, bool>& node_has_out_variant);
bool isOptimizableContainerType(
Node* n,
const FastMap<Node*, bool>& node_has_out_variant);
const c10::FastMap<Node*, bool>& node_has_out_variant);
SROperator getOutOfPlaceOperation(Node* n);
SROperator getNativeOperation(Node* n);

View File

@ -668,7 +668,7 @@ void ReplaceWithMaybeCopy(
void ReplaceWithCopyImpl(
std::shared_ptr<Graph>& graph,
const FastMap<c10::Symbol, c10::Symbol>& supported,
const c10::FastMap<c10::Symbol, c10::Symbol>& supported,
const std::vector<std::pair<c10::FunctionSchema, c10::Symbol>>&
supported_schema,
const std::function<bool(Node*)>& f_extra_checks,
@ -755,7 +755,7 @@ void ReplacePermuteWithCopy(
std::shared_ptr<Graph>& graph,
bool outputs_are_immutable) {
AliasDb db(graph);
const FastMap<c10::Symbol, c10::Symbol> supported = {
const c10::FastMap<c10::Symbol, c10::Symbol> supported = {
#ifdef FBCODE_CAFFE2
OP_PAIR("aten::permute", "static_runtime::permute_copy"),
#endif
@ -777,7 +777,7 @@ void ReplaceWithCopy(
std::shared_ptr<Graph>& graph,
bool outputs_are_immutable) {
AliasDb db(graph);
const FastMap<c10::Symbol, c10::Symbol> supported = {
const c10::FastMap<c10::Symbol, c10::Symbol> supported = {
#ifdef FBCODE_CAFFE2
OP_PAIR("aten::permute", "static_runtime::permute_copy"),
OP_PAIR("fb::expand_dims", "static_runtime::expand_dims_copy"),
@ -868,7 +868,7 @@ bool shouldNotFuseListUnpackSpecialCase(const Node* node) {
} // namespace
void FuseListUnpack(std::shared_ptr<torch::jit::Graph>& graph) {
const FastMap<c10::Symbol, c10::Symbol> unfused_to_fused = {
const c10::FastMap<c10::Symbol, c10::Symbol> unfused_to_fused = {
OP_PAIR(
"torcharrow::inference_wrapper_run_flat",
"static_runtime::fused_inference_wrapper_run_flat"),
@ -1045,7 +1045,8 @@ void CreateOwnedRefsForSpecialValuesHelper(Graph& graph, Block* block) {
auto outputs = block->outputs();
// Create owned refs for inputs. Otherwise, the input cleanup process
// will destroy our outputs before we return.
FastSet<Value*> inputs = {block->inputs().begin(), block->inputs().end()};
c10::FastSet<Value*> inputs = {
block->inputs().begin(), block->inputs().end()};
for (const auto i : c10::irange(outputs.size())) {
auto* output = outputs[i];

View File

@ -93,8 +93,8 @@ std::mutex& getNNCCacheMutex() {
return nncCacheMutex;
}
FastMap<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
static FastMap<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
c10::FastMap<NodeKind, std::shared_ptr<TEWrapper>>& getNNCCache() {
static c10::FastMap<NodeKind, std::shared_ptr<TEWrapper>> nncCache;
return nncCache;
}