mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: More clang tidy cleanups in `torch/csrc`. This time: 1. `hicpp-use-equals-default` recommends `= default` instead of `{}` for constructors/destructors. This is better practice because it expresses the intent better (https://stackoverflow.com/questions/6502828/what-does-default-mean-after-a-class-function-declaration) 2. `readability-inconsistent-declaration-parameter-name` enforces that parameter names in the declaration match parameter names in the definition. This is just generally useful and can prevent confusion and bugs. Also updated my script a little bit. apaszke ezyang Pull Request resolved: https://github.com/pytorch/pytorch/pull/9737 Differential Revision: D9069069 Pulled By: goldsborough fbshipit-source-id: f7b3f3a4eb4c9fadc30425a153566d3b613a41ae
569 lines
20 KiB
C++
569 lines
20 KiB
C++
#include "torch/csrc/jit/graph_executor.h"
|
|
|
|
#include "torch/csrc/jit/assertions.h"
|
|
#include "torch/csrc/autograd/grad_mode.h"
|
|
#include "torch/csrc/jit/argument_spec.h"
|
|
#include "torch/csrc/jit/autodiff.h"
|
|
#include "torch/csrc/jit/interpreter.h"
|
|
#include "torch/csrc/jit/ir.h"
|
|
#include "torch/csrc/jit/tracer.h"
|
|
#include "torch/csrc/jit/passes/batch_mm.h"
|
|
#include "torch/csrc/jit/passes/common_subexpression_elimination.h"
|
|
#include "torch/csrc/jit/passes/create_autodiff_subgraphs.h"
|
|
#include "torch/csrc/jit/passes/dead_code_elimination.h"
|
|
#include "torch/csrc/jit/passes/erase_number_types.h"
|
|
#include "torch/csrc/jit/passes/graph_fuser.h"
|
|
#include "torch/csrc/jit/passes/inplace_check.h"
|
|
#include "torch/csrc/jit/passes/peephole.h"
|
|
#include "torch/csrc/jit/passes/shape_analysis.h"
|
|
#include "torch/csrc/jit/passes/remove_expands.h"
|
|
#include "torch/csrc/jit/passes/decompose_addmm.h"
|
|
#include "torch/csrc/jit/passes/specialize_undef.h"
|
|
#include "torch/csrc/jit/passes/loop_unrolling.h"
|
|
#include "torch/csrc/jit/passes/lower_grad_of.h"
|
|
#include "torch/csrc/jit/symbolic_variable.h"
|
|
#include "torch/csrc/jit/ivalue.h"
|
|
|
|
#include "torch/csrc/autograd/edge.h"
|
|
#include "torch/csrc/autograd/function.h"
|
|
#include "torch/csrc/jit/script/compiler.h"
|
|
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <mutex>
|
|
#include <unordered_map>
|
|
#include <utility>
|
|
#include <vector>
|
|
#include <iterator>
|
|
|
|
namespace torch { namespace jit {
|
|
|
|
namespace {
|
|
|
|
using tensor_list = std::vector<at::Tensor>;
|
|
using Variable = autograd::Variable;
|
|
using autograd::variable_list;
|
|
|
|
// this type is in ExecutionPlan to run its Gradient if it is
|
|
// specified. It has a list of inputs captured by ExecutionPlan that
|
|
// it concats with inputs to form the full set of inputs to graph.
|
|
// see struct Gradient for a description of how the derivative graph
|
|
// is constructed and what variables are captured.
|
|
struct ExecutionPlanAutogradFunction : public autograd::Function {
|
|
ExecutionPlanAutogradFunction(GraphExecutor graph, size_t capture_size)
|
|
: graph(std::move(graph)) {
|
|
is_var_capture.reserve(capture_size);
|
|
var_captures.reserve(capture_size);
|
|
ivalue_captures.reserve(capture_size);
|
|
}
|
|
|
|
virtual variable_list apply(variable_list&& inputs) override {
|
|
Stack stack;
|
|
stack.reserve(is_var_capture.size() + inputs.size());
|
|
stack.insert(stack.end(), std::make_move_iterator(inputs.begin()),
|
|
std::make_move_iterator(inputs.end()));
|
|
auto var_capture_it = var_captures.begin();
|
|
auto ivalue_capture_it = ivalue_captures.begin();
|
|
for (bool is_var : is_var_capture) {
|
|
if (is_var) {
|
|
stack.push_back(var_capture_it->unpack(this->shared_from_this()));
|
|
++var_capture_it;
|
|
} else {
|
|
stack.push_back(*ivalue_capture_it);
|
|
++ivalue_capture_it;
|
|
}
|
|
}
|
|
graph.run(stack);
|
|
return fmap(stack, [](IValue & val) {
|
|
return autograd::Variable(std::move(val).toTensor());
|
|
});
|
|
}
|
|
|
|
void capture(const IValue & val) {
|
|
const bool is_tensor = val.isTensor();
|
|
is_var_capture.push_back(is_tensor);
|
|
if (is_tensor) {
|
|
var_captures.emplace_back(Variable(val.toTensor()), false);
|
|
} else {
|
|
ivalue_captures.push_back(val);
|
|
}
|
|
}
|
|
private:
|
|
friend struct ExecutionPlan;
|
|
GraphExecutor graph;
|
|
|
|
// INVARIANT: is_var_capture.size() == var_captures.size() + ivalue_captures.size()
|
|
std::vector<bool> is_var_capture;
|
|
std::vector<autograd::SavedVariable> var_captures;
|
|
std::vector<IValue> ivalue_captures;
|
|
};
|
|
|
|
// an optimized way of executing the subgraph computed directly on
|
|
// tensors rather than Variables.
|
|
// This will unwrap Variables, run the plan, and re-wrap them.
|
|
// It can optionally also have a gradient which is hooked up
|
|
// to the output Variables if present.
|
|
struct ExecutionPlan {
|
|
ExecutionPlan(std::shared_ptr<Graph>& graph)
|
|
: f(graph),
|
|
graph(graph),
|
|
num_inputs(graph->inputs().size()),
|
|
num_outputs(graph->outputs().size()) {}
|
|
ExecutionPlan(std::shared_ptr<Graph>& graph, Gradient grad)
|
|
: f(graph),
|
|
graph(graph),
|
|
grad(std::move(grad)),
|
|
grad_executor(this->grad.df),
|
|
num_inputs(graph->inputs().size()),
|
|
num_outputs(graph->outputs().size()) {}
|
|
|
|
void run(Stack & stack) const {
|
|
if (grad) {
|
|
return runWithGrad(stack);
|
|
}
|
|
InterpreterState(f).runOneStage(stack);
|
|
}
|
|
|
|
std::shared_ptr<Graph> get_graph() const {
|
|
return graph;
|
|
}
|
|
|
|
ExecutionPlanState getDebugState() {
|
|
ExecutionPlanState state;
|
|
state.f = &f;
|
|
state.graph = graph.get();
|
|
if (grad) {
|
|
state.grad = &grad;
|
|
state.grad_executor = std::unique_ptr<GraphExecutorState>(
|
|
new GraphExecutorState(grad_executor.getDebugState()));
|
|
} else {
|
|
state.grad = nullptr;
|
|
state.grad_executor.reset();
|
|
}
|
|
return state;
|
|
}
|
|
|
|
private:
|
|
void detachVariables(Stack & stack) const {
|
|
// It would be nice to use an ArrayRef here, but unfortunately those can only
|
|
// return const references, so we need to do a bunch of indexing ourselves.
|
|
const int64_t stack_size = stack.size();
|
|
const int64_t stack_offset = stack_size - num_inputs;
|
|
for (int64_t i = stack_offset; i < stack_size; ++i) {
|
|
auto & v = stack[i];
|
|
if (!v.isTensor()) continue;
|
|
auto t = std::move(v).toTensor();
|
|
v = IValue{t.defined() ? autograd::as_variable_ref(t).detach() : std::move(t)};
|
|
}
|
|
}
|
|
// Capture (save) inputs that would be required to subsequently run backwards
|
|
void captureInputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef<IValue> inputs) const {
|
|
for (size_t offset : grad.df_input_captured_inputs) {
|
|
grad_fn.capture(inputs[offset]);
|
|
}
|
|
}
|
|
void captureOutputs(ExecutionPlanAutogradFunction & grad_fn, at::ArrayRef<IValue> outputs) const {
|
|
for (size_t offset : grad.df_input_captured_outputs) {
|
|
grad_fn.capture(outputs[offset]);
|
|
}
|
|
}
|
|
|
|
// XXX: keep in mind that stack can be larger than the inputs we need!
|
|
void runWithGrad(Stack & stack) const {
|
|
auto grad_fn = std::make_shared<ExecutionPlanAutogradFunction>(grad_executor,
|
|
grad.df_input_captured_inputs.size() + grad.df_input_captured_outputs.size());
|
|
|
|
{
|
|
auto inputs = last(stack, num_inputs);
|
|
// hook up the outputs of df to the gradient functions of the inputs that require gradients
|
|
for(auto idx : grad.df_output_vjps) {
|
|
auto v = Variable(inputs[idx].toTensor());
|
|
grad_fn->add_next_edge(v.gradient_edge());
|
|
}
|
|
captureInputs(*grad_fn, inputs);
|
|
}
|
|
|
|
detachVariables(stack);
|
|
InterpreterState(f).runOneStage(stack);
|
|
|
|
{
|
|
auto outputs = last(stack, num_outputs);
|
|
// hookup the gradients for the output tensors that require gradients
|
|
// to the inputs to our gradient function df
|
|
// TODO - XXX - if any output is the same tensor multiple times, views have to be
|
|
// setup here. We need to refactor autograd until it is safe for
|
|
// tensors to be constructed without all the viewing infrastructure.
|
|
// this is currently intentionally not done here so we can get an idea of our
|
|
// perf before introducing overhead for correctness
|
|
for(auto idx : grad.df_input_vjps) {
|
|
// Note: we have to set this up in place, or we have to throw away and
|
|
// reallocate variables that were already created in wrapTensors. We
|
|
// should add an API for this.
|
|
Variable output = outputs[idx].toTensor();
|
|
autograd::create_gradient_edge(output, grad_fn);
|
|
output.set_requires_grad(true);
|
|
}
|
|
captureOutputs(*grad_fn, outputs);
|
|
// drop the temporary outputs so that we return the same number of
|
|
// outputs as if we were not also calculating gradient
|
|
const size_t num_temporary_outputs = num_outputs - grad.f_real_outputs;
|
|
stack.erase(stack.end() - num_temporary_outputs, stack.end());
|
|
}
|
|
}
|
|
|
|
Code f;
|
|
// optimized graph for debugging and testing
|
|
std::shared_ptr<Graph> graph;
|
|
// description of gradient as a graph
|
|
Gradient grad; // if(grad) is false when this is unused
|
|
// executor for df, including code caches
|
|
GraphExecutor grad_executor;
|
|
|
|
const size_t num_inputs;
|
|
const size_t num_outputs;
|
|
};
|
|
|
|
} // anonymous namespace
|
|
|
|
// a Graph can be created via tracing, or via a language-based frontend
|
|
// GraphExecutor runs it. It can run the same graph on many different sizes
|
|
// and different requires_grad states, and handles specializations for each situation.
|
|
// GraphExecutor is completely unaware of tracing or module parameters to keep the
|
|
// tracing concerns separated.
|
|
struct GraphExecutorImpl {
|
|
|
|
GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable)
|
|
: graph(std::move(graph))
|
|
, optimize(optimize)
|
|
, num_inputs(this->graph->inputs().size())
|
|
, num_outputs(this->graph->outputs().size())
|
|
, symbolically_differentiable(symbolically_differentiable)
|
|
, may_introduce_gradient(calcMayIntroduceGradient(this->graph->block())) {}
|
|
GraphExecutorImpl(std::shared_ptr<Graph> graph, bool optimize)
|
|
: GraphExecutorImpl(graph, optimize, isDifferentiable(*graph)) {
|
|
for(auto input : graph->inputs()) {
|
|
JIT_ASSERTM(input->type()->kind() != TypeKind::TupleType, "tuples cannot be inputs to the graph");
|
|
}
|
|
for(auto output : graph->outputs()) {
|
|
JIT_ASSERTM(output->type()->kind() != TypeKind::TupleType, "tuples cannot be outputs to the graph");
|
|
}
|
|
}
|
|
|
|
// entry point where execution begins
|
|
void run(Stack & stack) {
|
|
if(stack.size() < num_inputs) {
|
|
std::stringstream ss;
|
|
ss << "expected " << num_inputs << " inputs but got " << stack.size() << " inputs";
|
|
throw std::runtime_error(ss.str());
|
|
}
|
|
auto inputs = last(stack, num_inputs);
|
|
|
|
// the tracer has called a graph executor
|
|
// there is no need to optimize, but we do need to splice the graph of
|
|
// this excutor into the trace. Otherwise we might unroll control-flow
|
|
// operations.
|
|
if(tracer::isTracing()) {
|
|
return runTraced(stack);
|
|
}
|
|
|
|
// this is the fallback pathway, when we cannot differentiate
|
|
if(!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
|
|
return runFallback(stack);
|
|
}
|
|
|
|
// either we can symbolically differentiate, or we do not need a gradient.
|
|
// go down the route where we treat the inputs as tensors
|
|
// and fully optimize
|
|
auto & implementation = getOrCompile(inputs);
|
|
return implementation.run(stack);
|
|
}
|
|
|
|
std::shared_ptr<Graph> graphFor(const Stack& stack) const {
|
|
auto inputs = last(stack, num_inputs);
|
|
ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
|
|
|
|
if (!optimize || (!symbolically_differentiable && needsGradient(inputs))) {
|
|
JIT_ASSERTM(autograd_fallback_graph, "No graph found for given inputs");
|
|
return autograd_fallback_graph;
|
|
}
|
|
|
|
auto it = plan_cache.find(spec);
|
|
JIT_ASSERTM(it != plan_cache.end(), "No graph found for given inputs");
|
|
return it->second.get_graph();
|
|
}
|
|
|
|
GraphExecutorState getDebugState() {
|
|
GraphExecutorState state;
|
|
state.graph = graph.get();
|
|
if (autograd_fallback) {
|
|
state.autograd_fallback = &autograd_fallback;
|
|
state.autograd_fallback_graph = autograd_fallback_graph.get();
|
|
} else {
|
|
state.autograd_fallback = nullptr;
|
|
state.autograd_fallback_graph = nullptr;
|
|
}
|
|
for (auto & entry : plan_cache) {
|
|
state.execution_plans.emplace(entry.first, entry.second.getDebugState());
|
|
}
|
|
return state;
|
|
}
|
|
|
|
private:
|
|
friend struct GraphExecutor;
|
|
|
|
void runTraced(Stack & stack) {
|
|
auto state = tracer::getTracingState();
|
|
auto inputs = last(stack, num_inputs);
|
|
auto input_values = fmap(inputs, [](const IValue & v) {
|
|
return tracer::getValueTrace(v.toTensor());
|
|
});
|
|
|
|
ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
|
|
runFallback(stack);
|
|
|
|
auto all_dynamic = [](const at::ArrayRef<Value*> xs) {
|
|
for(Value* x : xs) {
|
|
if(x->type()->kind() != TypeKind::DynamicType)
|
|
return false;
|
|
}
|
|
return true;
|
|
};
|
|
// Traces always have types propagated through them, so we make sure to
|
|
// also propagate types through the graph we are inserting here.
|
|
// However, this->graph itself may already have been generated with
|
|
// tracing and so we only do the type propgation if no concrete types have
|
|
// been set.
|
|
auto local_graph = this->graph;
|
|
if(all_dynamic(local_graph->inputs()) && all_dynamic(local_graph->outputs())) {
|
|
local_graph = this->graph->copy();
|
|
PropagateInputShapes(*local_graph, spec);
|
|
}
|
|
auto output_values = script::inlineCallTo(*state->graph, *local_graph, input_values);
|
|
|
|
auto outputs = last(stack, num_outputs);
|
|
for (size_t i = 0; i < outputs.size(); ++i) {
|
|
// We can't attach tracing states to scalars, so we have to skip them here
|
|
// TODO: Should we reinterpret them as scalar tensors instead?
|
|
if (!outputs[i].isTensor()) continue;
|
|
tracer::setValueTrace(outputs[i].toTensor(), output_values[i]);
|
|
}
|
|
}
|
|
|
|
void runFallback(Stack & stack) {
|
|
auto & fb = getOrCreateAutogradFallback();
|
|
InterpreterState(fb).runOneStage(stack);
|
|
}
|
|
|
|
static bool calcMayIntroduceGradient(Block* b) {
|
|
for(Node* n : b->nodes()) {
|
|
if(n->kind() == prim::PythonOp)
|
|
return true;
|
|
for(Block* bb : n->blocks()) {
|
|
if(calcMayIntroduceGradient(bb))
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
bool needsGradient(at::ArrayRef<IValue> inputs) const {
|
|
if (!autograd::GradMode::is_enabled()) {
|
|
return false;
|
|
}
|
|
if (may_introduce_gradient)
|
|
return true;
|
|
for (const IValue & value : inputs) {
|
|
if (!value.isTensor()) continue;
|
|
auto t = value.toTensor();
|
|
if (t.defined() && autograd::as_variable_ref(t).requires_grad())
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
const Code & getOrCreateAutogradFallback() {
|
|
std::lock_guard<std::mutex> lock(compile_mutex);
|
|
if(autograd_fallback) {
|
|
return autograd_fallback;
|
|
}
|
|
auto graph_ = graph->copy();
|
|
runRequiredPasses(graph_);
|
|
if(optimize) {
|
|
if(!symbolically_differentiable) {
|
|
EraseShapeInformation(*graph_);
|
|
CreateAutodiffSubgraphs(*graph_);
|
|
}
|
|
runOptimization(graph_, /*graphMustSupportVariables=*/true);
|
|
}
|
|
autograd_fallback_graph = graph_;
|
|
autograd_fallback = Code(graph_);
|
|
return autograd_fallback;
|
|
}
|
|
const ExecutionPlan & getOrCompile(at::ArrayRef<IValue> inputs) {
|
|
// outside lock guard, to minimize the time holding the lock on the fast path
|
|
// ArgumentSpec even computes its hashCode here.
|
|
ArgumentSpec spec(autograd::GradMode::is_enabled(), inputs);
|
|
{
|
|
std::lock_guard<std::mutex> lock(compile_mutex);
|
|
auto it = plan_cache.find(spec);
|
|
if(it != plan_cache.end())
|
|
return it->second;
|
|
auto plan = compileSpec(spec);
|
|
auto r = plan_cache.emplace(std::move(spec), std::move(plan));
|
|
return r.first->second;
|
|
}
|
|
}
|
|
|
|
bool argumentSpecRequiresGradient(const ArgumentSpec & spec) {
|
|
for(size_t i = 0; i < spec.size(); ++i) {
|
|
if(spec.at(i).requires_grad())
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
ExecutionPlan compileSpec(const ArgumentSpec & spec) {
|
|
auto graph_ = graph->copy();
|
|
|
|
specializeToSpec(graph_, spec);
|
|
|
|
if(!argumentSpecRequiresGradient(spec)) {
|
|
runOptimization(graph_, /*graphMustSupportVariables=*/false);
|
|
return ExecutionPlan(graph_);
|
|
}
|
|
JIT_ASSERT(symbolically_differentiable);
|
|
|
|
std::vector<bool> requires_grads;
|
|
requires_grads.reserve(spec.size());
|
|
for(size_t i = 0; i < spec.size(); i++)
|
|
requires_grads.push_back(spec.at(i).requires_grad());
|
|
|
|
Gradient gradient = differentiate(graph_, requires_grads);
|
|
graph_ = gradient.f;
|
|
runOptimization(graph_, /*graphMustSupportVariables=*/false);
|
|
return ExecutionPlan(graph_, std::move(gradient));
|
|
}
|
|
// the unoptimized starting graph
|
|
// this is never mutated
|
|
std::shared_ptr<Graph> graph;
|
|
|
|
// true - do everything we can to make this graph run fast
|
|
// false - do not modifiy the graph at all and just use the interpreter
|
|
// to run the graph. Useful for debugging correctness issues in the implementation
|
|
const bool optimize;
|
|
const size_t num_inputs;
|
|
const size_t num_outputs;
|
|
|
|
// GraphExecutor optimizes more aggresively when we _know_ the graph will be
|
|
// symbolically differentiable.
|
|
bool symbolically_differentiable;
|
|
|
|
// some ops, including python operations, can intorduce requires_grad=True
|
|
// variables even though no inputs to this graph are availiable, if
|
|
// the graph includes those operators then needGradient must be true
|
|
// regardles of input state.
|
|
bool may_introduce_gradient;
|
|
|
|
// when this graph has some parts that are not symbolically_differentable,
|
|
// but some input does require a derivative, we create and use autograd_fallback,
|
|
// which wraps up the fully differentiable subgraphs, and then runs the outer
|
|
// graph through autograd.
|
|
// Since we can't optimize black box functions anyway, there is only one fallback path,
|
|
// and it must work on all sizes (so no optimizations that inspect sizes can run on it)
|
|
std::shared_ptr<Graph> autograd_fallback_graph;
|
|
Code autograd_fallback;
|
|
|
|
// optimizable code paths, used when we can differentiate or when no derivative is needed
|
|
// Spec describes input conditions, Plan describes how to execute them.
|
|
std::unordered_map<ArgumentSpec, ExecutionPlan> plan_cache;
|
|
|
|
// GraphExecutor can be accessed from multiple thread so
|
|
// anytime we are checking or updating the autograd_fallback or
|
|
// plan_cache, we must hold the compile mutex.
|
|
// along the fast path (no compilation) code should
|
|
// hold this for as little time as possible.
|
|
std::mutex compile_mutex;
|
|
};
|
|
|
|
GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize)
|
|
: pImpl(new GraphExecutorImpl(std::move(graph), optimize)) {}
|
|
|
|
GraphExecutor::GraphExecutor(std::shared_ptr<Graph> graph, bool optimize, bool symbolically_differentiable)
|
|
: pImpl(new GraphExecutorImpl(std::move(graph), optimize, symbolically_differentiable)) {}
|
|
|
|
void GraphExecutor::run(Stack & inputs) {
|
|
return pImpl->run(inputs);
|
|
}
|
|
|
|
std::shared_ptr<Graph> GraphExecutor::graph() const {
|
|
return pImpl->graph;
|
|
}
|
|
|
|
std::shared_ptr<Graph> GraphExecutor::graphFor(const Stack& inputs) const {
|
|
return pImpl->graphFor(inputs);
|
|
}
|
|
|
|
GraphExecutorState GraphExecutor::getDebugState() {
|
|
return pImpl->getDebugState();
|
|
}
|
|
|
|
|
|
void runRequiredPasses(const std::shared_ptr<Graph>& g) {
|
|
LowerGradOf(*g);
|
|
// implicit inserted expand nodes are not necessarily always valid
|
|
// when used inside script methods that might have unstable shapes
|
|
// we remove the implicitly created ones, and have shape analysis
|
|
// add valid expand nodes when the shapes are stable
|
|
RemoveExpands(g);
|
|
}
|
|
|
|
void specializeToSpec(const std::shared_ptr<Graph>& graph, const ArgumentSpec& spec) {
|
|
// clean up GradOf and AutogradAdd nodes
|
|
// this must be first because later passes do not know what GradOfs are
|
|
std::vector<bool> defined;
|
|
for(size_t i = 0; i < spec.size(); ++i) {
|
|
defined.push_back(spec.at(i).defined());
|
|
}
|
|
specializeUndef(*graph, defined);
|
|
|
|
// required passes shared with autograd fallback
|
|
runRequiredPasses(graph);
|
|
|
|
// Decompose addmm nodes to add + mm, so expands can be inserted and
|
|
// gradients accumulated on the backward pass
|
|
//
|
|
// In the future, if we need more passes like this, we should convert this
|
|
// into a generic canonicalization pass.
|
|
DecomposeAddmm(graph);
|
|
// clean up dead constants from specialization
|
|
EliminateDeadCode(graph);
|
|
// calculate all input shapes
|
|
PropagateInputShapes(*graph, spec);
|
|
}
|
|
|
|
void runOptimization(std::shared_ptr<Graph> & graph, bool graphMustSupportVariables) {
|
|
|
|
// these optimizations must run in the presence of variables
|
|
// and when shape information is not statically known.
|
|
EliminateDeadCode(graph);
|
|
CheckInplace(graph);
|
|
EliminateCommonSubexpression(graph);
|
|
|
|
if (!graphMustSupportVariables) {
|
|
// These optimizations can introduce operators like FusionGroup that
|
|
// do not work on variables
|
|
|
|
// They also may assume that concrete sizes/strides are availiable
|
|
UnrollLoops(graph);
|
|
|
|
//TODO: create peephole optimizations that are safe to run
|
|
// when we are using variables, and when we do not know sizes.
|
|
PeepholeOptimize(graph);
|
|
// TODO: remove mandatory size checking in BatchMM, otherwise
|
|
// it works fine on variables.
|
|
BatchMM(graph);
|
|
FuseGraph(graph);
|
|
}
|
|
}
|
|
|
|
}}
|