mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Summary: better safe than sorry. will throw if memory overlap detected when using planned tensors and debug mode is enabled -- this will make our planning unit tests more robust. Test Plan: ci Rollback Plan: Differential Revision: D77327841 Pull Request resolved: https://github.com/pytorch/pytorch/pull/157290 Approved by: https://github.com/SherlockNoMad, https://github.com/zhxchen17
80 lines
2.4 KiB
C++
80 lines
2.4 KiB
C++
#pragma once
|
|
|
|
#include <torch/nativert/executor/ExecutionFrame.h>
|
|
#include <torch/nativert/executor/ExecutionPlanner.h>
|
|
#include <torch/nativert/executor/ExecutorConfig.h>
|
|
#include <torch/nativert/executor/OpKernel.h>
|
|
#include <torch/nativert/graph/Graph.h>
|
|
#include <torch/nativert/graph/GraphSignature.h>
|
|
|
|
namespace torch::nativert {
|
|
|
|
struct ProfileMetrics {
|
|
size_t primNodesCount{0};
|
|
size_t staticDispatchNodesCount{0};
|
|
size_t totalNodesCount{0};
|
|
std::vector<float> timePerNode;
|
|
std::unordered_map<std::string, float> timePerNodeType;
|
|
std::unordered_map<std::string, float> percentPerNodeType;
|
|
std::unordered_map<std::string, int> instancesPerNodeType;
|
|
std::unordered_set<std::string> staticDispatchNodes;
|
|
std::unordered_set<std::string> primNodes;
|
|
float totalTime{0};
|
|
};
|
|
|
|
/**
|
|
* GraphExecutor is a lightweight abstraction to execute a graph with
|
|
* execution frames without actually owning the graph nor the weights. This is
|
|
* introduced to decouple the state management of the top level runtime from the
|
|
* kernel executions so that sub graphs from higher order ops can be supported.
|
|
*/
|
|
class GraphExecutorBase {
|
|
public:
|
|
GraphExecutorBase(
|
|
const Graph& graph,
|
|
std::vector<std::unique_ptr<OpKernel>> nodeKernels,
|
|
const ExecutorConfig& executorConfig);
|
|
virtual ~GraphExecutorBase() = default;
|
|
|
|
const Graph& graph() const {
|
|
return graph_;
|
|
}
|
|
|
|
// This API only returns the flattened UserOutputs,
|
|
// intended to be used for Inference path
|
|
virtual std::vector<c10::IValue> execute(
|
|
ExecutionFrame& frame,
|
|
std::vector<c10::IValue> inputs) = 0;
|
|
|
|
virtual std::vector<c10::IValue> executeWithPrefilledFrame(
|
|
ExecutionFrame& frame) = 0;
|
|
|
|
ProfileMetrics benchmarkIndividualNodes(
|
|
ExecutionFrame& executionFrame,
|
|
const std::vector<std::vector<c10::IValue>>& inputs,
|
|
const uint32_t warmup_runs,
|
|
const uint32_t main_runs);
|
|
|
|
std::vector<std::unique_ptr<OpKernel>> stealKernels() {
|
|
return std::move(nodeKernels_);
|
|
}
|
|
|
|
void setKernels(std::vector<std::unique_ptr<OpKernel>>&& kernels) {
|
|
nodeKernels_ = std::move(kernels);
|
|
}
|
|
|
|
protected:
|
|
void fillUserInputs(ExecutionFrame& frame, std::vector<c10::IValue> inputs);
|
|
|
|
const Graph& graph_;
|
|
|
|
// cache of the constructed kernels to avoid reconstruction per execution
|
|
std::vector<std::unique_ptr<OpKernel>> nodeKernels_;
|
|
|
|
const ExecutorConfig& executorConfig_;
|
|
|
|
std::unique_ptr<ExecutionPlan> execPlan_;
|
|
};
|
|
|
|
} // namespace torch::nativert
|