mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-21 05:34:18 +08:00
Re-landing #68111/#74596 ## Description v0.5 PR of this [RFC](https://github.com/pytorch/pytorch/issues/49444). On the basis of #50256, the below improvements are included: * The [v0.5 release branch](https://github.com/oneapi-src/oneDNN/releases/tag/graph-v0.5) of the oneDNN Graph API is used * The fuser now works with the profiling graph executor. We have inserted type check nodes to guard the profiled tensor properties. ### User API: The optimization pass is disabled by default. Users could enable it by: ``` torch.jit.enable_onednn_fusion(True) ``` `torch.jit.freeze` should be used after tracing (recommended) or scripting a model. ### Performance: [pytorch/benchmark](https://github.com/pytorch/benchmark) tool is used to compare the performance: * SkyLake 8180 (1 socket of 28 cores):  * SkyLake 8180 (single thread):  * By mapping hardswish to oneDNN Graph, it’s 8% faster than PyTorch JIT (NNC + OFI) ** We expect performance gain after mapping transpose, contiguous & view to oneDNN graph ops ### Directory structure of the integration code Fuser-related code is placed under: ``` torch/csrc/jit/codegen/onednn/ ``` Optimization pass registration is done in: ``` torch/csrc/jit/passes/onednn_graph_fuser.h ``` CMake for the integration code is in: ``` caffe2/CMakeLists.txt cmake/public/mkldnn.cmake cmake/Modules/FindMKLDNN.cmake ``` ## Limitations * In this PR, we only support Pytorch-oneDNN-Graph integration on Linux platform. Support on Windows and MacOS will be enabled as a next step. * We have only optimized the inference use-case. Pull Request resolved: https://github.com/pytorch/pytorch/pull/76622 Approved by: https://github.com/eellison
94 lines
2.7 KiB
C++
94 lines
2.7 KiB
C++
#pragma once
|
|
|
|
#include <unordered_map>
|
|
|
|
#include <oneapi/dnnl/dnnl_graph.hpp>
|
|
#include <torch/csrc/jit/codegen/onednn/LlgaTensorImpl.h>
|
|
#include <torch/csrc/jit/codegen/onednn/graph_helper.h>
|
|
#include <torch/csrc/jit/ir/ir.h>
|
|
#include <torch/csrc/jit/runtime/interpreter.h>
|
|
|
|
namespace torch {
|
|
namespace jit {
|
|
namespace fuser {
|
|
namespace onednn {
|
|
|
|
using ArgSpec = LlgaTensorDesc;
|
|
using ArgSpecs = std::vector<ArgSpec>;
|
|
using RunArg = dnnl::graph::tensor;
|
|
using RunArgs = std::vector<RunArg>;
|
|
using TensorArgs = std::vector<at::Tensor>;
|
|
|
|
class LlgaKernel {
|
|
public:
|
|
explicit LlgaKernel(const Node* fusionNode);
|
|
|
|
void run(Stack& stack);
|
|
|
|
void initialize(const TensorArgs& inputs);
|
|
|
|
const std::string& debugName() const {
|
|
return debugName_;
|
|
}
|
|
|
|
private:
|
|
bool useOpaqueLayout(size_t offset) const;
|
|
|
|
// PyTorch copy constants inside the subgraph instead of referencing them.
|
|
// Constants inputs to the partition are no longer in the graph->inputs().
|
|
// Need use the tid retrieved from the partition to find the missing
|
|
// constant inputs.
|
|
void initializeConstantInputs();
|
|
|
|
ArgSpecs initializeInputSpecs(const TensorArgs& inputs);
|
|
|
|
ArgSpecs initializeOutputSpecs() const;
|
|
|
|
dnnl::graph::compiled_partition compile(
|
|
const dnnl::graph::partition& partition);
|
|
|
|
std::map<size_t, int64_t> initializeTensorIdToOccurence() const;
|
|
|
|
std::tuple<RunArgs, RunArgs> prepareRunArgs(
|
|
const TensorArgs& inputs,
|
|
TensorArgs& outputs) const;
|
|
|
|
static std::string genDebugName() {
|
|
static size_t debugId = 0;
|
|
return "LlgaPartition_" + std::to_string(debugId++);
|
|
}
|
|
|
|
static dnnl::graph::logical_tensor toLogicalTensor(const ArgSpec& s) {
|
|
return s.logical_tensor();
|
|
}
|
|
|
|
at::Device device_ = at::kCPU;
|
|
const Node* fusionNode_;
|
|
std::shared_ptr<Graph> graph_;
|
|
int64_t nGraphInputs_ = 0; // number of inputs to graph_ on the IR
|
|
int64_t nOutputs_ = 0;
|
|
std::map<size_t, Value*> tensorIdToValue_;
|
|
std::vector<int64_t> runArgsIdx_;
|
|
dnnl::graph::partition partition_;
|
|
// nPartitionInputs_ is the actual number of inputs to partition_ of graph_
|
|
// needed by the backend.
|
|
// nPartitionInputs_ = nGraphInputs_ + constantInputs_.size() since Constant
|
|
// inputs are copied to the inside of the subgraph
|
|
int64_t nPartitionInputs_;
|
|
dnnl::graph::compiled_partition compilation_;
|
|
std::set<size_t> initializedInputIds_;
|
|
std::vector<Value*> constantValues_;
|
|
TensorArgs constantInputs_;
|
|
ArgSpecs inputSpecs_;
|
|
ArgSpecs outputSpecs_;
|
|
std::vector<dnnl::graph::logical_tensor> constantLogicalTensors_;
|
|
std::string debugName_;
|
|
std::once_flag initialized_flag;
|
|
bool is_initialized_ = false;
|
|
};
|
|
|
|
} // namespace onednn
|
|
} // namespace fuser
|
|
} // namespace jit
|
|
} // namespace torch
|