mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Integrate XNNPACK with custom class for packing weights. (#34047)
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/34047 This PR integrates the added xnnpack conv2d and linear op via custom class registration for packed weights. The packed struct is serializable. Test Plan: python test test/test_xnnpack_integration.py Imported from OSS Differential Revision: D20185657 fbshipit-source-id: fc7e692d8f913e493b293b02d92f4e78536d7698
This commit is contained in:
committed by
Facebook GitHub Bot
parent
e23a9dc140
commit
4c30fc7238
@ -141,6 +141,14 @@ const std::vector<at::QEngine>& Context::supportedQEngines() const {
|
||||
return supported_qengines;
|
||||
}
|
||||
|
||||
bool Context::isXNNPACKAvailable() const {
|
||||
#ifdef USE_XNNPACK
|
||||
return true;
|
||||
#else
|
||||
return false;
|
||||
#endif
|
||||
}
|
||||
|
||||
bool Context::setFlushDenormal(bool on) {
|
||||
return at::cpu::set_flush_denormal(on);
|
||||
}
|
||||
|
||||
@ -109,6 +109,7 @@ class CAFFE2_API Context {
|
||||
at::QEngine qEngine() const;
|
||||
void setQEngine(at::QEngine e);
|
||||
const std::vector<at::QEngine>& supportedQEngines() const;
|
||||
bool isXNNPACKAvailable() const;
|
||||
|
||||
private:
|
||||
void initCUDAIfNeeded(DeviceType p) {
|
||||
|
||||
@ -778,10 +778,6 @@
|
||||
|
||||
- func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor
|
||||
|
||||
- func: _conv2d_prepack(Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, float? output_min=None, float? output_max=None) -> Tensor
|
||||
|
||||
- func: _conv2d_packed(Tensor packed_weight, Tensor input) -> Tensor
|
||||
|
||||
- func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor
|
||||
|
||||
- func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
|
||||
@ -1577,10 +1573,6 @@
|
||||
- func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
|
||||
python_module: nn
|
||||
|
||||
- func: _linear_prepack(Tensor weight, Tensor? bias=None, float? output_min=None, float? output_max=None) -> Tensor
|
||||
|
||||
- func: _linear_packed(Tensor packed_weight, Tensor input) -> Tensor
|
||||
|
||||
- func: mkldnn_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
|
||||
python_module: nn
|
||||
dispatch:
|
||||
|
||||
@ -9,6 +9,53 @@
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace xnnpack {
|
||||
|
||||
struct Deleter final {
|
||||
void operator()(const xnn_operator_t op) const {
|
||||
xnn_delete_operator(op);
|
||||
}
|
||||
};
|
||||
|
||||
using Operator = std::unique_ptr<xnn_operator, Deleter>;
|
||||
|
||||
struct ContextLinear final {
|
||||
Operator op;
|
||||
int64_t output_channels;
|
||||
|
||||
ContextLinear() = delete;
|
||||
|
||||
ContextLinear(Operator&& o, int64_t o_channels) {
|
||||
op = std::move(o);
|
||||
output_channels = o_channels;
|
||||
}
|
||||
static constexpr float kMin = -std::numeric_limits<float>::infinity();
|
||||
static constexpr float kMax = std::numeric_limits<float>::infinity();
|
||||
};
|
||||
|
||||
struct ContextConv2D final {
|
||||
Operator op;
|
||||
std::array<int64_t, 4> weight_size_;
|
||||
std::array<int64_t, 2> padding_;
|
||||
std::array<int64_t, 2> stride_;
|
||||
std::array<int64_t, 2> dilation_;
|
||||
|
||||
ContextConv2D() = delete;
|
||||
|
||||
ContextConv2D(
|
||||
Operator&& o,
|
||||
std::array<int64_t, 4> weight_size,
|
||||
std::array<int64_t, 2> padding,
|
||||
std::array<int64_t, 2> stride,
|
||||
std::array<int64_t, 2> dilation)
|
||||
: op(std::move(o)),
|
||||
weight_size_(weight_size),
|
||||
padding_(padding),
|
||||
stride_(stride),
|
||||
dilation_(dilation) {}
|
||||
static constexpr float kMin = -std::numeric_limits<float>::infinity();
|
||||
static constexpr float kMax = std::numeric_limits<float>::infinity();
|
||||
};
|
||||
|
||||
namespace internal {
|
||||
|
||||
struct Layout final {
|
||||
@ -64,14 +111,6 @@ struct Layout final {
|
||||
};
|
||||
};
|
||||
|
||||
struct Deleter final {
|
||||
void operator()(const xnn_operator_t op) const {
|
||||
xnn_delete_operator(op);
|
||||
}
|
||||
};
|
||||
|
||||
using Operator = std::unique_ptr<xnn_operator, Deleter>;
|
||||
|
||||
bool available();
|
||||
|
||||
} // namespace internal
|
||||
|
||||
@ -1,10 +1,10 @@
|
||||
#ifdef USE_XNNPACK
|
||||
|
||||
#include <ATen/cpp_custom_type_hack.h>
|
||||
#include <ATen/native/xnnpack/Common.h>
|
||||
#include <ATen/native/ConvUtils.h>
|
||||
#include <ATen/native/utils/ParamUtils.h>
|
||||
#include <ATen/native/xnnpack/Common.h>
|
||||
#include <ATen/native/xnnpack/Factory.h>
|
||||
#include <ATen/native/xnnpack/Convolution.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -12,18 +12,6 @@ namespace xnnpack {
|
||||
namespace internal {
|
||||
namespace convolution2d {
|
||||
|
||||
struct Context final {
|
||||
Operator convolution_op;
|
||||
|
||||
std::vector<int64_t> weight_size;
|
||||
std::vector<int64_t> padding;
|
||||
std::vector<int64_t> stride;
|
||||
std::vector<int64_t> dilation;
|
||||
|
||||
static constexpr float kMin = -std::numeric_limits<float>::infinity();
|
||||
static constexpr float kMax = std::numeric_limits<float>::infinity();
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
// Supports NHWC and NCHW FP32 convolutions with any valid
|
||||
@ -79,72 +67,6 @@ bool available(
|
||||
true;
|
||||
}
|
||||
|
||||
Context create(
|
||||
const Tensor& weight,
|
||||
const c10::optional<Tensor>& bias,
|
||||
const IntArrayRef padding_,
|
||||
const IntArrayRef stride_,
|
||||
const IntArrayRef dilation_,
|
||||
const int64_t groups,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
const auto padding = expand_param_if_needed(padding_, "padding", 2);
|
||||
const auto stride = expand_param_if_needed(stride_, "stride", 2);
|
||||
const auto dilation = expand_param_if_needed(dilation_, "dilation", 2);
|
||||
const Tensor weight_nhwc = weight.contiguous(MemoryFormat::ChannelsLast);
|
||||
|
||||
TORCH_CHECK(
|
||||
available(
|
||||
weight_nhwc,
|
||||
bias,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
output_min,
|
||||
output_max),
|
||||
"XNNPACK Convolution not available! "
|
||||
"Reason: The provided (weight, bias, padding, stride, dilation, groups, output_min, output_max) "
|
||||
"parameters are either invalid individually or their combination is not supported by XNNPACK.");
|
||||
|
||||
xnn_operator_t convolution_op{};
|
||||
|
||||
const xnn_status create_status = xnn_create_convolution2d_nhwc_f32(
|
||||
padding[Layout::Parameter::height], // input_padding_top
|
||||
padding[Layout::Parameter::width], // input_padding_right
|
||||
padding[Layout::Parameter::height], // input_padding_bottom
|
||||
padding[Layout::Parameter::width], // input_padding_left
|
||||
weight_nhwc.size(Layout::Filter::height), // kernel_height
|
||||
weight_nhwc.size(Layout::Filter::width), // kernel_width
|
||||
stride[Layout::Parameter::height], // subsampling_height
|
||||
stride[Layout::Parameter::width], // subsampling_width
|
||||
dilation[Layout::Parameter::height], // dilation_height
|
||||
dilation[Layout::Parameter::width], // dilation_width
|
||||
groups, // groups
|
||||
weight_nhwc.size(Layout::Filter::input), // group_input_channels
|
||||
weight_nhwc.size(Layout::Filter::output) / groups, // group_output_channels
|
||||
weight_nhwc.size(Layout::Filter::input) * groups, // input_pixel_stride
|
||||
weight_nhwc.size(Layout::Filter::output), // output_pixel_stride
|
||||
weight_nhwc.data_ptr<float>(), // kernel
|
||||
(bias && bias->defined()) ? bias->data_ptr<float>() : nullptr, // bias
|
||||
output_min, // output_min
|
||||
output_max, // output_max
|
||||
0u, // flags
|
||||
&convolution_op); // operator
|
||||
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == create_status,
|
||||
"xnn_create_convolution2d_nhwc_f32 failed!");
|
||||
|
||||
return Context{
|
||||
Operator(convolution_op),
|
||||
weight_nhwc.sizes().vec(),
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
};
|
||||
}
|
||||
|
||||
// TODO: Decouple and improve error handling and messages.
|
||||
bool usable(const Tensor& input) {
|
||||
// Input
|
||||
@ -159,42 +81,43 @@ bool usable(const Tensor& input) {
|
||||
}
|
||||
|
||||
Tensor run(
|
||||
const Context& context,
|
||||
const ContextConv2D& context,
|
||||
const Tensor& input) {
|
||||
using namespace internal;
|
||||
|
||||
const Tensor input_nhwc = input.contiguous(MemoryFormat::ChannelsLast);
|
||||
const Tensor padded_input_nhwc = allocate_padded_if_needed(input_nhwc);
|
||||
|
||||
TORCH_CHECK(
|
||||
usable(input_nhwc),
|
||||
usable(padded_input_nhwc),
|
||||
"XNNPACK Convolution not usable! "
|
||||
"Reason: The provided input tensor is either invalid or unsupported by XNNPACK.");
|
||||
|
||||
Tensor output = empty_with_tail_padding(
|
||||
conv_output_size(
|
||||
input_nhwc.sizes(),
|
||||
context.weight_size,
|
||||
context.padding,
|
||||
context.stride,
|
||||
context.dilation),
|
||||
input_nhwc.options().dtype(),
|
||||
padded_input_nhwc.sizes(),
|
||||
context.weight_size_,
|
||||
context.padding_,
|
||||
context.stride_,
|
||||
context.dilation_),
|
||||
padded_input_nhwc.options().dtype(),
|
||||
MemoryFormat::ChannelsLast);
|
||||
|
||||
const xnn_status setup_status = xnn_setup_convolution2d_nhwc_f32(
|
||||
context.convolution_op.get(), // operator
|
||||
input_nhwc.size(Layout::Activation4D::batch), // batch_size
|
||||
input_nhwc.size(Layout::Activation4D::height), // input_height
|
||||
input_nhwc.size(Layout::Activation4D::width), // input_width
|
||||
input_nhwc.data_ptr<float>(), // input
|
||||
output.data_ptr<float>(), // output
|
||||
nullptr); // threadpool
|
||||
context.op.get(), // operator
|
||||
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
|
||||
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
|
||||
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
|
||||
padded_input_nhwc.data_ptr<float>(), // input
|
||||
output.data_ptr<float>(), // output
|
||||
nullptr); // threadpool
|
||||
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == setup_status,
|
||||
"xnn_setup_convolution2d_nhwc_f32 failed!");
|
||||
|
||||
const xnn_status run_status = xnn_run_operator(
|
||||
context.convolution_op.get(), // operator
|
||||
context.op.get(), // operator
|
||||
nullptr); // threadpool
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
@ -228,6 +151,101 @@ Tensor create_and_run(
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
ContextConv2D create(
|
||||
const Tensor& weight,
|
||||
const c10::optional<Tensor>& bias,
|
||||
const IntArrayRef padding,
|
||||
const IntArrayRef stride,
|
||||
const IntArrayRef dilation,
|
||||
const int64_t groups,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
const auto padding_expanded = expand_param_if_needed(padding, "padding", 2);
|
||||
const auto stride_expanded = expand_param_if_needed(stride, "stride", 2);
|
||||
const auto dilation_expanded = expand_param_if_needed(dilation, "dilation", 2);
|
||||
const Tensor weight_nhwc = weight.contiguous(MemoryFormat::ChannelsLast);
|
||||
|
||||
TORCH_CHECK(
|
||||
available(
|
||||
weight_nhwc,
|
||||
bias,
|
||||
padding_expanded,
|
||||
stride_expanded,
|
||||
dilation_expanded,
|
||||
groups,
|
||||
output_min,
|
||||
output_max),
|
||||
"xnnpack::convolution not available! "
|
||||
"Reason: The provided (weight, bias, padding, stride, dilation, groups, output_min, output_max) "
|
||||
"parameters are either invalid individually or their combination is not supported by XNNPACK.");
|
||||
|
||||
xnn_operator_t convolution_op{};
|
||||
|
||||
const xnn_status create_status = xnn_create_convolution2d_nhwc_f32(
|
||||
padding_expanded[Layout::Parameter::height], // input_padding_top
|
||||
padding_expanded[Layout::Parameter::width], // input_padding_right
|
||||
padding_expanded[Layout::Parameter::height], // input_padding_bottom
|
||||
padding_expanded[Layout::Parameter::width], // input_padding_left
|
||||
weight_nhwc.size(Layout::Filter::height), // kernel_height
|
||||
weight_nhwc.size(Layout::Filter::width), // kernel_width
|
||||
stride_expanded[Layout::Parameter::height], // subsampling_height
|
||||
stride_expanded[Layout::Parameter::width], // subsampling_width
|
||||
dilation_expanded[Layout::Parameter::height], // dilation_height
|
||||
dilation_expanded[Layout::Parameter::width], // dilation_width
|
||||
groups, // groups
|
||||
weight_nhwc.size(Layout::Filter::input), // group_input_channels
|
||||
weight_nhwc.size(Layout::Filter::output) / groups, // group_output_channels
|
||||
weight_nhwc.size(Layout::Filter::input) * groups, // input_pixel_stride
|
||||
weight_nhwc.size(Layout::Filter::output), // output_pixel_stride
|
||||
weight_nhwc.data_ptr<float>(), // kernel
|
||||
(bias && bias->defined()) ? bias->data_ptr<float>() : nullptr, // bias
|
||||
output_min, // output_min
|
||||
output_max, // output_max
|
||||
0u, // flags
|
||||
&convolution_op); // operator
|
||||
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == create_status,
|
||||
"xnn_create_convolution2d_nhwc_f32 failed!");
|
||||
|
||||
return ContextConv2D{
|
||||
Operator(convolution_op),
|
||||
{weight_nhwc.sizes()[0], weight_nhwc.sizes()[1],
|
||||
weight_nhwc.sizes()[2], weight_nhwc.sizes()[3]},
|
||||
{padding_expanded[0], padding_expanded[1]},
|
||||
{stride_expanded[0], stride_expanded[1]},
|
||||
{dilation_expanded[0], dilation_expanded[1]}
|
||||
};
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext> Conv2dPrePack::operator()(
|
||||
Tensor weight,
|
||||
c10::optional<Tensor> bias,
|
||||
std::vector<int64_t> stride,
|
||||
std::vector<int64_t> padding,
|
||||
std::vector<int64_t> dilation,
|
||||
int64_t groups
|
||||
) {
|
||||
return xnnpack::XNNPackConv2dOpContext::create_context(
|
||||
std::move(weight),
|
||||
std::move(bias),
|
||||
std::move(padding),
|
||||
std::move(stride),
|
||||
std::move(dilation),
|
||||
groups,
|
||||
{},
|
||||
{});
|
||||
}
|
||||
|
||||
Tensor Conv2dPacked::operator()(
|
||||
const Tensor& input,
|
||||
const c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext>& op_context) {
|
||||
return
|
||||
xnnpack::internal::convolution2d::run(
|
||||
(op_context.get())->get_context(), input);
|
||||
}
|
||||
|
||||
} // namespace convolution2d
|
||||
} // namespace internal
|
||||
|
||||
@ -246,8 +264,8 @@ bool use_convolution2d(
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
internal::convolution2d::Context::kMin,
|
||||
internal::convolution2d::Context::kMax) &&
|
||||
ContextConv2D::kMin,
|
||||
ContextConv2D::kMax) &&
|
||||
internal::convolution2d::usable(input);
|
||||
}
|
||||
|
||||
@ -267,50 +285,13 @@ Tensor convolution2d(
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
internal::convolution2d::Context::kMin,
|
||||
internal::convolution2d::Context::kMax);
|
||||
ContextConv2D::kMin,
|
||||
ContextConv2D::kMax);
|
||||
}
|
||||
|
||||
} // namespace xnnpack
|
||||
|
||||
at::Tensor _conv2d_prepack(
|
||||
const Tensor& weight,
|
||||
const Tensor& bias,
|
||||
const IntArrayRef stride,
|
||||
const IntArrayRef padding,
|
||||
const IntArrayRef dilation,
|
||||
const int64_t groups,
|
||||
const c10::optional<double> output_min,
|
||||
const c10::optional<double> output_max) {
|
||||
return cpp_custom_type_hack::create(
|
||||
std::make_unique<xnnpack::internal::convolution2d::Context>(
|
||||
xnnpack::internal::convolution2d::create(
|
||||
weight,
|
||||
bias,
|
||||
padding.vec(),
|
||||
stride.vec(),
|
||||
dilation.vec(),
|
||||
groups,
|
||||
output_min ? *output_min : xnnpack::internal::convolution2d::Context::kMin,
|
||||
output_max ? *output_max : xnnpack::internal::convolution2d::Context::kMax)),
|
||||
weight.options());
|
||||
}
|
||||
|
||||
at::Tensor _conv2d_packed(
|
||||
const Tensor& packed_weight,
|
||||
const Tensor& input) {
|
||||
return xnnpack::internal::convolution2d::run(
|
||||
cpp_custom_type_hack::cast<xnnpack::internal::convolution2d::Context>(packed_weight),
|
||||
input);
|
||||
}
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
CAFFE_KNOWN_TYPE(at::native::xnnpack::internal::convolution2d::Context);
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif /* USE_XNNPACK */
|
||||
|
||||
49
aten/src/ATen/native/xnnpack/Convolution.h
Normal file
49
aten/src/ATen/native/xnnpack/Convolution.h
Normal file
@ -0,0 +1,49 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef USE_XNNPACK
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/native/xnnpack/Common.h>
|
||||
#include <ATen/native/xnnpack/OpContext.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace xnnpack {
|
||||
namespace internal {
|
||||
namespace convolution2d {
|
||||
|
||||
class Conv2dPrePack final : public torch::OperatorKernel {
|
||||
public:
|
||||
c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext> operator()(
|
||||
Tensor weight,
|
||||
c10::optional<Tensor> bias,
|
||||
std::vector<int64_t> padding,
|
||||
std::vector<int64_t> stride,
|
||||
std::vector<int64_t> dilation,
|
||||
int64_t groups);
|
||||
};
|
||||
|
||||
class Conv2dPacked final : public torch::OperatorKernel {
|
||||
public:
|
||||
Tensor operator()(
|
||||
const Tensor& input,
|
||||
const c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext>& op_context);
|
||||
};
|
||||
|
||||
ContextConv2D create(
|
||||
const Tensor& weight,
|
||||
const c10::optional<Tensor>& bias,
|
||||
const IntArrayRef padding,
|
||||
const IntArrayRef stride,
|
||||
const IntArrayRef dilation,
|
||||
const int64_t groups,
|
||||
const float output_min,
|
||||
const float output_max);
|
||||
|
||||
} // namespace convolution2d
|
||||
} // namespace internal
|
||||
} // namespace xnnpack
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
||||
#endif /* USE_XNNPACK */
|
||||
@ -8,11 +8,16 @@ namespace native {
|
||||
namespace xnnpack {
|
||||
namespace internal {
|
||||
|
||||
GuardingAllocator<0u, XNN_EXTRA_BYTES>* get_guarding_allocator() {
|
||||
static GuardingAllocator<0u, XNN_EXTRA_BYTES> allocator;
|
||||
return &allocator;
|
||||
}
|
||||
|
||||
Tensor empty_with_tail_padding(
|
||||
const IntArrayRef size,
|
||||
const caffe2::TypeMeta dtype,
|
||||
const c10::MemoryFormat memory_format) {
|
||||
static GuardingAllocator<0u, XNN_EXTRA_BYTES> allocator;
|
||||
auto* allocator_ptr = get_guarding_allocator();
|
||||
|
||||
const int64_t nelements = prod_intlist(size);
|
||||
|
||||
@ -21,8 +26,8 @@ Tensor empty_with_tail_padding(
|
||||
c10::Storage{
|
||||
dtype,
|
||||
nelements,
|
||||
allocator.allocate(nelements * dtype.itemsize()),
|
||||
&allocator,
|
||||
allocator_ptr->allocate(nelements * dtype.itemsize()),
|
||||
allocator_ptr,
|
||||
/*resizable=*/true,
|
||||
},
|
||||
DispatchKeySet{DispatchKey::CPUTensorId}));
|
||||
@ -30,6 +35,19 @@ Tensor empty_with_tail_padding(
|
||||
return tensor.resize_(size, memory_format);
|
||||
}
|
||||
|
||||
Tensor allocate_padded_if_needed(const Tensor& input_contig) {
|
||||
const auto* allocator = input_contig.storage().allocator();
|
||||
const auto* guarding_allocator = get_guarding_allocator();
|
||||
if (allocator == guarding_allocator) {
|
||||
return input_contig;
|
||||
}
|
||||
Tensor padded_input =
|
||||
empty_with_tail_padding(input_contig.sizes(), input_contig.options().dtype(),
|
||||
input_contig.suggest_memory_format());
|
||||
padded_input.copy_(input_contig);
|
||||
return padded_input;
|
||||
}
|
||||
|
||||
} // namespace internal
|
||||
} // namespace xnnpack
|
||||
} // namespace native
|
||||
|
||||
@ -9,6 +9,8 @@ namespace native {
|
||||
namespace xnnpack {
|
||||
namespace internal {
|
||||
|
||||
Tensor allocate_padded_if_needed(const Tensor& input_contig);
|
||||
|
||||
// TODO: Remove this function when at::native::empty() is modified to accept a
|
||||
// custom memory allocator.
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
#ifdef USE_XNNPACK
|
||||
|
||||
#include <ATen/cpp_custom_type_hack.h>
|
||||
#include <ATen/native/xnnpack/Common.h>
|
||||
#include <ATen/native/xnnpack/Factory.h>
|
||||
#include <ATen/native/xnnpack/Linear.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
@ -10,17 +10,6 @@ namespace xnnpack {
|
||||
namespace internal {
|
||||
namespace linear {
|
||||
|
||||
struct Context final {
|
||||
Operator linear_op;
|
||||
|
||||
struct Output final {
|
||||
int64_t channels;
|
||||
} output;
|
||||
|
||||
static constexpr float kMin = -std::numeric_limits<float>::infinity();
|
||||
static constexpr float kMax = std::numeric_limits<float>::infinity();
|
||||
};
|
||||
|
||||
namespace {
|
||||
|
||||
// Supports NHWC and NCHW FP32 linear operators.
|
||||
@ -33,62 +22,19 @@ bool available(
|
||||
const float output_max) {
|
||||
// XNNPACK
|
||||
return xnnpack::internal::available() &&
|
||||
// Weight
|
||||
(2 == weight.ndimension()) &&
|
||||
(c10::DeviceType::CPU == weight.device().type()) &&
|
||||
(kFloat == weight.scalar_type()) &&
|
||||
// Bias
|
||||
((bias && bias->defined()) ? ((1 == bias->ndimension()) &&
|
||||
(c10::DeviceType::CPU == bias->device().type()) &&
|
||||
(kFloat == bias->scalar_type()) &&
|
||||
(weight.size(Layout::Filter::output)) == bias->size(0))
|
||||
: true) &&
|
||||
// Output Min / Max
|
||||
(output_max > output_min) &&
|
||||
true;
|
||||
}
|
||||
|
||||
Context create(
|
||||
const Tensor& weight,
|
||||
const c10::optional<Tensor>& bias,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
const Tensor weight_contig = weight.contiguous();
|
||||
|
||||
TORCH_CHECK(
|
||||
available(
|
||||
weight_contig,
|
||||
bias,
|
||||
output_min,
|
||||
output_max),
|
||||
"XNNPACK Linear not available! "
|
||||
"Reason: The provided (weight, bias, output_min, output_max) parameters are "
|
||||
"either invalid individually or their combination is not supported by XNNPACK.");
|
||||
|
||||
xnn_operator_t linear_op{};
|
||||
|
||||
const xnn_status create_status = xnn_create_fully_connected_nc_f32(
|
||||
weight_contig.size(Layout::Filter::input), // input_channels
|
||||
weight_contig.size(Layout::Filter::output), // output_channels
|
||||
weight_contig.size(Layout::Filter::input), // input_pixel_stride
|
||||
weight_contig.size(Layout::Filter::output), // output_pixel_stride
|
||||
weight_contig.data_ptr<float>(), // kernel
|
||||
(bias && bias->defined()) ? bias->data_ptr<float>() : nullptr, // bias
|
||||
output_min, // output_min
|
||||
output_max, // output_max
|
||||
0u, // flags
|
||||
&linear_op); // operator
|
||||
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == create_status,
|
||||
"xnn_create_fully_connected_nc_f32 failed!");
|
||||
|
||||
return Context{
|
||||
Operator(linear_op),
|
||||
{
|
||||
weight_contig.size(Layout::Filter::output),
|
||||
}
|
||||
};
|
||||
// Weight
|
||||
(2 == weight.ndimension()) &&
|
||||
(c10::DeviceType::CPU == weight.device().type()) &&
|
||||
(kFloat == weight.scalar_type()) &&
|
||||
// Bias
|
||||
((bias && bias->defined()) ? ((1 == bias->ndimension()) &&
|
||||
(c10::DeviceType::CPU == bias->device().type()) &&
|
||||
(kFloat == bias->scalar_type()) &&
|
||||
(weight.size(Layout::Filter::output)) == bias->size(0))
|
||||
: true) &&
|
||||
// Output Min / Max
|
||||
(output_max > output_min) &&
|
||||
true;
|
||||
}
|
||||
|
||||
// TODO: Decouple and improve error handling and messages.
|
||||
@ -101,30 +47,30 @@ bool usable(const Tensor& input) {
|
||||
}
|
||||
|
||||
Tensor run(
|
||||
const Context& context,
|
||||
const ContextLinear& context,
|
||||
const Tensor& input) {
|
||||
using namespace internal;
|
||||
|
||||
const Tensor& input_contig = input.contiguous();
|
||||
const Tensor padded_input = allocate_padded_if_needed(input.contiguous());
|
||||
|
||||
TORCH_CHECK(
|
||||
usable(input_contig),
|
||||
usable(padded_input),
|
||||
"XNNPACK Linear not usable! "
|
||||
"Reason: The provided input tensor is either invalid or unsupported by XNNPACK.");
|
||||
|
||||
const IntArrayRef input_size = input_contig.sizes();
|
||||
const IntArrayRef input_size = padded_input.sizes();
|
||||
std::vector<int64_t> output_size(input_size.cbegin(), input_size.cend());
|
||||
output_size.back() = context.output.channels;
|
||||
output_size.back() = context.output_channels;
|
||||
|
||||
Tensor output = empty_with_tail_padding(
|
||||
output_size,
|
||||
input_contig.options().dtype(),
|
||||
input_contig.suggest_memory_format());
|
||||
padded_input.options().dtype(),
|
||||
padded_input.suggest_memory_format());
|
||||
|
||||
const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
|
||||
context.linear_op.get(), // operator
|
||||
Layout::ActivationND::batch(input_contig.sizes()), // Batch,
|
||||
input_contig.data_ptr<float>(), // input
|
||||
context.op.get(), // operator
|
||||
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
|
||||
padded_input.data_ptr<float>(), // input
|
||||
output.data_ptr<float>(), // output
|
||||
nullptr); // threadpool
|
||||
|
||||
@ -133,7 +79,7 @@ Tensor run(
|
||||
"xnn_setup_fully_connected_nc_f32 failed!");
|
||||
|
||||
const xnn_status run_status = xnn_run_operator(
|
||||
context.linear_op.get(), // operator
|
||||
context.op.get(), // operator
|
||||
nullptr); // threadpool
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
@ -159,6 +105,63 @@ Tensor create_and_run(
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
ContextLinear create(
|
||||
const Tensor& weight,
|
||||
const c10::optional<Tensor>& bias,
|
||||
const float output_min,
|
||||
const float output_max) {
|
||||
const Tensor weight_contig = weight.contiguous();
|
||||
|
||||
TORCH_CHECK(
|
||||
available(
|
||||
weight_contig,
|
||||
bias,
|
||||
output_min,
|
||||
output_max),
|
||||
"XNNPACK Linear not available! "
|
||||
"Reason: The provided (weight, bias, output_min, output_max) parameters are "
|
||||
"either invalid individually or their combination is not supported by XNNPACK.");
|
||||
|
||||
xnn_operator_t linear_op{};
|
||||
|
||||
const xnn_status create_status = xnn_create_fully_connected_nc_f32(
|
||||
weight_contig.size(Layout::Filter::input), // input_channels
|
||||
weight_contig.size(Layout::Filter::output), // output_channels
|
||||
weight_contig.size(Layout::Filter::input), // input_pixel_stride
|
||||
weight_contig.size(Layout::Filter::output), // output_pixel_stride
|
||||
weight_contig.data_ptr<float>(), // kernel
|
||||
(bias && bias->defined()) ? bias->data_ptr<float>() : nullptr, // bias
|
||||
output_min, // output_min
|
||||
output_max, // output_max
|
||||
0u, // flags
|
||||
&linear_op); // operator
|
||||
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == create_status,
|
||||
"xnn_create_fully_connected_nc_f32 failed!");
|
||||
|
||||
return ContextLinear(
|
||||
Operator(linear_op),
|
||||
weight_contig.size(Layout::Filter::output)
|
||||
);
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext>
|
||||
LinearPrePack::operator()(
|
||||
Tensor weight,
|
||||
c10::optional<Tensor> bias) {
|
||||
return xnnpack::XNNPackLinearOpContext::create_context(
|
||||
std::move(weight), std::move(bias), {}, {});
|
||||
}
|
||||
|
||||
Tensor LinearPacked::operator()(
|
||||
const Tensor& input,
|
||||
const c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext>& op_context) {
|
||||
return
|
||||
xnnpack::internal::linear::run((op_context.get())->get_context(), input);
|
||||
}
|
||||
|
||||
} // namespace linear
|
||||
} // namespace internal
|
||||
|
||||
@ -169,8 +172,8 @@ bool use_linear(
|
||||
return internal::linear::available(
|
||||
weight,
|
||||
bias,
|
||||
internal::linear::Context::kMin,
|
||||
internal::linear::Context::kMax) &&
|
||||
ContextLinear::kMin,
|
||||
ContextLinear::kMax) &&
|
||||
internal::linear::usable(input);
|
||||
}
|
||||
|
||||
@ -182,42 +185,13 @@ Tensor linear(
|
||||
input,
|
||||
weight,
|
||||
bias,
|
||||
internal::linear::Context::kMin,
|
||||
internal::linear::Context::kMax);
|
||||
ContextLinear::kMin,
|
||||
ContextLinear::kMax);
|
||||
}
|
||||
|
||||
} // namespace xnnpack
|
||||
|
||||
Tensor _linear_prepack(
|
||||
const Tensor& weight,
|
||||
const Tensor& bias,
|
||||
const c10::optional<double> output_min,
|
||||
const c10::optional<double> output_max) {
|
||||
return cpp_custom_type_hack::create(
|
||||
std::make_unique<xnnpack::internal::linear::Context>(
|
||||
xnnpack::internal::linear::create(
|
||||
weight,
|
||||
bias,
|
||||
output_min ? *output_min : xnnpack::internal::linear::Context::kMin,
|
||||
output_max ? *output_max : xnnpack::internal::linear::Context::kMax)),
|
||||
weight.options());
|
||||
}
|
||||
|
||||
Tensor _linear_packed(
|
||||
const Tensor& packed_weight,
|
||||
const Tensor& input) {
|
||||
return xnnpack::internal::linear::run(
|
||||
cpp_custom_type_hack::cast<xnnpack::internal::linear::Context>(packed_weight),
|
||||
input);
|
||||
}
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
CAFFE_KNOWN_TYPE(at::native::xnnpack::internal::linear::Context);
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif /* USE_XNNPACK */
|
||||
|
||||
40
aten/src/ATen/native/xnnpack/Linear.h
Normal file
40
aten/src/ATen/native/xnnpack/Linear.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef USE_XNNPACK
|
||||
|
||||
#include <ATen/Tensor.h>
|
||||
#include <ATen/native/xnnpack/Common.h>
|
||||
#include <ATen/native/xnnpack/OpContext.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace xnnpack {
|
||||
namespace internal {
|
||||
namespace linear {
|
||||
|
||||
class LinearPrePack final : public torch::OperatorKernel {
|
||||
public:
|
||||
c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext> operator()(
|
||||
Tensor weight,
|
||||
c10::optional<Tensor> bias);
|
||||
};
|
||||
|
||||
class LinearPacked final : public torch::OperatorKernel {
|
||||
public:
|
||||
Tensor operator()(
|
||||
const Tensor& input,
|
||||
const c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext>& op_context);
|
||||
};
|
||||
|
||||
ContextLinear create(
|
||||
const Tensor& weight,
|
||||
const c10::optional<Tensor>& bias,
|
||||
const float output_min,
|
||||
const float output_max);
|
||||
} // namespace linear
|
||||
} // namespace internal
|
||||
} // namespace xnnpack
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
||||
#endif /* USE_XNNPACK */
|
||||
64
aten/src/ATen/native/xnnpack/OpContext.cpp
Normal file
64
aten/src/ATen/native/xnnpack/OpContext.cpp
Normal file
@ -0,0 +1,64 @@
|
||||
#ifdef USE_XNNPACK
|
||||
#include <ATen/native/xnnpack/Convolution.h>
|
||||
#include <ATen/native/xnnpack/Linear.h>
|
||||
#include <ATen/native/xnnpack/OpContext.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace xnnpack {
|
||||
|
||||
c10::intrusive_ptr<XNNPackLinearOpContext>
|
||||
XNNPackLinearOpContext::create_context(
|
||||
at::Tensor&& weight,
|
||||
c10::optional<at::Tensor>&& bias,
|
||||
const c10::optional<double> output_min,
|
||||
const c10::optional<double> output_max) {
|
||||
auto linear_op_context =
|
||||
c10::make_intrusive<XNNPackLinearOpContext>(
|
||||
std::move(weight),
|
||||
std::move(bias),
|
||||
xnnpack::internal::linear::create(
|
||||
weight,
|
||||
bias,
|
||||
output_min ? *output_min : xnnpack::ContextLinear::kMin,
|
||||
output_max ? *output_max : xnnpack::ContextLinear::kMax)
|
||||
);
|
||||
return linear_op_context;
|
||||
}
|
||||
|
||||
c10::intrusive_ptr<XNNPackConv2dOpContext>
|
||||
XNNPackConv2dOpContext::create_context(at::Tensor&& weight,
|
||||
c10::optional<at::Tensor>&& bias,
|
||||
std::vector<int64_t>&& padding,
|
||||
std::vector<int64_t>&& stride,
|
||||
std::vector<int64_t>&& dilation,
|
||||
int64_t groups,
|
||||
const c10::optional<double> output_min,
|
||||
const c10::optional<double> output_max) {
|
||||
auto op_context =
|
||||
xnnpack::internal::convolution2d::create(
|
||||
weight,
|
||||
bias,
|
||||
padding,
|
||||
stride,
|
||||
dilation,
|
||||
groups,
|
||||
output_min ? *output_min : xnnpack::ContextConv2D::kMin,
|
||||
output_max ? *output_max : xnnpack::ContextConv2D::kMax);
|
||||
auto conv2d_op_context =
|
||||
c10::make_intrusive<XNNPackConv2dOpContext>(
|
||||
std::move(weight),
|
||||
std::move(bias),
|
||||
std::move(padding),
|
||||
std::move(stride),
|
||||
std::move(dilation),
|
||||
groups,
|
||||
std::move(op_context));
|
||||
return conv2d_op_context;
|
||||
}
|
||||
|
||||
} // xnnpack
|
||||
} // native
|
||||
} // at
|
||||
|
||||
#endif /* USE_XNNPACK */
|
||||
96
aten/src/ATen/native/xnnpack/OpContext.h
Normal file
96
aten/src/ATen/native/xnnpack/OpContext.h
Normal file
@ -0,0 +1,96 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef USE_XNNPACK
|
||||
|
||||
#include <ATen/core/ivalue.h>
|
||||
#include <ATen/native/xnnpack/Common.h>
|
||||
#include <ATen/Tensor.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace xnnpack {
|
||||
|
||||
using SerializationTypeLinearPrePack = std::tuple<Tensor, c10::optional<Tensor>>;
|
||||
using SerializationTypeConv2dPrePack =
|
||||
std::tuple<Tensor, c10::optional<Tensor>,
|
||||
std::vector<int64_t>, std::vector<int64_t>, std::vector<int64_t>, int64_t>;
|
||||
|
||||
class XNNPackLinearOpContext : public torch::jit::CustomClassHolder {
|
||||
private:
|
||||
Tensor orig_weight_;
|
||||
c10::optional<Tensor> orig_bias_;
|
||||
ContextLinear op_context_;
|
||||
|
||||
public:
|
||||
XNNPackLinearOpContext(Tensor&& weight,
|
||||
c10::optional<Tensor>&& bias,
|
||||
ContextLinear&& op_context) :
|
||||
orig_weight_(std::move(weight)),
|
||||
orig_bias_(std::move(bias)),
|
||||
op_context_(std::move(op_context)) {}
|
||||
|
||||
const ContextLinear& get_context() const {
|
||||
return op_context_;
|
||||
}
|
||||
|
||||
SerializationTypeLinearPrePack unpack() {
|
||||
return std::make_tuple(orig_weight_, orig_bias_);
|
||||
}
|
||||
|
||||
static c10::intrusive_ptr<XNNPackLinearOpContext> create_context(Tensor&& weight,
|
||||
c10::optional<Tensor>&& bias,
|
||||
const c10::optional<double> output_min,
|
||||
const c10::optional<double> output_max);
|
||||
};
|
||||
|
||||
class XNNPackConv2dOpContext : public torch::jit::CustomClassHolder {
|
||||
private:
|
||||
Tensor orig_weight_;
|
||||
c10::optional<Tensor> orig_bias_;
|
||||
std::vector<int64_t> padding_;
|
||||
std::vector<int64_t> stride_;
|
||||
std::vector<int64_t> dilation_;
|
||||
int64_t groups_;
|
||||
ContextConv2D op_context_;
|
||||
|
||||
public:
|
||||
XNNPackConv2dOpContext(Tensor&& weight,
|
||||
c10::optional<Tensor>&& bias,
|
||||
std::vector<int64_t>&& padding,
|
||||
std::vector<int64_t>&& stride,
|
||||
std::vector<int64_t>&& dilation,
|
||||
uint64_t groups,
|
||||
ContextConv2D&& op_context
|
||||
) :
|
||||
orig_weight_(std::move(weight)),
|
||||
orig_bias_(std::move(bias)),
|
||||
padding_(std::move(padding)),
|
||||
stride_(std::move(stride)),
|
||||
dilation_(std::move(dilation)),
|
||||
groups_(groups),
|
||||
op_context_(std::move(op_context)) {}
|
||||
|
||||
const ContextConv2D& get_context() const {
|
||||
return op_context_;
|
||||
}
|
||||
|
||||
SerializationTypeConv2dPrePack unpack() {
|
||||
return std::make_tuple(orig_weight_, orig_bias_, padding_,
|
||||
stride_, dilation_, groups_);
|
||||
}
|
||||
|
||||
static c10::intrusive_ptr<XNNPackConv2dOpContext> create_context(Tensor&& weight,
|
||||
c10::optional<Tensor>&& bias,
|
||||
std::vector<int64_t>&& padding,
|
||||
std::vector<int64_t>&& stride,
|
||||
std::vector<int64_t>&& dilation,
|
||||
int64_t groups,
|
||||
const c10::optional<double> output_min,
|
||||
const c10::optional<double> output_max);
|
||||
};
|
||||
} // xnnpack
|
||||
|
||||
} // native
|
||||
} // at
|
||||
|
||||
#endif /* USE_XNNPACK */
|
||||
101
aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
Normal file
101
aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
Normal file
@ -0,0 +1,101 @@
|
||||
#ifdef USE_XNNPACK
|
||||
|
||||
#include <ATen/core/op_registration/op_registration.h>
|
||||
#include <ATen/native/xnnpack/Convolution.h>
|
||||
#include <ATen/native/xnnpack/Linear.h>
|
||||
#include <ATen/native/xnnpack/OpContext.h>
|
||||
#include <ATen/Tensor.h>
|
||||
#include <torch/custom_class.h>
|
||||
|
||||
namespace at {
|
||||
namespace native {
|
||||
namespace xnnpack {
|
||||
|
||||
namespace {
|
||||
torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_class() {
|
||||
static auto register_linear_op_context_class =
|
||||
torch::jit::class_<XNNPackLinearOpContext>("XNNPackLinearOpContext")
|
||||
.def_pickle(
|
||||
[](const c10::intrusive_ptr<XNNPackLinearOpContext>& op_context)
|
||||
-> SerializationTypeLinearPrePack { // __getstate__
|
||||
Tensor weight;
|
||||
c10::optional<Tensor> bias;
|
||||
return op_context->unpack();
|
||||
},
|
||||
[](SerializationTypeLinearPrePack state)
|
||||
-> c10::intrusive_ptr<
|
||||
XNNPackLinearOpContext> { // __setstate__
|
||||
return XNNPackLinearOpContext::create_context(
|
||||
std::move(std::get<0>(state)),
|
||||
std::move(std::get<1>(state)),
|
||||
{},
|
||||
{}
|
||||
);
|
||||
}
|
||||
);
|
||||
return register_linear_op_context_class;
|
||||
}
|
||||
|
||||
torch::jit::class_<XNNPackConv2dOpContext> register_xnnpack_conv2d_op_context_class() {
|
||||
static auto register_conv2d_op_context_class =
|
||||
torch::jit::class_<XNNPackConv2dOpContext>("XNNPackConv2dOpContext")
|
||||
.def_pickle(
|
||||
[](const c10::intrusive_ptr<XNNPackConv2dOpContext>& op_context)
|
||||
-> SerializationTypeConv2dPrePack { // __getstate__
|
||||
Tensor weight;
|
||||
std::vector<int64_t> padding, stride, dilation;
|
||||
int64_t groups;
|
||||
c10::optional<Tensor> bias;
|
||||
return op_context->unpack();
|
||||
},
|
||||
[](SerializationTypeConv2dPrePack state)
|
||||
-> c10::intrusive_ptr<
|
||||
XNNPackConv2dOpContext> { // __setstate__
|
||||
return XNNPackConv2dOpContext::create_context(
|
||||
std::move(std::get<0>(state)),
|
||||
std::move(std::get<1>(state)),
|
||||
std::move(std::get<2>(state)),
|
||||
std::move(std::get<3>(state)),
|
||||
std::move(std::get<4>(state)),
|
||||
std::move(std::get<5>(state)),
|
||||
{},
|
||||
{}
|
||||
);
|
||||
}
|
||||
);
|
||||
return register_conv2d_op_context_class;
|
||||
}
|
||||
|
||||
static auto xnnpack_linear_op_context_class = register_xnnpack_linear_op_context_class();
|
||||
static auto xnnpack_conv2d_op_context_class = register_xnnpack_conv2d_op_context_class();
|
||||
|
||||
// Op registeration
|
||||
static auto registry =
|
||||
// Registering under _xnnpack namespace for now. As we add more backend requiring similar functionality
|
||||
// We can refactor the code and use a better namespace.
|
||||
torch::RegisterOperators()
|
||||
.op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.XNNPackLinearOpContext",
|
||||
torch::RegisterOperators::options().kernel<internal::linear::LinearPrePack>(
|
||||
DispatchKey::CPUTensorId))
|
||||
.op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.XNNPackLinearOpContext W_prepack) -> Tensor Y",
|
||||
torch::RegisterOperators::options().kernel<internal::linear::LinearPacked>(
|
||||
DispatchKey::CPUTensorId))
|
||||
.op("_xnnpack::conv2d_prepack(Tensor W, Tensor? B, int[2] stride, "
|
||||
"int[2] padding, int[2] dilation, int groups) "
|
||||
"-> __torch__.torch.classes.XNNPackConv2dOpContext",
|
||||
torch::RegisterOperators::options().kernel<internal::convolution2d::Conv2dPrePack>(
|
||||
DispatchKey::CPUTensorId))
|
||||
.op("_xnnpack::conv2d_packed(Tensor X, "
|
||||
"__torch__.torch.classes.XNNPackConv2dOpContext W_prepack) -> Tensor Y",
|
||||
torch::RegisterOperators::options().kernel<internal::convolution2d::Conv2dPacked>(
|
||||
DispatchKey::CPUTensorId));
|
||||
} // namespace
|
||||
|
||||
} // xnnpack
|
||||
} // native
|
||||
} // at
|
||||
|
||||
namespace {
|
||||
}
|
||||
|
||||
#endif /* USE_XNNPACK */
|
||||
@ -58,38 +58,6 @@ Tensor linear(
|
||||
|
||||
} // namespace xnnpack
|
||||
|
||||
at::Tensor _conv2d_prepack(
|
||||
const Tensor&,
|
||||
const Tensor&,
|
||||
const IntArrayRef,
|
||||
const IntArrayRef,
|
||||
const IntArrayRef,
|
||||
const int64_t,
|
||||
const c10::optional<double>,
|
||||
const c10::optional<double>) {
|
||||
TORCH_CHECK(false, xnnpack::internal::kError);
|
||||
}
|
||||
|
||||
at::Tensor _conv2d_packed(
|
||||
const Tensor&,
|
||||
const Tensor&) {
|
||||
TORCH_CHECK(false, xnnpack::internal::kError);
|
||||
}
|
||||
|
||||
Tensor _linear_prepack(
|
||||
const Tensor&,
|
||||
const Tensor&,
|
||||
const c10::optional<double>,
|
||||
const c10::optional<double>) {
|
||||
TORCH_CHECK(false, xnnpack::internal::kError);
|
||||
}
|
||||
|
||||
Tensor _linear_packed(
|
||||
const Tensor&,
|
||||
const Tensor&) {
|
||||
TORCH_CHECK(false, xnnpack::internal::kError);
|
||||
}
|
||||
|
||||
} // namespace native
|
||||
} // namespace at
|
||||
|
||||
|
||||
364
test/test_xnnpack_integration.py
Normal file
364
test/test_xnnpack_integration.py
Normal file
@ -0,0 +1,364 @@
|
||||
from __future__ import division
|
||||
|
||||
import unittest
|
||||
|
||||
import torch
|
||||
import torch.backends.xnnpack
|
||||
from torch.nn import functional as F
|
||||
import torch.testing._internal.hypothesis_utils as hu
|
||||
from torch.testing._internal.common_utils import TestCase, run_tests
|
||||
from hypothesis import given, assume
|
||||
from hypothesis import strategies as st
|
||||
import io
|
||||
|
||||
|
||||
@unittest.skipUnless(torch.backends.xnnpack.enabled,
|
||||
" XNNPACK must be enabled for these tests."
|
||||
" Please build with USE_XNNPACK=1.")
|
||||
class TestXNNPACKOps(TestCase):
|
||||
@given(batch_size=st.integers(0, 3),
|
||||
data_shape=hu.array_shapes(1, 3, 2, 64),
|
||||
weight_output_dim=st.integers(2, 64),
|
||||
use_bias=st.booleans())
|
||||
def test_linear(self, batch_size, data_shape, weight_output_dim, use_bias):
|
||||
data_shape = [batch_size] + list(data_shape)
|
||||
input_data = torch.rand(data_shape)
|
||||
weight = torch.rand((weight_output_dim, data_shape[-1]))
|
||||
if use_bias:
|
||||
bias = torch.rand((weight_output_dim))
|
||||
else:
|
||||
bias = None
|
||||
ref_result = F.linear(input_data, weight, bias)
|
||||
packed_weight_bias = torch.ops._xnnpack.linear_prepack(weight, bias)
|
||||
output_linear_xnnpack = torch.ops._xnnpack.linear_packed(input_data, packed_weight_bias)
|
||||
torch.testing.assert_allclose(ref_result, output_linear_xnnpack, rtol=1e-2, atol=1e-3)
|
||||
|
||||
@given(batch_size=st.integers(0, 3),
|
||||
input_channels_per_group=st.integers(1, 32),
|
||||
height=st.integers(5, 64),
|
||||
width=st.integers(5, 64),
|
||||
output_channels_per_group=st.integers(1, 32),
|
||||
groups=st.integers(1, 16),
|
||||
kernel_h=st.integers(1, 7),
|
||||
kernel_w=st.integers(1, 7),
|
||||
stride_h=st.integers(1, 2),
|
||||
stride_w=st.integers(1, 2),
|
||||
pad_h=st.integers(0, 2),
|
||||
pad_w=st.integers(0, 2),
|
||||
dilation=st.integers(1, 2),
|
||||
use_bias=st.booleans())
|
||||
def test_conv2d(self,
|
||||
batch_size,
|
||||
input_channels_per_group,
|
||||
height,
|
||||
width,
|
||||
output_channels_per_group,
|
||||
groups,
|
||||
kernel_h,
|
||||
kernel_w,
|
||||
stride_h,
|
||||
stride_w,
|
||||
pad_h,
|
||||
pad_w,
|
||||
dilation,
|
||||
use_bias):
|
||||
input_channels = input_channels_per_group * groups
|
||||
output_channels = output_channels_per_group * groups
|
||||
kernels = (kernel_h, kernel_w)
|
||||
strides = (stride_h, stride_w)
|
||||
paddings = (pad_h, pad_w)
|
||||
dilations = (dilation, dilation)
|
||||
assume(height + 2 * paddings[0] >=
|
||||
dilations[0] * (kernels[0] - 1) + 1)
|
||||
assume(width + 2 * paddings[1] >=
|
||||
dilations[1] * (kernels[1] - 1) + 1)
|
||||
|
||||
input_data = torch.rand((batch_size, input_channels, height, width))
|
||||
weight = torch.rand((output_channels, input_channels_per_group, kernel_h, kernel_w))
|
||||
bias = None
|
||||
if use_bias:
|
||||
bias = torch.rand((output_channels))
|
||||
|
||||
ref_result = F.conv2d(input_data, weight, bias,
|
||||
strides, paddings, dilations, groups)
|
||||
packed_weight_bias = torch.ops._xnnpack.conv2d_prepack(weight, bias,
|
||||
strides, paddings, dilations, groups)
|
||||
xnnpack_result = torch.ops._xnnpack.conv2d_packed(input_data, packed_weight_bias)
|
||||
torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
|
||||
|
||||
|
||||
@unittest.skipUnless(torch.backends.xnnpack.enabled,
|
||||
" XNNPACK must be enabled for these tests."
|
||||
" Please build with USE_XNNPACK=1.")
|
||||
class TestXNNPACKSerDes(TestCase):
|
||||
@given(batch_size=st.integers(0, 3),
|
||||
data_shape=hu.array_shapes(1, 3, 2, 64),
|
||||
weight_output_dim=st.integers(2, 64),
|
||||
use_bias=st.booleans())
|
||||
def test_linear(self, batch_size, data_shape, weight_output_dim, use_bias):
|
||||
class Linear(torch.nn.Module):
|
||||
def __init__(self, weight, bias=None):
|
||||
super(Linear, self).__init__()
|
||||
self.weight = weight
|
||||
self.bias = bias
|
||||
|
||||
def forward(self, x):
|
||||
return F.linear(x, self.weight, self.bias)
|
||||
|
||||
class LinearPrePacked(torch.nn.Module):
|
||||
def __init__(self, weight, bias=None):
|
||||
super(LinearPrePacked, self).__init__()
|
||||
self.packed_weight_bias = torch.ops._xnnpack.linear_prepack(weight, bias)
|
||||
|
||||
def forward(self, x):
|
||||
return torch.ops._xnnpack.linear_packed(x, self.packed_weight_bias)
|
||||
|
||||
data_shape = [batch_size] + list(data_shape)
|
||||
weight = torch.rand((weight_output_dim, data_shape[-1]))
|
||||
if use_bias:
|
||||
bias = torch.rand((weight_output_dim))
|
||||
else:
|
||||
bias = None
|
||||
scripted_linear = torch.jit.script(Linear(weight, bias))
|
||||
scripted_linear_prepacked = torch.jit.script(LinearPrePacked(weight, bias))
|
||||
input_data = torch.rand(data_shape)
|
||||
ref_result = scripted_linear(input_data)
|
||||
output_linear_xnnpack = scripted_linear_prepacked(input_data)
|
||||
torch.testing.assert_allclose(ref_result, output_linear_xnnpack, rtol=1e-2, atol=1e-3)
|
||||
|
||||
# Serialize the modules and then deserialize
|
||||
input_data = torch.rand(data_shape)
|
||||
buffer = io.BytesIO()
|
||||
torch.jit.save(scripted_linear, buffer)
|
||||
buffer.seek(0)
|
||||
deserialized_linear = torch.jit.load(buffer)
|
||||
buffer = io.BytesIO()
|
||||
torch.jit.save(scripted_linear_prepacked, buffer)
|
||||
buffer.seek(0)
|
||||
deserialized_linear_prepacked = torch.jit.load(buffer)
|
||||
ref_result = deserialized_linear(input_data)
|
||||
output_linear_xnnpack = deserialized_linear_prepacked(input_data)
|
||||
torch.testing.assert_allclose(ref_result, output_linear_xnnpack, rtol=1e-2, atol=1e-3)
|
||||
|
||||
@given(batch_size=st.integers(0, 3),
|
||||
input_channels_per_group=st.integers(1, 32),
|
||||
height=st.integers(5, 64),
|
||||
width=st.integers(5, 64),
|
||||
output_channels_per_group=st.integers(1, 32),
|
||||
groups=st.integers(1, 16),
|
||||
kernel_h=st.integers(1, 7),
|
||||
kernel_w=st.integers(1, 7),
|
||||
stride_h=st.integers(1, 2),
|
||||
stride_w=st.integers(1, 2),
|
||||
pad_h=st.integers(0, 2),
|
||||
pad_w=st.integers(0, 2),
|
||||
dilation=st.integers(1, 2),
|
||||
use_bias=st.booleans())
|
||||
def test_conv2d(self,
|
||||
batch_size,
|
||||
input_channels_per_group,
|
||||
height,
|
||||
width,
|
||||
output_channels_per_group,
|
||||
groups,
|
||||
kernel_h,
|
||||
kernel_w,
|
||||
stride_h,
|
||||
stride_w,
|
||||
pad_h,
|
||||
pad_w,
|
||||
dilation,
|
||||
use_bias):
|
||||
class Conv2D(torch.nn.Module):
|
||||
def __init__(self, weight, bias, strides, paddings, dilations, groups):
|
||||
super(Conv2D, self).__init__()
|
||||
self.weight = weight
|
||||
self.bias = bias
|
||||
self.strides = strides
|
||||
self.paddings = paddings
|
||||
self.dilations = dilations
|
||||
self.groups = groups
|
||||
|
||||
def forward(self, x):
|
||||
return F.conv2d(x, self.weight, self.bias,
|
||||
self.strides, self.paddings, self.dilations, self.groups)
|
||||
|
||||
class Conv2DPrePacked(torch.nn.Module):
|
||||
def __init__(self, weight, bias, strides, paddings, dilations, groups):
|
||||
super(Conv2DPrePacked, self).__init__()
|
||||
self.packed_weight_bias = torch.ops._xnnpack.conv2d_prepack(weight, bias,
|
||||
strides, paddings, dilations, groups)
|
||||
|
||||
def forward(self, x):
|
||||
return torch.ops._xnnpack.conv2d_packed(x, self.packed_weight_bias)
|
||||
|
||||
input_channels = input_channels_per_group * groups
|
||||
output_channels = output_channels_per_group * groups
|
||||
kernels = (kernel_h, kernel_w)
|
||||
strides = (stride_h, stride_w)
|
||||
paddings = (pad_h, pad_w)
|
||||
dilations = (dilation, dilation)
|
||||
assume(height + 2 * paddings[0] >=
|
||||
dilations[0] * (kernels[0] - 1) + 1)
|
||||
assume(width + 2 * paddings[1] >=
|
||||
dilations[1] * (kernels[1] - 1) + 1)
|
||||
|
||||
input_data = torch.rand((batch_size, input_channels, height, width))
|
||||
weight = torch.rand((output_channels, input_channels_per_group, kernel_h, kernel_w))
|
||||
bias = None
|
||||
if use_bias:
|
||||
bias = torch.rand((output_channels))
|
||||
|
||||
scripted_conv2d = torch.jit.script(Conv2D(weight, bias,
|
||||
strides, paddings, dilations, groups))
|
||||
scripted_conv2d_prepacked = torch.jit.script(Conv2DPrePacked(
|
||||
weight, bias, strides, paddings, dilations, groups))
|
||||
ref_result = scripted_conv2d(input_data)
|
||||
xnnpack_result = scripted_conv2d_prepacked(input_data)
|
||||
torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
|
||||
|
||||
# Serialize the modules and then deserialize
|
||||
input_data = torch.rand((batch_size, input_channels, height, width))
|
||||
buffer = io.BytesIO()
|
||||
torch.jit.save(scripted_conv2d, buffer)
|
||||
buffer.seek(0)
|
||||
deserialized_conv2d = torch.jit.load(buffer)
|
||||
buffer = io.BytesIO()
|
||||
torch.jit.save(scripted_conv2d_prepacked, buffer)
|
||||
buffer.seek(0)
|
||||
deserialized_conv2d_prepacked = torch.jit.load(buffer)
|
||||
ref_result = deserialized_conv2d(input_data)
|
||||
xnnpack_result = deserialized_conv2d_prepacked(input_data)
|
||||
torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
|
||||
|
||||
@given(batch_size=st.integers(0, 3),
|
||||
input_channels_per_group=st.integers(1, 32),
|
||||
height=st.integers(5, 64),
|
||||
width=st.integers(5, 64),
|
||||
output_channels_per_group=st.integers(1, 32),
|
||||
groups=st.integers(1, 16),
|
||||
kernel_h=st.integers(1, 7),
|
||||
kernel_w=st.integers(1, 7),
|
||||
stride_h=st.integers(1, 2),
|
||||
stride_w=st.integers(1, 2),
|
||||
pad_h=st.integers(0, 2),
|
||||
pad_w=st.integers(0, 2),
|
||||
dilation=st.integers(1, 2),
|
||||
linear_weight_output_dim=st.integers(2, 64),
|
||||
use_bias=st.booleans())
|
||||
def test_combined_model(self,
|
||||
batch_size,
|
||||
input_channels_per_group,
|
||||
height,
|
||||
width,
|
||||
output_channels_per_group,
|
||||
groups,
|
||||
kernel_h,
|
||||
kernel_w,
|
||||
stride_h,
|
||||
stride_w,
|
||||
pad_h,
|
||||
pad_w,
|
||||
dilation,
|
||||
linear_weight_output_dim,
|
||||
use_bias):
|
||||
class M(torch.nn.Module):
|
||||
def __init__(self, conv_weight, conv_bias, linear_weight, linear_bias,
|
||||
strides, paddings, dilations, groups):
|
||||
super(M, self).__init__()
|
||||
self.conv_weight = conv_weight
|
||||
self.conv_bias = conv_bias
|
||||
self.linear_weight = linear_weight
|
||||
self.linear_bias = linear_bias
|
||||
self.strides = strides
|
||||
self.paddings = paddings
|
||||
self.dilations = dilations
|
||||
self.groups = groups
|
||||
|
||||
def forward(self, x):
|
||||
o = F.conv2d(x, self.conv_weight, self.conv_bias,
|
||||
self.strides, self.paddings, self.dilations, self.groups)
|
||||
o = o.permute([0, 2, 3, 1])
|
||||
o = F.linear(o, self.linear_weight, self.linear_bias)
|
||||
return F.relu(o)
|
||||
|
||||
class MPrePacked(torch.nn.Module):
|
||||
def __init__(self, conv_weight, conv_bias, linear_weight, linear_bias,
|
||||
strides, paddings, dilations, groups):
|
||||
super(MPrePacked, self).__init__()
|
||||
self.conv2d_packed_weight_bias = \
|
||||
torch.ops._xnnpack.conv2d_prepack(conv_weight, conv_bias,
|
||||
strides, paddings, dilations, groups)
|
||||
self.linear_packed_weight_bias = \
|
||||
torch.ops._xnnpack.linear_prepack(linear_weight, linear_bias)
|
||||
|
||||
def forward(self, x):
|
||||
o = torch.ops._xnnpack.conv2d_packed(x, self.conv2d_packed_weight_bias)
|
||||
o = o.permute([0, 2, 3, 1])
|
||||
o = torch.ops._xnnpack.linear_packed(o, self.linear_packed_weight_bias)
|
||||
return F.relu(o)
|
||||
|
||||
input_channels = input_channels_per_group * groups
|
||||
output_channels = output_channels_per_group * groups
|
||||
kernels = (kernel_h, kernel_w)
|
||||
strides = (stride_h, stride_w)
|
||||
paddings = (pad_h, pad_w)
|
||||
dilations = (dilation, dilation)
|
||||
assume(height + 2 * paddings[0] >=
|
||||
dilations[0] * (kernels[0] - 1) + 1)
|
||||
assume(width + 2 * paddings[1] >=
|
||||
dilations[1] * (kernels[1] - 1) + 1)
|
||||
|
||||
input_data = torch.rand((batch_size, input_channels, height, width))
|
||||
conv_weight = torch.rand((output_channels, input_channels_per_group, kernel_h, kernel_w))
|
||||
conv_bias = None
|
||||
if use_bias:
|
||||
conv_bias = torch.rand((output_channels))
|
||||
|
||||
# This is done just to find the output shape of the result
|
||||
# so that the shape of weight for the following linear layer
|
||||
# can be determined.
|
||||
result = F.conv2d(input_data, conv_weight, conv_bias,
|
||||
strides, paddings, dilations, groups)
|
||||
linear_input_shape = result.shape[1]
|
||||
|
||||
input_data = input_data.contiguous(memory_format=torch.channels_last)
|
||||
linear_weight = torch.rand((linear_weight_output_dim, linear_input_shape))
|
||||
linear_bias = None
|
||||
if use_bias:
|
||||
linear_bias = torch.rand((linear_weight_output_dim))
|
||||
|
||||
scripted_m = torch.jit.script(M(conv_weight, conv_bias, linear_weight,
|
||||
linear_bias, strides, paddings, dilations, groups))
|
||||
scripted_m_prepacked = torch.jit.script(
|
||||
MPrePacked(
|
||||
conv_weight,
|
||||
conv_bias,
|
||||
linear_weight,
|
||||
linear_bias,
|
||||
strides,
|
||||
paddings,
|
||||
dilations,
|
||||
groups))
|
||||
ref_result = scripted_m(input_data)
|
||||
xnnpack_result = scripted_m_prepacked(input_data)
|
||||
torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
|
||||
|
||||
# Serialize the modules and then deserialize
|
||||
input_data = torch.rand((batch_size, input_channels, height, width))
|
||||
input_data = input_data.contiguous(memory_format=torch.channels_last)
|
||||
buffer = io.BytesIO()
|
||||
torch.jit.save(scripted_m, buffer)
|
||||
buffer.seek(0)
|
||||
deserialized_m = torch.jit.load(buffer)
|
||||
buffer = io.BytesIO()
|
||||
torch.jit.save(scripted_m_prepacked, buffer)
|
||||
buffer.seek(0)
|
||||
deserialized_m_prepacked = torch.jit.load(buffer)
|
||||
ref_result = deserialized_m(input_data)
|
||||
xnnpack_result = deserialized_m_prepacked(input_data)
|
||||
torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
run_tests()
|
||||
25
torch/backends/xnnpack/__init__.py
Normal file
25
torch/backends/xnnpack/__init__.py
Normal file
@ -0,0 +1,25 @@
|
||||
from __future__ import absolute_import, division, print_function, unicode_literals
|
||||
import sys
|
||||
import torch
|
||||
import types
|
||||
|
||||
class _XNNPACKEnabled(object):
|
||||
def __get__(self, obj, objtype):
|
||||
return torch._C._is_xnnpack_enabled()
|
||||
|
||||
def __set__(self, obj, val):
|
||||
raise RuntimeError("Assignment not supported")
|
||||
|
||||
class XNNPACKEngine(types.ModuleType):
|
||||
def __init__(self, m, name):
|
||||
super(XNNPACKEngine, self).__init__(name)
|
||||
self.m = m
|
||||
|
||||
def __getattr__(self, attr):
|
||||
return self.m.__getattribute__(attr)
|
||||
|
||||
enabled = _XNNPACKEnabled()
|
||||
|
||||
# This is the sys.modules replacement trick, see
|
||||
# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
|
||||
sys.modules[__name__] = XNNPACKEngine(sys.modules[__name__], __name__)
|
||||
@ -516,6 +516,12 @@ PyObject *THPModule_supportedQEngines(PyObject */* unused */)
|
||||
return list.release();
|
||||
}
|
||||
|
||||
PyObject *THPModule_isEnabledXNNPACK(PyObject * /* unused */)
|
||||
{
|
||||
if (at::globalContext().isXNNPACKAvailable()) Py_RETURN_TRUE;
|
||||
else Py_RETURN_FALSE;
|
||||
}
|
||||
|
||||
//NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
|
||||
static PyMethodDef TorchMethods[] = {
|
||||
{"_initExtension", (PyCFunction)THPModule_initExtension, METH_O, nullptr},
|
||||
@ -556,6 +562,7 @@ static PyMethodDef TorchMethods[] = {
|
||||
{"_get_qengine", (PyCFunction)THPModule_qEngine, METH_NOARGS, nullptr},
|
||||
{"_set_qengine", (PyCFunction)THPModule_setQEngine, METH_O, nullptr},
|
||||
{"_supported_qengines", (PyCFunction)THPModule_supportedQEngines, METH_NOARGS, nullptr},
|
||||
{"_is_xnnpack_enabled", (PyCFunction)THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr},
|
||||
{nullptr, nullptr, 0, nullptr}
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user