mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
Update XNNPACK Version (#139913)
Updating XNNPACK Version to 4ea82e595b36106653175dcb04b2aa532660d0d8 submodule update Pull Request resolved: https://github.com/pytorch/pytorch/pull/139913 Approved by: https://github.com/digantdesai, https://github.com/huydhn
This commit is contained in:
committed by
PyTorch MergeBot
parent
e429a3b72e
commit
cca34be584
@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
|
||||
cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
|
||||
# build a FAT bianry
|
||||
cd ${ZIP_DIR}/install/lib
|
||||
target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
|
||||
target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a libmicrokernels-prod.a)
|
||||
for lib in ${target_libs[*]}
|
||||
do
|
||||
if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
|
||||
|
@ -112,6 +112,7 @@ if(ANDROID_ABI)
|
||||
import_static_lib(libc10)
|
||||
import_static_lib(libnnpack)
|
||||
import_static_lib(libXNNPACK)
|
||||
import_static_lib(libmicrokernels-prod)
|
||||
import_static_lib(libpytorch_qnnpack)
|
||||
import_static_lib(libpthreadpool)
|
||||
import_static_lib(libeigen_blas)
|
||||
@ -129,6 +130,7 @@ if(ANDROID_ABI)
|
||||
libc10
|
||||
libnnpack
|
||||
libXNNPACK
|
||||
libmicrokernels-prod
|
||||
libpytorch_qnnpack
|
||||
libpthreadpool
|
||||
libeigen_blas
|
||||
@ -151,6 +153,7 @@ else()
|
||||
|
||||
if(USE_XNNPACK)
|
||||
list(APPEND pytorch_jni_LIBS XNNPACK)
|
||||
list(APPEND pytorch_jni_LIBS microkernels-prod)
|
||||
endif()
|
||||
|
||||
if(USE_SYSTEM_PTHREADPOOL)
|
||||
|
@ -234,62 +234,27 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
|
||||
|
||||
#ifdef USE_XNNPACK
|
||||
C10_ALWAYS_INLINE
|
||||
enum xnn_status xnnp_create_add_nd(
|
||||
int8_t azp,
|
||||
float ascale,
|
||||
int8_t bzp,
|
||||
float bscale,
|
||||
int8_t czp,
|
||||
float cscale,
|
||||
int8_t output_min,
|
||||
int8_t output_max,
|
||||
uint32_t flags,
|
||||
xnn_operator_t* op) {
|
||||
return xnn_create_add_nd_qs8(
|
||||
azp, /* int8_t input1_zero_point */
|
||||
ascale, /* float input1_scale */
|
||||
bzp, /* int8_t input2_zero_point */
|
||||
bscale, /* float input2_scale */
|
||||
czp, /* int8_t output_zero_point */
|
||||
cscale, /* float output_scale */
|
||||
output_min, /* int8_t output_min */
|
||||
output_max, /* int8_t output_max */
|
||||
flags, /* uint32_t flags */
|
||||
op); /* xnn_operator_t* add_op_out */
|
||||
}
|
||||
enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
|
||||
Tensor contig_tensor = tensor.contiguous(format);
|
||||
const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
|
||||
const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
|
||||
const float scale = static_cast<float>(contig_tensor.q_scale());
|
||||
|
||||
C10_ALWAYS_INLINE
|
||||
enum xnn_status xnnp_reshape_add_nd(
|
||||
xnn_operator_t op,
|
||||
const std::vector<size_t>& a_shape,
|
||||
const std::vector<size_t>& b_shape,
|
||||
pthreadpool_t pt_pool) {
|
||||
return xnn_reshape_add_nd_qs8(
|
||||
op, /* xnn_operator_t add_op */
|
||||
a_shape.size(), /* size_t num_input1_dims */
|
||||
a_shape.data(), /* const size_t* input1_shape */
|
||||
b_shape.size(), /* size_t num_input2_dims */
|
||||
b_shape.data(), /* const size_t* input2_shape */
|
||||
pt_pool); /* pthreadpool_t threadpool */
|
||||
}
|
||||
|
||||
C10_ALWAYS_INLINE
|
||||
enum xnn_status xnnp_setup_add_nd(
|
||||
xnn_operator_t op,
|
||||
const int8_t* da,
|
||||
const int8_t* db,
|
||||
int8_t* dc,
|
||||
pthreadpool_t pt_pool) {
|
||||
return xnn_setup_add_nd_qs8(
|
||||
op, /* xnn_operator_t add_op */
|
||||
da, /* const int8_t* input1 */
|
||||
db, /* const int8_t* input2 */
|
||||
dc); /* int8_t* output */
|
||||
return xnn_define_quantized_tensor_value(
|
||||
subgraph_ptr,
|
||||
xnn_datatype_qint8,
|
||||
zero_point,
|
||||
scale,
|
||||
tensor.ndimension(),
|
||||
tensor_shape.data(),
|
||||
nullptr,
|
||||
external_id,
|
||||
flags,
|
||||
&id);
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool ReLUFused = false>
|
||||
Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
|
||||
using underlying_t = typename scalar_t::underlying;
|
||||
const string func_name = "xnnp_add()";
|
||||
TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
|
||||
TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
|
||||
@ -299,12 +264,6 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
|
||||
auto qa_mem_format = qa.suggest_memory_format();
|
||||
Tensor qa_contig = qa.contiguous(qa_mem_format);
|
||||
Tensor qb_contig = qb.contiguous(qa_mem_format);
|
||||
|
||||
const auto a_zero_point = qa_contig.q_zero_point();
|
||||
const auto b_zero_point = qb_contig.q_zero_point();
|
||||
const auto a_scale = qa_contig.q_scale();
|
||||
const auto b_scale = qb_contig.q_scale();
|
||||
|
||||
Tensor qy = at::native::empty_affine_quantized(
|
||||
at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
|
||||
qa.scalar_type(),
|
||||
@ -319,72 +278,108 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
|
||||
return qy;
|
||||
}
|
||||
|
||||
xnn_operator_t xnnp_op = nullptr;
|
||||
xnnpack_operator xnnp_add_operator;
|
||||
|
||||
auto output_max = std::numeric_limits<underlying_t>::max();
|
||||
auto output_min = std::numeric_limits<underlying_t>::min();
|
||||
auto output_max = std::numeric_limits<float>::infinity();
|
||||
auto output_min = -std::numeric_limits<float>::infinity();
|
||||
if (ReLUFused) {
|
||||
/*
|
||||
* FIXME: use activationLimits<T>()
|
||||
* With <T>, MSVC runs into "error C3862: identifier activationLimits not found".
|
||||
*/
|
||||
constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
|
||||
constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
|
||||
int64_t qvalue = static_cast<int64_t>(zero_point);
|
||||
qvalue = std::max<int64_t>(qvalue, qmin);
|
||||
output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
|
||||
output_min = 0;
|
||||
}
|
||||
|
||||
// Create an operator
|
||||
auto status = xnnp_create_add_nd(
|
||||
a_zero_point,
|
||||
a_scale,
|
||||
b_zero_point,
|
||||
b_scale,
|
||||
static_cast<underlying_t>(zero_point),
|
||||
static_cast<float>(scale),
|
||||
output_min,
|
||||
output_max,
|
||||
0,
|
||||
&xnnp_op);
|
||||
xnnp_add_operator = xnnpack_operator(xnnp_op);
|
||||
// Create XNNPACK Subgraph
|
||||
xnn_subgraph_t subgraph_ptr = nullptr;
|
||||
auto status = xnn_create_subgraph(
|
||||
/*external_value_ids=*/3,
|
||||
/*flags=*/0,
|
||||
&subgraph_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name, ": xnn create operator failed(", status,")!");
|
||||
func_name, ": xnn create subgraph failed(", status,")!");
|
||||
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
|
||||
subgraph_ptr, &xnn_delete_subgraph);
|
||||
|
||||
const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
|
||||
const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
|
||||
uint32_t input0_id = XNN_INVALID_VALUE_ID, input1_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
|
||||
|
||||
// Reshape the operator
|
||||
status = xnnp_reshape_add_nd(
|
||||
xnnp_add_operator.get(),
|
||||
qa_shape,
|
||||
qb_shape,
|
||||
caffe2::pthreadpool_());
|
||||
// Defining the quantized input 0
|
||||
status = xnnp_define_q_tensor(
|
||||
qa,
|
||||
qa_mem_format,
|
||||
input0_id,
|
||||
subgraph_ptr,
|
||||
0,
|
||||
XNN_VALUE_FLAG_EXTERNAL_INPUT
|
||||
);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
|
||||
func_name, ": xnn define input 0 failed(", status,")!");
|
||||
|
||||
// Defining the quantized input 1
|
||||
status = xnnp_define_q_tensor(
|
||||
qb,
|
||||
qa_mem_format,
|
||||
input1_id,
|
||||
subgraph_ptr,
|
||||
1,
|
||||
XNN_VALUE_FLAG_EXTERNAL_INPUT
|
||||
);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
|
||||
func_name, ": xnn define input 1 failed(", status,")!");
|
||||
|
||||
// Defining the quantized output
|
||||
status = xnnp_define_q_tensor(
|
||||
qy,
|
||||
qa_mem_format,
|
||||
output_id,
|
||||
subgraph_ptr,
|
||||
2,
|
||||
XNN_VALUE_FLAG_EXTERNAL_OUTPUT
|
||||
);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
|
||||
func_name, ": xnn define output failed(", status,")!");
|
||||
|
||||
const struct xnn_binary_params binary_params = {output_min, output_max};
|
||||
status = xnn_define_binary(
|
||||
subgraph_ptr,
|
||||
xnn_binary_add,
|
||||
&binary_params,
|
||||
input0_id,
|
||||
input1_id,
|
||||
output_id,
|
||||
0);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name, ": xnn reshape operator failed(", status,")!");
|
||||
func_name, ": xnn define binary add failed(", status,")!");
|
||||
|
||||
// Setup the operator
|
||||
status = xnnp_setup_add_nd(
|
||||
xnnp_add_operator.get(),
|
||||
reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
|
||||
reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
|
||||
reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
|
||||
caffe2::pthreadpool_());
|
||||
// create runtime
|
||||
xnn_runtime_t runtime_ptr = nullptr;
|
||||
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name, ": xnn setup operator failed(", status,")!");
|
||||
func_name, ": xnn create runtime failed(", status,")!");
|
||||
TORCH_CHECK(
|
||||
runtime_ptr != nullptr,
|
||||
func_name, ": xnn create runtime failed because runtime_ptr is null");
|
||||
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
|
||||
runtime_ptr, &xnn_delete_runtime);
|
||||
|
||||
// Run the operator
|
||||
status = xnn_run_operator(
|
||||
xnnp_add_operator.get(), /* xnn_operator_t op */
|
||||
caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
|
||||
std::array<xnn_external_value, 3> external = {
|
||||
xnn_external_value{input0_id, reinterpret_cast<void*>(qa_contig.data_ptr<scalar_t>())},
|
||||
xnn_external_value{input1_id, reinterpret_cast<void*>(qb_contig.data_ptr<scalar_t>())},
|
||||
xnn_external_value{output_id, reinterpret_cast<void*>(qy.data_ptr<scalar_t>())}};
|
||||
|
||||
status = xnn_setup_runtime(
|
||||
runtime_ptr,
|
||||
external.size(),
|
||||
external.data());
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name, ": xnn run operator failed(", status,")");
|
||||
func_name, ": xnn setup runtime failed(", status,")!");
|
||||
status = xnn_invoke_runtime(runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name, ": xnn invoke runtime failed(", status,")!");
|
||||
|
||||
return qy;
|
||||
}
|
||||
#endif // USE_XNNPACK
|
||||
|
@ -13,7 +13,6 @@
|
||||
#include <ATen/native/quantized/cpu/init_qnnpack.h>
|
||||
#include <ATen/quantized/Quantizer.h>
|
||||
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
|
||||
#include <torch/library.h>
|
||||
|
||||
#ifndef AT_PER_OPERATOR_HEADERS
|
||||
#include <ATen/Functions.h>
|
||||
@ -56,14 +55,32 @@ Tensor _mul_out(Tensor& out, const Tensor& self, const Tensor& other) {
|
||||
}
|
||||
|
||||
#ifdef USE_XNNPACK
|
||||
C10_ALWAYS_INLINE
|
||||
enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
|
||||
Tensor contig_tensor = tensor.contiguous(format);
|
||||
const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
|
||||
const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
|
||||
const float scale = static_cast<float>(contig_tensor.q_scale());
|
||||
|
||||
return xnn_define_quantized_tensor_value(
|
||||
subgraph_ptr,
|
||||
xnn_datatype_qint8,
|
||||
zero_point,
|
||||
scale,
|
||||
tensor.ndimension(),
|
||||
tensor_shape.data(),
|
||||
nullptr,
|
||||
external_id,
|
||||
flags,
|
||||
&id);
|
||||
}
|
||||
|
||||
template <typename scalar_t, bool ReLUFused = false>
|
||||
Tensor _mul_out_xnnpack(
|
||||
const Tensor& self,
|
||||
const Tensor& other,
|
||||
double output_scale,
|
||||
int64_t output_zero_point) {
|
||||
using underlying_t = typename scalar_t::underlying;
|
||||
|
||||
const string func_name = "xnnp_mul()";
|
||||
TORCH_CHECK(self.ndimension() > 0, func_name, ": Got empty input tensor.");
|
||||
TORCH_CHECK(
|
||||
@ -89,96 +106,108 @@ Tensor _mul_out_xnnpack(
|
||||
return out;
|
||||
}
|
||||
|
||||
int64_t self_zero_point = self_contig.q_zero_point();
|
||||
double self_scale = self_contig.q_scale();
|
||||
int64_t other_zero_point = other_contig.q_zero_point();
|
||||
double other_scale = other_contig.q_scale();
|
||||
|
||||
int64_t output_min = std::numeric_limits<underlying_t>::min();
|
||||
int64_t output_max = std::numeric_limits<underlying_t>::max();
|
||||
|
||||
if(ReLUFused) {
|
||||
/*
|
||||
* FIXME: use activationLimits<T>()
|
||||
* With <T>, MSVC runs into "error C3862: identifier activationLimits not
|
||||
* found".
|
||||
*/
|
||||
constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
|
||||
constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
|
||||
int64_t qvalue = static_cast<int64_t>(output_zero_point);
|
||||
qvalue = std::max<int64_t>(qvalue, qmin);
|
||||
output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
|
||||
auto output_max = std::numeric_limits<float>::infinity();
|
||||
auto output_min = -std::numeric_limits<float>::infinity();
|
||||
if (ReLUFused) {
|
||||
output_min = 0;
|
||||
}
|
||||
|
||||
xnn_operator_t xnnp_op = nullptr;
|
||||
xnnpack_operator xnnp_qmul_operator;
|
||||
|
||||
// create xnnpack multiply operator ...
|
||||
auto status = xnn_create_multiply_nd_qs8(
|
||||
self_zero_point,
|
||||
self_scale,
|
||||
other_zero_point,
|
||||
other_scale,
|
||||
static_cast<underlying_t>(output_zero_point),
|
||||
static_cast<float>(output_scale),
|
||||
output_min,
|
||||
output_max,
|
||||
0,
|
||||
&xnnp_op);
|
||||
|
||||
// Create XNNPACK Subgraph
|
||||
xnn_subgraph_t subgraph_ptr = nullptr;
|
||||
auto status = xnn_create_subgraph(
|
||||
/*external_value_ids=*/3,
|
||||
/*flags=*/0,
|
||||
&subgraph_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name,
|
||||
": xnn create operator failed(",
|
||||
status,
|
||||
")!");
|
||||
xnnp_qmul_operator = xnnpack_operator(xnnp_op);
|
||||
func_name, ": xnn create subgraph failed(", status,")!");
|
||||
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
|
||||
subgraph_ptr, &xnn_delete_subgraph);
|
||||
|
||||
uint32_t input0_id = XNN_INVALID_VALUE_ID;
|
||||
uint32_t input1_id = XNN_INVALID_VALUE_ID;
|
||||
uint32_t output_id = XNN_INVALID_VALUE_ID;
|
||||
|
||||
const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
|
||||
const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
|
||||
|
||||
// reshape operator
|
||||
status = xnn_reshape_multiply_nd_qs8(
|
||||
xnnp_qmul_operator.get(),
|
||||
self_shape.size(),
|
||||
self_shape.data(),
|
||||
other_shape.size(),
|
||||
other_shape.data(),
|
||||
caffe2::pthreadpool_());
|
||||
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name,
|
||||
": xnn reshape operator failed(",
|
||||
status,
|
||||
")!");
|
||||
|
||||
// set up operator
|
||||
status = xnn_setup_multiply_nd_qs8(
|
||||
xnnp_qmul_operator.get(),
|
||||
reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
|
||||
reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
|
||||
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
|
||||
// Defining the quantized input 0
|
||||
status = xnnp_define_q_tensor(
|
||||
self,
|
||||
qa_mem_format,
|
||||
input0_id,
|
||||
subgraph_ptr,
|
||||
0,
|
||||
XNN_VALUE_FLAG_EXTERNAL_INPUT
|
||||
);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
|
||||
func_name, ": xnn define input 0 failed(", status,")!");
|
||||
|
||||
// Defining the quantized input 1
|
||||
status = xnnp_define_q_tensor(
|
||||
other,
|
||||
qa_mem_format,
|
||||
input1_id,
|
||||
subgraph_ptr,
|
||||
1,
|
||||
XNN_VALUE_FLAG_EXTERNAL_INPUT
|
||||
);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
|
||||
func_name, ": xnn define input 1 failed(", status,")!");
|
||||
|
||||
// Defining the quantized output
|
||||
status = xnnp_define_q_tensor(
|
||||
out,
|
||||
qa_mem_format,
|
||||
output_id,
|
||||
subgraph_ptr,
|
||||
2,
|
||||
XNN_VALUE_FLAG_EXTERNAL_OUTPUT
|
||||
);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
|
||||
func_name, ": xnn define output failed(", status,")!");
|
||||
|
||||
const struct xnn_binary_params binary_params = {output_min, output_max};
|
||||
status = xnn_define_binary(
|
||||
subgraph_ptr,
|
||||
xnn_binary_multiply,
|
||||
&binary_params,
|
||||
input0_id,
|
||||
input1_id,
|
||||
output_id,
|
||||
0);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name,
|
||||
": xnn setup operator failed(",
|
||||
status,
|
||||
")!");
|
||||
func_name, ": xnn define binary add failed(", status,")!");
|
||||
|
||||
// Run the operator
|
||||
status = xnn_run_operator(
|
||||
xnnp_qmul_operator.get(), /* xnn_operator_t op */
|
||||
caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
|
||||
// create runtime
|
||||
xnn_runtime_t runtime_ptr = nullptr;
|
||||
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name,
|
||||
": xnn run operator failed(",
|
||||
status,
|
||||
")");
|
||||
func_name, ": xnn create runtime failed(", status,")!");
|
||||
TORCH_CHECK(
|
||||
runtime_ptr != nullptr,
|
||||
func_name, ": xnn create runtime failed because runtime_ptr is null");
|
||||
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
|
||||
runtime_ptr, &xnn_delete_runtime);
|
||||
|
||||
std::array<xnn_external_value, 3> external = {
|
||||
xnn_external_value{input0_id, reinterpret_cast<void*>(self.data_ptr<scalar_t>())},
|
||||
xnn_external_value{input1_id, reinterpret_cast<void*>(other.data_ptr<scalar_t>())},
|
||||
xnn_external_value{output_id, reinterpret_cast<void*>(out.data_ptr<scalar_t>())}};
|
||||
|
||||
status = xnn_setup_runtime(
|
||||
runtime_ptr,
|
||||
external.size(),
|
||||
external.data());
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name, ": xnn setup runtime failed(", status,")!");
|
||||
status = xnn_invoke_runtime(runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
func_name, ": xnn invoke runtime failed(", status,")!");
|
||||
|
||||
return out;
|
||||
}
|
||||
|
@ -19,46 +19,84 @@ bool use_hardswish(
|
||||
|
||||
static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
|
||||
using namespace internal;
|
||||
|
||||
xnn_operator_t hardswish_op{};
|
||||
const xnn_status create_status = xnn_create_hardswish_nc_f32(
|
||||
0, // flags
|
||||
&hardswish_op);
|
||||
|
||||
// Create XNNPACK Subgraph
|
||||
xnn_subgraph_t subgraph_ptr = nullptr;
|
||||
xnn_status status = xnn_create_subgraph(
|
||||
/*external_value_ids=*/2,
|
||||
/*flags=*/0,
|
||||
&subgraph_ptr);
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == create_status,
|
||||
"xnn_create_hardswish_nc_f32 failed!");
|
||||
|
||||
Operator hardswish_scoped_op(hardswish_op);
|
||||
|
||||
const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
|
||||
hardswish_op,
|
||||
input.numel(), // Batch
|
||||
1, // channels
|
||||
1, // input stride
|
||||
1, // output stride
|
||||
caffe2::pthreadpool_()); // threadpool
|
||||
status == xnn_status_success,
|
||||
"xnn create subgraph failed(", status,")!");
|
||||
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
|
||||
subgraph_ptr, &xnn_delete_subgraph);
|
||||
uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
|
||||
std::vector<size_t> input_output_shape(input.sizes().begin(), input.sizes().end());
|
||||
|
||||
status = xnn_define_tensor_value(
|
||||
subgraph_ptr,
|
||||
xnn_datatype_fp32,
|
||||
input_output_shape.size(),
|
||||
input_output_shape.data(),
|
||||
nullptr,
|
||||
0,
|
||||
XNN_VALUE_FLAG_EXTERNAL_INPUT,
|
||||
&input_id
|
||||
);
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == reshape_status,
|
||||
"xnn_reshape_hardswish_nc_f32 failed!");
|
||||
|
||||
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
|
||||
hardswish_op,
|
||||
input.data_ptr<float>(),
|
||||
output.data_ptr<float>());
|
||||
status == xnn_status_success,
|
||||
"defining xnn input failed(", status,")!");
|
||||
|
||||
status = xnn_define_tensor_value(
|
||||
subgraph_ptr,
|
||||
xnn_datatype_fp32,
|
||||
input_output_shape.size(),
|
||||
input_output_shape.data(),
|
||||
nullptr,
|
||||
1,
|
||||
XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
|
||||
&output_id
|
||||
);
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == setup_status,
|
||||
"xnn_setup_hardswish_nc_f32 failed!");
|
||||
status == xnn_status_success,
|
||||
"defining xnn output failed(", status,")!");
|
||||
|
||||
const xnn_status run_status = xnn_run_operator(
|
||||
hardswish_op,
|
||||
caffe2::pthreadpool_()); // threadpool
|
||||
status = xnn_define_unary(
|
||||
subgraph_ptr,
|
||||
xnn_unary_hardswish,
|
||||
nullptr,
|
||||
input_id,
|
||||
output_id,
|
||||
0
|
||||
);
|
||||
|
||||
TORCH_INTERNAL_ASSERT(
|
||||
xnn_status_success == run_status,
|
||||
"xnn_run_operator failed!");
|
||||
// create runtime
|
||||
xnn_runtime_t runtime_ptr = nullptr;
|
||||
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
"xnn create runtime failed(", status,")!");
|
||||
TORCH_CHECK(
|
||||
runtime_ptr != nullptr,
|
||||
"xnn create runtime failed because runtime_ptr is null");
|
||||
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
|
||||
runtime_ptr, &xnn_delete_runtime);
|
||||
|
||||
std::array<xnn_external_value, 2> external = {
|
||||
xnn_external_value{input_id, input.data_ptr<float>()},
|
||||
xnn_external_value{output_id, output.data_ptr<float>()}};
|
||||
|
||||
status = xnn_setup_runtime(
|
||||
runtime_ptr,
|
||||
external.size(),
|
||||
external.data());
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
"xnn setup runtime failed(", status,")!");
|
||||
status = xnn_invoke_runtime(runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
"xnn invoke runtime failed(", status,")!");
|
||||
|
||||
return output;
|
||||
}
|
||||
|
@ -7,6 +7,27 @@
|
||||
|
||||
namespace at::native::xnnpack {
|
||||
|
||||
inline std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
|
||||
const auto mem_format = in.suggest_memory_format();
|
||||
const auto& sizes = in.sizes();
|
||||
std::vector<size_t> ret(sizes.begin(), sizes.end());
|
||||
if (mem_format == c10::MemoryFormat::ChannelsLast) {
|
||||
// NCHW -> NHWC
|
||||
// 0123 -> 0231
|
||||
ret[1] = sizes[2]; /* H */
|
||||
ret[2] = sizes[3]; /* W */
|
||||
ret[3] = sizes[1]; /* C */
|
||||
} else if (mem_format == c10::MemoryFormat::ChannelsLast3d) {
|
||||
// NCDHW -> NDHWC
|
||||
// 01234 -> 02341
|
||||
ret[1] = sizes[2]; /* D */
|
||||
ret[2] = sizes[3]; /* H */
|
||||
ret[3] = sizes[4]; /* W */
|
||||
ret[4] = sizes[1]; /* C */
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
|
||||
bool use_global_average_pool(const Tensor& input) {
|
||||
return xnnpack::available() && (1 <= input.ndimension()) &&
|
||||
(input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
|
||||
@ -31,63 +52,91 @@ Tensor global_average_pool(const Tensor& input) {
|
||||
MemoryFormat::ChannelsLast,
|
||||
input_padded_contig_nhwc.opt_names());
|
||||
|
||||
xnn_operator_t global_average_pooling_op{};
|
||||
const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
|
||||
-std::numeric_limits<float>::infinity(),
|
||||
std::numeric_limits<float>::infinity(),
|
||||
0 /* flags */,
|
||||
&global_average_pooling_op);
|
||||
|
||||
// Create XNNPACK Subgraph
|
||||
xnn_subgraph_t subgraph_ptr = nullptr;
|
||||
xnn_status status = xnn_create_subgraph(
|
||||
/*external_value_ids=*/2,
|
||||
/*flags=*/0,
|
||||
&subgraph_ptr);
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == create_status,
|
||||
"xnn_create_global_average_pooling_nwc_f32 failed!");
|
||||
status == xnn_status_success,
|
||||
"xnn create subgraph failed(", status,")!");
|
||||
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
|
||||
subgraph_ptr, &xnn_delete_subgraph);
|
||||
uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
|
||||
|
||||
Operator global_avg_pool_scoped_op(global_average_pooling_op);
|
||||
|
||||
size_t workspace_size = 0;
|
||||
size_t workspace_alignment = 0;
|
||||
|
||||
const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
|
||||
global_average_pooling_op,
|
||||
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
|
||||
input_padded_contig_nhwc.size(Layout::Activation4D::width) *
|
||||
input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
|
||||
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
|
||||
input_padded_contig_nhwc.size(
|
||||
Layout::Activation4D::channels), // input stride
|
||||
input_padded_contig_nhwc.size(
|
||||
Layout::Activation4D::channels), // output stride
|
||||
&workspace_size, // workspace_size
|
||||
&workspace_alignment, // workspace_alignment
|
||||
caffe2::pthreadpool_());
|
||||
|
||||
const auto& input_shape = get_mem_format_aware_shape(input_padded_contig_nhwc);
|
||||
status = xnn_define_tensor_value(
|
||||
subgraph_ptr,
|
||||
xnn_datatype_fp32,
|
||||
input_shape.size(),
|
||||
input_shape.data(),
|
||||
nullptr,
|
||||
0,
|
||||
XNN_VALUE_FLAG_EXTERNAL_INPUT,
|
||||
&input_id
|
||||
);
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == reshape_status,
|
||||
"xnn_reshape_global_average_pooling_nwc_f32 failed!");
|
||||
|
||||
// Create Workspace pointer, which we will align and pad with 16 bytes
|
||||
size_t xnnpack_buffer_padding = 16;
|
||||
std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
|
||||
void* maybe_aligned_workspace = workspace_vector.data();
|
||||
void* aligned_workspace =
|
||||
(void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
|
||||
|
||||
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
|
||||
global_average_pooling_op,
|
||||
aligned_workspace,
|
||||
input_padded_contig_nhwc.data_ptr<float>(),
|
||||
output.data_ptr<float>());
|
||||
status == xnn_status_success,
|
||||
"defining xnn input failed(", status,")!");
|
||||
|
||||
const auto& output_shape = get_mem_format_aware_shape(output);
|
||||
status = xnn_define_tensor_value(
|
||||
subgraph_ptr,
|
||||
xnn_datatype_fp32,
|
||||
output_shape.size(),
|
||||
output_shape.data(),
|
||||
nullptr,
|
||||
1,
|
||||
XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
|
||||
&output_id
|
||||
);
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == setup_status,
|
||||
"xnn_setup_global_average_pooling_nwc_f32 failed!");
|
||||
|
||||
const xnn_status run_status =
|
||||
xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
|
||||
status == xnn_status_success,
|
||||
"defining xnn output failed(", status,")!");
|
||||
|
||||
std::vector<size_t> reduce_dims{1, 2};
|
||||
status = xnn_define_static_reduce(
|
||||
subgraph_ptr,
|
||||
xnn_reduce_mean,
|
||||
reduce_dims.size(),
|
||||
reduce_dims.data(),
|
||||
input_id,
|
||||
output_id,
|
||||
0
|
||||
);
|
||||
TORCH_CHECK(
|
||||
xnn_status_success == run_status,
|
||||
"xnn_setup_global_average_pooling_nwc_f32 failed!");
|
||||
status == xnn_status_success,
|
||||
"defining xnn static reduce failed(", status,")!");
|
||||
|
||||
// create runtime
|
||||
xnn_runtime_t runtime_ptr = nullptr;
|
||||
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
"xnn create runtime failed(", status,")!");
|
||||
TORCH_CHECK(
|
||||
runtime_ptr != nullptr,
|
||||
"xnn create runtime failed because runtime_ptr is null");
|
||||
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
|
||||
runtime_ptr, &xnn_delete_runtime);
|
||||
|
||||
std::array<xnn_external_value, 2> external = {
|
||||
xnn_external_value{input_id, input_padded_contig_nhwc.data_ptr<float>()},
|
||||
xnn_external_value{output_id, output.data_ptr<float>()}};
|
||||
|
||||
status = xnn_setup_runtime(
|
||||
runtime_ptr,
|
||||
external.size(),
|
||||
external.data());
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
"xnn setup runtime failed(", status,")!");
|
||||
status = xnn_invoke_runtime(runtime_ptr);
|
||||
TORCH_CHECK(
|
||||
status == xnn_status_success,
|
||||
"xnn invoke runtime failed(", status,")!");
|
||||
|
||||
return output.to(input.suggest_memory_format());
|
||||
}
|
||||
|
@ -516,6 +516,9 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
|
||||
# Disable I8MM For CI since clang 9 does not support neon i8mm.
|
||||
set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
|
||||
|
||||
# Disable avxvnni int8
|
||||
set(XNNPACK_ENABLE_AVXVNNIINT8 OFF CACHE BOOL "")
|
||||
|
||||
# Older MSVC versions don't support AVX512FP. TODO Minimum version support?
|
||||
IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
|
||||
set(XNNPACK_ENABLE_AVX512FP16 OFF CACHE BOOL "")
|
||||
|
@ -94,6 +94,7 @@ else()
|
||||
|
||||
if(@USE_XNNPACK@)
|
||||
append_torchlib_if_found(XNNPACK)
|
||||
append_torchlib_if_found(microkernels-prod)
|
||||
endif()
|
||||
|
||||
append_torchlib_if_found(caffe2_protos protobuf-lite protobuf protoc)
|
||||
|
@ -111,7 +111,7 @@ else
|
||||
end
|
||||
|
||||
puts "Linking static libraries..."
|
||||
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
|
||||
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
|
||||
frameworks = ['CoreML', 'Metal', 'MetalPerformanceShaders', 'Accelerate', 'UIKit']
|
||||
targets.each do |target|
|
||||
# NB: All these libraries and frameworks have already been linked by TestApp, adding them
|
||||
|
@ -40,7 +40,7 @@ end
|
||||
|
||||
# link static libraries
|
||||
target.frameworks_build_phases.clear
|
||||
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
|
||||
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
|
||||
for lib in libs do
|
||||
path = "#{install_path}/lib/#{lib}"
|
||||
if File.exist?(path)
|
||||
|
2
third_party/XNNPACK
vendored
2
third_party/XNNPACK
vendored
Submodule third_party/XNNPACK updated: 87ee0b46b8...4ea82e595b
820
third_party/xnnpack.buck.bzl
vendored
820
third_party/xnnpack.buck.bzl
vendored
File diff suppressed because it is too large
Load Diff
33
third_party/xnnpack_buck_shim.bzl
vendored
Normal file
33
third_party/xnnpack_buck_shim.bzl
vendored
Normal file
@ -0,0 +1,33 @@
|
||||
load(
|
||||
"//xplat/third-party/XNNPACK/XNNPACK:build_srcs.bzl",
|
||||
_LOGGING_SRCS = "LOGGING_SRCS",
|
||||
_OPERATOR_SRCS = "OPERATOR_SRCS",
|
||||
_SUBGRAPH_SRCS = "SUBGRAPH_SRCS",
|
||||
_TABLE_SRCS = "TABLE_SRCS",
|
||||
_XNNPACK_SRCS = "XNNPACK_SRCS",
|
||||
)
|
||||
load("//xplat/third-party/XNNPACK/XNNPACK/gen:microkernels.bzl", "prod_srcs_for_arch")
|
||||
load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
|
||||
|
||||
def define_xnnpack_build_src(xnnpack_build_src):
|
||||
return ["XNNPACK/{}".format(src) for src in xnnpack_build_src]
|
||||
|
||||
def prod_srcs_for_arch_wrapper(arch):
|
||||
prod_srcs = prod_srcs_for_arch(arch)
|
||||
return define_xnnpack_build_src(prod_srcs)
|
||||
|
||||
def get_xnnpack_headers():
|
||||
src_headers = subdir_glob([
|
||||
("XNNPACK/src", "**/*.h"),
|
||||
])
|
||||
include_headers = subdir_glob([
|
||||
("XNNPACK/include", "*.h"),
|
||||
])
|
||||
|
||||
return src_headers | include_headers
|
||||
|
||||
OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
|
||||
SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)
|
||||
TABLE_SRCS = define_xnnpack_build_src(_TABLE_SRCS)
|
||||
XNNPACK_SRCS = define_xnnpack_build_src(_XNNPACK_SRCS)
|
||||
LOGGING_SRCS = define_xnnpack_build_src(_LOGGING_SRCS)
|
1929
third_party/xnnpack_src_defs.bzl
vendored
1929
third_party/xnnpack_src_defs.bzl
vendored
File diff suppressed because it is too large
Load Diff
1596
third_party/xnnpack_wrapper_defs.bzl
vendored
1596
third_party/xnnpack_wrapper_defs.bzl
vendored
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user