Update XNNPACK Version (#139913)

Updating XNNPACK Version to 4ea82e595b36106653175dcb04b2aa532660d0d8

submodule update
Pull Request resolved: https://github.com/pytorch/pytorch/pull/139913
Approved by: https://github.com/digantdesai, https://github.com/huydhn
This commit is contained in:
Max Ren
2024-11-18 18:16:31 +00:00
committed by PyTorch MergeBot
parent e429a3b72e
commit cca34be584
15 changed files with 3429 additions and 1611 deletions

View File

@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
# build a FAT bianry
cd ${ZIP_DIR}/install/lib
target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a libmicrokernels-prod.a)
for lib in ${target_libs[*]}
do
if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then

View File

@ -112,6 +112,7 @@ if(ANDROID_ABI)
import_static_lib(libc10)
import_static_lib(libnnpack)
import_static_lib(libXNNPACK)
import_static_lib(libmicrokernels-prod)
import_static_lib(libpytorch_qnnpack)
import_static_lib(libpthreadpool)
import_static_lib(libeigen_blas)
@ -129,6 +130,7 @@ if(ANDROID_ABI)
libc10
libnnpack
libXNNPACK
libmicrokernels-prod
libpytorch_qnnpack
libpthreadpool
libeigen_blas
@ -151,6 +153,7 @@ else()
if(USE_XNNPACK)
list(APPEND pytorch_jni_LIBS XNNPACK)
list(APPEND pytorch_jni_LIBS microkernels-prod)
endif()
if(USE_SYSTEM_PTHREADPOOL)

View File

@ -234,62 +234,27 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
#ifdef USE_XNNPACK
C10_ALWAYS_INLINE
enum xnn_status xnnp_create_add_nd(
int8_t azp,
float ascale,
int8_t bzp,
float bscale,
int8_t czp,
float cscale,
int8_t output_min,
int8_t output_max,
uint32_t flags,
xnn_operator_t* op) {
return xnn_create_add_nd_qs8(
azp, /* int8_t input1_zero_point */
ascale, /* float input1_scale */
bzp, /* int8_t input2_zero_point */
bscale, /* float input2_scale */
czp, /* int8_t output_zero_point */
cscale, /* float output_scale */
output_min, /* int8_t output_min */
output_max, /* int8_t output_max */
flags, /* uint32_t flags */
op); /* xnn_operator_t* add_op_out */
}
enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
Tensor contig_tensor = tensor.contiguous(format);
const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
const float scale = static_cast<float>(contig_tensor.q_scale());
C10_ALWAYS_INLINE
enum xnn_status xnnp_reshape_add_nd(
xnn_operator_t op,
const std::vector<size_t>& a_shape,
const std::vector<size_t>& b_shape,
pthreadpool_t pt_pool) {
return xnn_reshape_add_nd_qs8(
op, /* xnn_operator_t add_op */
a_shape.size(), /* size_t num_input1_dims */
a_shape.data(), /* const size_t* input1_shape */
b_shape.size(), /* size_t num_input2_dims */
b_shape.data(), /* const size_t* input2_shape */
pt_pool); /* pthreadpool_t threadpool */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_add_nd(
xnn_operator_t op,
const int8_t* da,
const int8_t* db,
int8_t* dc,
pthreadpool_t pt_pool) {
return xnn_setup_add_nd_qs8(
op, /* xnn_operator_t add_op */
da, /* const int8_t* input1 */
db, /* const int8_t* input2 */
dc); /* int8_t* output */
return xnn_define_quantized_tensor_value(
subgraph_ptr,
xnn_datatype_qint8,
zero_point,
scale,
tensor.ndimension(),
tensor_shape.data(),
nullptr,
external_id,
flags,
&id);
}
template <typename scalar_t, bool ReLUFused = false>
Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
using underlying_t = typename scalar_t::underlying;
const string func_name = "xnnp_add()";
TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
@ -299,12 +264,6 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
auto qa_mem_format = qa.suggest_memory_format();
Tensor qa_contig = qa.contiguous(qa_mem_format);
Tensor qb_contig = qb.contiguous(qa_mem_format);
const auto a_zero_point = qa_contig.q_zero_point();
const auto b_zero_point = qb_contig.q_zero_point();
const auto a_scale = qa_contig.q_scale();
const auto b_scale = qb_contig.q_scale();
Tensor qy = at::native::empty_affine_quantized(
at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
qa.scalar_type(),
@ -319,72 +278,108 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
return qy;
}
xnn_operator_t xnnp_op = nullptr;
xnnpack_operator xnnp_add_operator;
auto output_max = std::numeric_limits<underlying_t>::max();
auto output_min = std::numeric_limits<underlying_t>::min();
auto output_max = std::numeric_limits<float>::infinity();
auto output_min = -std::numeric_limits<float>::infinity();
if (ReLUFused) {
/*
* FIXME: use activationLimits<T>()
* With <T>, MSVC runs into "error C3862: identifier activationLimits not found".
*/
constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
int64_t qvalue = static_cast<int64_t>(zero_point);
qvalue = std::max<int64_t>(qvalue, qmin);
output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
output_min = 0;
}
// Create an operator
auto status = xnnp_create_add_nd(
a_zero_point,
a_scale,
b_zero_point,
b_scale,
static_cast<underlying_t>(zero_point),
static_cast<float>(scale),
output_min,
output_max,
0,
&xnnp_op);
xnnp_add_operator = xnnpack_operator(xnnp_op);
// Create XNNPACK Subgraph
xnn_subgraph_t subgraph_ptr = nullptr;
auto status = xnn_create_subgraph(
/*external_value_ids=*/3,
/*flags=*/0,
&subgraph_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn create operator failed(", status,")!");
func_name, ": xnn create subgraph failed(", status,")!");
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
subgraph_ptr, &xnn_delete_subgraph);
const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
uint32_t input0_id = XNN_INVALID_VALUE_ID, input1_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
// Reshape the operator
status = xnnp_reshape_add_nd(
xnnp_add_operator.get(),
qa_shape,
qb_shape,
caffe2::pthreadpool_());
// Defining the quantized input 0
status = xnnp_define_q_tensor(
qa,
qa_mem_format,
input0_id,
subgraph_ptr,
0,
XNN_VALUE_FLAG_EXTERNAL_INPUT
);
TORCH_CHECK(
status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 0 failed(", status,")!");
// Defining the quantized input 1
status = xnnp_define_q_tensor(
qb,
qa_mem_format,
input1_id,
subgraph_ptr,
1,
XNN_VALUE_FLAG_EXTERNAL_INPUT
);
TORCH_CHECK(
status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 1 failed(", status,")!");
// Defining the quantized output
status = xnnp_define_q_tensor(
qy,
qa_mem_format,
output_id,
subgraph_ptr,
2,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT
);
TORCH_CHECK(
status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define output failed(", status,")!");
const struct xnn_binary_params binary_params = {output_min, output_max};
status = xnn_define_binary(
subgraph_ptr,
xnn_binary_add,
&binary_params,
input0_id,
input1_id,
output_id,
0);
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn reshape operator failed(", status,")!");
func_name, ": xnn define binary add failed(", status,")!");
// Setup the operator
status = xnnp_setup_add_nd(
xnnp_add_operator.get(),
reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
caffe2::pthreadpool_());
// create runtime
xnn_runtime_t runtime_ptr = nullptr;
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn setup operator failed(", status,")!");
func_name, ": xnn create runtime failed(", status,")!");
TORCH_CHECK(
runtime_ptr != nullptr,
func_name, ": xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
// Run the operator
status = xnn_run_operator(
xnnp_add_operator.get(), /* xnn_operator_t op */
caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
std::array<xnn_external_value, 3> external = {
xnn_external_value{input0_id, reinterpret_cast<void*>(qa_contig.data_ptr<scalar_t>())},
xnn_external_value{input1_id, reinterpret_cast<void*>(qb_contig.data_ptr<scalar_t>())},
xnn_external_value{output_id, reinterpret_cast<void*>(qy.data_ptr<scalar_t>())}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn run operator failed(", status,")");
func_name, ": xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn invoke runtime failed(", status,")!");
return qy;
}
#endif // USE_XNNPACK

View File

@ -13,7 +13,6 @@
#include <ATen/native/quantized/cpu/init_qnnpack.h>
#include <ATen/quantized/Quantizer.h>
#include <caffe2/utils/threadpool/pthreadpool-cpp.h>
#include <torch/library.h>
#ifndef AT_PER_OPERATOR_HEADERS
#include <ATen/Functions.h>
@ -56,14 +55,32 @@ Tensor _mul_out(Tensor& out, const Tensor& self, const Tensor& other) {
}
#ifdef USE_XNNPACK
C10_ALWAYS_INLINE
enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
Tensor contig_tensor = tensor.contiguous(format);
const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
const float scale = static_cast<float>(contig_tensor.q_scale());
return xnn_define_quantized_tensor_value(
subgraph_ptr,
xnn_datatype_qint8,
zero_point,
scale,
tensor.ndimension(),
tensor_shape.data(),
nullptr,
external_id,
flags,
&id);
}
template <typename scalar_t, bool ReLUFused = false>
Tensor _mul_out_xnnpack(
const Tensor& self,
const Tensor& other,
double output_scale,
int64_t output_zero_point) {
using underlying_t = typename scalar_t::underlying;
const string func_name = "xnnp_mul()";
TORCH_CHECK(self.ndimension() > 0, func_name, ": Got empty input tensor.");
TORCH_CHECK(
@ -89,96 +106,108 @@ Tensor _mul_out_xnnpack(
return out;
}
int64_t self_zero_point = self_contig.q_zero_point();
double self_scale = self_contig.q_scale();
int64_t other_zero_point = other_contig.q_zero_point();
double other_scale = other_contig.q_scale();
int64_t output_min = std::numeric_limits<underlying_t>::min();
int64_t output_max = std::numeric_limits<underlying_t>::max();
if(ReLUFused) {
/*
* FIXME: use activationLimits<T>()
* With <T>, MSVC runs into "error C3862: identifier activationLimits not
* found".
*/
constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
int64_t qvalue = static_cast<int64_t>(output_zero_point);
qvalue = std::max<int64_t>(qvalue, qmin);
output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
auto output_max = std::numeric_limits<float>::infinity();
auto output_min = -std::numeric_limits<float>::infinity();
if (ReLUFused) {
output_min = 0;
}
xnn_operator_t xnnp_op = nullptr;
xnnpack_operator xnnp_qmul_operator;
// create xnnpack multiply operator ...
auto status = xnn_create_multiply_nd_qs8(
self_zero_point,
self_scale,
other_zero_point,
other_scale,
static_cast<underlying_t>(output_zero_point),
static_cast<float>(output_scale),
output_min,
output_max,
0,
&xnnp_op);
// Create XNNPACK Subgraph
xnn_subgraph_t subgraph_ptr = nullptr;
auto status = xnn_create_subgraph(
/*external_value_ids=*/3,
/*flags=*/0,
&subgraph_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn create operator failed(",
status,
")!");
xnnp_qmul_operator = xnnpack_operator(xnnp_op);
func_name, ": xnn create subgraph failed(", status,")!");
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
subgraph_ptr, &xnn_delete_subgraph);
uint32_t input0_id = XNN_INVALID_VALUE_ID;
uint32_t input1_id = XNN_INVALID_VALUE_ID;
uint32_t output_id = XNN_INVALID_VALUE_ID;
const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
// reshape operator
status = xnn_reshape_multiply_nd_qs8(
xnnp_qmul_operator.get(),
self_shape.size(),
self_shape.data(),
other_shape.size(),
other_shape.data(),
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn reshape operator failed(",
status,
")!");
// set up operator
status = xnn_setup_multiply_nd_qs8(
xnnp_qmul_operator.get(),
reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
// Defining the quantized input 0
status = xnnp_define_q_tensor(
self,
qa_mem_format,
input0_id,
subgraph_ptr,
0,
XNN_VALUE_FLAG_EXTERNAL_INPUT
);
TORCH_CHECK(
status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 0 failed(", status,")!");
// Defining the quantized input 1
status = xnnp_define_q_tensor(
other,
qa_mem_format,
input1_id,
subgraph_ptr,
1,
XNN_VALUE_FLAG_EXTERNAL_INPUT
);
TORCH_CHECK(
status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define input 1 failed(", status,")!");
// Defining the quantized output
status = xnnp_define_q_tensor(
out,
qa_mem_format,
output_id,
subgraph_ptr,
2,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT
);
TORCH_CHECK(
status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
func_name, ": xnn define output failed(", status,")!");
const struct xnn_binary_params binary_params = {output_min, output_max};
status = xnn_define_binary(
subgraph_ptr,
xnn_binary_multiply,
&binary_params,
input0_id,
input1_id,
output_id,
0);
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn setup operator failed(",
status,
")!");
func_name, ": xnn define binary add failed(", status,")!");
// Run the operator
status = xnn_run_operator(
xnnp_qmul_operator.get(), /* xnn_operator_t op */
caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
// create runtime
xnn_runtime_t runtime_ptr = nullptr;
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn run operator failed(",
status,
")");
func_name, ": xnn create runtime failed(", status,")!");
TORCH_CHECK(
runtime_ptr != nullptr,
func_name, ": xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
std::array<xnn_external_value, 3> external = {
xnn_external_value{input0_id, reinterpret_cast<void*>(self.data_ptr<scalar_t>())},
xnn_external_value{input1_id, reinterpret_cast<void*>(other.data_ptr<scalar_t>())},
xnn_external_value{output_id, reinterpret_cast<void*>(out.data_ptr<scalar_t>())}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn invoke runtime failed(", status,")!");
return out;
}

View File

@ -19,46 +19,84 @@ bool use_hardswish(
static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
using namespace internal;
xnn_operator_t hardswish_op{};
const xnn_status create_status = xnn_create_hardswish_nc_f32(
0, // flags
&hardswish_op);
// Create XNNPACK Subgraph
xnn_subgraph_t subgraph_ptr = nullptr;
xnn_status status = xnn_create_subgraph(
/*external_value_ids=*/2,
/*flags=*/0,
&subgraph_ptr);
TORCH_CHECK(
xnn_status_success == create_status,
"xnn_create_hardswish_nc_f32 failed!");
Operator hardswish_scoped_op(hardswish_op);
const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
hardswish_op,
input.numel(), // Batch
1, // channels
1, // input stride
1, // output stride
caffe2::pthreadpool_()); // threadpool
status == xnn_status_success,
"xnn create subgraph failed(", status,")!");
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
subgraph_ptr, &xnn_delete_subgraph);
uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
std::vector<size_t> input_output_shape(input.sizes().begin(), input.sizes().end());
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
input_output_shape.size(),
input_output_shape.data(),
nullptr,
0,
XNN_VALUE_FLAG_EXTERNAL_INPUT,
&input_id
);
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_hardswish_nc_f32 failed!");
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
hardswish_op,
input.data_ptr<float>(),
output.data_ptr<float>());
status == xnn_status_success,
"defining xnn input failed(", status,")!");
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
input_output_shape.size(),
input_output_shape.data(),
nullptr,
1,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
&output_id
);
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_hardswish_nc_f32 failed!");
status == xnn_status_success,
"defining xnn output failed(", status,")!");
const xnn_status run_status = xnn_run_operator(
hardswish_op,
caffe2::pthreadpool_()); // threadpool
status = xnn_define_unary(
subgraph_ptr,
xnn_unary_hardswish,
nullptr,
input_id,
output_id,
0
);
TORCH_INTERNAL_ASSERT(
xnn_status_success == run_status,
"xnn_run_operator failed!");
// create runtime
xnn_runtime_t runtime_ptr = nullptr;
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn create runtime failed(", status,")!");
TORCH_CHECK(
runtime_ptr != nullptr,
"xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
std::array<xnn_external_value, 2> external = {
xnn_external_value{input_id, input.data_ptr<float>()},
xnn_external_value{output_id, output.data_ptr<float>()}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK(
status == xnn_status_success,
"xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn invoke runtime failed(", status,")!");
return output;
}

View File

@ -7,6 +7,27 @@
namespace at::native::xnnpack {
inline std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
const auto mem_format = in.suggest_memory_format();
const auto& sizes = in.sizes();
std::vector<size_t> ret(sizes.begin(), sizes.end());
if (mem_format == c10::MemoryFormat::ChannelsLast) {
// NCHW -> NHWC
// 0123 -> 0231
ret[1] = sizes[2]; /* H */
ret[2] = sizes[3]; /* W */
ret[3] = sizes[1]; /* C */
} else if (mem_format == c10::MemoryFormat::ChannelsLast3d) {
// NCDHW -> NDHWC
// 01234 -> 02341
ret[1] = sizes[2]; /* D */
ret[2] = sizes[3]; /* H */
ret[3] = sizes[4]; /* W */
ret[4] = sizes[1]; /* C */
}
return ret;
}
bool use_global_average_pool(const Tensor& input) {
return xnnpack::available() && (1 <= input.ndimension()) &&
(input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
@ -31,63 +52,91 @@ Tensor global_average_pool(const Tensor& input) {
MemoryFormat::ChannelsLast,
input_padded_contig_nhwc.opt_names());
xnn_operator_t global_average_pooling_op{};
const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::infinity(),
0 /* flags */,
&global_average_pooling_op);
// Create XNNPACK Subgraph
xnn_subgraph_t subgraph_ptr = nullptr;
xnn_status status = xnn_create_subgraph(
/*external_value_ids=*/2,
/*flags=*/0,
&subgraph_ptr);
TORCH_CHECK(
xnn_status_success == create_status,
"xnn_create_global_average_pooling_nwc_f32 failed!");
status == xnn_status_success,
"xnn create subgraph failed(", status,")!");
std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
subgraph_ptr, &xnn_delete_subgraph);
uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
Operator global_avg_pool_scoped_op(global_average_pooling_op);
size_t workspace_size = 0;
size_t workspace_alignment = 0;
const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
global_average_pooling_op,
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
input_padded_contig_nhwc.size(Layout::Activation4D::width) *
input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // input stride
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // output stride
&workspace_size, // workspace_size
&workspace_alignment, // workspace_alignment
caffe2::pthreadpool_());
const auto& input_shape = get_mem_format_aware_shape(input_padded_contig_nhwc);
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
input_shape.size(),
input_shape.data(),
nullptr,
0,
XNN_VALUE_FLAG_EXTERNAL_INPUT,
&input_id
);
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_global_average_pooling_nwc_f32 failed!");
// Create Workspace pointer, which we will align and pad with 16 bytes
size_t xnnpack_buffer_padding = 16;
std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
void* maybe_aligned_workspace = workspace_vector.data();
void* aligned_workspace =
(void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
global_average_pooling_op,
aligned_workspace,
input_padded_contig_nhwc.data_ptr<float>(),
output.data_ptr<float>());
status == xnn_status_success,
"defining xnn input failed(", status,")!");
const auto& output_shape = get_mem_format_aware_shape(output);
status = xnn_define_tensor_value(
subgraph_ptr,
xnn_datatype_fp32,
output_shape.size(),
output_shape.data(),
nullptr,
1,
XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
&output_id
);
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
const xnn_status run_status =
xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
status == xnn_status_success,
"defining xnn output failed(", status,")!");
std::vector<size_t> reduce_dims{1, 2};
status = xnn_define_static_reduce(
subgraph_ptr,
xnn_reduce_mean,
reduce_dims.size(),
reduce_dims.data(),
input_id,
output_id,
0
);
TORCH_CHECK(
xnn_status_success == run_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
status == xnn_status_success,
"defining xnn static reduce failed(", status,")!");
// create runtime
xnn_runtime_t runtime_ptr = nullptr;
status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn create runtime failed(", status,")!");
TORCH_CHECK(
runtime_ptr != nullptr,
"xnn create runtime failed because runtime_ptr is null");
std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
runtime_ptr, &xnn_delete_runtime);
std::array<xnn_external_value, 2> external = {
xnn_external_value{input_id, input_padded_contig_nhwc.data_ptr<float>()},
xnn_external_value{output_id, output.data_ptr<float>()}};
status = xnn_setup_runtime(
runtime_ptr,
external.size(),
external.data());
TORCH_CHECK(
status == xnn_status_success,
"xnn setup runtime failed(", status,")!");
status = xnn_invoke_runtime(runtime_ptr);
TORCH_CHECK(
status == xnn_status_success,
"xnn invoke runtime failed(", status,")!");
return output.to(input.suggest_memory_format());
}

View File

@ -516,6 +516,9 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
# Disable I8MM For CI since clang 9 does not support neon i8mm.
set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
# Disable avxvnni int8
set(XNNPACK_ENABLE_AVXVNNIINT8 OFF CACHE BOOL "")
# Older MSVC versions don't support AVX512FP. TODO Minimum version support?
IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
set(XNNPACK_ENABLE_AVX512FP16 OFF CACHE BOOL "")

View File

@ -94,6 +94,7 @@ else()
if(@USE_XNNPACK@)
append_torchlib_if_found(XNNPACK)
append_torchlib_if_found(microkernels-prod)
endif()
append_torchlib_if_found(caffe2_protos protobuf-lite protobuf protoc)

View File

@ -111,7 +111,7 @@ else
end
puts "Linking static libraries..."
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
frameworks = ['CoreML', 'Metal', 'MetalPerformanceShaders', 'Accelerate', 'UIKit']
targets.each do |target|
# NB: All these libraries and frameworks have already been linked by TestApp, adding them

View File

@ -40,7 +40,7 @@ end
# link static libraries
target.frameworks_build_phases.clear
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
for lib in libs do
path = "#{install_path}/lib/#{lib}"
if File.exist?(path)

File diff suppressed because it is too large Load Diff

33
third_party/xnnpack_buck_shim.bzl vendored Normal file
View File

@ -0,0 +1,33 @@
load(
"//xplat/third-party/XNNPACK/XNNPACK:build_srcs.bzl",
_LOGGING_SRCS = "LOGGING_SRCS",
_OPERATOR_SRCS = "OPERATOR_SRCS",
_SUBGRAPH_SRCS = "SUBGRAPH_SRCS",
_TABLE_SRCS = "TABLE_SRCS",
_XNNPACK_SRCS = "XNNPACK_SRCS",
)
load("//xplat/third-party/XNNPACK/XNNPACK/gen:microkernels.bzl", "prod_srcs_for_arch")
load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
def define_xnnpack_build_src(xnnpack_build_src):
return ["XNNPACK/{}".format(src) for src in xnnpack_build_src]
def prod_srcs_for_arch_wrapper(arch):
prod_srcs = prod_srcs_for_arch(arch)
return define_xnnpack_build_src(prod_srcs)
def get_xnnpack_headers():
src_headers = subdir_glob([
("XNNPACK/src", "**/*.h"),
])
include_headers = subdir_glob([
("XNNPACK/include", "*.h"),
])
return src_headers | include_headers
OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)
TABLE_SRCS = define_xnnpack_build_src(_TABLE_SRCS)
XNNPACK_SRCS = define_xnnpack_build_src(_XNNPACK_SRCS)
LOGGING_SRCS = define_xnnpack_build_src(_LOGGING_SRCS)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff