[8/n] Update XNNPACK Version Part 8 Everything Remaining to get it to work (#115587)

> **__Note:__** XNNPACK Upgrade is too large in the range of **40k** files and **10m** Lines of code, Thus we break the update of the library into multiple parts. All Parts [1 - 6/n] Must be landed together for it to work. ***This also means If there is a revert. Please revert the Entire Stack.***

This change is everything remaining requiring XNNPACK version to work.

Differential Revision: [D52044420](https://our.internmc.facebook.com/intern/diff/D52044420/)

Pull Request resolved: https://github.com/pytorch/pytorch/pull/115587
Approved by: https://github.com/digantdesai
This commit is contained in:
Max Ren
2023-12-11 21:35:02 -08:00
committed by PyTorch MergeBot
parent e918461377
commit a8dc9d8e35
20 changed files with 1921 additions and 13596 deletions

View File

@ -259,24 +259,32 @@ enum xnn_status xnnp_create_add_nd(
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_add_nd(
enum xnn_status xnnp_reshape_add_nd(
xnn_operator_t op,
const std::vector<size_t>& a_shape,
const std::vector<size_t>& b_shape,
pthreadpool_t pt_pool) {
return xnn_reshape_add_nd_qs8(
op, /* xnn_operator_t add_op */
a_shape.size(), /* size_t num_input1_dims */
a_shape.data(), /* const size_t* input1_shape */
b_shape.size(), /* size_t num_input2_dims */
b_shape.data(), /* const size_t* input2_shape */
pt_pool); /* pthreadpool_t threadpool */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_add_nd(
xnn_operator_t op,
const int8_t* da,
const int8_t* db,
int8_t* dc,
pthreadpool_t pt_pool) {
return xnn_setup_add_nd_qs8(
op, /* xnn_operator_t add_op */
a_shape.size(), /* size_t num_input1_dims */
a_shape.data(), /* const size_t* input1_shape */
b_shape.size(), /* size_t num_input2_dims */
b_shape.data(), /* const size_t* input2_shape */
da, /* const int8_t* input1 */
db, /* const int8_t* input2 */
dc, /* int8_t* output */
pt_pool); /* pthreadpool_t threadpool */
dc); /* int8_t* output */
}
template <typename scalar_t, bool ReLUFused = false>
@ -348,11 +356,20 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
// Setup the operator
status = xnnp_setup_add_nd(
// Reshape the operator
status = xnnp_reshape_add_nd(
xnnp_add_operator.get(),
qa_shape,
qb_shape,
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn reshape operator failed(", status,")!");
// Setup the operator
status = xnnp_setup_add_nd(
xnnp_add_operator.get(),
reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),

View File

@ -100,6 +100,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
op_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t weights_cache */
op); /* xnn_operator_t* deconvolution_op_out */
}
@ -132,9 +133,10 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
op_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t weights_cache */
op); /* xnn_operator_t* convolution_op_out */
} else { /* per_channel */
return xnn_create_convolution2d_nhwc_qc8(
return xnn_create_convolution2d_nhwc_qs8_qc8w(
pad_top, /* uint32_t input_padding_top */
pad_right, /* uint32_t input_padding_right */
pad_bottom, /* uint32_t input_padding_bottom */
@ -161,21 +163,20 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
op_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t weights_cache */
op); /* xnn_operator_t* convolution_op_out */
}
}
/*
* Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
* Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions.
*/
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_convolution2d_nhwc(
enum xnn_status xnnp_reshape_convolution2d_nhwc(
xnn_operator_t op,
size_t batch,
size_t in_h,
size_t in_w,
const int8_t* inp,
int8_t* outp,
pthreadpool_t pt_pool,
bool per_channel = false,
bool transpose = false,
@ -183,36 +184,78 @@ enum xnn_status xnnp_setup_convolution2d_nhwc(
uint32_t adj_w = 0) {
if(transpose) {
TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
return xnn_setup_deconvolution2d_nhwc_qs8(
return xnn_reshape_deconvolution2d_nhwc_qs8(
op, /* xnn_operator_t deconvolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
adj_h, /* uint32_t adjustment_height */
adj_w, /* uint32_t adjustment_width */
inp, /* const int8_t* input */
outp, /* int8_t* output */
nullptr, /* size_t* output_height_out */
nullptr, /* size_t* output_width_out */
pt_pool); /* pthreadpool_t threadpool */
}
size_t workspace_size = SIZE_MAX;
size_t workspace_alignment = SIZE_MAX;
if (!per_channel) {
return xnn_reshape_convolution2d_nhwc_qs8(
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
&workspace_size, /* size_t* workspace_size */
&workspace_alignment, /* size_t* workspace_alignment */
nullptr, /* size_t* output_height_out */
nullptr, /* size_t* output_width_out */
pt_pool); /* pthreadpool_t threadpool */
} else { /* per_channel */
return xnn_reshape_convolution2d_nhwc_qs8_qc8w(
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
&workspace_size, /* size_t* workspace_size */
&workspace_alignment, /* size_t* workspace_alignment */
nullptr, /* size_t* output_height_out */
nullptr, /* size_t* output_width_out */
pt_pool); /* pthreadpool_t threadpool */
}
}
/*
* Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
*/
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_convolution2d_nhwc(
xnn_operator_t op,
const int8_t* inp,
int8_t* outp,
bool per_channel = false,
bool transpose = false) {
if(transpose) {
TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
return xnn_setup_deconvolution2d_nhwc_qs8(
op, /* xnn_operator_t deconvolution_op */
inp, /* const int8_t* input */
outp); /* int8_t* output */
}
if (!per_channel) {
return xnn_setup_convolution2d_nhwc_qs8(
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
inp, /* const int8_t* input */
outp, /* int8_t* output */
pt_pool); /* pthreadpool_t threadpool */
op, /* xnn_operator_t deconvolution_op */
nullptr, /* void workspace */
inp, /* const int8_t* input */
outp); /* int8_t* output */
} else { /* per_channel */
return xnn_setup_convolution2d_nhwc_qc8(
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
inp, /* const int8_t* input */
outp, /* int8_t* output */
pt_pool); /* pthreadpool_t threadpool */
return xnn_setup_convolution2d_nhwc_qs8_qc8w(
op, /* xnn_operator_t deconvolution_op */
nullptr, /* void workspace */
inp, /* const int8_t* input */
outp); /* int8_t* output */
}
}
@ -258,22 +301,31 @@ enum xnn_status xnnp_create_fully_connected_nc(
output_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t */
fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_reshape_fully_connected_nc(
xnn_operator_t fully_connected_op,
size_t batch_size,
pthreadpool_t threadpool) {
return xnn_reshape_fully_connected_nc_qs8(
fully_connected_op, /* xnn_operator_t fully_connected_op */
batch_size, /* size_t batch_size */
threadpool); /* pthreadpool_t threadpool */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_fully_connected_nc(
xnn_operator_t fully_connected_op,
size_t batch_size,
const int8_t* input,
int8_t* output,
pthreadpool_t threadpool) {
int8_t* output) {
return xnn_setup_fully_connected_nc_qs8(
fully_connected_op, /* xnn_operator_t fully_connected_op */
batch_size, /* size_t batch_size */
input, /* const int8_t* input */
output, /* int8_t* output */
threadpool); /* pthreadpool_t threadpool */
output /* int8_t* output */
);
}
} // namespace xnnp_utils

View File

@ -770,14 +770,12 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
output_zero_point,
c10::MemoryFormat::ChannelsLast);
// Setup the operator
status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
// Reshape the operator
status = at::native::xnnp_utils::xnnp_reshape_convolution2d_nhwc(
xnnp_convolution_op.get(),
N,
H,
W,
reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
caffe2::pthreadpool_(),
per_channel(),
transpose(),
@ -791,6 +789,21 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
status,
")");
// Setup the operator
status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
xnnp_convolution_op.get(),
reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
per_channel(),
transpose());
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn setup operator failed(",
status,
")");
// Run the operator
status = xnn_run_operator(
xnnp_convolution_op.get(), /* xnn_operator_t op */

View File

@ -565,14 +565,19 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
rows_input *= input_contig.size(i);
}
// Reshape the operator
status = at::native::xnnp_utils::xnnp_reshape_fully_connected_nc(
xnnp_linear_op.get(),
rows_input, /* batch_size */
caffe2::pthreadpool_());
// Setup the operator
status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
xnnp_linear_op.get(),
rows_input, /* batch_size */
reinterpret_cast<const underlying_t*>(
input_contig.template data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
caffe2::pthreadpool_());
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>())
);
TORCH_CHECK(
status == xnn_status_success,

View File

@ -139,17 +139,29 @@ Tensor _mul_out_xnnpack(
const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
// set up operator
status = xnn_setup_multiply_nd_qs8(
// reshape operator
status = xnn_reshape_multiply_nd_qs8(
xnnp_qmul_operator.get(),
self_shape.size(),
self_shape.data(),
other_shape.size(),
other_shape.data(),
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn reshape operator failed(",
status,
")!");
// set up operator
status = xnn_setup_multiply_nd_qs8(
xnnp_qmul_operator.get(),
reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>()),
caffe2::pthreadpool_());
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
);
TORCH_CHECK(
status == xnn_status_success,

View File

@ -34,13 +34,20 @@ static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
Operator hardswish_scoped_op(hardswish_op);
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
hardswish_op,
input.numel(), // Batch
input.data_ptr<float>(),
output.data_ptr<float>(),
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_hardswish_nc_f32 failed!");
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
hardswish_op,
input.data_ptr<float>(),
output.data_ptr<float>());
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_hardswish_nc_f32 failed!");

View File

@ -7,18 +7,13 @@
namespace at::native::xnnpack {
bool use_global_average_pool(
const Tensor& input) {
return xnnpack::available() &&
(1 <= input.ndimension()) &&
(input.device().is_cpu()) &&
(kFloat == input.scalar_type()) &&
!input.requires_grad() &&
true;
bool use_global_average_pool(const Tensor& input) {
return xnnpack::available() && (1 <= input.ndimension()) &&
(input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
!input.requires_grad() && true;
}
Tensor global_average_pool(
const Tensor& input) {
Tensor global_average_pool(const Tensor& input) {
using namespace internal;
const Tensor input_padded_contig_nhwc =
@ -27,10 +22,10 @@ Tensor global_average_pool(
Tensor output = mobile::empty_with_tail_padding(
{
input_padded_contig_nhwc.size(Layout::Activation4D::batch),
input_padded_contig_nhwc.size(Layout::Activation4D::channels),
1,
1,
input_padded_contig_nhwc.size(Layout::Activation4D::batch),
input_padded_contig_nhwc.size(Layout::Activation4D::channels),
1,
1,
},
input_padded_contig_nhwc.options().dtype(),
MemoryFormat::ChannelsLast,
@ -38,42 +33,61 @@ Tensor global_average_pool(
xnn_operator_t global_average_pooling_op{};
const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // input stride
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // output stride
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::infinity(),
0 /* flags */,
&global_average_pooling_op);
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // input stride
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // output stride
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::infinity(),
0 /* flags */,
&global_average_pooling_op);
TORCH_CHECK(
xnn_status_success == create_status,
"xnn_create_global_average_pooling_nwc_f32 failed!");
xnn_status_success == create_status,
"xnn_create_global_average_pooling_nwc_f32 failed!");
Operator global_avg_pool_scoped_op(global_average_pooling_op);
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
size_t workspace_size = 0;
size_t workspace_alignment = 0;
const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
global_average_pooling_op,
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
input_padded_contig_nhwc.size(Layout::Activation4D::width) *
input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
input_padded_contig_nhwc.data_ptr<float>(),
output.data_ptr<float>(),
&workspace_size, // workspace_size
&workspace_alignment, // workspace_alignment
caffe2::pthreadpool_());
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
xnn_status_success == reshape_status,
"xnn_reshape_global_average_pooling_nwc_f32 failed!");
const xnn_status run_status = xnn_run_operator(
global_average_pooling_op,
caffe2::pthreadpool_());
// Create Workspace pointer, which we will align and pad with 16 bytes
size_t xnnpack_buffer_padding = 16;
std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
void* maybe_aligned_workspace = workspace_vector.data();
void* aligned_workspace =
(void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
global_average_pooling_op,
aligned_workspace,
input_padded_contig_nhwc.data_ptr<float>(),
output.data_ptr<float>());
TORCH_CHECK(
xnn_status_success == run_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
xnn_status_success == setup_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
const xnn_status run_status =
xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
TORCH_CHECK(
xnn_status_success == run_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
return output.to(input.suggest_memory_format());
}

View File

@ -79,13 +79,20 @@ Tensor channel_shuffle(
input_padded_contig_nhwc.size(Layout::Activation4D::height) *
input_padded_contig_nhwc.size(Layout::Activation4D::width);
const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
const xnn_status reshape_status = xnn_reshape_channel_shuffle_nc_x32(
channel_shuffle_op, // operator
batch_size, // batch_size
input_padded_contig_nhwc.data_ptr<float>(), // input
output_padded_contig_nhwc.data_ptr<float>(), // output
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_channel_shuffle_nc_x32 failed!");
const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
channel_shuffle_op, // operator
input_padded_contig_nhwc.data_ptr<float>(), // input
output_padded_contig_nhwc.data_ptr<float>()); // output
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_channel_shuffle_nc_x32 failed!");

View File

@ -236,6 +236,7 @@ ContextConv2D create(
output_max, // output_max
0u, // flags
nullptr, // xnn_caches_t
nullptr, // xnn_weights_cache_t
&convolution_op); // operator
} else {
for (const auto i : c10::irange(4)) {
@ -265,6 +266,7 @@ ContextConv2D create(
output_max, // output_max
0u, // flags
nullptr, // xnn_caches_t
nullptr, // xnn_weights_cache_t
&convolution_op); // operator
}
@ -338,26 +340,41 @@ Tensor run(
*/
if (context.transposed_) {
setup_status = xnn_setup_deconvolution2d_nhwc_f32(
context.op.get(), // operator
setup_status = xnn_reshape_deconvolution2d_nhwc_f32(
context.op.get(),
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
context.output_padding_[0], // adjustment_height
context.output_padding_[1], // adjustment_width
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>(), // output
nullptr, // output_height_out
nullptr, // output_width_out
caffe2::pthreadpool_()); // threadpool
} else {
setup_status = xnn_setup_convolution2d_nhwc_f32(
setup_status = xnn_setup_deconvolution2d_nhwc_f32(
context.op.get(), // operator
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>()); // output
} else {
size_t workspace_size = SIZE_MAX;
size_t workspace_alignment = SIZE_MAX;
setup_status = xnn_reshape_convolution2d_nhwc_f32(
context.op.get(),
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>(), // output
&workspace_size, // workspace_size
&workspace_alignment, // workspace_alignment
nullptr, // output_height_out
nullptr, // output_width_out
caffe2::pthreadpool_());
setup_status = xnn_setup_convolution2d_nhwc_f32(
context.op.get(), // operator
nullptr, // workspace
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>()); // output
}
TORCH_CHECK(

View File

@ -95,6 +95,7 @@ ContextLinear create(
output_max, // output_max
0u, // flags
nullptr, // xnn_caches_t
nullptr, // xnn_weights_cache_t
&linear_op); // operator
TORCH_CHECK(
@ -136,13 +137,20 @@ Tensor run(
padded_input.suggest_memory_format(),
padded_input.opt_names());
const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
const xnn_status reshape_status = xnn_reshape_fully_connected_nc_f32(
context.op.get(), // operator
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
padded_input.data_ptr<float>(), // input
output.data_ptr<float>(), // output
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_fully_connected_nc_f32 failed!");
const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
context.op.get(), // operator
padded_input.data_ptr<float>(), // input
output.data_ptr<float>()); // output
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_fully_connected_nc_f32 failed!");

View File

@ -214,15 +214,24 @@ Tensor max_pool2d(
xnn_status_success == create_status,
"xnn_create_max_pooling2d_nhwc_f32 failed!");
const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
const xnn_status reshape_status = xnn_reshape_max_pooling2d_nhwc_f32(
max_pool_op, // operator
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
input_padded_contig_nhwc.size(Layout::Activation4D::height), // input_height
input_padded_contig_nhwc.size(Layout::Activation4D::width), // input_width
input_padded_contig_nhwc.data_ptr<float>(), // input
output_padded_contig_nhwc.data_ptr<float>(), // output
nullptr, // output_height_out
nullptr, // output_width_out
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_max_pooling2d_nhwc_f32 failed!");
const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
max_pool_op, // operator
input_padded_contig_nhwc.data_ptr<float>(), // input
output_padded_contig_nhwc.data_ptr<float>()); // output
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_max_pooling2d_nhwc_f32 failed!");

View File

@ -619,7 +619,13 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
# Disable ARM BF16 and FP16 vector for now; unused and causes build failures because
# these new ISA features may not be supported on older compilers
set(XNNPACK_ENABLE_ARM_BF16 OFF CACHE BOOL "")
set(XNNPACK_ENABLE_ARM_FP16_VECTOR OFF CACHE BOOL "")
# Disable AVXVNNI for now, older clang versions seem not to support it
# (clang 12 is where avx-vnni support is added)
set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "")
# Disable I8MM For CI since clang 9 does not support neon i8mm.
set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
# Setting this global PIC flag for all XNNPACK targets.
# This is needed for Object libraries within XNNPACK which must

View File

@ -127,6 +127,7 @@ cxx_library(
"cpuinfo/wrappers/linux/multiline.c",
"cpuinfo/wrappers/linux/processors.c",
"cpuinfo/wrappers/linux/smallfile.c",
"cpuinfo/wrappers/log.c",
"cpuinfo/wrappers/mach/topology.c",
"cpuinfo/wrappers/x86/cache/descriptor.c",
"cpuinfo/wrappers/x86/cache/deterministic.c",

View File

@ -9,6 +9,7 @@ CPUINFO_SOURCES = {
"init.c",
"api.c",
"cache.c",
"log.c",
],
"defined(__linux__)": [
"linux/multiline.c",

View File

@ -8,16 +8,22 @@ import logging
BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
WRAPPER_SRC_NAMES = {
"PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS": None,
"PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)",
"PROD_SCALAR_MICROKERNEL_SRCS": None,
"PROD_FMA_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
"PROD_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
"PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__arm__)",
"PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONI8MM_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
@ -30,42 +36,13 @@ WRAPPER_SRC_NAMES = {
"PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
"PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
"AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
# add additoonal:
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"ALL_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
"ALL_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"ALL_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"ALL_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_AVX512SKX_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_AVX512VBMI_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_F16C_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_FMA3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_FP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEON_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEON_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_NEONBF16_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONDOT_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFMA_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_NEONFP16_MICROKERNEL_SRCS':"defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_NEONV8_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_SCALAR_MICROKERNEL_SRCS': "defined(__arm__)",
'ALL_SSE_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_SSE2_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_SSE41_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_SSSE3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_XOP_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'AARCH32_ASM_MICROKERNEL_SRCS': "defined(__arm__)",
"PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_SCALAR_MICROKERNEL_SRCS": "defined(__arm__)",
# add non-prod microkernel sources here:
}
SRC_NAMES = set([
@ -73,12 +50,24 @@ SRC_NAMES = set([
"SUBGRAPH_SRCS",
"LOGGING_SRCS",
"XNNPACK_SRCS",
"HOT_SRCS",
"TABLE_SRCS",
"JIT_SRCS",
"JIT_AARCH32_SRCS",
"JIT_AARCH64_SRCS",
"PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS",
"PROD_SCALAR_MICROKERNEL_SRCS",
"PROD_FMA_MICROKERNEL_SRCS",
"PROD_ARMSIMD32_MICROKERNEL_SRCS",
"PROD_FP16ARITH_MICROKERNEL_SRCS",
"PROD_NEON_MICROKERNEL_SRCS",
"PROD_NEONFP16_MICROKERNEL_SRCS",
"PROD_NEONFMA_MICROKERNEL_SRCS",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONV8_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONDOT_MICROKERNEL_SRCS",
"PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
"PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONI8MM_MICROKERNEL_SRCS",
"PROD_SSE_MICROKERNEL_SRCS",
"PROD_SSE2_MICROKERNEL_SRCS",
"PROD_SSSE3_MICROKERNEL_SRCS",
@ -90,59 +79,14 @@ SRC_NAMES = set([
"PROD_AVX2_MICROKERNEL_SRCS",
"PROD_AVX512F_MICROKERNEL_SRCS",
"PROD_AVX512SKX_MICROKERNEL_SRCS",
"PROD_SCALAR_MICROKERNEL_SRCS",
"PROD_SCALAR_AARCH32_MICROKERNEL_SRCS",
"PROD_SCALAR_RISCV_MICROKERNEL_SRCS",
"PROD_ARMSIMD32_MICROKERNEL_SRCS",
"PROD_FP16ARITH_MICROKERNEL_SRCS",
"PROD_NEON_MICROKERNEL_SRCS",
"PROD_NEONFP16_MICROKERNEL_SRCS",
"PROD_NEONFMA_MICROKERNEL_SRCS",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONV8_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONDOT_MICROKERNEL_SRCS",
"PROD_SSE2_MICROKERNEL_SRCS",
"PROD_SSSE3_MICROKERNEL_SRCS",
"PROD_SSE41_MICROKERNEL_SRCS",
"PROD_AVX_MICROKERNEL_SRCS",
"PROD_F16C_MICROKERNEL_SRCS",
"PROD_AVX512VBMI_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
"PROD_AVX512VNNI_MICROKERNEL_SRCS",
"PROD_RVV_MICROKERNEL_SRCS",
"PROD_AVXVNNI_MICROKERNEL_SRCS",
"AARCH32_ASM_MICROKERNEL_SRCS",
"AARCH64_ASM_MICROKERNEL_SRCS",
# new adding libs:
'ALL_ARMSIMD32_MICROKERNEL_SRCS',
'ALL_AVX_MICROKERNEL_SRCS',
'ALL_AVX2_MICROKERNEL_SRCS',
'ALL_AVX512F_MICROKERNEL_SRCS',
'ALL_AVX512SKX_MICROKERNEL_SRCS',
'ALL_AVX512VBMI_MICROKERNEL_SRCS',
'ALL_F16C_MICROKERNEL_SRCS',
'ALL_FMA3_MICROKERNEL_SRCS',
'ALL_FP16ARITH_MICROKERNEL_SRCS',
'ALL_HEXAGON_MICROKERNEL_SRCS',
'ALL_NEON_MICROKERNEL_SRCS',
'ALL_NEON_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONBF16_MICROKERNEL_SRCS',
'ALL_NEONBF16_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONDOT_MICROKERNEL_SRCS',
'ALL_NEONFMA_MICROKERNEL_SRCS',
'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONFP16_MICROKERNEL_SRCS',
'ALL_NEONFP16ARITH_MICROKERNEL_SRCS',
'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONV8_MICROKERNEL_SRCS',
'ALL_SCALAR_MICROKERNEL_SRCS',
'ALL_SSE_MICROKERNEL_SRCS',
'ALL_SSE2_MICROKERNEL_SRCS',
'ALL_SSE41_MICROKERNEL_SRCS',
'ALL_SSSE3_MICROKERNEL_SRCS',
'ALL_WASM_MICROKERNEL_SRCS',
'ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS',
'ALL_WASMSIMD_MICROKERNEL_SRCS',
'ALL_XOP_MICROKERNEL_SRCS',
'AARCH32_ASM_MICROKERNEL_SRCS',
'AARCH64_ASM_MICROKERNEL_SRCS',
# add non-prod microkernel sources here:
])
def handle_singleline_parse(line):
@ -150,11 +94,10 @@ def handle_singleline_parse(line):
end_index = line.find(")")
line = line[start_index+1:end_index]
key_val = line.split(" ")
return key_val[0], key_val[1][4:]
return key_val[0], list(map(lambda x: x[4:], key_val[1:]))
def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
sources = collections.defaultdict(list)
count = 0
with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
lines = cmake.readlines()
i = 0
@ -163,7 +106,7 @@ def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
if lines[i].startswith("SET") and "src/" in lines[i]:
name, val = handle_singleline_parse(line)
sources[name].append(val)
sources[name].extend(val)
i+=1
continue

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff