Revert "[8/n] Update XNNPACK Version Part 8 Everything Remaining to get it to work (#115587)"

This reverts commit a8dc9d8e353ddcf7db0247349a3acd0dd37fcc6f.

Reverted https://github.com/pytorch/pytorch/pull/115587 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/115587#issuecomment-1852835898))
This commit is contained in:
PyTorch MergeBot
2023-12-12 21:28:09 +00:00
parent ac4f6beb00
commit c3ed9f65a0
20 changed files with 13589 additions and 1914 deletions

View File

@ -259,32 +259,24 @@ enum xnn_status xnnp_create_add_nd(
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_reshape_add_nd(
enum xnn_status xnnp_setup_add_nd(
xnn_operator_t op,
const std::vector<size_t>& a_shape,
const std::vector<size_t>& b_shape,
pthreadpool_t pt_pool) {
return xnn_reshape_add_nd_qs8(
op, /* xnn_operator_t add_op */
a_shape.size(), /* size_t num_input1_dims */
a_shape.data(), /* const size_t* input1_shape */
b_shape.size(), /* size_t num_input2_dims */
b_shape.data(), /* const size_t* input2_shape */
pt_pool); /* pthreadpool_t threadpool */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_add_nd(
xnn_operator_t op,
const int8_t* da,
const int8_t* db,
int8_t* dc,
pthreadpool_t pt_pool) {
return xnn_setup_add_nd_qs8(
op, /* xnn_operator_t add_op */
a_shape.size(), /* size_t num_input1_dims */
a_shape.data(), /* const size_t* input1_shape */
b_shape.size(), /* size_t num_input2_dims */
b_shape.data(), /* const size_t* input2_shape */
da, /* const int8_t* input1 */
db, /* const int8_t* input2 */
dc); /* int8_t* output */
dc, /* int8_t* output */
pt_pool); /* pthreadpool_t threadpool */
}
template <typename scalar_t, bool ReLUFused = false>
@ -356,20 +348,11 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
// Reshape the operator
status = xnnp_reshape_add_nd(
xnnp_add_operator.get(),
qa_shape,
qb_shape,
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,
func_name, ": xnn reshape operator failed(", status,")!");
// Setup the operator
status = xnnp_setup_add_nd(
xnnp_add_operator.get(),
qa_shape,
qb_shape,
reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),

View File

@ -100,7 +100,6 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
op_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t weights_cache */
op); /* xnn_operator_t* deconvolution_op_out */
}
@ -133,10 +132,9 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
op_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t weights_cache */
op); /* xnn_operator_t* convolution_op_out */
} else { /* per_channel */
return xnn_create_convolution2d_nhwc_qs8_qc8w(
return xnn_create_convolution2d_nhwc_qc8(
pad_top, /* uint32_t input_padding_top */
pad_right, /* uint32_t input_padding_right */
pad_bottom, /* uint32_t input_padding_bottom */
@ -163,99 +161,58 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
op_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t weights_cache */
op); /* xnn_operator_t* convolution_op_out */
}
}
/*
* Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions.
*/
C10_ALWAYS_INLINE
enum xnn_status xnnp_reshape_convolution2d_nhwc(
xnn_operator_t op,
size_t batch,
size_t in_h,
size_t in_w,
pthreadpool_t pt_pool,
bool per_channel = false,
bool transpose = false,
uint32_t adj_h = 0,
uint32_t adj_w = 0) {
if(transpose) {
TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
return xnn_reshape_deconvolution2d_nhwc_qs8(
op, /* xnn_operator_t deconvolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
adj_h, /* uint32_t adjustment_height */
adj_w, /* uint32_t adjustment_width */
nullptr, /* size_t* output_height_out */
nullptr, /* size_t* output_width_out */
pt_pool); /* pthreadpool_t threadpool */
}
size_t workspace_size = SIZE_MAX;
size_t workspace_alignment = SIZE_MAX;
if (!per_channel) {
return xnn_reshape_convolution2d_nhwc_qs8(
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
&workspace_size, /* size_t* workspace_size */
&workspace_alignment, /* size_t* workspace_alignment */
nullptr, /* size_t* output_height_out */
nullptr, /* size_t* output_width_out */
pt_pool); /* pthreadpool_t threadpool */
} else { /* per_channel */
return xnn_reshape_convolution2d_nhwc_qs8_qc8w(
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
&workspace_size, /* size_t* workspace_size */
&workspace_alignment, /* size_t* workspace_alignment */
nullptr, /* size_t* output_height_out */
nullptr, /* size_t* output_width_out */
pt_pool); /* pthreadpool_t threadpool */
}
}
/*
* Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
*/
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_convolution2d_nhwc(
xnn_operator_t op,
size_t batch,
size_t in_h,
size_t in_w,
const int8_t* inp,
int8_t* outp,
pthreadpool_t pt_pool,
bool per_channel = false,
bool transpose = false) {
bool transpose = false,
uint32_t adj_h = 0,
uint32_t adj_w = 0) {
if(transpose) {
TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
return xnn_setup_deconvolution2d_nhwc_qs8(
op, /* xnn_operator_t deconvolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
adj_h, /* uint32_t adjustment_height */
adj_w, /* uint32_t adjustment_width */
inp, /* const int8_t* input */
outp); /* int8_t* output */
outp, /* int8_t* output */
pt_pool); /* pthreadpool_t threadpool */
}
if (!per_channel) {
return xnn_setup_convolution2d_nhwc_qs8(
op, /* xnn_operator_t deconvolution_op */
nullptr, /* void workspace */
inp, /* const int8_t* input */
outp); /* int8_t* output */
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
inp, /* const int8_t* input */
outp, /* int8_t* output */
pt_pool); /* pthreadpool_t threadpool */
} else { /* per_channel */
return xnn_setup_convolution2d_nhwc_qs8_qc8w(
op, /* xnn_operator_t deconvolution_op */
nullptr, /* void workspace */
inp, /* const int8_t* input */
outp); /* int8_t* output */
return xnn_setup_convolution2d_nhwc_qc8(
op, /* xnn_operator_t convolution_op */
batch, /* size_t batch_size */
in_h, /* size_t input_height */
in_w, /* size_t input_width */
inp, /* const int8_t* input */
outp, /* int8_t* output */
pt_pool); /* pthreadpool_t threadpool */
}
}
@ -301,31 +258,22 @@ enum xnn_status xnnp_create_fully_connected_nc(
output_max, /* int8_t output_max */
flags, /* uint32_t flags */
nullptr, /* xnn_caches_t caches */
nullptr, /* xnn_weights_cache_t */
fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_reshape_fully_connected_nc(
xnn_operator_t fully_connected_op,
size_t batch_size,
pthreadpool_t threadpool) {
return xnn_reshape_fully_connected_nc_qs8(
fully_connected_op, /* xnn_operator_t fully_connected_op */
batch_size, /* size_t batch_size */
threadpool); /* pthreadpool_t threadpool */
}
C10_ALWAYS_INLINE
enum xnn_status xnnp_setup_fully_connected_nc(
xnn_operator_t fully_connected_op,
size_t batch_size,
const int8_t* input,
int8_t* output) {
int8_t* output,
pthreadpool_t threadpool) {
return xnn_setup_fully_connected_nc_qs8(
fully_connected_op, /* xnn_operator_t fully_connected_op */
batch_size, /* size_t batch_size */
input, /* const int8_t* input */
output /* int8_t* output */
);
output, /* int8_t* output */
threadpool); /* pthreadpool_t threadpool */
}
} // namespace xnnp_utils

View File

@ -770,12 +770,14 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
output_zero_point,
c10::MemoryFormat::ChannelsLast);
// Reshape the operator
status = at::native::xnnp_utils::xnnp_reshape_convolution2d_nhwc(
// Setup the operator
status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
xnnp_convolution_op.get(),
N,
H,
W,
reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
caffe2::pthreadpool_(),
per_channel(),
transpose(),
@ -789,21 +791,6 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
status,
")");
// Setup the operator
status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
xnnp_convolution_op.get(),
reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
per_channel(),
transpose());
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn setup operator failed(",
status,
")");
// Run the operator
status = xnn_run_operator(
xnnp_convolution_op.get(), /* xnn_operator_t op */

View File

@ -565,19 +565,14 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
rows_input *= input_contig.size(i);
}
// Reshape the operator
status = at::native::xnnp_utils::xnnp_reshape_fully_connected_nc(
xnnp_linear_op.get(),
rows_input, /* batch_size */
caffe2::pthreadpool_());
// Setup the operator
status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
xnnp_linear_op.get(),
rows_input, /* batch_size */
reinterpret_cast<const underlying_t*>(
input_contig.template data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>())
);
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,

View File

@ -139,29 +139,17 @@ Tensor _mul_out_xnnpack(
const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
// reshape operator
status = xnn_reshape_multiply_nd_qs8(
// set up operator
status = xnn_setup_multiply_nd_qs8(
xnnp_qmul_operator.get(),
self_shape.size(),
self_shape.data(),
other_shape.size(),
other_shape.data(),
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,
func_name,
": xnn reshape operator failed(",
status,
")!");
// set up operator
status = xnn_setup_multiply_nd_qs8(
xnnp_qmul_operator.get(),
reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
);
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>()),
caffe2::pthreadpool_());
TORCH_CHECK(
status == xnn_status_success,

View File

@ -34,19 +34,12 @@ static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
Operator hardswish_scoped_op(hardswish_op);
const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
hardswish_op,
input.numel(), // Batch
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_hardswish_nc_f32 failed!");
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
hardswish_op,
input.numel(), // Batch
input.data_ptr<float>(),
output.data_ptr<float>());
output.data_ptr<float>(),
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == setup_status,

View File

@ -7,13 +7,18 @@
namespace at::native::xnnpack {
bool use_global_average_pool(const Tensor& input) {
return xnnpack::available() && (1 <= input.ndimension()) &&
(input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
!input.requires_grad() && true;
bool use_global_average_pool(
const Tensor& input) {
return xnnpack::available() &&
(1 <= input.ndimension()) &&
(input.device().is_cpu()) &&
(kFloat == input.scalar_type()) &&
!input.requires_grad() &&
true;
}
Tensor global_average_pool(const Tensor& input) {
Tensor global_average_pool(
const Tensor& input) {
using namespace internal;
const Tensor input_padded_contig_nhwc =
@ -22,10 +27,10 @@ Tensor global_average_pool(const Tensor& input) {
Tensor output = mobile::empty_with_tail_padding(
{
input_padded_contig_nhwc.size(Layout::Activation4D::batch),
input_padded_contig_nhwc.size(Layout::Activation4D::channels),
1,
1,
input_padded_contig_nhwc.size(Layout::Activation4D::batch),
input_padded_contig_nhwc.size(Layout::Activation4D::channels),
1,
1,
},
input_padded_contig_nhwc.options().dtype(),
MemoryFormat::ChannelsLast,
@ -33,61 +38,42 @@ Tensor global_average_pool(const Tensor& input) {
xnn_operator_t global_average_pooling_op{};
const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // input stride
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // output stride
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::infinity(),
0 /* flags */,
&global_average_pooling_op);
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // input stride
input_padded_contig_nhwc.size(
Layout::Activation4D::channels), // output stride
-std::numeric_limits<float>::infinity(),
std::numeric_limits<float>::infinity(),
0 /* flags */,
&global_average_pooling_op);
TORCH_CHECK(
xnn_status_success == create_status,
"xnn_create_global_average_pooling_nwc_f32 failed!");
xnn_status_success == create_status,
"xnn_create_global_average_pooling_nwc_f32 failed!");
Operator global_avg_pool_scoped_op(global_average_pooling_op);
size_t workspace_size = 0;
size_t workspace_alignment = 0;
const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
global_average_pooling_op,
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
input_padded_contig_nhwc.size(Layout::Activation4D::width) *
input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
&workspace_size, // workspace_size
&workspace_alignment, // workspace_alignment
input_padded_contig_nhwc.data_ptr<float>(),
output.data_ptr<float>(),
caffe2::pthreadpool_());
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_global_average_pooling_nwc_f32 failed!");
xnn_status_success == setup_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
// Create Workspace pointer, which we will align and pad with 16 bytes
size_t xnnpack_buffer_padding = 16;
std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
void* maybe_aligned_workspace = workspace_vector.data();
void* aligned_workspace =
(void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
global_average_pooling_op,
aligned_workspace,
input_padded_contig_nhwc.data_ptr<float>(),
output.data_ptr<float>());
const xnn_status run_status = xnn_run_operator(
global_average_pooling_op,
caffe2::pthreadpool_());
TORCH_CHECK(
xnn_status_success == setup_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
const xnn_status run_status =
xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
TORCH_CHECK(
xnn_status_success == run_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
xnn_status_success == run_status,
"xnn_setup_global_average_pooling_nwc_f32 failed!");
return output.to(input.suggest_memory_format());
}

View File

@ -79,19 +79,12 @@ Tensor channel_shuffle(
input_padded_contig_nhwc.size(Layout::Activation4D::height) *
input_padded_contig_nhwc.size(Layout::Activation4D::width);
const xnn_status reshape_status = xnn_reshape_channel_shuffle_nc_x32(
channel_shuffle_op, // operator
batch_size, // batch_size
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_channel_shuffle_nc_x32 failed!");
const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
channel_shuffle_op, // operator
batch_size, // batch_size
input_padded_contig_nhwc.data_ptr<float>(), // input
output_padded_contig_nhwc.data_ptr<float>()); // output
output_padded_contig_nhwc.data_ptr<float>(), // output
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == setup_status,

View File

@ -236,7 +236,6 @@ ContextConv2D create(
output_max, // output_max
0u, // flags
nullptr, // xnn_caches_t
nullptr, // xnn_weights_cache_t
&convolution_op); // operator
} else {
for (const auto i : c10::irange(4)) {
@ -266,7 +265,6 @@ ContextConv2D create(
output_max, // output_max
0u, // flags
nullptr, // xnn_caches_t
nullptr, // xnn_weights_cache_t
&convolution_op); // operator
}
@ -340,41 +338,26 @@ Tensor run(
*/
if (context.transposed_) {
setup_status = xnn_reshape_deconvolution2d_nhwc_f32(
context.op.get(),
setup_status = xnn_setup_deconvolution2d_nhwc_f32(
context.op.get(), // operator
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
context.output_padding_[0], // adjustment_height
context.output_padding_[1], // adjustment_width
nullptr, // output_height_out
nullptr, // output_width_out
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>(), // output
caffe2::pthreadpool_()); // threadpool
setup_status = xnn_setup_deconvolution2d_nhwc_f32(
context.op.get(), // operator
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>()); // output
} else {
size_t workspace_size = SIZE_MAX;
size_t workspace_alignment = SIZE_MAX;
setup_status = xnn_reshape_convolution2d_nhwc_f32(
context.op.get(),
setup_status = xnn_setup_convolution2d_nhwc_f32(
context.op.get(), // operator
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
&workspace_size, // workspace_size
&workspace_alignment, // workspace_alignment
nullptr, // output_height_out
nullptr, // output_width_out
caffe2::pthreadpool_());
setup_status = xnn_setup_convolution2d_nhwc_f32(
context.op.get(), // operator
nullptr, // workspace
padded_input_nhwc.data_ptr<float>(), // input
output.data_ptr<float>()); // output
output.data_ptr<float>(), // output
caffe2::pthreadpool_());
}
TORCH_CHECK(

View File

@ -95,7 +95,6 @@ ContextLinear create(
output_max, // output_max
0u, // flags
nullptr, // xnn_caches_t
nullptr, // xnn_weights_cache_t
&linear_op); // operator
TORCH_CHECK(
@ -137,19 +136,12 @@ Tensor run(
padded_input.suggest_memory_format(),
padded_input.opt_names());
const xnn_status reshape_status = xnn_reshape_fully_connected_nc_f32(
context.op.get(), // operator
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_fully_connected_nc_f32 failed!");
const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
context.op.get(), // operator
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
padded_input.data_ptr<float>(), // input
output.data_ptr<float>()); // output
output.data_ptr<float>(), // output
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == setup_status,

View File

@ -214,23 +214,14 @@ Tensor max_pool2d(
xnn_status_success == create_status,
"xnn_create_max_pooling2d_nhwc_f32 failed!");
const xnn_status reshape_status = xnn_reshape_max_pooling2d_nhwc_f32(
const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
max_pool_op, // operator
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
input_padded_contig_nhwc.size(Layout::Activation4D::height), // input_height
input_padded_contig_nhwc.size(Layout::Activation4D::width), // input_width
nullptr, // output_height_out
nullptr, // output_width_out
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == reshape_status,
"xnn_reshape_max_pooling2d_nhwc_f32 failed!");
const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
max_pool_op, // operator
input_padded_contig_nhwc.data_ptr<float>(), // input
output_padded_contig_nhwc.data_ptr<float>()); // output
output_padded_contig_nhwc.data_ptr<float>(), // output
caffe2::pthreadpool_()); // threadpool
TORCH_CHECK(
xnn_status_success == setup_status,

View File

@ -619,13 +619,7 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
# Disable ARM BF16 and FP16 vector for now; unused and causes build failures because
# these new ISA features may not be supported on older compilers
set(XNNPACK_ENABLE_ARM_BF16 OFF CACHE BOOL "")
# Disable AVXVNNI for now, older clang versions seem not to support it
# (clang 12 is where avx-vnni support is added)
set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "")
# Disable I8MM For CI since clang 9 does not support neon i8mm.
set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
set(XNNPACK_ENABLE_ARM_FP16_VECTOR OFF CACHE BOOL "")
# Setting this global PIC flag for all XNNPACK targets.
# This is needed for Object libraries within XNNPACK which must

View File

@ -127,7 +127,6 @@ cxx_library(
"cpuinfo/wrappers/linux/multiline.c",
"cpuinfo/wrappers/linux/processors.c",
"cpuinfo/wrappers/linux/smallfile.c",
"cpuinfo/wrappers/log.c",
"cpuinfo/wrappers/mach/topology.c",
"cpuinfo/wrappers/x86/cache/descriptor.c",
"cpuinfo/wrappers/x86/cache/deterministic.c",

View File

@ -9,7 +9,6 @@ CPUINFO_SOURCES = {
"init.c",
"api.c",
"cache.c",
"log.c",
],
"defined(__linux__)": [
"linux/multiline.c",

View File

@ -8,22 +8,16 @@ import logging
BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
WRAPPER_SRC_NAMES = {
"PROD_SCALAR_MICROKERNEL_SRCS": None,
"PROD_FMA_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
"PROD_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
"PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__arm__)",
"PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS": None,
"PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)",
"PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONI8MM_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
@ -36,13 +30,42 @@ WRAPPER_SRC_NAMES = {
"PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
"PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
"AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
# add non-prod microkernel sources here:
# add additoonal:
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"ALL_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
"ALL_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"ALL_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
"ALL_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_AVX512SKX_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_AVX512VBMI_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_F16C_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_FMA3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_FP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEON_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEON_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_NEONBF16_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONDOT_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFMA_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_NEONFP16_MICROKERNEL_SRCS':"defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_NEONV8_MICROKERNEL_SRCS': "defined(__aarch64__)",
'ALL_SCALAR_MICROKERNEL_SRCS': "defined(__arm__)",
'ALL_SSE_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_SSE2_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_SSE41_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_SSSE3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'ALL_XOP_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
'AARCH32_ASM_MICROKERNEL_SRCS': "defined(__arm__)",
"PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
"PROD_SCALAR_MICROKERNEL_SRCS": "defined(__arm__)",
}
SRC_NAMES = set([
@ -50,24 +73,12 @@ SRC_NAMES = set([
"SUBGRAPH_SRCS",
"LOGGING_SRCS",
"XNNPACK_SRCS",
"HOT_SRCS",
"TABLE_SRCS",
"JIT_SRCS",
"PROD_SCALAR_MICROKERNEL_SRCS",
"PROD_FMA_MICROKERNEL_SRCS",
"PROD_ARMSIMD32_MICROKERNEL_SRCS",
"PROD_FP16ARITH_MICROKERNEL_SRCS",
"PROD_NEON_MICROKERNEL_SRCS",
"PROD_NEONFP16_MICROKERNEL_SRCS",
"PROD_NEONFMA_MICROKERNEL_SRCS",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONV8_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONDOT_MICROKERNEL_SRCS",
"PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
"PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONI8MM_MICROKERNEL_SRCS",
"JIT_AARCH32_SRCS",
"JIT_AARCH64_SRCS",
"PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS",
"PROD_SSE_MICROKERNEL_SRCS",
"PROD_SSE2_MICROKERNEL_SRCS",
"PROD_SSSE3_MICROKERNEL_SRCS",
@ -79,14 +90,59 @@ SRC_NAMES = set([
"PROD_AVX2_MICROKERNEL_SRCS",
"PROD_AVX512F_MICROKERNEL_SRCS",
"PROD_AVX512SKX_MICROKERNEL_SRCS",
"PROD_SCALAR_MICROKERNEL_SRCS",
"PROD_SCALAR_AARCH32_MICROKERNEL_SRCS",
"PROD_SCALAR_RISCV_MICROKERNEL_SRCS",
"PROD_ARMSIMD32_MICROKERNEL_SRCS",
"PROD_FP16ARITH_MICROKERNEL_SRCS",
"PROD_NEON_MICROKERNEL_SRCS",
"PROD_NEONFP16_MICROKERNEL_SRCS",
"PROD_NEONFMA_MICROKERNEL_SRCS",
"PROD_NEON_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONV8_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
"PROD_NEONDOT_MICROKERNEL_SRCS",
"PROD_SSE2_MICROKERNEL_SRCS",
"PROD_SSSE3_MICROKERNEL_SRCS",
"PROD_SSE41_MICROKERNEL_SRCS",
"PROD_AVX_MICROKERNEL_SRCS",
"PROD_F16C_MICROKERNEL_SRCS",
"PROD_AVX512VBMI_MICROKERNEL_SRCS",
"PROD_AVX512VNNI_MICROKERNEL_SRCS",
"PROD_RVV_MICROKERNEL_SRCS",
"PROD_AVXVNNI_MICROKERNEL_SRCS",
"AARCH32_ASM_MICROKERNEL_SRCS",
"AARCH64_ASM_MICROKERNEL_SRCS",
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
# add non-prod microkernel sources here:
# new adding libs:
'ALL_ARMSIMD32_MICROKERNEL_SRCS',
'ALL_AVX_MICROKERNEL_SRCS',
'ALL_AVX2_MICROKERNEL_SRCS',
'ALL_AVX512F_MICROKERNEL_SRCS',
'ALL_AVX512SKX_MICROKERNEL_SRCS',
'ALL_AVX512VBMI_MICROKERNEL_SRCS',
'ALL_F16C_MICROKERNEL_SRCS',
'ALL_FMA3_MICROKERNEL_SRCS',
'ALL_FP16ARITH_MICROKERNEL_SRCS',
'ALL_HEXAGON_MICROKERNEL_SRCS',
'ALL_NEON_MICROKERNEL_SRCS',
'ALL_NEON_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONBF16_MICROKERNEL_SRCS',
'ALL_NEONBF16_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONDOT_MICROKERNEL_SRCS',
'ALL_NEONFMA_MICROKERNEL_SRCS',
'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONFP16_MICROKERNEL_SRCS',
'ALL_NEONFP16ARITH_MICROKERNEL_SRCS',
'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS',
'ALL_NEONV8_MICROKERNEL_SRCS',
'ALL_SCALAR_MICROKERNEL_SRCS',
'ALL_SSE_MICROKERNEL_SRCS',
'ALL_SSE2_MICROKERNEL_SRCS',
'ALL_SSE41_MICROKERNEL_SRCS',
'ALL_SSSE3_MICROKERNEL_SRCS',
'ALL_WASM_MICROKERNEL_SRCS',
'ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS',
'ALL_WASMSIMD_MICROKERNEL_SRCS',
'ALL_XOP_MICROKERNEL_SRCS',
'AARCH32_ASM_MICROKERNEL_SRCS',
'AARCH64_ASM_MICROKERNEL_SRCS',
])
def handle_singleline_parse(line):
@ -94,10 +150,11 @@ def handle_singleline_parse(line):
end_index = line.find(")")
line = line[start_index+1:end_index]
key_val = line.split(" ")
return key_val[0], list(map(lambda x: x[4:], key_val[1:]))
return key_val[0], key_val[1][4:]
def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
sources = collections.defaultdict(list)
count = 0
with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
lines = cmake.readlines()
i = 0
@ -106,7 +163,7 @@ def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
if lines[i].startswith("SET") and "src/" in lines[i]:
name, val = handle_singleline_parse(line)
sources[name].extend(val)
sources[name].append(val)
i+=1
continue

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff