mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Revert "[8/n] Update XNNPACK Version Part 8 Everything Remaining to get it to work (#115587)"
This reverts commit a8dc9d8e353ddcf7db0247349a3acd0dd37fcc6f. Reverted https://github.com/pytorch/pytorch/pull/115587 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/115587#issuecomment-1852835898))
This commit is contained in:
@ -259,32 +259,24 @@ enum xnn_status xnnp_create_add_nd(
|
|||||||
}
|
}
|
||||||
|
|
||||||
C10_ALWAYS_INLINE
|
C10_ALWAYS_INLINE
|
||||||
enum xnn_status xnnp_reshape_add_nd(
|
enum xnn_status xnnp_setup_add_nd(
|
||||||
xnn_operator_t op,
|
xnn_operator_t op,
|
||||||
const std::vector<size_t>& a_shape,
|
const std::vector<size_t>& a_shape,
|
||||||
const std::vector<size_t>& b_shape,
|
const std::vector<size_t>& b_shape,
|
||||||
pthreadpool_t pt_pool) {
|
|
||||||
return xnn_reshape_add_nd_qs8(
|
|
||||||
op, /* xnn_operator_t add_op */
|
|
||||||
a_shape.size(), /* size_t num_input1_dims */
|
|
||||||
a_shape.data(), /* const size_t* input1_shape */
|
|
||||||
b_shape.size(), /* size_t num_input2_dims */
|
|
||||||
b_shape.data(), /* const size_t* input2_shape */
|
|
||||||
pt_pool); /* pthreadpool_t threadpool */
|
|
||||||
}
|
|
||||||
|
|
||||||
C10_ALWAYS_INLINE
|
|
||||||
enum xnn_status xnnp_setup_add_nd(
|
|
||||||
xnn_operator_t op,
|
|
||||||
const int8_t* da,
|
const int8_t* da,
|
||||||
const int8_t* db,
|
const int8_t* db,
|
||||||
int8_t* dc,
|
int8_t* dc,
|
||||||
pthreadpool_t pt_pool) {
|
pthreadpool_t pt_pool) {
|
||||||
return xnn_setup_add_nd_qs8(
|
return xnn_setup_add_nd_qs8(
|
||||||
op, /* xnn_operator_t add_op */
|
op, /* xnn_operator_t add_op */
|
||||||
|
a_shape.size(), /* size_t num_input1_dims */
|
||||||
|
a_shape.data(), /* const size_t* input1_shape */
|
||||||
|
b_shape.size(), /* size_t num_input2_dims */
|
||||||
|
b_shape.data(), /* const size_t* input2_shape */
|
||||||
da, /* const int8_t* input1 */
|
da, /* const int8_t* input1 */
|
||||||
db, /* const int8_t* input2 */
|
db, /* const int8_t* input2 */
|
||||||
dc); /* int8_t* output */
|
dc, /* int8_t* output */
|
||||||
|
pt_pool); /* pthreadpool_t threadpool */
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename scalar_t, bool ReLUFused = false>
|
template <typename scalar_t, bool ReLUFused = false>
|
||||||
@ -356,20 +348,11 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
|
|||||||
const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
|
const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
|
||||||
const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
|
const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
|
||||||
|
|
||||||
// Reshape the operator
|
|
||||||
status = xnnp_reshape_add_nd(
|
|
||||||
xnnp_add_operator.get(),
|
|
||||||
qa_shape,
|
|
||||||
qb_shape,
|
|
||||||
caffe2::pthreadpool_());
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
status == xnn_status_success,
|
|
||||||
func_name, ": xnn reshape operator failed(", status,")!");
|
|
||||||
|
|
||||||
// Setup the operator
|
// Setup the operator
|
||||||
status = xnnp_setup_add_nd(
|
status = xnnp_setup_add_nd(
|
||||||
xnnp_add_operator.get(),
|
xnnp_add_operator.get(),
|
||||||
|
qa_shape,
|
||||||
|
qb_shape,
|
||||||
reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
|
reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
|
||||||
reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
|
reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
|
||||||
reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
|
reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
|
||||||
|
@ -100,7 +100,6 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
|
|||||||
op_max, /* int8_t output_max */
|
op_max, /* int8_t output_max */
|
||||||
flags, /* uint32_t flags */
|
flags, /* uint32_t flags */
|
||||||
nullptr, /* xnn_caches_t caches */
|
nullptr, /* xnn_caches_t caches */
|
||||||
nullptr, /* xnn_weights_cache_t weights_cache */
|
|
||||||
op); /* xnn_operator_t* deconvolution_op_out */
|
op); /* xnn_operator_t* deconvolution_op_out */
|
||||||
|
|
||||||
}
|
}
|
||||||
@ -133,10 +132,9 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
|
|||||||
op_max, /* int8_t output_max */
|
op_max, /* int8_t output_max */
|
||||||
flags, /* uint32_t flags */
|
flags, /* uint32_t flags */
|
||||||
nullptr, /* xnn_caches_t caches */
|
nullptr, /* xnn_caches_t caches */
|
||||||
nullptr, /* xnn_weights_cache_t weights_cache */
|
|
||||||
op); /* xnn_operator_t* convolution_op_out */
|
op); /* xnn_operator_t* convolution_op_out */
|
||||||
} else { /* per_channel */
|
} else { /* per_channel */
|
||||||
return xnn_create_convolution2d_nhwc_qs8_qc8w(
|
return xnn_create_convolution2d_nhwc_qc8(
|
||||||
pad_top, /* uint32_t input_padding_top */
|
pad_top, /* uint32_t input_padding_top */
|
||||||
pad_right, /* uint32_t input_padding_right */
|
pad_right, /* uint32_t input_padding_right */
|
||||||
pad_bottom, /* uint32_t input_padding_bottom */
|
pad_bottom, /* uint32_t input_padding_bottom */
|
||||||
@ -163,99 +161,58 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
|
|||||||
op_max, /* int8_t output_max */
|
op_max, /* int8_t output_max */
|
||||||
flags, /* uint32_t flags */
|
flags, /* uint32_t flags */
|
||||||
nullptr, /* xnn_caches_t caches */
|
nullptr, /* xnn_caches_t caches */
|
||||||
nullptr, /* xnn_weights_cache_t weights_cache */
|
|
||||||
op); /* xnn_operator_t* convolution_op_out */
|
op); /* xnn_operator_t* convolution_op_out */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions.
|
|
||||||
*/
|
|
||||||
C10_ALWAYS_INLINE
|
|
||||||
enum xnn_status xnnp_reshape_convolution2d_nhwc(
|
|
||||||
xnn_operator_t op,
|
|
||||||
size_t batch,
|
|
||||||
size_t in_h,
|
|
||||||
size_t in_w,
|
|
||||||
pthreadpool_t pt_pool,
|
|
||||||
bool per_channel = false,
|
|
||||||
bool transpose = false,
|
|
||||||
uint32_t adj_h = 0,
|
|
||||||
uint32_t adj_w = 0) {
|
|
||||||
if(transpose) {
|
|
||||||
TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
|
|
||||||
return xnn_reshape_deconvolution2d_nhwc_qs8(
|
|
||||||
op, /* xnn_operator_t deconvolution_op */
|
|
||||||
batch, /* size_t batch_size */
|
|
||||||
in_h, /* size_t input_height */
|
|
||||||
in_w, /* size_t input_width */
|
|
||||||
adj_h, /* uint32_t adjustment_height */
|
|
||||||
adj_w, /* uint32_t adjustment_width */
|
|
||||||
nullptr, /* size_t* output_height_out */
|
|
||||||
nullptr, /* size_t* output_width_out */
|
|
||||||
pt_pool); /* pthreadpool_t threadpool */
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t workspace_size = SIZE_MAX;
|
|
||||||
size_t workspace_alignment = SIZE_MAX;
|
|
||||||
|
|
||||||
if (!per_channel) {
|
|
||||||
return xnn_reshape_convolution2d_nhwc_qs8(
|
|
||||||
op, /* xnn_operator_t convolution_op */
|
|
||||||
batch, /* size_t batch_size */
|
|
||||||
in_h, /* size_t input_height */
|
|
||||||
in_w, /* size_t input_width */
|
|
||||||
&workspace_size, /* size_t* workspace_size */
|
|
||||||
&workspace_alignment, /* size_t* workspace_alignment */
|
|
||||||
nullptr, /* size_t* output_height_out */
|
|
||||||
nullptr, /* size_t* output_width_out */
|
|
||||||
pt_pool); /* pthreadpool_t threadpool */
|
|
||||||
} else { /* per_channel */
|
|
||||||
return xnn_reshape_convolution2d_nhwc_qs8_qc8w(
|
|
||||||
op, /* xnn_operator_t convolution_op */
|
|
||||||
batch, /* size_t batch_size */
|
|
||||||
in_h, /* size_t input_height */
|
|
||||||
in_w, /* size_t input_width */
|
|
||||||
&workspace_size, /* size_t* workspace_size */
|
|
||||||
&workspace_alignment, /* size_t* workspace_alignment */
|
|
||||||
nullptr, /* size_t* output_height_out */
|
|
||||||
nullptr, /* size_t* output_width_out */
|
|
||||||
pt_pool); /* pthreadpool_t threadpool */
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
|
* Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
|
||||||
*/
|
*/
|
||||||
C10_ALWAYS_INLINE
|
C10_ALWAYS_INLINE
|
||||||
enum xnn_status xnnp_setup_convolution2d_nhwc(
|
enum xnn_status xnnp_setup_convolution2d_nhwc(
|
||||||
xnn_operator_t op,
|
xnn_operator_t op,
|
||||||
|
size_t batch,
|
||||||
|
size_t in_h,
|
||||||
|
size_t in_w,
|
||||||
const int8_t* inp,
|
const int8_t* inp,
|
||||||
int8_t* outp,
|
int8_t* outp,
|
||||||
|
pthreadpool_t pt_pool,
|
||||||
bool per_channel = false,
|
bool per_channel = false,
|
||||||
bool transpose = false) {
|
bool transpose = false,
|
||||||
|
uint32_t adj_h = 0,
|
||||||
|
uint32_t adj_w = 0) {
|
||||||
if(transpose) {
|
if(transpose) {
|
||||||
TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
|
TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
|
||||||
|
|
||||||
return xnn_setup_deconvolution2d_nhwc_qs8(
|
return xnn_setup_deconvolution2d_nhwc_qs8(
|
||||||
op, /* xnn_operator_t deconvolution_op */
|
op, /* xnn_operator_t deconvolution_op */
|
||||||
|
batch, /* size_t batch_size */
|
||||||
|
in_h, /* size_t input_height */
|
||||||
|
in_w, /* size_t input_width */
|
||||||
|
adj_h, /* uint32_t adjustment_height */
|
||||||
|
adj_w, /* uint32_t adjustment_width */
|
||||||
inp, /* const int8_t* input */
|
inp, /* const int8_t* input */
|
||||||
outp); /* int8_t* output */
|
outp, /* int8_t* output */
|
||||||
|
pt_pool); /* pthreadpool_t threadpool */
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!per_channel) {
|
if (!per_channel) {
|
||||||
return xnn_setup_convolution2d_nhwc_qs8(
|
return xnn_setup_convolution2d_nhwc_qs8(
|
||||||
op, /* xnn_operator_t deconvolution_op */
|
op, /* xnn_operator_t convolution_op */
|
||||||
nullptr, /* void workspace */
|
batch, /* size_t batch_size */
|
||||||
inp, /* const int8_t* input */
|
in_h, /* size_t input_height */
|
||||||
outp); /* int8_t* output */
|
in_w, /* size_t input_width */
|
||||||
|
inp, /* const int8_t* input */
|
||||||
|
outp, /* int8_t* output */
|
||||||
|
pt_pool); /* pthreadpool_t threadpool */
|
||||||
} else { /* per_channel */
|
} else { /* per_channel */
|
||||||
return xnn_setup_convolution2d_nhwc_qs8_qc8w(
|
return xnn_setup_convolution2d_nhwc_qc8(
|
||||||
op, /* xnn_operator_t deconvolution_op */
|
op, /* xnn_operator_t convolution_op */
|
||||||
nullptr, /* void workspace */
|
batch, /* size_t batch_size */
|
||||||
inp, /* const int8_t* input */
|
in_h, /* size_t input_height */
|
||||||
outp); /* int8_t* output */
|
in_w, /* size_t input_width */
|
||||||
|
inp, /* const int8_t* input */
|
||||||
|
outp, /* int8_t* output */
|
||||||
|
pt_pool); /* pthreadpool_t threadpool */
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -301,31 +258,22 @@ enum xnn_status xnnp_create_fully_connected_nc(
|
|||||||
output_max, /* int8_t output_max */
|
output_max, /* int8_t output_max */
|
||||||
flags, /* uint32_t flags */
|
flags, /* uint32_t flags */
|
||||||
nullptr, /* xnn_caches_t caches */
|
nullptr, /* xnn_caches_t caches */
|
||||||
nullptr, /* xnn_weights_cache_t */
|
|
||||||
fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
|
fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
|
||||||
}
|
}
|
||||||
|
|
||||||
C10_ALWAYS_INLINE
|
|
||||||
enum xnn_status xnnp_reshape_fully_connected_nc(
|
|
||||||
xnn_operator_t fully_connected_op,
|
|
||||||
size_t batch_size,
|
|
||||||
pthreadpool_t threadpool) {
|
|
||||||
return xnn_reshape_fully_connected_nc_qs8(
|
|
||||||
fully_connected_op, /* xnn_operator_t fully_connected_op */
|
|
||||||
batch_size, /* size_t batch_size */
|
|
||||||
threadpool); /* pthreadpool_t threadpool */
|
|
||||||
}
|
|
||||||
|
|
||||||
C10_ALWAYS_INLINE
|
C10_ALWAYS_INLINE
|
||||||
enum xnn_status xnnp_setup_fully_connected_nc(
|
enum xnn_status xnnp_setup_fully_connected_nc(
|
||||||
xnn_operator_t fully_connected_op,
|
xnn_operator_t fully_connected_op,
|
||||||
|
size_t batch_size,
|
||||||
const int8_t* input,
|
const int8_t* input,
|
||||||
int8_t* output) {
|
int8_t* output,
|
||||||
|
pthreadpool_t threadpool) {
|
||||||
return xnn_setup_fully_connected_nc_qs8(
|
return xnn_setup_fully_connected_nc_qs8(
|
||||||
fully_connected_op, /* xnn_operator_t fully_connected_op */
|
fully_connected_op, /* xnn_operator_t fully_connected_op */
|
||||||
|
batch_size, /* size_t batch_size */
|
||||||
input, /* const int8_t* input */
|
input, /* const int8_t* input */
|
||||||
output /* int8_t* output */
|
output, /* int8_t* output */
|
||||||
);
|
threadpool); /* pthreadpool_t threadpool */
|
||||||
}
|
}
|
||||||
|
|
||||||
} // namespace xnnp_utils
|
} // namespace xnnp_utils
|
||||||
|
@ -770,12 +770,14 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
|
|||||||
output_zero_point,
|
output_zero_point,
|
||||||
c10::MemoryFormat::ChannelsLast);
|
c10::MemoryFormat::ChannelsLast);
|
||||||
|
|
||||||
// Reshape the operator
|
// Setup the operator
|
||||||
status = at::native::xnnp_utils::xnnp_reshape_convolution2d_nhwc(
|
status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
|
||||||
xnnp_convolution_op.get(),
|
xnnp_convolution_op.get(),
|
||||||
N,
|
N,
|
||||||
H,
|
H,
|
||||||
W,
|
W,
|
||||||
|
reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
|
||||||
|
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
|
||||||
caffe2::pthreadpool_(),
|
caffe2::pthreadpool_(),
|
||||||
per_channel(),
|
per_channel(),
|
||||||
transpose(),
|
transpose(),
|
||||||
@ -789,21 +791,6 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
|
|||||||
status,
|
status,
|
||||||
")");
|
")");
|
||||||
|
|
||||||
// Setup the operator
|
|
||||||
status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
|
|
||||||
xnnp_convolution_op.get(),
|
|
||||||
reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
|
|
||||||
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
|
|
||||||
per_channel(),
|
|
||||||
transpose());
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
status == xnn_status_success,
|
|
||||||
func_name,
|
|
||||||
": xnn setup operator failed(",
|
|
||||||
status,
|
|
||||||
")");
|
|
||||||
|
|
||||||
// Run the operator
|
// Run the operator
|
||||||
status = xnn_run_operator(
|
status = xnn_run_operator(
|
||||||
xnnp_convolution_op.get(), /* xnn_operator_t op */
|
xnnp_convolution_op.get(), /* xnn_operator_t op */
|
||||||
|
@ -565,19 +565,14 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
|
|||||||
rows_input *= input_contig.size(i);
|
rows_input *= input_contig.size(i);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Reshape the operator
|
|
||||||
status = at::native::xnnp_utils::xnnp_reshape_fully_connected_nc(
|
|
||||||
xnnp_linear_op.get(),
|
|
||||||
rows_input, /* batch_size */
|
|
||||||
caffe2::pthreadpool_());
|
|
||||||
|
|
||||||
// Setup the operator
|
// Setup the operator
|
||||||
status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
|
status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
|
||||||
xnnp_linear_op.get(),
|
xnnp_linear_op.get(),
|
||||||
|
rows_input, /* batch_size */
|
||||||
reinterpret_cast<const underlying_t*>(
|
reinterpret_cast<const underlying_t*>(
|
||||||
input_contig.template data_ptr<scalar_t>()),
|
input_contig.template data_ptr<scalar_t>()),
|
||||||
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>())
|
reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
|
||||||
);
|
caffe2::pthreadpool_());
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
status == xnn_status_success,
|
status == xnn_status_success,
|
||||||
|
@ -139,29 +139,17 @@ Tensor _mul_out_xnnpack(
|
|||||||
const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
|
const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
|
||||||
const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
|
const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
|
||||||
|
|
||||||
// reshape operator
|
// set up operator
|
||||||
status = xnn_reshape_multiply_nd_qs8(
|
status = xnn_setup_multiply_nd_qs8(
|
||||||
xnnp_qmul_operator.get(),
|
xnnp_qmul_operator.get(),
|
||||||
self_shape.size(),
|
self_shape.size(),
|
||||||
self_shape.data(),
|
self_shape.data(),
|
||||||
other_shape.size(),
|
other_shape.size(),
|
||||||
other_shape.data(),
|
other_shape.data(),
|
||||||
caffe2::pthreadpool_());
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
status == xnn_status_success,
|
|
||||||
func_name,
|
|
||||||
": xnn reshape operator failed(",
|
|
||||||
status,
|
|
||||||
")!");
|
|
||||||
|
|
||||||
// set up operator
|
|
||||||
status = xnn_setup_multiply_nd_qs8(
|
|
||||||
xnnp_qmul_operator.get(),
|
|
||||||
reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
|
reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
|
||||||
reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
|
reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
|
||||||
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
|
reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>()),
|
||||||
);
|
caffe2::pthreadpool_());
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
status == xnn_status_success,
|
status == xnn_status_success,
|
||||||
|
@ -34,19 +34,12 @@ static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
|
|||||||
|
|
||||||
Operator hardswish_scoped_op(hardswish_op);
|
Operator hardswish_scoped_op(hardswish_op);
|
||||||
|
|
||||||
const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
|
|
||||||
hardswish_op,
|
|
||||||
input.numel(), // Batch
|
|
||||||
caffe2::pthreadpool_()); // threadpool
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
xnn_status_success == reshape_status,
|
|
||||||
"xnn_reshape_hardswish_nc_f32 failed!");
|
|
||||||
|
|
||||||
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
|
const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
|
||||||
hardswish_op,
|
hardswish_op,
|
||||||
|
input.numel(), // Batch
|
||||||
input.data_ptr<float>(),
|
input.data_ptr<float>(),
|
||||||
output.data_ptr<float>());
|
output.data_ptr<float>(),
|
||||||
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == setup_status,
|
||||||
|
@ -7,13 +7,18 @@
|
|||||||
|
|
||||||
namespace at::native::xnnpack {
|
namespace at::native::xnnpack {
|
||||||
|
|
||||||
bool use_global_average_pool(const Tensor& input) {
|
bool use_global_average_pool(
|
||||||
return xnnpack::available() && (1 <= input.ndimension()) &&
|
const Tensor& input) {
|
||||||
(input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
|
return xnnpack::available() &&
|
||||||
!input.requires_grad() && true;
|
(1 <= input.ndimension()) &&
|
||||||
|
(input.device().is_cpu()) &&
|
||||||
|
(kFloat == input.scalar_type()) &&
|
||||||
|
!input.requires_grad() &&
|
||||||
|
true;
|
||||||
}
|
}
|
||||||
|
|
||||||
Tensor global_average_pool(const Tensor& input) {
|
Tensor global_average_pool(
|
||||||
|
const Tensor& input) {
|
||||||
using namespace internal;
|
using namespace internal;
|
||||||
|
|
||||||
const Tensor input_padded_contig_nhwc =
|
const Tensor input_padded_contig_nhwc =
|
||||||
@ -22,10 +27,10 @@ Tensor global_average_pool(const Tensor& input) {
|
|||||||
|
|
||||||
Tensor output = mobile::empty_with_tail_padding(
|
Tensor output = mobile::empty_with_tail_padding(
|
||||||
{
|
{
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::batch),
|
input_padded_contig_nhwc.size(Layout::Activation4D::batch),
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::channels),
|
input_padded_contig_nhwc.size(Layout::Activation4D::channels),
|
||||||
1,
|
1,
|
||||||
1,
|
1,
|
||||||
},
|
},
|
||||||
input_padded_contig_nhwc.options().dtype(),
|
input_padded_contig_nhwc.options().dtype(),
|
||||||
MemoryFormat::ChannelsLast,
|
MemoryFormat::ChannelsLast,
|
||||||
@ -33,61 +38,42 @@ Tensor global_average_pool(const Tensor& input) {
|
|||||||
|
|
||||||
xnn_operator_t global_average_pooling_op{};
|
xnn_operator_t global_average_pooling_op{};
|
||||||
const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
|
const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
|
input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
|
||||||
input_padded_contig_nhwc.size(
|
input_padded_contig_nhwc.size(
|
||||||
Layout::Activation4D::channels), // input stride
|
Layout::Activation4D::channels), // input stride
|
||||||
input_padded_contig_nhwc.size(
|
input_padded_contig_nhwc.size(
|
||||||
Layout::Activation4D::channels), // output stride
|
Layout::Activation4D::channels), // output stride
|
||||||
-std::numeric_limits<float>::infinity(),
|
-std::numeric_limits<float>::infinity(),
|
||||||
std::numeric_limits<float>::infinity(),
|
std::numeric_limits<float>::infinity(),
|
||||||
0 /* flags */,
|
0 /* flags */,
|
||||||
&global_average_pooling_op);
|
&global_average_pooling_op);
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == create_status,
|
xnn_status_success == create_status,
|
||||||
"xnn_create_global_average_pooling_nwc_f32 failed!");
|
"xnn_create_global_average_pooling_nwc_f32 failed!");
|
||||||
|
|
||||||
Operator global_avg_pool_scoped_op(global_average_pooling_op);
|
Operator global_avg_pool_scoped_op(global_average_pooling_op);
|
||||||
|
|
||||||
size_t workspace_size = 0;
|
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
|
||||||
size_t workspace_alignment = 0;
|
|
||||||
|
|
||||||
const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
|
|
||||||
global_average_pooling_op,
|
global_average_pooling_op,
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
|
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::width) *
|
input_padded_contig_nhwc.size(Layout::Activation4D::width) *
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
|
input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
|
||||||
&workspace_size, // workspace_size
|
input_padded_contig_nhwc.data_ptr<float>(),
|
||||||
&workspace_alignment, // workspace_alignment
|
output.data_ptr<float>(),
|
||||||
caffe2::pthreadpool_());
|
caffe2::pthreadpool_());
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == reshape_status,
|
xnn_status_success == setup_status,
|
||||||
"xnn_reshape_global_average_pooling_nwc_f32 failed!");
|
"xnn_setup_global_average_pooling_nwc_f32 failed!");
|
||||||
|
|
||||||
// Create Workspace pointer, which we will align and pad with 16 bytes
|
const xnn_status run_status = xnn_run_operator(
|
||||||
size_t xnnpack_buffer_padding = 16;
|
global_average_pooling_op,
|
||||||
std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
|
caffe2::pthreadpool_());
|
||||||
void* maybe_aligned_workspace = workspace_vector.data();
|
|
||||||
void* aligned_workspace =
|
|
||||||
(void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
|
|
||||||
|
|
||||||
const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
|
|
||||||
global_average_pooling_op,
|
|
||||||
aligned_workspace,
|
|
||||||
input_padded_contig_nhwc.data_ptr<float>(),
|
|
||||||
output.data_ptr<float>());
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == run_status,
|
||||||
"xnn_setup_global_average_pooling_nwc_f32 failed!");
|
"xnn_setup_global_average_pooling_nwc_f32 failed!");
|
||||||
|
|
||||||
const xnn_status run_status =
|
|
||||||
xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
xnn_status_success == run_status,
|
|
||||||
"xnn_setup_global_average_pooling_nwc_f32 failed!");
|
|
||||||
|
|
||||||
return output.to(input.suggest_memory_format());
|
return output.to(input.suggest_memory_format());
|
||||||
}
|
}
|
||||||
|
@ -79,19 +79,12 @@ Tensor channel_shuffle(
|
|||||||
input_padded_contig_nhwc.size(Layout::Activation4D::height) *
|
input_padded_contig_nhwc.size(Layout::Activation4D::height) *
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::width);
|
input_padded_contig_nhwc.size(Layout::Activation4D::width);
|
||||||
|
|
||||||
const xnn_status reshape_status = xnn_reshape_channel_shuffle_nc_x32(
|
|
||||||
channel_shuffle_op, // operator
|
|
||||||
batch_size, // batch_size
|
|
||||||
caffe2::pthreadpool_()); // threadpool
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
xnn_status_success == reshape_status,
|
|
||||||
"xnn_reshape_channel_shuffle_nc_x32 failed!");
|
|
||||||
|
|
||||||
const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
|
const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
|
||||||
channel_shuffle_op, // operator
|
channel_shuffle_op, // operator
|
||||||
|
batch_size, // batch_size
|
||||||
input_padded_contig_nhwc.data_ptr<float>(), // input
|
input_padded_contig_nhwc.data_ptr<float>(), // input
|
||||||
output_padded_contig_nhwc.data_ptr<float>()); // output
|
output_padded_contig_nhwc.data_ptr<float>(), // output
|
||||||
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == setup_status,
|
||||||
|
@ -236,7 +236,6 @@ ContextConv2D create(
|
|||||||
output_max, // output_max
|
output_max, // output_max
|
||||||
0u, // flags
|
0u, // flags
|
||||||
nullptr, // xnn_caches_t
|
nullptr, // xnn_caches_t
|
||||||
nullptr, // xnn_weights_cache_t
|
|
||||||
&convolution_op); // operator
|
&convolution_op); // operator
|
||||||
} else {
|
} else {
|
||||||
for (const auto i : c10::irange(4)) {
|
for (const auto i : c10::irange(4)) {
|
||||||
@ -266,7 +265,6 @@ ContextConv2D create(
|
|||||||
output_max, // output_max
|
output_max, // output_max
|
||||||
0u, // flags
|
0u, // flags
|
||||||
nullptr, // xnn_caches_t
|
nullptr, // xnn_caches_t
|
||||||
nullptr, // xnn_weights_cache_t
|
|
||||||
&convolution_op); // operator
|
&convolution_op); // operator
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -340,41 +338,26 @@ Tensor run(
|
|||||||
*/
|
*/
|
||||||
|
|
||||||
if (context.transposed_) {
|
if (context.transposed_) {
|
||||||
setup_status = xnn_reshape_deconvolution2d_nhwc_f32(
|
setup_status = xnn_setup_deconvolution2d_nhwc_f32(
|
||||||
context.op.get(),
|
context.op.get(), // operator
|
||||||
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
|
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
|
||||||
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
|
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
|
||||||
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
|
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
|
||||||
context.output_padding_[0], // adjustment_height
|
context.output_padding_[0], // adjustment_height
|
||||||
context.output_padding_[1], // adjustment_width
|
context.output_padding_[1], // adjustment_width
|
||||||
nullptr, // output_height_out
|
padded_input_nhwc.data_ptr<float>(), // input
|
||||||
nullptr, // output_width_out
|
output.data_ptr<float>(), // output
|
||||||
caffe2::pthreadpool_()); // threadpool
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
setup_status = xnn_setup_deconvolution2d_nhwc_f32(
|
|
||||||
context.op.get(), // operator
|
|
||||||
padded_input_nhwc.data_ptr<float>(), // input
|
|
||||||
output.data_ptr<float>()); // output
|
|
||||||
} else {
|
} else {
|
||||||
size_t workspace_size = SIZE_MAX;
|
setup_status = xnn_setup_convolution2d_nhwc_f32(
|
||||||
size_t workspace_alignment = SIZE_MAX;
|
context.op.get(), // operator
|
||||||
|
|
||||||
setup_status = xnn_reshape_convolution2d_nhwc_f32(
|
|
||||||
context.op.get(),
|
|
||||||
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
|
padded_input_nhwc.size(Layout::Activation4D::batch), // batch_size
|
||||||
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
|
padded_input_nhwc.size(Layout::Activation4D::height), // input_height
|
||||||
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
|
padded_input_nhwc.size(Layout::Activation4D::width), // input_width
|
||||||
&workspace_size, // workspace_size
|
|
||||||
&workspace_alignment, // workspace_alignment
|
|
||||||
nullptr, // output_height_out
|
|
||||||
nullptr, // output_width_out
|
|
||||||
caffe2::pthreadpool_());
|
|
||||||
|
|
||||||
setup_status = xnn_setup_convolution2d_nhwc_f32(
|
|
||||||
context.op.get(), // operator
|
|
||||||
nullptr, // workspace
|
|
||||||
padded_input_nhwc.data_ptr<float>(), // input
|
padded_input_nhwc.data_ptr<float>(), // input
|
||||||
output.data_ptr<float>()); // output
|
output.data_ptr<float>(), // output
|
||||||
|
caffe2::pthreadpool_());
|
||||||
}
|
}
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
|
@ -95,7 +95,6 @@ ContextLinear create(
|
|||||||
output_max, // output_max
|
output_max, // output_max
|
||||||
0u, // flags
|
0u, // flags
|
||||||
nullptr, // xnn_caches_t
|
nullptr, // xnn_caches_t
|
||||||
nullptr, // xnn_weights_cache_t
|
|
||||||
&linear_op); // operator
|
&linear_op); // operator
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
@ -137,19 +136,12 @@ Tensor run(
|
|||||||
padded_input.suggest_memory_format(),
|
padded_input.suggest_memory_format(),
|
||||||
padded_input.opt_names());
|
padded_input.opt_names());
|
||||||
|
|
||||||
const xnn_status reshape_status = xnn_reshape_fully_connected_nc_f32(
|
|
||||||
context.op.get(), // operator
|
|
||||||
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
|
|
||||||
caffe2::pthreadpool_()); // threadpool
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
xnn_status_success == reshape_status,
|
|
||||||
"xnn_reshape_fully_connected_nc_f32 failed!");
|
|
||||||
|
|
||||||
const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
|
const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
|
||||||
context.op.get(), // operator
|
context.op.get(), // operator
|
||||||
|
Layout::ActivationND::batch(padded_input.sizes()), // Batch,
|
||||||
padded_input.data_ptr<float>(), // input
|
padded_input.data_ptr<float>(), // input
|
||||||
output.data_ptr<float>()); // output
|
output.data_ptr<float>(), // output
|
||||||
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == setup_status,
|
||||||
|
@ -214,23 +214,14 @@ Tensor max_pool2d(
|
|||||||
xnn_status_success == create_status,
|
xnn_status_success == create_status,
|
||||||
"xnn_create_max_pooling2d_nhwc_f32 failed!");
|
"xnn_create_max_pooling2d_nhwc_f32 failed!");
|
||||||
|
|
||||||
const xnn_status reshape_status = xnn_reshape_max_pooling2d_nhwc_f32(
|
const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
|
||||||
max_pool_op, // operator
|
max_pool_op, // operator
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
|
input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::height), // input_height
|
input_padded_contig_nhwc.size(Layout::Activation4D::height), // input_height
|
||||||
input_padded_contig_nhwc.size(Layout::Activation4D::width), // input_width
|
input_padded_contig_nhwc.size(Layout::Activation4D::width), // input_width
|
||||||
nullptr, // output_height_out
|
|
||||||
nullptr, // output_width_out
|
|
||||||
caffe2::pthreadpool_()); // threadpool
|
|
||||||
|
|
||||||
TORCH_CHECK(
|
|
||||||
xnn_status_success == reshape_status,
|
|
||||||
"xnn_reshape_max_pooling2d_nhwc_f32 failed!");
|
|
||||||
|
|
||||||
const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
|
|
||||||
max_pool_op, // operator
|
|
||||||
input_padded_contig_nhwc.data_ptr<float>(), // input
|
input_padded_contig_nhwc.data_ptr<float>(), // input
|
||||||
output_padded_contig_nhwc.data_ptr<float>()); // output
|
output_padded_contig_nhwc.data_ptr<float>(), // output
|
||||||
|
caffe2::pthreadpool_()); // threadpool
|
||||||
|
|
||||||
TORCH_CHECK(
|
TORCH_CHECK(
|
||||||
xnn_status_success == setup_status,
|
xnn_status_success == setup_status,
|
||||||
|
@ -619,13 +619,7 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
|
|||||||
# Disable ARM BF16 and FP16 vector for now; unused and causes build failures because
|
# Disable ARM BF16 and FP16 vector for now; unused and causes build failures because
|
||||||
# these new ISA features may not be supported on older compilers
|
# these new ISA features may not be supported on older compilers
|
||||||
set(XNNPACK_ENABLE_ARM_BF16 OFF CACHE BOOL "")
|
set(XNNPACK_ENABLE_ARM_BF16 OFF CACHE BOOL "")
|
||||||
|
set(XNNPACK_ENABLE_ARM_FP16_VECTOR OFF CACHE BOOL "")
|
||||||
# Disable AVXVNNI for now, older clang versions seem not to support it
|
|
||||||
# (clang 12 is where avx-vnni support is added)
|
|
||||||
set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "")
|
|
||||||
|
|
||||||
# Disable I8MM For CI since clang 9 does not support neon i8mm.
|
|
||||||
set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
|
|
||||||
|
|
||||||
# Setting this global PIC flag for all XNNPACK targets.
|
# Setting this global PIC flag for all XNNPACK targets.
|
||||||
# This is needed for Object libraries within XNNPACK which must
|
# This is needed for Object libraries within XNNPACK which must
|
||||||
|
1
third_party/BUCK.oss
vendored
1
third_party/BUCK.oss
vendored
@ -127,7 +127,6 @@ cxx_library(
|
|||||||
"cpuinfo/wrappers/linux/multiline.c",
|
"cpuinfo/wrappers/linux/multiline.c",
|
||||||
"cpuinfo/wrappers/linux/processors.c",
|
"cpuinfo/wrappers/linux/processors.c",
|
||||||
"cpuinfo/wrappers/linux/smallfile.c",
|
"cpuinfo/wrappers/linux/smallfile.c",
|
||||||
"cpuinfo/wrappers/log.c",
|
|
||||||
"cpuinfo/wrappers/mach/topology.c",
|
"cpuinfo/wrappers/mach/topology.c",
|
||||||
"cpuinfo/wrappers/x86/cache/descriptor.c",
|
"cpuinfo/wrappers/x86/cache/descriptor.c",
|
||||||
"cpuinfo/wrappers/x86/cache/deterministic.c",
|
"cpuinfo/wrappers/x86/cache/deterministic.c",
|
||||||
|
2
third_party/XNNPACK
vendored
2
third_party/XNNPACK
vendored
Submodule third_party/XNNPACK updated: d9cce341f8...51a987591a
2
third_party/cpuinfo
vendored
2
third_party/cpuinfo
vendored
Submodule third_party/cpuinfo updated: d6860c477c...6481e8bef0
1
third_party/generate-cpuinfo-wrappers.py
vendored
1
third_party/generate-cpuinfo-wrappers.py
vendored
@ -9,7 +9,6 @@ CPUINFO_SOURCES = {
|
|||||||
"init.c",
|
"init.c",
|
||||||
"api.c",
|
"api.c",
|
||||||
"cache.c",
|
"cache.c",
|
||||||
"log.c",
|
|
||||||
],
|
],
|
||||||
"defined(__linux__)": [
|
"defined(__linux__)": [
|
||||||
"linux/multiline.c",
|
"linux/multiline.c",
|
||||||
|
135
third_party/generate-xnnpack-wrappers.py
vendored
135
third_party/generate-xnnpack-wrappers.py
vendored
@ -8,22 +8,16 @@ import logging
|
|||||||
|
|
||||||
BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
|
BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
|
||||||
WRAPPER_SRC_NAMES = {
|
WRAPPER_SRC_NAMES = {
|
||||||
"PROD_SCALAR_MICROKERNEL_SRCS": None,
|
"PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS": None,
|
||||||
"PROD_FMA_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
|
"PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)",
|
||||||
"PROD_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
|
|
||||||
"PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__arm__)",
|
|
||||||
"PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
"PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
"PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
"PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
"PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
"PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
"PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
"PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
"PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
||||||
"PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
"PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
"PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
||||||
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
|
||||||
"PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
"PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
"PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
|
||||||
"PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
|
||||||
"PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
|
||||||
"PROD_NEONI8MM_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
|
||||||
"PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
"PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
"PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
"PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
"PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
"PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
@ -36,13 +30,42 @@ WRAPPER_SRC_NAMES = {
|
|||||||
"PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
"PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
"PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
"PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
"PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
"PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
"PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
|
||||||
"PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
|
|
||||||
"PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
|
||||||
"AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
|
"AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
|
||||||
"AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
"AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
||||||
|
|
||||||
# add non-prod microkernel sources here:
|
# add additoonal:
|
||||||
|
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
"ALL_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
|
||||||
|
"ALL_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
"ALL_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
"ALL_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
|
||||||
|
'ALL_AVX512SKX_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_AVX512VBMI_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_F16C_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_FMA3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_FP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
'ALL_NEON_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
'ALL_NEON_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
|
||||||
|
'ALL_NEONBF16_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
'ALL_NEONDOT_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
'ALL_NEONFMA_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
|
||||||
|
'ALL_NEONFP16_MICROKERNEL_SRCS':"defined(__arm__) || defined(__aarch64__)",
|
||||||
|
'ALL_NEONFP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
|
||||||
|
'ALL_NEONV8_MICROKERNEL_SRCS': "defined(__aarch64__)",
|
||||||
|
'ALL_SCALAR_MICROKERNEL_SRCS': "defined(__arm__)",
|
||||||
|
'ALL_SSE_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_SSE2_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_SSE41_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_SSSE3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'ALL_XOP_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
|
||||||
|
'AARCH32_ASM_MICROKERNEL_SRCS': "defined(__arm__)",
|
||||||
|
"PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
|
||||||
|
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
|
||||||
|
"PROD_SCALAR_MICROKERNEL_SRCS": "defined(__arm__)",
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
SRC_NAMES = set([
|
SRC_NAMES = set([
|
||||||
@ -50,24 +73,12 @@ SRC_NAMES = set([
|
|||||||
"SUBGRAPH_SRCS",
|
"SUBGRAPH_SRCS",
|
||||||
"LOGGING_SRCS",
|
"LOGGING_SRCS",
|
||||||
"XNNPACK_SRCS",
|
"XNNPACK_SRCS",
|
||||||
|
"HOT_SRCS",
|
||||||
"TABLE_SRCS",
|
"TABLE_SRCS",
|
||||||
"JIT_SRCS",
|
"JIT_SRCS",
|
||||||
"PROD_SCALAR_MICROKERNEL_SRCS",
|
"JIT_AARCH32_SRCS",
|
||||||
"PROD_FMA_MICROKERNEL_SRCS",
|
"JIT_AARCH64_SRCS",
|
||||||
"PROD_ARMSIMD32_MICROKERNEL_SRCS",
|
"PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS",
|
||||||
"PROD_FP16ARITH_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEON_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONFP16_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONFMA_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEON_AARCH64_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONV8_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONDOT_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
|
|
||||||
"PROD_NEONI8MM_MICROKERNEL_SRCS",
|
|
||||||
"PROD_SSE_MICROKERNEL_SRCS",
|
"PROD_SSE_MICROKERNEL_SRCS",
|
||||||
"PROD_SSE2_MICROKERNEL_SRCS",
|
"PROD_SSE2_MICROKERNEL_SRCS",
|
||||||
"PROD_SSSE3_MICROKERNEL_SRCS",
|
"PROD_SSSE3_MICROKERNEL_SRCS",
|
||||||
@ -79,14 +90,59 @@ SRC_NAMES = set([
|
|||||||
"PROD_AVX2_MICROKERNEL_SRCS",
|
"PROD_AVX2_MICROKERNEL_SRCS",
|
||||||
"PROD_AVX512F_MICROKERNEL_SRCS",
|
"PROD_AVX512F_MICROKERNEL_SRCS",
|
||||||
"PROD_AVX512SKX_MICROKERNEL_SRCS",
|
"PROD_AVX512SKX_MICROKERNEL_SRCS",
|
||||||
|
"PROD_SCALAR_MICROKERNEL_SRCS",
|
||||||
|
"PROD_SCALAR_AARCH32_MICROKERNEL_SRCS",
|
||||||
|
"PROD_SCALAR_RISCV_MICROKERNEL_SRCS",
|
||||||
|
"PROD_ARMSIMD32_MICROKERNEL_SRCS",
|
||||||
|
"PROD_FP16ARITH_MICROKERNEL_SRCS",
|
||||||
|
"PROD_NEON_MICROKERNEL_SRCS",
|
||||||
|
"PROD_NEONFP16_MICROKERNEL_SRCS",
|
||||||
|
"PROD_NEONFMA_MICROKERNEL_SRCS",
|
||||||
|
"PROD_NEON_AARCH64_MICROKERNEL_SRCS",
|
||||||
|
"PROD_NEONV8_MICROKERNEL_SRCS",
|
||||||
|
"PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
|
||||||
|
"PROD_NEONDOT_MICROKERNEL_SRCS",
|
||||||
|
"PROD_SSE2_MICROKERNEL_SRCS",
|
||||||
|
"PROD_SSSE3_MICROKERNEL_SRCS",
|
||||||
|
"PROD_SSE41_MICROKERNEL_SRCS",
|
||||||
|
"PROD_AVX_MICROKERNEL_SRCS",
|
||||||
|
"PROD_F16C_MICROKERNEL_SRCS",
|
||||||
"PROD_AVX512VBMI_MICROKERNEL_SRCS",
|
"PROD_AVX512VBMI_MICROKERNEL_SRCS",
|
||||||
"PROD_AVX512VNNI_MICROKERNEL_SRCS",
|
"PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
|
||||||
"PROD_RVV_MICROKERNEL_SRCS",
|
|
||||||
"PROD_AVXVNNI_MICROKERNEL_SRCS",
|
|
||||||
"AARCH32_ASM_MICROKERNEL_SRCS",
|
|
||||||
"AARCH64_ASM_MICROKERNEL_SRCS",
|
|
||||||
|
|
||||||
# add non-prod microkernel sources here:
|
# new adding libs:
|
||||||
|
'ALL_ARMSIMD32_MICROKERNEL_SRCS',
|
||||||
|
'ALL_AVX_MICROKERNEL_SRCS',
|
||||||
|
'ALL_AVX2_MICROKERNEL_SRCS',
|
||||||
|
'ALL_AVX512F_MICROKERNEL_SRCS',
|
||||||
|
'ALL_AVX512SKX_MICROKERNEL_SRCS',
|
||||||
|
'ALL_AVX512VBMI_MICROKERNEL_SRCS',
|
||||||
|
'ALL_F16C_MICROKERNEL_SRCS',
|
||||||
|
'ALL_FMA3_MICROKERNEL_SRCS',
|
||||||
|
'ALL_FP16ARITH_MICROKERNEL_SRCS',
|
||||||
|
'ALL_HEXAGON_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEON_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEON_AARCH64_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONBF16_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONBF16_AARCH64_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONDOT_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONFMA_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONFP16_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONFP16ARITH_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS',
|
||||||
|
'ALL_NEONV8_MICROKERNEL_SRCS',
|
||||||
|
'ALL_SCALAR_MICROKERNEL_SRCS',
|
||||||
|
'ALL_SSE_MICROKERNEL_SRCS',
|
||||||
|
'ALL_SSE2_MICROKERNEL_SRCS',
|
||||||
|
'ALL_SSE41_MICROKERNEL_SRCS',
|
||||||
|
'ALL_SSSE3_MICROKERNEL_SRCS',
|
||||||
|
'ALL_WASM_MICROKERNEL_SRCS',
|
||||||
|
'ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS',
|
||||||
|
'ALL_WASMSIMD_MICROKERNEL_SRCS',
|
||||||
|
'ALL_XOP_MICROKERNEL_SRCS',
|
||||||
|
'AARCH32_ASM_MICROKERNEL_SRCS',
|
||||||
|
'AARCH64_ASM_MICROKERNEL_SRCS',
|
||||||
])
|
])
|
||||||
|
|
||||||
def handle_singleline_parse(line):
|
def handle_singleline_parse(line):
|
||||||
@ -94,10 +150,11 @@ def handle_singleline_parse(line):
|
|||||||
end_index = line.find(")")
|
end_index = line.find(")")
|
||||||
line = line[start_index+1:end_index]
|
line = line[start_index+1:end_index]
|
||||||
key_val = line.split(" ")
|
key_val = line.split(" ")
|
||||||
return key_val[0], list(map(lambda x: x[4:], key_val[1:]))
|
return key_val[0], key_val[1][4:]
|
||||||
|
|
||||||
def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
|
def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
|
||||||
sources = collections.defaultdict(list)
|
sources = collections.defaultdict(list)
|
||||||
|
count = 0
|
||||||
with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
|
with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
|
||||||
lines = cmake.readlines()
|
lines = cmake.readlines()
|
||||||
i = 0
|
i = 0
|
||||||
@ -106,7 +163,7 @@ def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
|
|||||||
|
|
||||||
if lines[i].startswith("SET") and "src/" in lines[i]:
|
if lines[i].startswith("SET") and "src/" in lines[i]:
|
||||||
name, val = handle_singleline_parse(line)
|
name, val = handle_singleline_parse(line)
|
||||||
sources[name].extend(val)
|
sources[name].append(val)
|
||||||
i+=1
|
i+=1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
860
third_party/xnnpack.buck.bzl
vendored
860
third_party/xnnpack.buck.bzl
vendored
File diff suppressed because it is too large
Load Diff
8007
third_party/xnnpack_src_defs.bzl
vendored
8007
third_party/xnnpack_src_defs.bzl
vendored
File diff suppressed because it is too large
Load Diff
6104
third_party/xnnpack_wrapper_defs.bzl
vendored
6104
third_party/xnnpack_wrapper_defs.bzl
vendored
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user