Revert "[8/n] Update XNNPACK Version Part 8 Everything Remaining to get it to work (#115587)"

This reverts commit a8dc9d8e353ddcf7db0247349a3acd0dd37fcc6f. Reverted https://github.com/pytorch/pytorch/pull/115587 on behalf of https://github.com/facebook-github-bot due to Diff reverted internally ([comment](https://github.com/pytorch/pytorch/pull/115587#issuecomment-1852835898))
2025-10-20 21:14:14 +08:00 · 2023-12-12 21:28:09 +00:00
parent ac4f6beb00
commit c3ed9f65a0
20 changed files with 13589 additions and 1914 deletions
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@ -259,32 +259,24 @@ enum xnn_status xnnp_create_add_nd(
 }
 C10_ALWAYS_INLINE
-enum xnn_status xnnp_reshape_add_nd(
+enum xnn_status xnnp_setup_add_nd(
    xnn_operator_t op,
    const std::vector<size_t>& a_shape,
    const std::vector<size_t>& b_shape,
    pthreadpool_t pt_pool) {
  return xnn_reshape_add_nd_qs8(
      op,             /* xnn_operator_t add_op      */
      a_shape.size(), /* size_t num_input1_dims     */
      a_shape.data(), /* const size_t* input1_shape */
      b_shape.size(), /* size_t num_input2_dims     */
      b_shape.data(), /* const size_t* input2_shape */
      pt_pool);       /* pthreadpool_t threadpool   */
 }
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_setup_add_nd(
    xnn_operator_t op,
    const int8_t* da,
    const int8_t* db,
    int8_t* dc,
    pthreadpool_t pt_pool) {
  return xnn_setup_add_nd_qs8(
      op,             /* xnn_operator_t add_op      */
      a_shape.size(), /* size_t num_input1_dims     */
      a_shape.data(), /* const size_t* input1_shape */
      b_shape.size(), /* size_t num_input2_dims     */
      b_shape.data(), /* const size_t* input2_shape */
      da,             /* const int8_t* input1       */
      db,             /* const int8_t* input2       */
-      dc);            /* int8_t* output             */
+      dc,             /* int8_t* output             */
      pt_pool);       /* pthreadpool_t threadpool   */
 }
 template <typename scalar_t, bool ReLUFused = false>
@ -356,20 +348,11 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
  const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
  const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
  // Reshape the operator
  status = xnnp_reshape_add_nd(
      xnnp_add_operator.get(),
      qa_shape,
      qb_shape,
      caffe2::pthreadpool_());
  TORCH_CHECK(
      status == xnn_status_success,
      func_name, ": xnn reshape operator failed(", status,")!");
  // Setup the operator
  status = xnnp_setup_add_nd(
      xnnp_add_operator.get(),
      qa_shape,
      qb_shape,
      reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
      reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
      reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
--- a/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
@ -100,7 +100,6 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
        op_max,         /* int8_t output_max                    */
        flags,          /* uint32_t flags                       */
        nullptr,        /* xnn_caches_t caches                  */
        nullptr,        /* xnn_weights_cache_t weights_cache    */
        op);            /* xnn_operator_t* deconvolution_op_out */
  }
@ -133,10 +132,9 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
        op_max,         /* int8_t output_max                  */
        flags,          /* uint32_t flags                     */
        nullptr,        /* xnn_caches_t caches                */
        nullptr,        /* xnn_weights_cache_t weights_cache    */
        op);            /* xnn_operator_t* convolution_op_out */
  } else { /* per_channel */
-    return xnn_create_convolution2d_nhwc_qs8_qc8w(
+    return xnn_create_convolution2d_nhwc_qc8(
        pad_top,        /* uint32_t input_padding_top         */
        pad_right,      /* uint32_t input_padding_right       */
        pad_bottom,     /* uint32_t input_padding_bottom      */
@ -163,99 +161,58 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
        op_max,         /* int8_t output_max                  */
        flags,          /* uint32_t flags                     */
        nullptr,        /* xnn_caches_t caches                */
        nullptr,        /* xnn_weights_cache_t weights_cache    */
        op);            /* xnn_operator_t* convolution_op_out */
  }
 }
 /*
 * Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions.
 */
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_reshape_convolution2d_nhwc(
    xnn_operator_t op,
    size_t batch,
    size_t in_h,
    size_t in_w,
    pthreadpool_t pt_pool,
    bool per_channel = false,
    bool transpose = false,
    uint32_t adj_h = 0,
    uint32_t adj_w = 0) {
  if(transpose) {
    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
    return xnn_reshape_deconvolution2d_nhwc_qs8(
        op,       /* xnn_operator_t deconvolution_op */
        batch,    /* size_t batch_size               */
        in_h,     /* size_t input_height             */
        in_w,     /* size_t input_width              */
        adj_h,    /* uint32_t adjustment_height      */
        adj_w,    /* uint32_t adjustment_width       */
        nullptr,  /* size_t* output_height_out       */
        nullptr,  /* size_t* output_width_out        */
        pt_pool); /* pthreadpool_t threadpool        */
  }
  size_t workspace_size = SIZE_MAX;
  size_t workspace_alignment = SIZE_MAX;
  if (!per_channel) {
    return xnn_reshape_convolution2d_nhwc_qs8(
        op,       /* xnn_operator_t convolution_op */
        batch,    /* size_t batch_size             */
        in_h,     /* size_t input_height           */
        in_w,     /* size_t input_width            */
        &workspace_size, /* size_t* workspace_size */
        &workspace_alignment, /* size_t* workspace_alignment */
        nullptr,  /* size_t* output_height_out     */
        nullptr,  /* size_t* output_width_out      */
        pt_pool); /* pthreadpool_t threadpool      */
  } else { /* per_channel */
    return xnn_reshape_convolution2d_nhwc_qs8_qc8w(
        op,       /* xnn_operator_t convolution_op */
        batch,    /* size_t batch_size             */
        in_h,     /* size_t input_height           */
        in_w,     /* size_t input_width            */
        &workspace_size, /* size_t* workspace_size */
        &workspace_alignment, /* size_t* workspace_alignment */
        nullptr,  /* size_t* output_height_out     */
        nullptr,  /* size_t* output_width_out      */
        pt_pool); /* pthreadpool_t threadpool      */
  }
 }
 /*
 * Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
 */
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_setup_convolution2d_nhwc(
    xnn_operator_t op,
    size_t batch,
    size_t in_h,
    size_t in_w,
    const int8_t* inp,
    int8_t* outp,
    pthreadpool_t pt_pool,
    bool per_channel = false,
-    bool transpose = false) {
+    bool transpose = false,
    uint32_t adj_h = 0,
    uint32_t adj_w = 0) {
  if(transpose) {
    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
    return xnn_setup_deconvolution2d_nhwc_qs8(
        op,       /* xnn_operator_t deconvolution_op */
        batch,    /* size_t batch_size               */
        in_h,     /* size_t input_height             */
        in_w,     /* size_t input_width              */
        adj_h,    /* uint32_t adjustment_height      */
        adj_w,    /* uint32_t adjustment_width       */
        inp,      /* const int8_t* input             */
-        outp);    /* int8_t* output                  */
+        outp,     /* int8_t* output                  */
        pt_pool); /* pthreadpool_t threadpool        */
  }
  if (!per_channel) {
    return xnn_setup_convolution2d_nhwc_qs8(
-        op,       /* xnn_operator_t deconvolution_op */
+        op,       /* xnn_operator_t convolution_op */
-        nullptr,  /* void workspace                  */
+        batch,    /* size_t batch_size             */
-        inp,      /* const int8_t* input             */
+        in_h,     /* size_t input_height           */
-        outp);    /* int8_t* output                  */
+        in_w,     /* size_t input_width            */
        inp,      /* const int8_t* input           */
        outp,     /* int8_t* output                */
        pt_pool); /* pthreadpool_t threadpool      */
  } else { /* per_channel */
-    return xnn_setup_convolution2d_nhwc_qs8_qc8w(
+    return xnn_setup_convolution2d_nhwc_qc8(
-        op,       /* xnn_operator_t deconvolution_op */
+        op,       /* xnn_operator_t convolution_op */
-        nullptr,  /* void workspace                  */
+        batch,    /* size_t batch_size             */
-        inp,      /* const int8_t* input             */
+        in_h,     /* size_t input_height           */
-        outp);    /* int8_t* output                  */
+        in_w,     /* size_t input_width            */
        inp,      /* const int8_t* input           */
        outp,     /* int8_t* output                */
        pt_pool); /* pthreadpool_t threadpool      */
  }
 }
@ -301,31 +258,22 @@ enum xnn_status xnnp_create_fully_connected_nc(
      output_max,              /* int8_t output_max                      */
      flags,                   /* uint32_t flags                         */
      nullptr,                 /* xnn_caches_t caches                    */
      nullptr,                 /* xnn_weights_cache_t                    */
      fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
 }
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_reshape_fully_connected_nc(
    xnn_operator_t fully_connected_op,
    size_t batch_size,
    pthreadpool_t threadpool) {
  return xnn_reshape_fully_connected_nc_qs8(
      fully_connected_op, /* xnn_operator_t fully_connected_op */
      batch_size,         /* size_t batch_size                 */
      threadpool);        /* pthreadpool_t threadpool          */
 }
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_setup_fully_connected_nc(
    xnn_operator_t fully_connected_op,
    size_t batch_size,
    const int8_t* input,
-    int8_t* output) {
+    int8_t* output,
    pthreadpool_t threadpool) {
  return xnn_setup_fully_connected_nc_qs8(
      fully_connected_op, /* xnn_operator_t fully_connected_op */
      batch_size,         /* size_t batch_size                 */
      input,              /* const int8_t* input               */
-      output              /* int8_t* output                    */
+      output,             /* int8_t* output                    */
-    );
+      threadpool);        /* pthreadpool_t threadpool          */
 }
 } // namespace xnnp_utils
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@ -770,12 +770,14 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
      output_zero_point,
      c10::MemoryFormat::ChannelsLast);
-  // Reshape the operator
+  // Setup the operator
-  status = at::native::xnnp_utils::xnnp_reshape_convolution2d_nhwc(
+  status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
      xnnp_convolution_op.get(),
      N,
      H,
      W,
      reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
      caffe2::pthreadpool_(),
      per_channel(),
      transpose(),
@ -789,21 +791,6 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
      status,
      ")");
  // Setup the operator
  status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
      xnnp_convolution_op.get(),
      reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
      per_channel(),
      transpose());
  TORCH_CHECK(
      status == xnn_status_success,
      func_name,
      ": xnn setup operator failed(",
      status,
      ")");
  // Run the operator
  status = xnn_run_operator(
      xnnp_convolution_op.get(), /* xnn_operator_t op */
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -565,19 +565,14 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
    rows_input *= input_contig.size(i);
  }
  // Reshape the operator
  status = at::native::xnnp_utils::xnnp_reshape_fully_connected_nc(
      xnnp_linear_op.get(),
      rows_input, /* batch_size */
      caffe2::pthreadpool_());
  // Setup the operator
  status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
      xnnp_linear_op.get(),
      rows_input, /* batch_size */
      reinterpret_cast<const underlying_t*>(
          input_contig.template data_ptr<scalar_t>()),
-      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>())
+      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
-    );
+      caffe2::pthreadpool_());
  TORCH_CHECK(
      status == xnn_status_success,
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@ -139,29 +139,17 @@ Tensor _mul_out_xnnpack(
  const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
  const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
-  // reshape operator
+  // set up operator
-  status = xnn_reshape_multiply_nd_qs8(
+  status = xnn_setup_multiply_nd_qs8(
      xnnp_qmul_operator.get(),
      self_shape.size(),
      self_shape.data(),
      other_shape.size(),
      other_shape.data(),
      caffe2::pthreadpool_());
  TORCH_CHECK(
      status == xnn_status_success,
      func_name,
      ": xnn reshape operator failed(",
      status,
      ")!");
  // set up operator
  status = xnn_setup_multiply_nd_qs8(
      xnnp_qmul_operator.get(),
      reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
      reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
-      reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
+      reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>()),
-  );
+      caffe2::pthreadpool_());
  TORCH_CHECK(
      status == xnn_status_success,
--- a/aten/src/ATen/native/xnnpack/Activation.cpp
+++ b/aten/src/ATen/native/xnnpack/Activation.cpp
@ -34,19 +34,12 @@ static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
  Operator hardswish_scoped_op(hardswish_op);
  const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
    hardswish_op,
    input.numel(),  // Batch
    caffe2::pthreadpool_());  // threadpool
  TORCH_CHECK(
    xnn_status_success == reshape_status,
    "xnn_reshape_hardswish_nc_f32 failed!");
  const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
    hardswish_op,
    input.numel(),  // Batch
    input.data_ptr<float>(),
-    output.data_ptr<float>());
+    output.data_ptr<float>(),
    caffe2::pthreadpool_());  // threadpool
  TORCH_CHECK(
    xnn_status_success == setup_status,
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@ -7,13 +7,18 @@
 namespace at::native::xnnpack {
-bool use_global_average_pool(const Tensor& input) {
+bool use_global_average_pool(
-  return xnnpack::available() && (1 <= input.ndimension()) &&
+  const Tensor& input) {
-      (input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
+  return xnnpack::available() &&
-      !input.requires_grad() && true;
+          (1 <= input.ndimension()) &&
          (input.device().is_cpu()) &&
          (kFloat == input.scalar_type()) &&
          !input.requires_grad() &&
           true;
 }
-Tensor global_average_pool(const Tensor& input) {
+Tensor global_average_pool(
    const Tensor& input) {
  using namespace internal;
  const Tensor input_padded_contig_nhwc =
@ -22,10 +27,10 @@ Tensor global_average_pool(const Tensor& input) {
  Tensor output = mobile::empty_with_tail_padding(
      {
-          input_padded_contig_nhwc.size(Layout::Activation4D::batch),
+        input_padded_contig_nhwc.size(Layout::Activation4D::batch),
-          input_padded_contig_nhwc.size(Layout::Activation4D::channels),
+        input_padded_contig_nhwc.size(Layout::Activation4D::channels),
-          1,
+        1,
-          1,
+        1,
      },
      input_padded_contig_nhwc.options().dtype(),
      MemoryFormat::ChannelsLast,
@ -33,61 +38,42 @@ Tensor global_average_pool(const Tensor& input) {
  xnn_operator_t global_average_pooling_op{};
  const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
-      input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
+    input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
-      input_padded_contig_nhwc.size(
+    input_padded_contig_nhwc.size(
-          Layout::Activation4D::channels), // input stride
+        Layout::Activation4D::channels), // input stride
-      input_padded_contig_nhwc.size(
+    input_padded_contig_nhwc.size(
-          Layout::Activation4D::channels), // output stride
+        Layout::Activation4D::channels), // output stride
-      -std::numeric_limits<float>::infinity(),
+    -std::numeric_limits<float>::infinity(),
-      std::numeric_limits<float>::infinity(),
+    std::numeric_limits<float>::infinity(),
-      0 /* flags */,
+    0 /* flags */,
-      &global_average_pooling_op);
+    &global_average_pooling_op);
  TORCH_CHECK(
-      xnn_status_success == create_status,
+    xnn_status_success == create_status,
-      "xnn_create_global_average_pooling_nwc_f32 failed!");
+    "xnn_create_global_average_pooling_nwc_f32 failed!");
  Operator global_avg_pool_scoped_op(global_average_pooling_op);
-  size_t workspace_size = 0;
+  const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
  size_t workspace_alignment = 0;
  const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
      global_average_pooling_op,
      input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
      input_padded_contig_nhwc.size(Layout::Activation4D::width) *
          input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
-      &workspace_size, // workspace_size
+      input_padded_contig_nhwc.data_ptr<float>(),
-      &workspace_alignment, // workspace_alignment
+      output.data_ptr<float>(),
      caffe2::pthreadpool_());
  TORCH_CHECK(
-      xnn_status_success == reshape_status,
+    xnn_status_success == setup_status,
-      "xnn_reshape_global_average_pooling_nwc_f32 failed!");
+    "xnn_setup_global_average_pooling_nwc_f32 failed!");
-  // Create Workspace pointer, which we will align and pad with 16 bytes
+  const xnn_status run_status = xnn_run_operator(
-  size_t xnnpack_buffer_padding = 16;
+    global_average_pooling_op,
-  std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
+    caffe2::pthreadpool_());
  void* maybe_aligned_workspace = workspace_vector.data();
  void* aligned_workspace =
      (void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
  const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
      global_average_pooling_op,
      aligned_workspace,
      input_padded_contig_nhwc.data_ptr<float>(),
      output.data_ptr<float>());
  TORCH_CHECK(
-      xnn_status_success == setup_status,
+    xnn_status_success == run_status,
-      "xnn_setup_global_average_pooling_nwc_f32 failed!");
+    "xnn_setup_global_average_pooling_nwc_f32 failed!");
  const xnn_status run_status =
      xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
  TORCH_CHECK(
      xnn_status_success == run_status,
      "xnn_setup_global_average_pooling_nwc_f32 failed!");
  return output.to(input.suggest_memory_format());
 }
--- a/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
+++ b/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
@ -79,19 +79,12 @@ Tensor channel_shuffle(
                       input_padded_contig_nhwc.size(Layout::Activation4D::height) *
                       input_padded_contig_nhwc.size(Layout::Activation4D::width);
  const xnn_status reshape_status = xnn_reshape_channel_shuffle_nc_x32(
      channel_shuffle_op,                                           // operator
      batch_size,                                                   // batch_size
      caffe2::pthreadpool_());                                      // threadpool
  TORCH_CHECK(
      xnn_status_success == reshape_status,
      "xnn_reshape_channel_shuffle_nc_x32 failed!");
  const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
      channel_shuffle_op,                                           // operator
      batch_size,                                                   // batch_size
      input_padded_contig_nhwc.data_ptr<float>(),                   // input
-      output_padded_contig_nhwc.data_ptr<float>());                 // output
+      output_padded_contig_nhwc.data_ptr<float>(),                  // output
      caffe2::pthreadpool_());                                      // threadpool
  TORCH_CHECK(
      xnn_status_success == setup_status,
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@ -236,7 +236,6 @@ ContextConv2D create(
      output_max,                                                     // output_max
      0u,                                                             // flags
      nullptr,                                                        // xnn_caches_t
      nullptr,                                                        // xnn_weights_cache_t
      &convolution_op);                                               // operator
  } else {
    for (const auto i : c10::irange(4)) {
@ -266,7 +265,6 @@ ContextConv2D create(
      output_max,                                                     // output_max
      0u,                                                             // flags
      nullptr,                                                        // xnn_caches_t
      nullptr,                                                        // xnn_weights_cache_t
      &convolution_op);                                               // operator
  }
@ -340,41 +338,26 @@ Tensor run(
   */
  if (context.transposed_) {
-    setup_status = xnn_reshape_deconvolution2d_nhwc_f32(
+    setup_status = xnn_setup_deconvolution2d_nhwc_f32(
-      context.op.get(),
+      context.op.get(),                                      // operator
      padded_input_nhwc.size(Layout::Activation4D::batch),   // batch_size
      padded_input_nhwc.size(Layout::Activation4D::height),  // input_height
      padded_input_nhwc.size(Layout::Activation4D::width),   // input_width
      context.output_padding_[0],                            // adjustment_height
      context.output_padding_[1],                            // adjustment_width
-      nullptr,                                               // output_height_out
+      padded_input_nhwc.data_ptr<float>(),                   // input
-      nullptr,                                               // output_width_out
+      output.data_ptr<float>(),                              // output
      caffe2::pthreadpool_());                               // threadpool
    setup_status = xnn_setup_deconvolution2d_nhwc_f32(
      context.op.get(),                                      // operator
      padded_input_nhwc.data_ptr<float>(),                   // input
      output.data_ptr<float>());                             // output
  } else {
-    size_t workspace_size = SIZE_MAX;
+    setup_status = xnn_setup_convolution2d_nhwc_f32(
-    size_t workspace_alignment = SIZE_MAX;
+      context.op.get(),                                      // operator
    setup_status = xnn_reshape_convolution2d_nhwc_f32(
      context.op.get(),
      padded_input_nhwc.size(Layout::Activation4D::batch),   // batch_size
      padded_input_nhwc.size(Layout::Activation4D::height),  // input_height
      padded_input_nhwc.size(Layout::Activation4D::width),   // input_width
      &workspace_size,                                       // workspace_size
      &workspace_alignment,                                  // workspace_alignment
      nullptr,                                               // output_height_out
      nullptr,                                               // output_width_out
      caffe2::pthreadpool_());
    setup_status = xnn_setup_convolution2d_nhwc_f32(
      context.op.get(),                                      // operator
      nullptr,                                               // workspace
      padded_input_nhwc.data_ptr<float>(),                   // input
-      output.data_ptr<float>());                             // output
+      output.data_ptr<float>(),                              // output
      caffe2::pthreadpool_());
  }
  TORCH_CHECK(
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@ -95,7 +95,6 @@ ContextLinear create(
      output_max,                                                     // output_max
      0u,                                                             // flags
      nullptr,                                                        // xnn_caches_t
      nullptr,                                                        // xnn_weights_cache_t
      &linear_op);                                                    // operator
  TORCH_CHECK(
@ -137,19 +136,12 @@ Tensor run(
      padded_input.suggest_memory_format(),
      padded_input.opt_names());
  const xnn_status reshape_status = xnn_reshape_fully_connected_nc_f32(
      context.op.get(),                                   // operator
      Layout::ActivationND::batch(padded_input.sizes()),  // Batch,
      caffe2::pthreadpool_());                            // threadpool
  TORCH_CHECK(
      xnn_status_success == reshape_status,
      "xnn_reshape_fully_connected_nc_f32 failed!");
  const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
      context.op.get(),                                   // operator
      Layout::ActivationND::batch(padded_input.sizes()),  // Batch,
      padded_input.data_ptr<float>(),                     // input
-      output.data_ptr<float>());                          // output
+      output.data_ptr<float>(),                           // output
      caffe2::pthreadpool_());                            // threadpool
  TORCH_CHECK(
      xnn_status_success == setup_status,
--- a/aten/src/ATen/native/xnnpack/MaxPooling.cpp
+++ b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
@ -214,23 +214,14 @@ Tensor max_pool2d(
      xnn_status_success == create_status,
      "xnn_create_max_pooling2d_nhwc_f32 failed!");
-  const xnn_status reshape_status = xnn_reshape_max_pooling2d_nhwc_f32(
+  const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
      max_pool_op,                                                  // operator
      input_padded_contig_nhwc.size(Layout::Activation4D::batch),   // batch_size
      input_padded_contig_nhwc.size(Layout::Activation4D::height),  // input_height
      input_padded_contig_nhwc.size(Layout::Activation4D::width),   // input_width
      nullptr,                                                      // output_height_out
      nullptr,                                                      // output_width_out
      caffe2::pthreadpool_());                                      // threadpool
  TORCH_CHECK(
    xnn_status_success == reshape_status,
    "xnn_reshape_max_pooling2d_nhwc_f32 failed!");
  const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
      max_pool_op,                                                  // operator
      input_padded_contig_nhwc.data_ptr<float>(),                   // input
-      output_padded_contig_nhwc.data_ptr<float>());                 // output
+      output_padded_contig_nhwc.data_ptr<float>(),                  // output
      caffe2::pthreadpool_());                                      // threadpool
  TORCH_CHECK(
      xnn_status_success == setup_status,
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -619,13 +619,7 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
    # Disable ARM BF16 and FP16 vector for now; unused and causes build failures because
    # these new ISA features may not be supported on older compilers
    set(XNNPACK_ENABLE_ARM_BF16 OFF CACHE BOOL "")
-
+    set(XNNPACK_ENABLE_ARM_FP16_VECTOR OFF CACHE BOOL "")
    # Disable AVXVNNI for now, older clang versions seem not to support it
    # (clang 12 is where avx-vnni support is added)
    set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "")
    # Disable I8MM For CI since clang 9 does not support neon i8mm.
    set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")
    # Setting this global PIC flag for all XNNPACK targets.
    # This is needed for Object libraries within XNNPACK which must
--- a/third_party/BUCK.oss
+++ b/third_party/BUCK.oss
@ -127,7 +127,6 @@ cxx_library(
        "cpuinfo/wrappers/linux/multiline.c",
        "cpuinfo/wrappers/linux/processors.c",
        "cpuinfo/wrappers/linux/smallfile.c",
        "cpuinfo/wrappers/log.c",
        "cpuinfo/wrappers/mach/topology.c",
        "cpuinfo/wrappers/x86/cache/descriptor.c",
        "cpuinfo/wrappers/x86/cache/deterministic.c",
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
--- a/third_party/cpuinfo
+++ b/third_party/cpuinfo
--- a/third_party/generate-cpuinfo-wrappers.py
+++ b/third_party/generate-cpuinfo-wrappers.py
@ -9,7 +9,6 @@ CPUINFO_SOURCES = {
        "init.c",
        "api.c",
        "cache.c",
        "log.c",
    ],
    "defined(__linux__)": [
        "linux/multiline.c",
--- a/third_party/generate-xnnpack-wrappers.py
+++ b/third_party/generate-xnnpack-wrappers.py
@ -8,22 +8,16 @@ import logging
 BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
 WRAPPER_SRC_NAMES = {
-    "PROD_SCALAR_MICROKERNEL_SRCS": None,
+    "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS": None,
-    "PROD_FMA_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
+    "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)",
    "PROD_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
    "PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__arm__)",
    "PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONI8MM_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
@ -36,13 +30,42 @@ WRAPPER_SRC_NAMES = {
    "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
    "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
    "AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",
-    # add non-prod microkernel sources here:
+    # add additoonal:
    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "ALL_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
    "ALL_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "ALL_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "ALL_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_AVX512SKX_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_AVX512VBMI_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_F16C_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_FMA3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_FP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
    'ALL_NEON_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
    'ALL_NEON_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
    'ALL_NEONBF16_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
    'ALL_NEONDOT_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
    'ALL_NEONFMA_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
    'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
    'ALL_NEONFP16_MICROKERNEL_SRCS':"defined(__arm__) || defined(__aarch64__)",
    'ALL_NEONFP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
    'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
    'ALL_NEONV8_MICROKERNEL_SRCS': "defined(__aarch64__)",
    'ALL_SCALAR_MICROKERNEL_SRCS': "defined(__arm__)",
    'ALL_SSE_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_SSE2_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_SSE41_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_SSSE3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'ALL_XOP_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    'AARCH32_ASM_MICROKERNEL_SRCS': "defined(__arm__)",
    "PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_SCALAR_MICROKERNEL_SRCS": "defined(__arm__)",
 }
 SRC_NAMES = set([
@ -50,24 +73,12 @@ SRC_NAMES = set([
    "SUBGRAPH_SRCS",
    "LOGGING_SRCS",
    "XNNPACK_SRCS",
    "HOT_SRCS",
    "TABLE_SRCS",
    "JIT_SRCS",
-    "PROD_SCALAR_MICROKERNEL_SRCS",
+    "JIT_AARCH32_SRCS",
-    "PROD_FMA_MICROKERNEL_SRCS",
+    "JIT_AARCH64_SRCS",
-    "PROD_ARMSIMD32_MICROKERNEL_SRCS",
+    "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS",
    "PROD_FP16ARITH_MICROKERNEL_SRCS",
    "PROD_NEON_MICROKERNEL_SRCS",
    "PROD_NEONFP16_MICROKERNEL_SRCS",
    "PROD_NEONFMA_MICROKERNEL_SRCS",
    "PROD_NEON_AARCH64_MICROKERNEL_SRCS",
    "PROD_NEONV8_MICROKERNEL_SRCS",
    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
    "PROD_NEONDOT_MICROKERNEL_SRCS",
    "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
    "PROD_NEONI8MM_MICROKERNEL_SRCS",
    "PROD_SSE_MICROKERNEL_SRCS",
    "PROD_SSE2_MICROKERNEL_SRCS",
    "PROD_SSSE3_MICROKERNEL_SRCS",
@ -79,14 +90,59 @@ SRC_NAMES = set([
    "PROD_AVX2_MICROKERNEL_SRCS",
    "PROD_AVX512F_MICROKERNEL_SRCS",
    "PROD_AVX512SKX_MICROKERNEL_SRCS",
    "PROD_SCALAR_MICROKERNEL_SRCS",
    "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS",
    "PROD_SCALAR_RISCV_MICROKERNEL_SRCS",
    "PROD_ARMSIMD32_MICROKERNEL_SRCS",
    "PROD_FP16ARITH_MICROKERNEL_SRCS",
    "PROD_NEON_MICROKERNEL_SRCS",
    "PROD_NEONFP16_MICROKERNEL_SRCS",
    "PROD_NEONFMA_MICROKERNEL_SRCS",
    "PROD_NEON_AARCH64_MICROKERNEL_SRCS",
    "PROD_NEONV8_MICROKERNEL_SRCS",
    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
    "PROD_NEONDOT_MICROKERNEL_SRCS",
    "PROD_SSE2_MICROKERNEL_SRCS",
    "PROD_SSSE3_MICROKERNEL_SRCS",
    "PROD_SSE41_MICROKERNEL_SRCS",
    "PROD_AVX_MICROKERNEL_SRCS",
    "PROD_F16C_MICROKERNEL_SRCS",
    "PROD_AVX512VBMI_MICROKERNEL_SRCS",
-    "PROD_AVX512VNNI_MICROKERNEL_SRCS",
+    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
    "PROD_RVV_MICROKERNEL_SRCS",
    "PROD_AVXVNNI_MICROKERNEL_SRCS",
    "AARCH32_ASM_MICROKERNEL_SRCS",
    "AARCH64_ASM_MICROKERNEL_SRCS",
-    # add non-prod microkernel sources here:
+    # new adding libs:
    'ALL_ARMSIMD32_MICROKERNEL_SRCS',
    'ALL_AVX_MICROKERNEL_SRCS',
    'ALL_AVX2_MICROKERNEL_SRCS',
    'ALL_AVX512F_MICROKERNEL_SRCS',
    'ALL_AVX512SKX_MICROKERNEL_SRCS',
    'ALL_AVX512VBMI_MICROKERNEL_SRCS',
    'ALL_F16C_MICROKERNEL_SRCS',
    'ALL_FMA3_MICROKERNEL_SRCS',
    'ALL_FP16ARITH_MICROKERNEL_SRCS',
    'ALL_HEXAGON_MICROKERNEL_SRCS',
    'ALL_NEON_MICROKERNEL_SRCS',
    'ALL_NEON_AARCH64_MICROKERNEL_SRCS',
    'ALL_NEONBF16_MICROKERNEL_SRCS',
    'ALL_NEONBF16_AARCH64_MICROKERNEL_SRCS',
    'ALL_NEONDOT_MICROKERNEL_SRCS',
    'ALL_NEONFMA_MICROKERNEL_SRCS',
    'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS',
    'ALL_NEONFP16_MICROKERNEL_SRCS',
    'ALL_NEONFP16ARITH_MICROKERNEL_SRCS',
    'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS',
    'ALL_NEONV8_MICROKERNEL_SRCS',
    'ALL_SCALAR_MICROKERNEL_SRCS',
    'ALL_SSE_MICROKERNEL_SRCS',
    'ALL_SSE2_MICROKERNEL_SRCS',
    'ALL_SSE41_MICROKERNEL_SRCS',
    'ALL_SSSE3_MICROKERNEL_SRCS',
    'ALL_WASM_MICROKERNEL_SRCS',
    'ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS',
    'ALL_WASMSIMD_MICROKERNEL_SRCS',
    'ALL_XOP_MICROKERNEL_SRCS',
    'AARCH32_ASM_MICROKERNEL_SRCS',
    'AARCH64_ASM_MICROKERNEL_SRCS',
 ])
 def handle_singleline_parse(line):
@ -94,10 +150,11 @@ def handle_singleline_parse(line):
    end_index = line.find(")")
    line = line[start_index+1:end_index]
    key_val = line.split(" ")
-    return key_val[0], list(map(lambda x: x[4:], key_val[1:]))
+    return key_val[0], key_val[1][4:]
 def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
    sources = collections.defaultdict(list)
    count = 0
    with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
        lines = cmake.readlines()
        i = 0
@ -106,7 +163,7 @@ def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
            if lines[i].startswith("SET") and "src/" in lines[i]:
                name, val = handle_singleline_parse(line)
-                sources[name].extend(val)
+                sources[name].append(val)
                i+=1
                continue
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
--- a/third_party/xnnpack_src_defs.bzl
+++ b/third_party/xnnpack_src_defs.bzl
--- a/third_party/xnnpack_wrapper_defs.bzl
+++ b/third_party/xnnpack_wrapper_defs.bzl