[8/n] Update XNNPACK Version Part 8 Everything Remaining to get it to work (#115587)

> **__Note:__** XNNPACK Upgrade is too large in the range of **40k** files and **10m** Lines of code, Thus we break the update of the library into multiple parts. All Parts [1 - 6/n] Must be landed together for it to work. ***This also means If there is a revert. Please revert the Entire Stack.*** This change is everything remaining requiring XNNPACK version to work. Differential Revision: [D52044420](https://our.internmc.facebook.com/intern/diff/D52044420/) Pull Request resolved: https://github.com/pytorch/pytorch/pull/115587 Approved by: https://github.com/digantdesai
2025-10-20 21:14:14 +08:00 · 2023-12-11 21:35:02 -08:00
parent e918461377
commit a8dc9d8e35
20 changed files with 1921 additions and 13596 deletions
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@ -259,24 +259,32 @@ enum xnn_status xnnp_create_add_nd(
 }

 C10_ALWAYS_INLINE
-enum xnn_status xnnp_setup_add_nd(
+enum xnn_status xnnp_reshape_add_nd(
    xnn_operator_t op,
    const std::vector<size_t>& a_shape,
    const std::vector<size_t>& b_shape,
+    pthreadpool_t pt_pool) {
+  return xnn_reshape_add_nd_qs8(
+      op,             /* xnn_operator_t add_op      */
+      a_shape.size(), /* size_t num_input1_dims     */
+      a_shape.data(), /* const size_t* input1_shape */
+      b_shape.size(), /* size_t num_input2_dims     */
+      b_shape.data(), /* const size_t* input2_shape */
+      pt_pool);       /* pthreadpool_t threadpool   */
+}
+
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_add_nd(
+    xnn_operator_t op,
    const int8_t* da,
    const int8_t* db,
    int8_t* dc,
    pthreadpool_t pt_pool) {
  return xnn_setup_add_nd_qs8(
      op,             /* xnn_operator_t add_op      */
-      a_shape.size(), /* size_t num_input1_dims     */
-      a_shape.data(), /* const size_t* input1_shape */
-      b_shape.size(), /* size_t num_input2_dims     */
-      b_shape.data(), /* const size_t* input2_shape */
      da,             /* const int8_t* input1       */
      db,             /* const int8_t* input2       */
-      dc,             /* int8_t* output             */
-      pt_pool);       /* pthreadpool_t threadpool   */
+      dc);            /* int8_t* output             */
 }

 template <typename scalar_t, bool ReLUFused = false>
@ -348,11 +356,20 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
  const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
  const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);

-  // Setup the operator
-  status = xnnp_setup_add_nd(
+  // Reshape the operator
+  status = xnnp_reshape_add_nd(
      xnnp_add_operator.get(),
      qa_shape,
      qb_shape,
+      caffe2::pthreadpool_());
+
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name, ": xnn reshape operator failed(", status,")!");
+
+  // Setup the operator
+  status = xnnp_setup_add_nd(
+      xnnp_add_operator.get(),
      reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
      reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
      reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
--- a/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
+++ b/aten/src/ATen/native/quantized/cpu/XnnpackUtils.h
@ -100,6 +100,7 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
        op_max,         /* int8_t output_max                    */
        flags,          /* uint32_t flags                       */
        nullptr,        /* xnn_caches_t caches                  */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
        op);            /* xnn_operator_t* deconvolution_op_out */

  }
@ -132,9 +133,10 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
        op_max,         /* int8_t output_max                  */
        flags,          /* uint32_t flags                     */
        nullptr,        /* xnn_caches_t caches                */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
        op);            /* xnn_operator_t* convolution_op_out */
  } else { /* per_channel */
-    return xnn_create_convolution2d_nhwc_qc8(
+    return xnn_create_convolution2d_nhwc_qs8_qc8w(
        pad_top,        /* uint32_t input_padding_top         */
        pad_right,      /* uint32_t input_padding_right       */
        pad_bottom,     /* uint32_t input_padding_bottom      */
@ -161,21 +163,20 @@ enum xnn_status xnnp_create_convolution2d_nhwc(
        op_max,         /* int8_t output_max                  */
        flags,          /* uint32_t flags                     */
        nullptr,        /* xnn_caches_t caches                */
+        nullptr,        /* xnn_weights_cache_t weights_cache    */
        op);            /* xnn_operator_t* convolution_op_out */
  }
 }

 /*
- * Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
+ * Series of reshape wrapper functions to call xnn_reshape_[de]conv* functions.
 */
 C10_ALWAYS_INLINE
-enum xnn_status xnnp_setup_convolution2d_nhwc(
+enum xnn_status xnnp_reshape_convolution2d_nhwc(
    xnn_operator_t op,
    size_t batch,
    size_t in_h,
    size_t in_w,
-    const int8_t* inp,
-    int8_t* outp,
    pthreadpool_t pt_pool,
    bool per_channel = false,
    bool transpose = false,
@ -183,36 +184,78 @@ enum xnn_status xnnp_setup_convolution2d_nhwc(
    uint32_t adj_w = 0) {
  if(transpose) {
    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
-    return xnn_setup_deconvolution2d_nhwc_qs8(
+    return xnn_reshape_deconvolution2d_nhwc_qs8(
        op,       /* xnn_operator_t deconvolution_op */
        batch,    /* size_t batch_size               */
        in_h,     /* size_t input_height             */
        in_w,     /* size_t input_width              */
        adj_h,    /* uint32_t adjustment_height      */
        adj_w,    /* uint32_t adjustment_width       */
-        inp,      /* const int8_t* input             */
-        outp,     /* int8_t* output                  */
+        nullptr,  /* size_t* output_height_out       */
+        nullptr,  /* size_t* output_width_out        */
        pt_pool); /* pthreadpool_t threadpool        */
  }

+  size_t workspace_size = SIZE_MAX;
+  size_t workspace_alignment = SIZE_MAX;
+
+  if (!per_channel) {
+    return xnn_reshape_convolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        &workspace_size, /* size_t* workspace_size */
+        &workspace_alignment, /* size_t* workspace_alignment */
+        nullptr,  /* size_t* output_height_out     */
+        nullptr,  /* size_t* output_width_out      */
+        pt_pool); /* pthreadpool_t threadpool      */
+  } else { /* per_channel */
+    return xnn_reshape_convolution2d_nhwc_qs8_qc8w(
+        op,       /* xnn_operator_t convolution_op */
+        batch,    /* size_t batch_size             */
+        in_h,     /* size_t input_height           */
+        in_w,     /* size_t input_width            */
+        &workspace_size, /* size_t* workspace_size */
+        &workspace_alignment, /* size_t* workspace_alignment */
+        nullptr,  /* size_t* output_height_out     */
+        nullptr,  /* size_t* output_width_out      */
+        pt_pool); /* pthreadpool_t threadpool      */
+  }
+}
+
+
+/*
+ * Series of setup wrapper functions to call xnn_setup_[de]conv* functions.
+ */
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_setup_convolution2d_nhwc(
+    xnn_operator_t op,
+    const int8_t* inp,
+    int8_t* outp,
+    bool per_channel = false,
+    bool transpose = false) {
+  if(transpose) {
+    TORCH_CHECK(!per_channel, "XNNPACK Q[SC]8 does not have a per channel deconvolution!");
+
+    return xnn_setup_deconvolution2d_nhwc_qs8(
+        op,       /* xnn_operator_t deconvolution_op */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
+  }
+
  if (!per_channel) {
    return xnn_setup_convolution2d_nhwc_qs8(
-        op,       /* xnn_operator_t convolution_op */
-        batch,    /* size_t batch_size             */
-        in_h,     /* size_t input_height           */
-        in_w,     /* size_t input_width            */
-        inp,      /* const int8_t* input           */
-        outp,     /* int8_t* output                */
-        pt_pool); /* pthreadpool_t threadpool      */
+        op,       /* xnn_operator_t deconvolution_op */
+        nullptr,  /* void workspace                  */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
  } else { /* per_channel */
-    return xnn_setup_convolution2d_nhwc_qc8(
-        op,       /* xnn_operator_t convolution_op */
-        batch,    /* size_t batch_size             */
-        in_h,     /* size_t input_height           */
-        in_w,     /* size_t input_width            */
-        inp,      /* const int8_t* input           */
-        outp,     /* int8_t* output                */
-        pt_pool); /* pthreadpool_t threadpool      */
+    return xnn_setup_convolution2d_nhwc_qs8_qc8w(
+        op,       /* xnn_operator_t deconvolution_op */
+        nullptr,  /* void workspace                  */
+        inp,      /* const int8_t* input             */
+        outp);    /* int8_t* output                  */
  }
 }

@ -258,22 +301,31 @@ enum xnn_status xnnp_create_fully_connected_nc(
      output_max,              /* int8_t output_max                      */
      flags,                   /* uint32_t flags                         */
      nullptr,                 /* xnn_caches_t caches                    */
+      nullptr,                 /* xnn_weights_cache_t                    */
      fully_connected_op_out); /* xnn_operator_t* fully_connected_op_out */
 }

+C10_ALWAYS_INLINE
+enum xnn_status xnnp_reshape_fully_connected_nc(
+    xnn_operator_t fully_connected_op,
+    size_t batch_size,
+    pthreadpool_t threadpool) {
+  return xnn_reshape_fully_connected_nc_qs8(
+      fully_connected_op, /* xnn_operator_t fully_connected_op */
+      batch_size,         /* size_t batch_size                 */
+      threadpool);        /* pthreadpool_t threadpool          */
+}
+
 C10_ALWAYS_INLINE
 enum xnn_status xnnp_setup_fully_connected_nc(
    xnn_operator_t fully_connected_op,
-    size_t batch_size,
    const int8_t* input,
-    int8_t* output,
-    pthreadpool_t threadpool) {
+    int8_t* output) {
  return xnn_setup_fully_connected_nc_qs8(
      fully_connected_op, /* xnn_operator_t fully_connected_op */
-      batch_size,         /* size_t batch_size                 */
      input,              /* const int8_t* input               */
-      output,             /* int8_t* output                    */
-      threadpool);        /* pthreadpool_t threadpool          */
+      output              /* int8_t* output                    */
+    );
 }

 } // namespace xnnp_utils
--- a/aten/src/ATen/native/quantized/cpu/qconv.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp
@ -770,14 +770,12 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
      output_zero_point,
      c10::MemoryFormat::ChannelsLast);

-  // Setup the operator
-  status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
+  // Reshape the operator
+  status = at::native::xnnp_utils::xnnp_reshape_convolution2d_nhwc(
      xnnp_convolution_op.get(),
      N,
      H,
      W,
-      reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
-      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
      caffe2::pthreadpool_(),
      per_channel(),
      transpose(),
@ -791,6 +789,21 @@ at::Tensor PackedConvWeightsQnnp<kSpatialDim>::apply_impl_xnnp(
      status,
      ")");

+  // Setup the operator
+  status = at::native::xnnp_utils::xnnp_setup_convolution2d_nhwc(
+      xnnp_convolution_op.get(),
+      reinterpret_cast<const underlying_t*>(act_nhwc.template data_ptr<scalar_t>()),
+      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
+      per_channel(),
+      transpose());
+
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name,
+      ": xnn setup operator failed(",
+      status,
+      ")");
+
  // Run the operator
  status = xnn_run_operator(
      xnnp_convolution_op.get(), /* xnn_operator_t op */
--- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp
@ -565,14 +565,19 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp(
    rows_input *= input_contig.size(i);
  }

+  // Reshape the operator
+  status = at::native::xnnp_utils::xnnp_reshape_fully_connected_nc(
+      xnnp_linear_op.get(),
+      rows_input, /* batch_size */
+      caffe2::pthreadpool_());
+
  // Setup the operator
  status = at::native::xnnp_utils::xnnp_setup_fully_connected_nc(
      xnnp_linear_op.get(),
-      rows_input, /* batch_size */
      reinterpret_cast<const underlying_t*>(
          input_contig.template data_ptr<scalar_t>()),
-      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>()),
-      caffe2::pthreadpool_());
+      reinterpret_cast<underlying_t*>(output.template data_ptr<scalar_t>())
+    );

  TORCH_CHECK(
      status == xnn_status_success,
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@ -139,17 +139,29 @@ Tensor _mul_out_xnnpack(
  const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
  const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);

-  // set up operator
-  status = xnn_setup_multiply_nd_qs8(
+  // reshape operator
+  status = xnn_reshape_multiply_nd_qs8(
      xnnp_qmul_operator.get(),
      self_shape.size(),
      self_shape.data(),
      other_shape.size(),
      other_shape.data(),
+      caffe2::pthreadpool_());
+
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name,
+      ": xnn reshape operator failed(",
+      status,
+      ")!");
+
+  // set up operator
+  status = xnn_setup_multiply_nd_qs8(
+      xnnp_qmul_operator.get(),
      reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
      reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
-      reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>()),
-      caffe2::pthreadpool_());
+      reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
+  );

  TORCH_CHECK(
      status == xnn_status_success,
--- a/aten/src/ATen/native/xnnpack/Activation.cpp
+++ b/aten/src/ATen/native/xnnpack/Activation.cpp
@ -34,13 +34,20 @@ static Tensor& hardswish_impl(Tensor& input, Tensor& output) {

  Operator hardswish_scoped_op(hardswish_op);

-  const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
+  const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
    hardswish_op,
    input.numel(),  // Batch
-    input.data_ptr<float>(),
-    output.data_ptr<float>(),
    caffe2::pthreadpool_());  // threadpool

+  TORCH_CHECK(
+    xnn_status_success == reshape_status,
+    "xnn_reshape_hardswish_nc_f32 failed!");
+
+  const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
+    hardswish_op,
+    input.data_ptr<float>(),
+    output.data_ptr<float>());
+
  TORCH_CHECK(
    xnn_status_success == setup_status,
    "xnn_setup_hardswish_nc_f32 failed!");
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@ -7,18 +7,13 @@

 namespace at::native::xnnpack {

-bool use_global_average_pool(
-  const Tensor& input) {
-  return xnnpack::available() &&
-          (1 <= input.ndimension()) &&
-          (input.device().is_cpu()) &&
-          (kFloat == input.scalar_type()) &&
-          !input.requires_grad() &&
-           true;
+bool use_global_average_pool(const Tensor& input) {
+  return xnnpack::available() && (1 <= input.ndimension()) &&
+      (input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
+      !input.requires_grad() && true;
 }

-Tensor global_average_pool(
-    const Tensor& input) {
+Tensor global_average_pool(const Tensor& input) {
  using namespace internal;

  const Tensor input_padded_contig_nhwc =
@ -27,10 +22,10 @@ Tensor global_average_pool(

  Tensor output = mobile::empty_with_tail_padding(
      {
-        input_padded_contig_nhwc.size(Layout::Activation4D::batch),
-        input_padded_contig_nhwc.size(Layout::Activation4D::channels),
-        1,
-        1,
+          input_padded_contig_nhwc.size(Layout::Activation4D::batch),
+          input_padded_contig_nhwc.size(Layout::Activation4D::channels),
+          1,
+          1,
      },
      input_padded_contig_nhwc.options().dtype(),
      MemoryFormat::ChannelsLast,
@ -38,42 +33,61 @@ Tensor global_average_pool(

  xnn_operator_t global_average_pooling_op{};
  const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
-    input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
-    input_padded_contig_nhwc.size(
-        Layout::Activation4D::channels), // input stride
-    input_padded_contig_nhwc.size(
-        Layout::Activation4D::channels), // output stride
-    -std::numeric_limits<float>::infinity(),
-    std::numeric_limits<float>::infinity(),
-    0 /* flags */,
-    &global_average_pooling_op);
+      input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
+      input_padded_contig_nhwc.size(
+          Layout::Activation4D::channels), // input stride
+      input_padded_contig_nhwc.size(
+          Layout::Activation4D::channels), // output stride
+      -std::numeric_limits<float>::infinity(),
+      std::numeric_limits<float>::infinity(),
+      0 /* flags */,
+      &global_average_pooling_op);

  TORCH_CHECK(
-    xnn_status_success == create_status,
-    "xnn_create_global_average_pooling_nwc_f32 failed!");
+      xnn_status_success == create_status,
+      "xnn_create_global_average_pooling_nwc_f32 failed!");

  Operator global_avg_pool_scoped_op(global_average_pooling_op);

-  const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
+  size_t workspace_size = 0;
+  size_t workspace_alignment = 0;
+
+  const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
      global_average_pooling_op,
      input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
      input_padded_contig_nhwc.size(Layout::Activation4D::width) *
          input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
-      input_padded_contig_nhwc.data_ptr<float>(),
-      output.data_ptr<float>(),
+      &workspace_size, // workspace_size
+      &workspace_alignment, // workspace_alignment
      caffe2::pthreadpool_());

  TORCH_CHECK(
-    xnn_status_success == setup_status,
-    "xnn_setup_global_average_pooling_nwc_f32 failed!");
+      xnn_status_success == reshape_status,
+      "xnn_reshape_global_average_pooling_nwc_f32 failed!");

-  const xnn_status run_status = xnn_run_operator(
-    global_average_pooling_op,
-    caffe2::pthreadpool_());
+  // Create Workspace pointer, which we will align and pad with 16 bytes
+  size_t xnnpack_buffer_padding = 16;
+  std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
+  void* maybe_aligned_workspace = workspace_vector.data();
+  void* aligned_workspace =
+      (void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
+
+  const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
+      global_average_pooling_op,
+      aligned_workspace,
+      input_padded_contig_nhwc.data_ptr<float>(),
+      output.data_ptr<float>());

  TORCH_CHECK(
-    xnn_status_success == run_status,
-    "xnn_setup_global_average_pooling_nwc_f32 failed!");
+      xnn_status_success == setup_status,
+      "xnn_setup_global_average_pooling_nwc_f32 failed!");
+
+  const xnn_status run_status =
+      xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
+
+  TORCH_CHECK(
+      xnn_status_success == run_status,
+      "xnn_setup_global_average_pooling_nwc_f32 failed!");

  return output.to(input.suggest_memory_format());
 }
--- a/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
+++ b/aten/src/ATen/native/xnnpack/ChannelShuffle.cpp
@ -79,13 +79,20 @@ Tensor channel_shuffle(
                       input_padded_contig_nhwc.size(Layout::Activation4D::height) *
                       input_padded_contig_nhwc.size(Layout::Activation4D::width);

-  const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
+  const xnn_status reshape_status = xnn_reshape_channel_shuffle_nc_x32(
      channel_shuffle_op,                                           // operator
      batch_size,                                                   // batch_size
-      input_padded_contig_nhwc.data_ptr<float>(),                   // input
-      output_padded_contig_nhwc.data_ptr<float>(),                  // output
      caffe2::pthreadpool_());                                      // threadpool

+  TORCH_CHECK(
+      xnn_status_success == reshape_status,
+      "xnn_reshape_channel_shuffle_nc_x32 failed!");
+
+  const xnn_status setup_status = xnn_setup_channel_shuffle_nc_x32(
+      channel_shuffle_op,                                           // operator
+      input_padded_contig_nhwc.data_ptr<float>(),                   // input
+      output_padded_contig_nhwc.data_ptr<float>());                 // output
+
  TORCH_CHECK(
      xnn_status_success == setup_status,
      "xnn_setup_channel_shuffle_nc_x32 failed!");
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@ -236,6 +236,7 @@ ContextConv2D create(
      output_max,                                                     // output_max
      0u,                                                             // flags
      nullptr,                                                        // xnn_caches_t
+      nullptr,                                                        // xnn_weights_cache_t
      &convolution_op);                                               // operator
  } else {
    for (const auto i : c10::irange(4)) {
@ -265,6 +266,7 @@ ContextConv2D create(
      output_max,                                                     // output_max
      0u,                                                             // flags
      nullptr,                                                        // xnn_caches_t
+      nullptr,                                                        // xnn_weights_cache_t
      &convolution_op);                                               // operator
  }

@ -338,26 +340,41 @@ Tensor run(
   */

  if (context.transposed_) {
-    setup_status = xnn_setup_deconvolution2d_nhwc_f32(
-      context.op.get(),                                      // operator
+    setup_status = xnn_reshape_deconvolution2d_nhwc_f32(
+      context.op.get(),
      padded_input_nhwc.size(Layout::Activation4D::batch),   // batch_size
      padded_input_nhwc.size(Layout::Activation4D::height),  // input_height
      padded_input_nhwc.size(Layout::Activation4D::width),   // input_width
      context.output_padding_[0],                            // adjustment_height
      context.output_padding_[1],                            // adjustment_width
-      padded_input_nhwc.data_ptr<float>(),                   // input
-      output.data_ptr<float>(),                              // output
+      nullptr,                                               // output_height_out
+      nullptr,                                               // output_width_out
      caffe2::pthreadpool_());                               // threadpool

-  } else {
-    setup_status = xnn_setup_convolution2d_nhwc_f32(
+    setup_status = xnn_setup_deconvolution2d_nhwc_f32(
      context.op.get(),                                      // operator
+      padded_input_nhwc.data_ptr<float>(),                   // input
+      output.data_ptr<float>());                             // output
+  } else {
+    size_t workspace_size = SIZE_MAX;
+    size_t workspace_alignment = SIZE_MAX;
+
+    setup_status = xnn_reshape_convolution2d_nhwc_f32(
+      context.op.get(),
      padded_input_nhwc.size(Layout::Activation4D::batch),   // batch_size
      padded_input_nhwc.size(Layout::Activation4D::height),  // input_height
      padded_input_nhwc.size(Layout::Activation4D::width),   // input_width
-      padded_input_nhwc.data_ptr<float>(),                   // input
-      output.data_ptr<float>(),                              // output
+      &workspace_size,                                       // workspace_size
+      &workspace_alignment,                                  // workspace_alignment
+      nullptr,                                               // output_height_out
+      nullptr,                                               // output_width_out
      caffe2::pthreadpool_());
+
+    setup_status = xnn_setup_convolution2d_nhwc_f32(
+      context.op.get(),                                      // operator
+      nullptr,                                               // workspace
+      padded_input_nhwc.data_ptr<float>(),                   // input
+      output.data_ptr<float>());                             // output
  }

  TORCH_CHECK(
--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@ -95,6 +95,7 @@ ContextLinear create(
      output_max,                                                     // output_max
      0u,                                                             // flags
      nullptr,                                                        // xnn_caches_t
+      nullptr,                                                        // xnn_weights_cache_t
      &linear_op);                                                    // operator

  TORCH_CHECK(
@ -136,13 +137,20 @@ Tensor run(
      padded_input.suggest_memory_format(),
      padded_input.opt_names());

-  const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
+  const xnn_status reshape_status = xnn_reshape_fully_connected_nc_f32(
      context.op.get(),                                   // operator
      Layout::ActivationND::batch(padded_input.sizes()),  // Batch,
-      padded_input.data_ptr<float>(),                     // input
-      output.data_ptr<float>(),                           // output
      caffe2::pthreadpool_());                            // threadpool

+  TORCH_CHECK(
+      xnn_status_success == reshape_status,
+      "xnn_reshape_fully_connected_nc_f32 failed!");
+
+  const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
+      context.op.get(),                                   // operator
+      padded_input.data_ptr<float>(),                     // input
+      output.data_ptr<float>());                          // output
+
  TORCH_CHECK(
      xnn_status_success == setup_status,
      "xnn_setup_fully_connected_nc_f32 failed!");
--- a/aten/src/ATen/native/xnnpack/MaxPooling.cpp
+++ b/aten/src/ATen/native/xnnpack/MaxPooling.cpp
@ -214,15 +214,24 @@ Tensor max_pool2d(
      xnn_status_success == create_status,
      "xnn_create_max_pooling2d_nhwc_f32 failed!");

-  const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
+  const xnn_status reshape_status = xnn_reshape_max_pooling2d_nhwc_f32(
      max_pool_op,                                                  // operator
      input_padded_contig_nhwc.size(Layout::Activation4D::batch),   // batch_size
      input_padded_contig_nhwc.size(Layout::Activation4D::height),  // input_height
      input_padded_contig_nhwc.size(Layout::Activation4D::width),   // input_width
-      input_padded_contig_nhwc.data_ptr<float>(),                   // input
-      output_padded_contig_nhwc.data_ptr<float>(),                  // output
+      nullptr,                                                      // output_height_out
+      nullptr,                                                      // output_width_out
      caffe2::pthreadpool_());                                      // threadpool

+  TORCH_CHECK(
+    xnn_status_success == reshape_status,
+    "xnn_reshape_max_pooling2d_nhwc_f32 failed!");
+
+  const xnn_status setup_status = xnn_setup_max_pooling2d_nhwc_f32(
+      max_pool_op,                                                  // operator
+      input_padded_contig_nhwc.data_ptr<float>(),                   // input
+      output_padded_contig_nhwc.data_ptr<float>());                 // output
+
  TORCH_CHECK(
      xnn_status_success == setup_status,
      "xnn_setup_max_pooling2d_nhwc_f32 failed!");
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -619,7 +619,13 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
    # Disable ARM BF16 and FP16 vector for now; unused and causes build failures because
    # these new ISA features may not be supported on older compilers
    set(XNNPACK_ENABLE_ARM_BF16 OFF CACHE BOOL "")
-    set(XNNPACK_ENABLE_ARM_FP16_VECTOR OFF CACHE BOOL "")
+
+    # Disable AVXVNNI for now, older clang versions seem not to support it
+    # (clang 12 is where avx-vnni support is added)
+    set(XNNPACK_ENABLE_AVXVNNI OFF CACHE BOOL "")
+
+    # Disable I8MM For CI since clang 9 does not support neon i8mm.
+    set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")

    # Setting this global PIC flag for all XNNPACK targets.
    # This is needed for Object libraries within XNNPACK which must
--- a/third_party/BUCK.oss
+++ b/third_party/BUCK.oss
@ -127,6 +127,7 @@ cxx_library(
        "cpuinfo/wrappers/linux/multiline.c",
        "cpuinfo/wrappers/linux/processors.c",
        "cpuinfo/wrappers/linux/smallfile.c",
+        "cpuinfo/wrappers/log.c",
        "cpuinfo/wrappers/mach/topology.c",
        "cpuinfo/wrappers/x86/cache/descriptor.c",
        "cpuinfo/wrappers/x86/cache/deterministic.c",
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
--- a/third_party/cpuinfo
+++ b/third_party/cpuinfo
--- a/third_party/generate-cpuinfo-wrappers.py
+++ b/third_party/generate-cpuinfo-wrappers.py
@ -9,6 +9,7 @@ CPUINFO_SOURCES = {
        "init.c",
        "api.c",
        "cache.c",
+        "log.c",
    ],
    "defined(__linux__)": [
        "linux/multiline.c",
--- a/third_party/generate-xnnpack-wrappers.py
+++ b/third_party/generate-xnnpack-wrappers.py
@ -8,16 +8,22 @@ import logging

 BANNER = "Auto-generated by generate-wrappers.py script. Do not modify"
 WRAPPER_SRC_NAMES = {
-    "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS": None,
-    "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS" : "defined(__arm__)",
+    "PROD_SCALAR_MICROKERNEL_SRCS": None,
+    "PROD_FMA_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
+    "PROD_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
+    "PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__arm__)",
    "PROD_NEON_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_NEONFP16_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_NEONFMA_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_AARCH64_NEON_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_NEON_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONV8_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_AARCH64_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
    "PROD_NEONDOT_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
+    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__aarch64__)",
+    "PROD_NEONI8MM_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
    "PROD_SSE_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_SSE2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_SSSE3_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
@ -30,42 +36,13 @@ WRAPPER_SRC_NAMES = {
    "PROD_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_AVX512SKX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "PROD_AVX512VBMI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_AVX512VNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
+    "PROD_RVV_MICROKERNEL_SRCS": "defined(__riscv) || defined(__riscv__)",
+    "PROD_AVXVNNI_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
    "AARCH32_ASM_MICROKERNEL_SRCS": "defined(__arm__)",
    "AARCH64_ASM_MICROKERNEL_SRCS": "defined(__aarch64__)",

-    # add additoonal:
-    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "ALL_ARMSIMD32_MICROKERNEL_SRCS": "defined(__arm__)",
-    "ALL_AVX_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "ALL_AVX2_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    "ALL_AVX512F_MICROKERNEL_SRCS": "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-
-    'ALL_AVX512SKX_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_AVX512VBMI_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_F16C_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_FMA3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_FP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
-    'ALL_NEON_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
-    'ALL_NEON_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
-    'ALL_NEONBF16_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
-    'ALL_NEONDOT_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
-    'ALL_NEONFMA_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
-    'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
-    'ALL_NEONFP16_MICROKERNEL_SRCS':"defined(__arm__) || defined(__aarch64__)",
-    'ALL_NEONFP16ARITH_MICROKERNEL_SRCS': "defined(__arm__) || defined(__aarch64__)",
-    'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS': "defined(__aarch64__)",
-    'ALL_NEONV8_MICROKERNEL_SRCS': "defined(__aarch64__)",
-    'ALL_SCALAR_MICROKERNEL_SRCS': "defined(__arm__)",
-    'ALL_SSE_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_SSE2_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_SSE41_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_SSSE3_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'ALL_XOP_MICROKERNEL_SRCS': "defined(__i386__) || defined(__i686__) || defined(__x86_64__)",
-    'AARCH32_ASM_MICROKERNEL_SRCS': "defined(__arm__)",
-    "PROD_FP16ARITH_MICROKERNEL_SRCS": "defined(__aarch64__)",
-    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS": "defined(__arm__) || defined(__aarch64__)",
-    "PROD_SCALAR_MICROKERNEL_SRCS": "defined(__arm__)",
-
+    # add non-prod microkernel sources here:
 }

 SRC_NAMES = set([
@ -73,12 +50,24 @@ SRC_NAMES = set([
    "SUBGRAPH_SRCS",
    "LOGGING_SRCS",
    "XNNPACK_SRCS",
-    "HOT_SRCS",
    "TABLE_SRCS",
    "JIT_SRCS",
-    "JIT_AARCH32_SRCS",
-    "JIT_AARCH64_SRCS",
-    "PROD_SCALAR_PORTABLE_MICROKERNEL_SRCS",
+    "PROD_SCALAR_MICROKERNEL_SRCS",
+    "PROD_FMA_MICROKERNEL_SRCS",
+    "PROD_ARMSIMD32_MICROKERNEL_SRCS",
+    "PROD_FP16ARITH_MICROKERNEL_SRCS",
+    "PROD_NEON_MICROKERNEL_SRCS",
+    "PROD_NEONFP16_MICROKERNEL_SRCS",
+    "PROD_NEONFMA_MICROKERNEL_SRCS",
+    "PROD_NEON_AARCH64_MICROKERNEL_SRCS",
+    "PROD_NEONV8_MICROKERNEL_SRCS",
+    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
+    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
+    "PROD_NEONDOT_MICROKERNEL_SRCS",
+    "PROD_NEONDOT_AARCH64_MICROKERNEL_SRCS",
+    "PROD_NEONDOTFP16ARITH_MICROKERNEL_SRCS",
+    "PROD_NEONDOTFP16ARITH_AARCH64_MICROKERNEL_SRCS",
+    "PROD_NEONI8MM_MICROKERNEL_SRCS",
    "PROD_SSE_MICROKERNEL_SRCS",
    "PROD_SSE2_MICROKERNEL_SRCS",
    "PROD_SSSE3_MICROKERNEL_SRCS",
@ -90,59 +79,14 @@ SRC_NAMES = set([
    "PROD_AVX2_MICROKERNEL_SRCS",
    "PROD_AVX512F_MICROKERNEL_SRCS",
    "PROD_AVX512SKX_MICROKERNEL_SRCS",
-    "PROD_SCALAR_MICROKERNEL_SRCS",
-    "PROD_SCALAR_AARCH32_MICROKERNEL_SRCS",
-    "PROD_SCALAR_RISCV_MICROKERNEL_SRCS",
-    "PROD_ARMSIMD32_MICROKERNEL_SRCS",
-    "PROD_FP16ARITH_MICROKERNEL_SRCS",
-    "PROD_NEON_MICROKERNEL_SRCS",
-    "PROD_NEONFP16_MICROKERNEL_SRCS",
-    "PROD_NEONFMA_MICROKERNEL_SRCS",
-    "PROD_NEON_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONV8_MICROKERNEL_SRCS",
-    "PROD_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS",
-    "PROD_NEONDOT_MICROKERNEL_SRCS",
-    "PROD_SSE2_MICROKERNEL_SRCS",
-    "PROD_SSSE3_MICROKERNEL_SRCS",
-    "PROD_SSE41_MICROKERNEL_SRCS",
-    "PROD_AVX_MICROKERNEL_SRCS",
-    "PROD_F16C_MICROKERNEL_SRCS",
    "PROD_AVX512VBMI_MICROKERNEL_SRCS",
-    "PROD_NEONFP16ARITH_MICROKERNEL_SRCS",
+    "PROD_AVX512VNNI_MICROKERNEL_SRCS",
+    "PROD_RVV_MICROKERNEL_SRCS",
+    "PROD_AVXVNNI_MICROKERNEL_SRCS",
+    "AARCH32_ASM_MICROKERNEL_SRCS",
+    "AARCH64_ASM_MICROKERNEL_SRCS",

-    # new adding libs:
-    'ALL_ARMSIMD32_MICROKERNEL_SRCS',
-    'ALL_AVX_MICROKERNEL_SRCS',
-    'ALL_AVX2_MICROKERNEL_SRCS',
-    'ALL_AVX512F_MICROKERNEL_SRCS',
-    'ALL_AVX512SKX_MICROKERNEL_SRCS',
-    'ALL_AVX512VBMI_MICROKERNEL_SRCS',
-    'ALL_F16C_MICROKERNEL_SRCS',
-    'ALL_FMA3_MICROKERNEL_SRCS',
-    'ALL_FP16ARITH_MICROKERNEL_SRCS',
-    'ALL_HEXAGON_MICROKERNEL_SRCS',
-    'ALL_NEON_MICROKERNEL_SRCS',
-    'ALL_NEON_AARCH64_MICROKERNEL_SRCS',
-    'ALL_NEONBF16_MICROKERNEL_SRCS',
-    'ALL_NEONBF16_AARCH64_MICROKERNEL_SRCS',
-    'ALL_NEONDOT_MICROKERNEL_SRCS',
-    'ALL_NEONFMA_MICROKERNEL_SRCS',
-    'ALL_NEONFMA_AARCH64_MICROKERNEL_SRCS',
-    'ALL_NEONFP16_MICROKERNEL_SRCS',
-    'ALL_NEONFP16ARITH_MICROKERNEL_SRCS',
-    'ALL_NEONFP16ARITH_AARCH64_MICROKERNEL_SRCS',
-    'ALL_NEONV8_MICROKERNEL_SRCS',
-    'ALL_SCALAR_MICROKERNEL_SRCS',
-    'ALL_SSE_MICROKERNEL_SRCS',
-    'ALL_SSE2_MICROKERNEL_SRCS',
-    'ALL_SSE41_MICROKERNEL_SRCS',
-    'ALL_SSSE3_MICROKERNEL_SRCS',
-    'ALL_WASM_MICROKERNEL_SRCS',
-    'ALL_WASMRELAXEDSIMD_MICROKERNEL_SRCS',
-    'ALL_WASMSIMD_MICROKERNEL_SRCS',
-    'ALL_XOP_MICROKERNEL_SRCS',
-    'AARCH32_ASM_MICROKERNEL_SRCS',
-    'AARCH64_ASM_MICROKERNEL_SRCS',
+    # add non-prod microkernel sources here:
 ])

 def handle_singleline_parse(line):
@ -150,11 +94,10 @@ def handle_singleline_parse(line):
    end_index = line.find(")")
    line = line[start_index+1:end_index]
    key_val = line.split(" ")
-    return key_val[0], key_val[1][4:]
+    return key_val[0], list(map(lambda x: x[4:], key_val[1:]))

 def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):
    sources = collections.defaultdict(list)
-    count = 0
    with open(os.path.join(xnnpack_path, cmakefile)) as cmake:
        lines = cmake.readlines()
        i = 0
@ -163,7 +106,7 @@ def update_sources(xnnpack_path, cmakefile = "XNNPACK/CMakeLists.txt"):

            if lines[i].startswith("SET") and "src/" in lines[i]:
                name, val = handle_singleline_parse(line)
-                sources[name].append(val)
+                sources[name].extend(val)
                i+=1
                continue

--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
--- a/third_party/xnnpack_src_defs.bzl
+++ b/third_party/xnnpack_src_defs.bzl
--- a/third_party/xnnpack_wrapper_defs.bzl
+++ b/third_party/xnnpack_wrapper_defs.bzl