Update XNNPACK Version (#139913)

Updating XNNPACK Version to 4ea82e595b36106653175dcb04b2aa532660d0d8 submodule update Pull Request resolved: https://github.com/pytorch/pytorch/pull/139913 Approved by: https://github.com/digantdesai, https://github.com/huydhn
2025-10-20 12:54:11 +08:00 · 2024-11-18 18:16:31 +00:00
parent e429a3b72e
commit cca34be584
15 changed files with 3429 additions and 1611 deletions
--- a/.circleci/scripts/binary_ios_upload.sh
+++ b/.circleci/scripts/binary_ios_upload.sh
@ -14,7 +14,7 @@ mkdir -p ${ZIP_DIR}/src
 cp -R ${ARTIFACTS_DIR}/arm64/include ${ZIP_DIR}/install/
 # build a FAT bianry
 cd ${ZIP_DIR}/install/lib
-target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a)
+target_libs=(libc10.a libclog.a libcpuinfo.a libeigen_blas.a libpthreadpool.a libpytorch_qnnpack.a libtorch_cpu.a libtorch.a libXNNPACK.a libmicrokernels-prod.a)
 for lib in ${target_libs[*]}
 do
    if [ -f "${ARTIFACTS_DIR}/x86_64/lib/${lib}" ] && [ -f "${ARTIFACTS_DIR}/arm64/lib/${lib}" ]; then
--- a/android/pytorch_android/CMakeLists.txt
+++ b/android/pytorch_android/CMakeLists.txt
@ -112,6 +112,7 @@ if(ANDROID_ABI)
  import_static_lib(libc10)
  import_static_lib(libnnpack)
  import_static_lib(libXNNPACK)
+  import_static_lib(libmicrokernels-prod)
  import_static_lib(libpytorch_qnnpack)
  import_static_lib(libpthreadpool)
  import_static_lib(libeigen_blas)
@ -129,6 +130,7 @@ if(ANDROID_ABI)
      libc10
      libnnpack
      libXNNPACK
+      libmicrokernels-prod
      libpytorch_qnnpack
      libpthreadpool
      libeigen_blas
@ -151,6 +153,7 @@ else()

  if(USE_XNNPACK)
    list(APPEND pytorch_jni_LIBS XNNPACK)
+    list(APPEND pytorch_jni_LIBS microkernels-prod)
  endif()

  if(USE_SYSTEM_PTHREADPOOL)
--- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
+++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp
@ -234,62 +234,27 @@ Tensor qnnpack_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {

 #ifdef USE_XNNPACK
 C10_ALWAYS_INLINE
-enum xnn_status xnnp_create_add_nd(
-    int8_t azp,
-    float ascale,
-    int8_t bzp,
-    float bscale,
-    int8_t czp,
-    float cscale,
-    int8_t output_min,
-    int8_t output_max,
-    uint32_t flags,
-    xnn_operator_t* op) {
-  return xnn_create_add_nd_qs8(
-      azp,        /* int8_t input1_zero_point   */
-      ascale,     /* float input1_scale         */
-      bzp,        /* int8_t input2_zero_point   */
-      bscale,     /* float input2_scale         */
-      czp,        /* int8_t output_zero_point   */
-      cscale,     /* float output_scale         */
-      output_min, /* int8_t output_min          */
-      output_max, /* int8_t output_max          */
-      flags,      /* uint32_t flags             */
-      op);        /* xnn_operator_t* add_op_out */
-}
+enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
+  Tensor contig_tensor = tensor.contiguous(format);
+  const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
+  const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
+  const float scale = static_cast<float>(contig_tensor.q_scale());

-C10_ALWAYS_INLINE
-enum xnn_status xnnp_reshape_add_nd(
-    xnn_operator_t op,
-    const std::vector<size_t>& a_shape,
-    const std::vector<size_t>& b_shape,
-    pthreadpool_t pt_pool) {
-  return xnn_reshape_add_nd_qs8(
-      op,             /* xnn_operator_t add_op      */
-      a_shape.size(), /* size_t num_input1_dims     */
-      a_shape.data(), /* const size_t* input1_shape */
-      b_shape.size(), /* size_t num_input2_dims     */
-      b_shape.data(), /* const size_t* input2_shape */
-      pt_pool);       /* pthreadpool_t threadpool   */
-}
-
-C10_ALWAYS_INLINE
-enum xnn_status xnnp_setup_add_nd(
-    xnn_operator_t op,
-    const int8_t* da,
-    const int8_t* db,
-    int8_t* dc,
-    pthreadpool_t pt_pool) {
-  return xnn_setup_add_nd_qs8(
-      op,             /* xnn_operator_t add_op      */
-      da,             /* const int8_t* input1       */
-      db,             /* const int8_t* input2       */
-      dc);            /* int8_t* output             */
+  return xnn_define_quantized_tensor_value(
+    subgraph_ptr,
+    xnn_datatype_qint8,
+    zero_point,
+    scale,
+    tensor.ndimension(),
+    tensor_shape.data(),
+    nullptr,
+    external_id,
+    flags,
+    &id);
 }

 template <typename scalar_t, bool ReLUFused = false>
 Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
-  using underlying_t = typename scalar_t::underlying;
  const string func_name = "xnnp_add()";
  TORCH_CHECK(qa.ndimension() > 0, func_name, ": Got empty input tensor.");
  TORCH_CHECK(at::native::xnnpack::available(), func_name, ": XNNPACK is not available")
@ -299,12 +264,6 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
  auto qa_mem_format = qa.suggest_memory_format();
  Tensor qa_contig = qa.contiguous(qa_mem_format);
  Tensor qb_contig = qb.contiguous(qa_mem_format);
-
-  const auto a_zero_point = qa_contig.q_zero_point();
-  const auto b_zero_point = qb_contig.q_zero_point();
-  const auto a_scale = qa_contig.q_scale();
-  const auto b_scale = qb_contig.q_scale();
-
  Tensor qy = at::native::empty_affine_quantized(
      at::infer_size_dimvector(qa_contig.sizes(), qb_contig.sizes()),
      qa.scalar_type(),
@ -319,72 +278,108 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) {
    return qy;
  }

-  xnn_operator_t xnnp_op = nullptr;
-  xnnpack_operator xnnp_add_operator;

-  auto output_max = std::numeric_limits<underlying_t>::max();
-  auto output_min = std::numeric_limits<underlying_t>::min();
+  auto output_max = std::numeric_limits<float>::infinity();
+  auto output_min = -std::numeric_limits<float>::infinity();
  if (ReLUFused) {
-    /*
-     * FIXME: use activationLimits<T>()
-     * With <T>, MSVC runs into "error C3862: identifier activationLimits not found".
-     */
-    constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
-    constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
-    int64_t qvalue = static_cast<int64_t>(zero_point);
-    qvalue = std::max<int64_t>(qvalue, qmin);
-    output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
+    output_min = 0;
  }

-  // Create an operator
-  auto status = xnnp_create_add_nd(
-      a_zero_point,
-      a_scale,
-      b_zero_point,
-      b_scale,
-      static_cast<underlying_t>(zero_point),
-      static_cast<float>(scale),
-      output_min,
-      output_max,
-      0,
-      &xnnp_op);
-  xnnp_add_operator = xnnpack_operator(xnnp_op);
+  // Create XNNPACK Subgraph
+  xnn_subgraph_t subgraph_ptr = nullptr;
+  auto status = xnn_create_subgraph(
+    /*external_value_ids=*/3,
+    /*flags=*/0,
+    &subgraph_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn create operator failed(", status,")!");
+      func_name, ": xnn create subgraph failed(", status,")!");
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
+      subgraph_ptr, &xnn_delete_subgraph);

-  const auto qa_shape = xnnp_utils::get_mem_format_aware_shape(qa_contig);
-  const auto qb_shape = xnnp_utils::get_mem_format_aware_shape(qb_contig);
+  uint32_t input0_id = XNN_INVALID_VALUE_ID, input1_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;

-  // Reshape the operator
-  status = xnnp_reshape_add_nd(
-      xnnp_add_operator.get(),
-      qa_shape,
-      qb_shape,
-      caffe2::pthreadpool_());
+  // Defining the quantized input 0
+  status = xnnp_define_q_tensor(
+    qa,
+    qa_mem_format,
+    input0_id,
+    subgraph_ptr,
+    0,
+    XNN_VALUE_FLAG_EXTERNAL_INPUT
+  );
+  TORCH_CHECK(
+      status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
+      func_name, ": xnn define input 0 failed(", status,")!");

+  // Defining the quantized input 1
+  status = xnnp_define_q_tensor(
+    qb,
+    qa_mem_format,
+    input1_id,
+    subgraph_ptr,
+    1,
+    XNN_VALUE_FLAG_EXTERNAL_INPUT
+  );
+  TORCH_CHECK(
+      status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
+      func_name, ": xnn define input 1 failed(", status,")!");
+
+  // Defining the quantized output
+  status = xnnp_define_q_tensor(
+    qy,
+    qa_mem_format,
+    output_id,
+    subgraph_ptr,
+    2,
+    XNN_VALUE_FLAG_EXTERNAL_OUTPUT
+  );
+  TORCH_CHECK(
+      status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
+      func_name, ": xnn define output failed(", status,")!");
+
+  const struct xnn_binary_params binary_params = {output_min, output_max};
+  status = xnn_define_binary(
+    subgraph_ptr,
+    xnn_binary_add,
+    &binary_params,
+    input0_id,
+    input1_id,
+    output_id,
+    0);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn reshape operator failed(", status,")!");
+      func_name, ": xnn define binary add failed(", status,")!");

-  // Setup the operator
-  status = xnnp_setup_add_nd(
-      xnnp_add_operator.get(),
-      reinterpret_cast<const underlying_t*>(qa_contig.data_ptr<scalar_t>()),
-      reinterpret_cast<const underlying_t*>(qb_contig.data_ptr<scalar_t>()),
-      reinterpret_cast<underlying_t*>(qy.data_ptr<scalar_t>()),
-      caffe2::pthreadpool_());
+  // create runtime
+  xnn_runtime_t runtime_ptr = nullptr;
+  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn setup operator failed(", status,")!");
+      func_name, ": xnn create runtime failed(", status,")!");
+  TORCH_CHECK(
+      runtime_ptr != nullptr,
+      func_name, ": xnn create runtime failed because runtime_ptr is null");
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
+      runtime_ptr, &xnn_delete_runtime);

-  // Run the operator
-  status = xnn_run_operator(
-      xnnp_add_operator.get(), /* xnn_operator_t op */
-      caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
+  std::array<xnn_external_value, 3> external = {
+    xnn_external_value{input0_id, reinterpret_cast<void*>(qa_contig.data_ptr<scalar_t>())},
+    xnn_external_value{input1_id, reinterpret_cast<void*>(qb_contig.data_ptr<scalar_t>())},
+    xnn_external_value{output_id, reinterpret_cast<void*>(qy.data_ptr<scalar_t>())}};
+
+  status = xnn_setup_runtime(
+    runtime_ptr,
+    external.size(),
+    external.data());
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name, ": xnn run operator failed(", status,")");
+      func_name, ": xnn setup runtime failed(", status,")!");
+  status = xnn_invoke_runtime(runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name, ": xnn invoke runtime failed(", status,")!");
+
  return qy;
 }
 #endif // USE_XNNPACK
--- a/aten/src/ATen/native/quantized/cpu/qmul.cpp
+++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp
@ -13,7 +13,6 @@
 #include <ATen/native/quantized/cpu/init_qnnpack.h>
 #include <ATen/quantized/Quantizer.h>
 #include <caffe2/utils/threadpool/pthreadpool-cpp.h>
-#include <torch/library.h>

 #ifndef AT_PER_OPERATOR_HEADERS
 #include <ATen/Functions.h>
@ -56,14 +55,32 @@ Tensor _mul_out(Tensor& out, const Tensor& self, const Tensor& other) {
 }

 #ifdef USE_XNNPACK
+C10_ALWAYS_INLINE
+enum xnn_status xnnp_define_q_tensor(const Tensor& tensor, MemoryFormat format, uint32_t& id, xnn_subgraph_t subgraph_ptr, uint32_t external_id, uint32_t flags){
+  Tensor contig_tensor = tensor.contiguous(format);
+  const auto tensor_shape = xnnp_utils::get_mem_format_aware_shape(contig_tensor);
+  const int32_t zero_point = static_cast<int32_t>(contig_tensor.q_zero_point());
+  const float scale = static_cast<float>(contig_tensor.q_scale());
+
+  return xnn_define_quantized_tensor_value(
+    subgraph_ptr,
+    xnn_datatype_qint8,
+    zero_point,
+    scale,
+    tensor.ndimension(),
+    tensor_shape.data(),
+    nullptr,
+    external_id,
+    flags,
+    &id);
+}
+
 template <typename scalar_t, bool ReLUFused = false>
 Tensor _mul_out_xnnpack(
    const Tensor& self,
    const Tensor& other,
    double output_scale,
    int64_t output_zero_point) {
-  using underlying_t = typename scalar_t::underlying;
-
  const string func_name = "xnnp_mul()";
  TORCH_CHECK(self.ndimension() > 0, func_name, ": Got empty input tensor.");
  TORCH_CHECK(
@ -89,96 +106,108 @@ Tensor _mul_out_xnnpack(
    return out;
  }

-  int64_t self_zero_point = self_contig.q_zero_point();
-  double self_scale = self_contig.q_scale();
-  int64_t other_zero_point = other_contig.q_zero_point();
-  double other_scale = other_contig.q_scale();
-
-  int64_t output_min = std::numeric_limits<underlying_t>::min();
-  int64_t output_max = std::numeric_limits<underlying_t>::max();
-
-  if(ReLUFused) {
-    /*
-     * FIXME: use activationLimits<T>()
-     * With <T>, MSVC runs into "error C3862: identifier activationLimits not
-     * found".
-     */
-    constexpr int64_t qmin = std::numeric_limits<underlying_t>::min();
-    constexpr int64_t qmax = std::numeric_limits<underlying_t>::max();
-    int64_t qvalue = static_cast<int64_t>(output_zero_point);
-    qvalue = std::max<int64_t>(qvalue, qmin);
-    output_min = static_cast<underlying_t>(std::min<int64_t>(qvalue, qmax));
+  auto output_max = std::numeric_limits<float>::infinity();
+  auto output_min = -std::numeric_limits<float>::infinity();
+  if (ReLUFused) {
+    output_min = 0;
  }

-  xnn_operator_t xnnp_op = nullptr;
-  xnnpack_operator xnnp_qmul_operator;
-
-  // create xnnpack multiply operator ...
-  auto status = xnn_create_multiply_nd_qs8(
-      self_zero_point,
-      self_scale,
-      other_zero_point,
-      other_scale,
-      static_cast<underlying_t>(output_zero_point),
-      static_cast<float>(output_scale),
-      output_min,
-      output_max,
-      0,
-      &xnnp_op);
-
+  // Create XNNPACK Subgraph
+  xnn_subgraph_t subgraph_ptr = nullptr;
+  auto status = xnn_create_subgraph(
+    /*external_value_ids=*/3,
+    /*flags=*/0,
+    &subgraph_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name,
-      ": xnn create operator failed(",
-      status,
-      ")!");
-  xnnp_qmul_operator = xnnpack_operator(xnnp_op);
+      func_name, ": xnn create subgraph failed(", status,")!");
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
+      subgraph_ptr, &xnn_delete_subgraph);

+  uint32_t input0_id = XNN_INVALID_VALUE_ID;
+  uint32_t input1_id = XNN_INVALID_VALUE_ID;
+  uint32_t output_id = XNN_INVALID_VALUE_ID;

-  const auto self_shape = xnnp_utils::get_mem_format_aware_shape(self_contig);
-  const auto other_shape = xnnp_utils::get_mem_format_aware_shape(other_contig);
-
-  // reshape operator
-  status = xnn_reshape_multiply_nd_qs8(
-      xnnp_qmul_operator.get(),
-      self_shape.size(),
-      self_shape.data(),
-      other_shape.size(),
-      other_shape.data(),
-      caffe2::pthreadpool_());
-
-  TORCH_CHECK(
-      status == xnn_status_success,
-      func_name,
-      ": xnn reshape operator failed(",
-      status,
-      ")!");
-
-  // set up operator
-  status = xnn_setup_multiply_nd_qs8(
-      xnnp_qmul_operator.get(),
-      reinterpret_cast<const underlying_t*>(self_contig.data_ptr<scalar_t>()),
-      reinterpret_cast<const underlying_t*>(other_contig.data_ptr<scalar_t>()),
-      reinterpret_cast<underlying_t*>(out.data_ptr<scalar_t>())
+  // Defining the quantized input 0
+  status = xnnp_define_q_tensor(
+    self,
+    qa_mem_format,
+    input0_id,
+    subgraph_ptr,
+    0,
+    XNN_VALUE_FLAG_EXTERNAL_INPUT
  );
+  TORCH_CHECK(
+      status == xnn_status_success && input0_id != XNN_INVALID_VALUE_ID,
+      func_name, ": xnn define input 0 failed(", status,")!");

+  // Defining the quantized input 1
+  status = xnnp_define_q_tensor(
+    other,
+    qa_mem_format,
+    input1_id,
+    subgraph_ptr,
+    1,
+    XNN_VALUE_FLAG_EXTERNAL_INPUT
+  );
+  TORCH_CHECK(
+      status == xnn_status_success && input1_id != XNN_INVALID_VALUE_ID,
+      func_name, ": xnn define input 1 failed(", status,")!");
+
+  // Defining the quantized output
+  status = xnnp_define_q_tensor(
+    out,
+    qa_mem_format,
+    output_id,
+    subgraph_ptr,
+    2,
+    XNN_VALUE_FLAG_EXTERNAL_OUTPUT
+  );
+  TORCH_CHECK(
+      status == xnn_status_success && output_id != XNN_INVALID_VALUE_ID,
+      func_name, ": xnn define output failed(", status,")!");
+
+  const struct xnn_binary_params binary_params = {output_min, output_max};
+  status = xnn_define_binary(
+    subgraph_ptr,
+    xnn_binary_multiply,
+    &binary_params,
+    input0_id,
+    input1_id,
+    output_id,
+    0);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name,
-      ": xnn setup operator failed(",
-      status,
-      ")!");
+      func_name, ": xnn define binary add failed(", status,")!");

-  // Run the operator
-  status = xnn_run_operator(
-      xnnp_qmul_operator.get(), /* xnn_operator_t op */
-      caffe2::pthreadpool_()); /* pthreadpool_t threadpool */
+  // create runtime
+  xnn_runtime_t runtime_ptr = nullptr;
+  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
  TORCH_CHECK(
      status == xnn_status_success,
-      func_name,
-      ": xnn run operator failed(",
-      status,
-      ")");
+      func_name, ": xnn create runtime failed(", status,")!");
+  TORCH_CHECK(
+      runtime_ptr != nullptr,
+      func_name, ": xnn create runtime failed because runtime_ptr is null");
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
+      runtime_ptr, &xnn_delete_runtime);
+
+  std::array<xnn_external_value, 3> external = {
+    xnn_external_value{input0_id, reinterpret_cast<void*>(self.data_ptr<scalar_t>())},
+    xnn_external_value{input1_id, reinterpret_cast<void*>(other.data_ptr<scalar_t>())},
+    xnn_external_value{output_id, reinterpret_cast<void*>(out.data_ptr<scalar_t>())}};
+
+  status = xnn_setup_runtime(
+    runtime_ptr,
+    external.size(),
+    external.data());
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name, ": xnn setup runtime failed(", status,")!");
+  status = xnn_invoke_runtime(runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      func_name, ": xnn invoke runtime failed(", status,")!");

  return out;
 }
--- a/aten/src/ATen/native/xnnpack/Activation.cpp
+++ b/aten/src/ATen/native/xnnpack/Activation.cpp
@ -19,46 +19,84 @@ bool use_hardswish(

 static Tensor& hardswish_impl(Tensor& input, Tensor& output) {
  using namespace internal;
-
-  xnn_operator_t hardswish_op{};
-  const xnn_status create_status = xnn_create_hardswish_nc_f32(
-    0, // flags
-    &hardswish_op);
-
+  // Create XNNPACK Subgraph
+  xnn_subgraph_t subgraph_ptr = nullptr;
+  xnn_status status = xnn_create_subgraph(
+    /*external_value_ids=*/2,
+    /*flags=*/0,
+    &subgraph_ptr);
  TORCH_CHECK(
-    xnn_status_success == create_status,
-    "xnn_create_hardswish_nc_f32 failed!");
-
-  Operator hardswish_scoped_op(hardswish_op);
-
-  const xnn_status reshape_status = xnn_reshape_hardswish_nc_f32(
-    hardswish_op,
-    input.numel(),  // Batch
-    1, // channels
-    1, // input stride
-    1, // output stride
-    caffe2::pthreadpool_());  // threadpool
+      status == xnn_status_success,
+      "xnn create subgraph failed(", status,")!");
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
+      subgraph_ptr, &xnn_delete_subgraph);
+  uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;
+  std::vector<size_t> input_output_shape(input.sizes().begin(), input.sizes().end());

+  status = xnn_define_tensor_value(
+    subgraph_ptr,
+    xnn_datatype_fp32,
+    input_output_shape.size(),
+    input_output_shape.data(),
+    nullptr,
+    0,
+    XNN_VALUE_FLAG_EXTERNAL_INPUT,
+    &input_id
+  );
  TORCH_CHECK(
-    xnn_status_success == reshape_status,
-    "xnn_reshape_hardswish_nc_f32 failed!");
-
-  const xnn_status setup_status = xnn_setup_hardswish_nc_f32(
-    hardswish_op,
-    input.data_ptr<float>(),
-    output.data_ptr<float>());
+      status == xnn_status_success,
+      "defining xnn input failed(", status,")!");

+  status = xnn_define_tensor_value(
+    subgraph_ptr,
+    xnn_datatype_fp32,
+    input_output_shape.size(),
+    input_output_shape.data(),
+    nullptr,
+    1,
+    XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
+    &output_id
+  );
  TORCH_CHECK(
-    xnn_status_success == setup_status,
-    "xnn_setup_hardswish_nc_f32 failed!");
+      status == xnn_status_success,
+      "defining xnn output failed(", status,")!");

-  const xnn_status run_status = xnn_run_operator(
-    hardswish_op,
-    caffe2::pthreadpool_());  // threadpool
+  status = xnn_define_unary(
+    subgraph_ptr,
+    xnn_unary_hardswish,
+    nullptr,
+    input_id,
+    output_id,
+    0
+  );

-  TORCH_INTERNAL_ASSERT(
-    xnn_status_success == run_status,
-    "xnn_run_operator failed!");
+  // create runtime
+  xnn_runtime_t runtime_ptr = nullptr;
+  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      "xnn create runtime failed(", status,")!");
+  TORCH_CHECK(
+      runtime_ptr != nullptr,
+      "xnn create runtime failed because runtime_ptr is null");
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
+      runtime_ptr, &xnn_delete_runtime);
+
+  std::array<xnn_external_value, 2> external = {
+    xnn_external_value{input_id, input.data_ptr<float>()},
+    xnn_external_value{output_id, output.data_ptr<float>()}};
+
+  status = xnn_setup_runtime(
+    runtime_ptr,
+    external.size(),
+    external.data());
+  TORCH_CHECK(
+      status == xnn_status_success,
+      "xnn setup runtime failed(", status,")!");
+  status = xnn_invoke_runtime(runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      "xnn invoke runtime failed(", status,")!");

  return output;
 }
--- a/aten/src/ATen/native/xnnpack/AveragePooling.cpp
+++ b/aten/src/ATen/native/xnnpack/AveragePooling.cpp
@ -7,6 +7,27 @@

 namespace at::native::xnnpack {

+inline std::vector<size_t> get_mem_format_aware_shape(const at::Tensor& in) {
+  const auto mem_format = in.suggest_memory_format();
+  const auto& sizes = in.sizes();
+  std::vector<size_t> ret(sizes.begin(), sizes.end());
+  if (mem_format == c10::MemoryFormat::ChannelsLast) {
+    // NCHW -> NHWC
+    // 0123 -> 0231
+    ret[1] = sizes[2]; /* H */
+    ret[2] = sizes[3]; /* W */
+    ret[3] = sizes[1]; /* C */
+  } else if (mem_format == c10::MemoryFormat::ChannelsLast3d) {
+    // NCDHW -> NDHWC
+    // 01234 -> 02341
+    ret[1] = sizes[2]; /* D */
+    ret[2] = sizes[3]; /* H */
+    ret[3] = sizes[4]; /* W */
+    ret[4] = sizes[1]; /* C */
+  }
+  return ret;
+}
+
 bool use_global_average_pool(const Tensor& input) {
  return xnnpack::available() && (1 <= input.ndimension()) &&
      (input.device().is_cpu()) && (kFloat == input.scalar_type()) &&
@ -31,63 +52,91 @@ Tensor global_average_pool(const Tensor& input) {
      MemoryFormat::ChannelsLast,
      input_padded_contig_nhwc.opt_names());

-  xnn_operator_t global_average_pooling_op{};
-  const xnn_status create_status = xnn_create_global_average_pooling_nwc_f32(
-      -std::numeric_limits<float>::infinity(),
-      std::numeric_limits<float>::infinity(),
-      0 /* flags */,
-      &global_average_pooling_op);
-
+  // Create XNNPACK Subgraph
+  xnn_subgraph_t subgraph_ptr = nullptr;
+  xnn_status status = xnn_create_subgraph(
+    /*external_value_ids=*/2,
+    /*flags=*/0,
+    &subgraph_ptr);
  TORCH_CHECK(
-      xnn_status_success == create_status,
-      "xnn_create_global_average_pooling_nwc_f32 failed!");
+      status == xnn_status_success,
+      "xnn create subgraph failed(", status,")!");
+  std::unique_ptr<xnn_subgraph, decltype(&xnn_delete_subgraph)> subgraph(
+      subgraph_ptr, &xnn_delete_subgraph);
+  uint32_t input_id = XNN_INVALID_VALUE_ID, output_id = XNN_INVALID_VALUE_ID;

-  Operator global_avg_pool_scoped_op(global_average_pooling_op);
-
-  size_t workspace_size = 0;
-  size_t workspace_alignment = 0;
-
-  const xnn_status reshape_status = xnn_reshape_global_average_pooling_nwc_f32(
-      global_average_pooling_op,
-      input_padded_contig_nhwc.size(Layout::Activation4D::batch), // batch_size
-      input_padded_contig_nhwc.size(Layout::Activation4D::width) *
-          input_padded_contig_nhwc.size(Layout::Activation4D::height), // width
-      input_padded_contig_nhwc.size(Layout::Activation4D::channels), // channels
-      input_padded_contig_nhwc.size(
-          Layout::Activation4D::channels), // input stride
-      input_padded_contig_nhwc.size(
-          Layout::Activation4D::channels), // output stride
-      &workspace_size, // workspace_size
-      &workspace_alignment, // workspace_alignment
-      caffe2::pthreadpool_());

+  const auto& input_shape = get_mem_format_aware_shape(input_padded_contig_nhwc);
+  status = xnn_define_tensor_value(
+    subgraph_ptr,
+    xnn_datatype_fp32,
+    input_shape.size(),
+    input_shape.data(),
+    nullptr,
+    0,
+    XNN_VALUE_FLAG_EXTERNAL_INPUT,
+    &input_id
+  );
  TORCH_CHECK(
-      xnn_status_success == reshape_status,
-      "xnn_reshape_global_average_pooling_nwc_f32 failed!");
-
-  // Create Workspace pointer, which we will align and pad with 16 bytes
-  size_t xnnpack_buffer_padding = 16;
-  std::vector<char> workspace_vector(workspace_size + workspace_alignment + xnnpack_buffer_padding);
-  void* maybe_aligned_workspace = workspace_vector.data();
-  void* aligned_workspace =
-      (void*)((intptr_t)maybe_aligned_workspace + workspace_alignment - (intptr_t)maybe_aligned_workspace % workspace_alignment);
-
-  const xnn_status setup_status = xnn_setup_global_average_pooling_nwc_f32(
-      global_average_pooling_op,
-      aligned_workspace,
-      input_padded_contig_nhwc.data_ptr<float>(),
-      output.data_ptr<float>());
+      status == xnn_status_success,
+      "defining xnn input failed(", status,")!");

+  const auto& output_shape = get_mem_format_aware_shape(output);
+  status = xnn_define_tensor_value(
+    subgraph_ptr,
+    xnn_datatype_fp32,
+    output_shape.size(),
+    output_shape.data(),
+    nullptr,
+    1,
+    XNN_VALUE_FLAG_EXTERNAL_OUTPUT,
+    &output_id
+  );
  TORCH_CHECK(
-      xnn_status_success == setup_status,
-      "xnn_setup_global_average_pooling_nwc_f32 failed!");
-
-  const xnn_status run_status =
-      xnn_run_operator(global_average_pooling_op, caffe2::pthreadpool_());
+      status == xnn_status_success,
+      "defining xnn output failed(", status,")!");

+  std::vector<size_t> reduce_dims{1, 2};
+  status = xnn_define_static_reduce(
+    subgraph_ptr,
+    xnn_reduce_mean,
+    reduce_dims.size(),
+    reduce_dims.data(),
+    input_id,
+    output_id,
+    0
+  );
  TORCH_CHECK(
-      xnn_status_success == run_status,
-      "xnn_setup_global_average_pooling_nwc_f32 failed!");
+      status == xnn_status_success,
+      "defining xnn static reduce failed(", status,")!");
+
+  // create runtime
+  xnn_runtime_t runtime_ptr = nullptr;
+  status = xnn_create_runtime_v2(subgraph_ptr, caffe2::pthreadpool_(), 0, &runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      "xnn create runtime failed(", status,")!");
+  TORCH_CHECK(
+      runtime_ptr != nullptr,
+      "xnn create runtime failed because runtime_ptr is null");
+  std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> auto_runtime(
+      runtime_ptr, &xnn_delete_runtime);
+
+  std::array<xnn_external_value, 2> external = {
+    xnn_external_value{input_id, input_padded_contig_nhwc.data_ptr<float>()},
+    xnn_external_value{output_id, output.data_ptr<float>()}};
+
+  status = xnn_setup_runtime(
+    runtime_ptr,
+    external.size(),
+    external.data());
+  TORCH_CHECK(
+      status == xnn_status_success,
+      "xnn setup runtime failed(", status,")!");
+  status = xnn_invoke_runtime(runtime_ptr);
+  TORCH_CHECK(
+      status == xnn_status_success,
+      "xnn invoke runtime failed(", status,")!");

  return output.to(input.suggest_memory_format());
 }
--- a/cmake/Dependencies.cmake
+++ b/cmake/Dependencies.cmake
@ -516,6 +516,9 @@ if(USE_XNNPACK AND NOT USE_SYSTEM_XNNPACK)
    # Disable I8MM For CI since clang 9 does not support neon i8mm.
    set(XNNPACK_ENABLE_ARM_I8MM OFF CACHE BOOL "")

+    # Disable avxvnni int8
+    set(XNNPACK_ENABLE_AVXVNNIINT8 OFF CACHE BOOL "")
+
    # Older MSVC versions don't support AVX512FP. TODO Minimum version support?
    IF(CMAKE_C_COMPILER_ID STREQUAL "MSVC")
      set(XNNPACK_ENABLE_AVX512FP16  OFF CACHE BOOL "")
--- a/cmake/TorchConfig.cmake.in
+++ b/cmake/TorchConfig.cmake.in
@ -94,6 +94,7 @@ else()

  if(@USE_XNNPACK@)
    append_torchlib_if_found(XNNPACK)
+    append_torchlib_if_found(microkernels-prod)
  endif()

  append_torchlib_if_found(caffe2_protos protobuf-lite protobuf protoc)
--- a/ios/TestApp/benchmark/setup.rb
+++ b/ios/TestApp/benchmark/setup.rb
@ -111,7 +111,7 @@ else
 end

 puts "Linking static libraries..."
-libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
+libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a']
 frameworks = ['CoreML', 'Metal', 'MetalPerformanceShaders', 'Accelerate', 'UIKit']
 targets.each do |target|
    # NB: All these libraries and frameworks have already been linked by TestApp, adding them
--- a/scripts/xcode_build.rb
+++ b/scripts/xcode_build.rb
@ -40,7 +40,7 @@ end

 # link static libraries
 target.frameworks_build_phases.clear
-libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
+libs = ['libc10.a', 'libclog.a', 'libpthreadpool.a', 'libXNNPACK.a', 'libmicrokernels-prod.a', 'libeigen_blas.a', 'libcpuinfo.a', 'libpytorch_qnnpack.a', 'libtorch_cpu.a', 'libtorch.a', 'libkineto.a']
 for lib in libs do
    path = "#{install_path}/lib/#{lib}"
    if File.exist?(path)
--- a/third_party/XNNPACK
+++ b/third_party/XNNPACK
--- a/third_party/xnnpack.buck.bzl
+++ b/third_party/xnnpack.buck.bzl
--- a/third_party/xnnpack_buck_shim.bzl
+++ b/third_party/xnnpack_buck_shim.bzl
@ -0,0 +1,33 @@
+load(
+    "//xplat/third-party/XNNPACK/XNNPACK:build_srcs.bzl",
+    _LOGGING_SRCS = "LOGGING_SRCS",
+    _OPERATOR_SRCS = "OPERATOR_SRCS",
+    _SUBGRAPH_SRCS = "SUBGRAPH_SRCS",
+    _TABLE_SRCS = "TABLE_SRCS",
+    _XNNPACK_SRCS = "XNNPACK_SRCS",
+)
+load("//xplat/third-party/XNNPACK/XNNPACK/gen:microkernels.bzl", "prod_srcs_for_arch")
+load("//tools/build_defs:glob_defs.bzl", "subdir_glob")
+
+def define_xnnpack_build_src(xnnpack_build_src):
+    return ["XNNPACK/{}".format(src) for src in xnnpack_build_src]
+
+def prod_srcs_for_arch_wrapper(arch):
+    prod_srcs = prod_srcs_for_arch(arch)
+    return define_xnnpack_build_src(prod_srcs)
+
+def get_xnnpack_headers():
+    src_headers = subdir_glob([
+        ("XNNPACK/src", "**/*.h"),
+    ])
+    include_headers = subdir_glob([
+        ("XNNPACK/include", "*.h"),
+    ])
+
+    return src_headers | include_headers
+
+OPERATOR_SRCS = define_xnnpack_build_src(_OPERATOR_SRCS)
+SUBGRAPH_SRCS = define_xnnpack_build_src(_SUBGRAPH_SRCS)
+TABLE_SRCS = define_xnnpack_build_src(_TABLE_SRCS)
+XNNPACK_SRCS = define_xnnpack_build_src(_XNNPACK_SRCS)
+LOGGING_SRCS = define_xnnpack_build_src(_LOGGING_SRCS)
--- a/third_party/xnnpack_src_defs.bzl
+++ b/third_party/xnnpack_src_defs.bzl
--- a/third_party/xnnpack_wrapper_defs.bzl
+++ b/third_party/xnnpack_wrapper_defs.bzl