Integrate XNNPACK with custom class for packing weights. (#34047)

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/34047 This PR integrates the added xnnpack conv2d and linear op via custom class registration for packed weights. The packed struct is serializable. Test Plan: python test test/test_xnnpack_integration.py Imported from OSS Differential Revision: D20185657 fbshipit-source-id: fc7e692d8f913e493b293b02d92f4e78536d7698
2025-10-20 21:14:14 +08:00 · 2020-03-14 12:48:24 -07:00
parent e23a9dc140
commit 4c30fc7238
17 changed files with 1029 additions and 300 deletions
--- a/aten/src/ATen/Context.cpp
+++ b/aten/src/ATen/Context.cpp
@ -141,6 +141,14 @@ const std::vector<at::QEngine>& Context::supportedQEngines() const {
  return supported_qengines;
 }

+bool Context::isXNNPACKAvailable() const {
+#ifdef USE_XNNPACK
+  return true;
+#else
+  return false;
+#endif
+}
+
 bool Context::setFlushDenormal(bool on) {
  return at::cpu::set_flush_denormal(on);
 }
--- a/aten/src/ATen/Context.h
+++ b/aten/src/ATen/Context.h
@ -109,6 +109,7 @@ class CAFFE2_API Context {
  at::QEngine qEngine() const;
  void setQEngine(at::QEngine e);
  const std::vector<at::QEngine>& supportedQEngines() const;
+  bool isXNNPACKAvailable() const;

 private:
  void initCUDAIfNeeded(DeviceType p) {
--- a/aten/src/ATen/native/native_functions.yaml
+++ b/aten/src/ATen/native/native_functions.yaml
@ -778,10 +778,6 @@

 - func: conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor

- func: _conv2d_prepack(Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1, float? output_min=None, float? output_max=None) -> Tensor
-
- func: _conv2d_packed(Tensor packed_weight, Tensor input) -> Tensor
-
 - func: conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor

 - func: conv_tbc(Tensor self, Tensor weight, Tensor bias, int pad=0) -> Tensor
@ -1577,10 +1573,6 @@
 - func: linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
  python_module: nn

- func: _linear_prepack(Tensor weight, Tensor? bias=None, float? output_min=None, float? output_max=None) -> Tensor
-
- func: _linear_packed(Tensor packed_weight, Tensor input) -> Tensor
-
 - func: mkldnn_linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor
  python_module: nn
  dispatch:
--- a/aten/src/ATen/native/xnnpack/Common.h
+++ b/aten/src/ATen/native/xnnpack/Common.h
@ -9,6 +9,53 @@
 namespace at {
 namespace native {
 namespace xnnpack {
+
+struct Deleter final {
+  void operator()(const xnn_operator_t op) const {
+    xnn_delete_operator(op);
+  }
+};
+
+using Operator = std::unique_ptr<xnn_operator, Deleter>;
+
+struct ContextLinear final {
+  Operator op;
+  int64_t output_channels;
+
+  ContextLinear() = delete;
+
+  ContextLinear(Operator&& o, int64_t o_channels) {
+    op = std::move(o);
+    output_channels = o_channels;
+  }
+  static constexpr float kMin = -std::numeric_limits<float>::infinity();
+  static constexpr float kMax = std::numeric_limits<float>::infinity();
+};
+
+struct ContextConv2D final {
+  Operator op;
+  std::array<int64_t, 4> weight_size_;
+  std::array<int64_t, 2> padding_;
+  std::array<int64_t, 2> stride_;
+  std::array<int64_t, 2> dilation_;
+
+  ContextConv2D() = delete;
+
+  ContextConv2D(
+      Operator&& o,
+      std::array<int64_t, 4> weight_size,
+      std::array<int64_t, 2> padding,
+      std::array<int64_t, 2> stride,
+      std::array<int64_t, 2> dilation)
+      :  op(std::move(o)),
+         weight_size_(weight_size),
+         padding_(padding),
+         stride_(stride),
+         dilation_(dilation) {}
+  static constexpr float kMin = -std::numeric_limits<float>::infinity();
+  static constexpr float kMax = std::numeric_limits<float>::infinity();
+};
+
 namespace internal {

 struct Layout final {
@ -64,14 +111,6 @@ struct Layout final {
  };
 };

-struct Deleter final {
-  void operator()(const xnn_operator_t op) const {
-    xnn_delete_operator(op);
-  }
-};
-
-using Operator = std::unique_ptr<xnn_operator, Deleter>;
-
 bool available();

 } // namespace internal
--- a/aten/src/ATen/native/xnnpack/Convolution.cpp
+++ b/aten/src/ATen/native/xnnpack/Convolution.cpp
@ -1,10 +1,10 @@
 #ifdef USE_XNNPACK

-#include <ATen/cpp_custom_type_hack.h>
+#include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/ConvUtils.h>
 #include <ATen/native/utils/ParamUtils.h>
-#include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/xnnpack/Factory.h>
+#include <ATen/native/xnnpack/Convolution.h>

 namespace at {
 namespace native {
@ -12,18 +12,6 @@ namespace xnnpack {
 namespace internal {
 namespace convolution2d {

-struct Context final {
-  Operator convolution_op;
-
-  std::vector<int64_t> weight_size;
-  std::vector<int64_t> padding;
-  std::vector<int64_t> stride;
-  std::vector<int64_t> dilation;
-
-  static constexpr float kMin = -std::numeric_limits<float>::infinity();
-  static constexpr float kMax = std::numeric_limits<float>::infinity();
-};
-
 namespace {

 // Supports NHWC and NCHW FP32 convolutions with any valid
@ -79,72 +67,6 @@ bool available(
         true;
 }

-Context create(
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias,
-    const IntArrayRef padding_,
-    const IntArrayRef stride_,
-    const IntArrayRef dilation_,
-    const int64_t groups,
-    const float output_min,
-    const float output_max) {
-  const auto padding = expand_param_if_needed(padding_, "padding", 2);
-  const auto stride = expand_param_if_needed(stride_, "stride", 2);
-  const auto dilation = expand_param_if_needed(dilation_, "dilation", 2);
-  const Tensor weight_nhwc = weight.contiguous(MemoryFormat::ChannelsLast);
-
-  TORCH_CHECK(
-      available(
-          weight_nhwc,
-          bias,
-          padding,
-          stride,
-          dilation,
-          groups,
-          output_min,
-          output_max),
-      "XNNPACK Convolution not available! "
-      "Reason: The provided (weight, bias, padding, stride, dilation, groups, output_min, output_max) "
-      "parameters are either invalid individually or their combination is not supported by XNNPACK.");
-
-  xnn_operator_t convolution_op{};
-
-  const xnn_status create_status = xnn_create_convolution2d_nhwc_f32(
-      padding[Layout::Parameter::height],                             // input_padding_top
-      padding[Layout::Parameter::width],                              // input_padding_right
-      padding[Layout::Parameter::height],                             // input_padding_bottom
-      padding[Layout::Parameter::width],                              // input_padding_left
-      weight_nhwc.size(Layout::Filter::height),                       // kernel_height
-      weight_nhwc.size(Layout::Filter::width),                        // kernel_width
-      stride[Layout::Parameter::height],                              // subsampling_height
-      stride[Layout::Parameter::width],                               // subsampling_width
-      dilation[Layout::Parameter::height],                            // dilation_height
-      dilation[Layout::Parameter::width],                             // dilation_width
-      groups,                                                         // groups
-      weight_nhwc.size(Layout::Filter::input),                        // group_input_channels
-      weight_nhwc.size(Layout::Filter::output) / groups,              // group_output_channels
-      weight_nhwc.size(Layout::Filter::input) * groups,               // input_pixel_stride
-      weight_nhwc.size(Layout::Filter::output),                       // output_pixel_stride
-      weight_nhwc.data_ptr<float>(),                                  // kernel
-      (bias && bias->defined()) ? bias->data_ptr<float>() : nullptr,  // bias
-      output_min,                                                     // output_min
-      output_max,                                                     // output_max
-      0u,                                                             // flags
-      &convolution_op);                                               // operator
-
-  TORCH_CHECK(
-      xnn_status_success == create_status,
-      "xnn_create_convolution2d_nhwc_f32 failed!");
-
-  return Context{
-      Operator(convolution_op),
-      weight_nhwc.sizes().vec(),
-      padding,
-      stride,
-      dilation,
-  };
-}
-
 // TODO: Decouple and improve error handling and messages.
 bool usable(const Tensor& input) {
         // Input
@ -159,42 +81,43 @@ bool usable(const Tensor& input) {
 }

 Tensor run(
-    const Context& context,
+    const ContextConv2D& context,
    const Tensor& input) {
  using namespace internal;

  const Tensor input_nhwc = input.contiguous(MemoryFormat::ChannelsLast);
+  const Tensor padded_input_nhwc = allocate_padded_if_needed(input_nhwc);

  TORCH_CHECK(
-      usable(input_nhwc),
+      usable(padded_input_nhwc),
      "XNNPACK Convolution not usable! "
      "Reason: The provided input tensor is either invalid or unsupported by XNNPACK.");

  Tensor output = empty_with_tail_padding(
      conv_output_size(
-          input_nhwc.sizes(),
-          context.weight_size,
-          context.padding,
-          context.stride,
-          context.dilation),
-      input_nhwc.options().dtype(),
+          padded_input_nhwc.sizes(),
+          context.weight_size_,
+          context.padding_,
+          context.stride_,
+          context.dilation_),
+      padded_input_nhwc.options().dtype(),
      MemoryFormat::ChannelsLast);

  const xnn_status setup_status = xnn_setup_convolution2d_nhwc_f32(
-      context.convolution_op.get(),                   // operator
-      input_nhwc.size(Layout::Activation4D::batch),   // batch_size
-      input_nhwc.size(Layout::Activation4D::height),  // input_height
-      input_nhwc.size(Layout::Activation4D::width),   // input_width
-      input_nhwc.data_ptr<float>(),                   // input
-      output.data_ptr<float>(),                       // output
-      nullptr);                                       // threadpool
+      context.op.get(),                                      // operator
+      padded_input_nhwc.size(Layout::Activation4D::batch),   // batch_size
+      padded_input_nhwc.size(Layout::Activation4D::height),  // input_height
+      padded_input_nhwc.size(Layout::Activation4D::width),   // input_width
+      padded_input_nhwc.data_ptr<float>(),                   // input
+      output.data_ptr<float>(),                              // output
+      nullptr);                                              // threadpool

  TORCH_CHECK(
      xnn_status_success == setup_status,
      "xnn_setup_convolution2d_nhwc_f32 failed!");

  const xnn_status run_status = xnn_run_operator(
-      context.convolution_op.get(), // operator
+      context.op.get(),             // operator
      nullptr);                     // threadpool

  TORCH_INTERNAL_ASSERT(
@ -228,6 +151,101 @@ Tensor create_and_run(
 }

 } // namespace
+
+ContextConv2D create(
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const float output_min,
+    const float output_max) {
+  const auto padding_expanded = expand_param_if_needed(padding, "padding", 2);
+  const auto stride_expanded = expand_param_if_needed(stride, "stride", 2);
+  const auto dilation_expanded = expand_param_if_needed(dilation, "dilation", 2);
+  const Tensor weight_nhwc = weight.contiguous(MemoryFormat::ChannelsLast);
+
+  TORCH_CHECK(
+      available(
+          weight_nhwc,
+          bias,
+          padding_expanded,
+          stride_expanded,
+          dilation_expanded,
+          groups,
+          output_min,
+          output_max),
+      "xnnpack::convolution not available! "
+      "Reason: The provided (weight, bias, padding, stride, dilation, groups, output_min, output_max) "
+      "parameters are either invalid individually or their combination is not supported by XNNPACK.");
+
+  xnn_operator_t convolution_op{};
+
+  const xnn_status create_status = xnn_create_convolution2d_nhwc_f32(
+      padding_expanded[Layout::Parameter::height],                    // input_padding_top
+      padding_expanded[Layout::Parameter::width],                     // input_padding_right
+      padding_expanded[Layout::Parameter::height],                    // input_padding_bottom
+      padding_expanded[Layout::Parameter::width],                     // input_padding_left
+      weight_nhwc.size(Layout::Filter::height),                       // kernel_height
+      weight_nhwc.size(Layout::Filter::width),                        // kernel_width
+      stride_expanded[Layout::Parameter::height],                     // subsampling_height
+      stride_expanded[Layout::Parameter::width],                      // subsampling_width
+      dilation_expanded[Layout::Parameter::height],                   // dilation_height
+      dilation_expanded[Layout::Parameter::width],                    // dilation_width
+      groups,                                                         // groups
+      weight_nhwc.size(Layout::Filter::input),                        // group_input_channels
+      weight_nhwc.size(Layout::Filter::output) / groups,              // group_output_channels
+      weight_nhwc.size(Layout::Filter::input) * groups,               // input_pixel_stride
+      weight_nhwc.size(Layout::Filter::output),                       // output_pixel_stride
+      weight_nhwc.data_ptr<float>(),                                  // kernel
+      (bias && bias->defined()) ? bias->data_ptr<float>() : nullptr,  // bias
+      output_min,                                                     // output_min
+      output_max,                                                     // output_max
+      0u,                                                             // flags
+      &convolution_op);                                               // operator
+
+  TORCH_CHECK(
+      xnn_status_success == create_status,
+      "xnn_create_convolution2d_nhwc_f32 failed!");
+
+  return ContextConv2D{
+      Operator(convolution_op),
+      {weight_nhwc.sizes()[0], weight_nhwc.sizes()[1],
+          weight_nhwc.sizes()[2], weight_nhwc.sizes()[3]},
+      {padding_expanded[0], padding_expanded[1]},
+      {stride_expanded[0], stride_expanded[1]},
+      {dilation_expanded[0], dilation_expanded[1]}
+  };
+}
+
+c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext> Conv2dPrePack::operator()(
+  Tensor weight,
+  c10::optional<Tensor> bias,
+  std::vector<int64_t> stride,
+  std::vector<int64_t> padding,
+  std::vector<int64_t> dilation,
+  int64_t groups
+  ) {
+    return xnnpack::XNNPackConv2dOpContext::create_context(
+        std::move(weight),
+        std::move(bias),
+        std::move(padding),
+        std::move(stride),
+        std::move(dilation),
+        groups,
+        {},
+        {});
+}
+
+Tensor Conv2dPacked::operator()(
+  const Tensor& input,
+  const c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext>& op_context) {
+    return
+      xnnpack::internal::convolution2d::run(
+          (op_context.get())->get_context(), input);
+}
+
 } // namespace convolution2d
 } // namespace internal

@ -246,8 +264,8 @@ bool use_convolution2d(
            stride,
            dilation,
            groups,
-            internal::convolution2d::Context::kMin,
-            internal::convolution2d::Context::kMax) &&
+            ContextConv2D::kMin,
+            ContextConv2D::kMax) &&
         internal::convolution2d::usable(input);
 }

@ -267,50 +285,13 @@ Tensor convolution2d(
      stride,
      dilation,
      groups,
-      internal::convolution2d::Context::kMin,
-      internal::convolution2d::Context::kMax);
+      ContextConv2D::kMin,
+      ContextConv2D::kMax);
 }

 } // namespace xnnpack

-at::Tensor _conv2d_prepack(
-    const Tensor& weight,
-    const Tensor& bias,
-    const IntArrayRef stride,
-    const IntArrayRef padding,
-    const IntArrayRef dilation,
-    const int64_t groups,
-    const c10::optional<double> output_min,
-    const c10::optional<double> output_max) {
-  return cpp_custom_type_hack::create(
-      std::make_unique<xnnpack::internal::convolution2d::Context>(
-          xnnpack::internal::convolution2d::create(
-              weight,
-              bias,
-              padding.vec(),
-              stride.vec(),
-              dilation.vec(),
-              groups,
-              output_min ? *output_min : xnnpack::internal::convolution2d::Context::kMin,
-              output_max ? *output_max : xnnpack::internal::convolution2d::Context::kMax)),
-      weight.options());
-}
-
-at::Tensor _conv2d_packed(
-    const Tensor& packed_weight,
-    const Tensor& input) {
-  return xnnpack::internal::convolution2d::run(
-      cpp_custom_type_hack::cast<xnnpack::internal::convolution2d::Context>(packed_weight),
-      input);
-}
-
 } // namespace native
 } // namespace at

-namespace caffe2 {
-
-CAFFE_KNOWN_TYPE(at::native::xnnpack::internal::convolution2d::Context);
-
-} // namespace caffe2
-
 #endif /* USE_XNNPACK */
--- a/aten/src/ATen/native/xnnpack/Convolution.h
+++ b/aten/src/ATen/native/xnnpack/Convolution.h
@ -0,0 +1,49 @@
+#pragma once
+
+#ifdef USE_XNNPACK
+
+#include <ATen/Tensor.h>
+#include <ATen/native/xnnpack/Common.h>
+#include <ATen/native/xnnpack/OpContext.h>
+
+namespace at {
+namespace native {
+namespace xnnpack {
+namespace internal {
+namespace convolution2d {
+
+class Conv2dPrePack final : public torch::OperatorKernel {
+ public:
+  c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext> operator()(
+      Tensor weight,
+      c10::optional<Tensor> bias,
+      std::vector<int64_t> padding,
+      std::vector<int64_t> stride,
+      std::vector<int64_t> dilation,
+      int64_t groups);
+};
+
+class Conv2dPacked final : public torch::OperatorKernel {
+ public:
+  Tensor operator()(
+      const Tensor& input,
+      const c10::intrusive_ptr<xnnpack::XNNPackConv2dOpContext>& op_context);
+};
+
+ContextConv2D create(
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias,
+    const IntArrayRef padding,
+    const IntArrayRef stride,
+    const IntArrayRef dilation,
+    const int64_t groups,
+    const float output_min,
+    const float output_max);
+
+} // namespace convolution2d
+} // namespace internal
+} // namespace xnnpack
+} // namespace native
+} // namespace at
+
+#endif /* USE_XNNPACK */
--- a/aten/src/ATen/native/xnnpack/Factory.cpp
+++ b/aten/src/ATen/native/xnnpack/Factory.cpp
@ -8,11 +8,16 @@ namespace native {
 namespace xnnpack {
 namespace internal {

+GuardingAllocator<0u, XNN_EXTRA_BYTES>* get_guarding_allocator() {
+  static GuardingAllocator<0u, XNN_EXTRA_BYTES> allocator;
+  return &allocator;
+}
+
 Tensor empty_with_tail_padding(
    const IntArrayRef size,
    const caffe2::TypeMeta dtype,
    const c10::MemoryFormat memory_format) {
-  static GuardingAllocator<0u, XNN_EXTRA_BYTES> allocator;
+  auto* allocator_ptr = get_guarding_allocator();

  const int64_t nelements = prod_intlist(size);

@ -21,8 +26,8 @@ Tensor empty_with_tail_padding(
          c10::Storage{
              dtype,
              nelements,
-              allocator.allocate(nelements * dtype.itemsize()),
-              &allocator,
+              allocator_ptr->allocate(nelements * dtype.itemsize()),
+              allocator_ptr,
              /*resizable=*/true,
          },
          DispatchKeySet{DispatchKey::CPUTensorId}));
@ -30,6 +35,19 @@ Tensor empty_with_tail_padding(
  return tensor.resize_(size, memory_format);
 }

+Tensor allocate_padded_if_needed(const Tensor& input_contig) {
+  const auto* allocator = input_contig.storage().allocator();
+  const auto* guarding_allocator = get_guarding_allocator();
+  if (allocator == guarding_allocator) {
+    return input_contig;
+  }
+  Tensor padded_input =
+      empty_with_tail_padding(input_contig.sizes(), input_contig.options().dtype(),
+          input_contig.suggest_memory_format());
+  padded_input.copy_(input_contig);
+  return padded_input;
+}
+
 } // namespace internal
 } // namespace xnnpack
 } // namespace native
--- a/aten/src/ATen/native/xnnpack/Factory.h
+++ b/aten/src/ATen/native/xnnpack/Factory.h
@ -9,6 +9,8 @@ namespace native {
 namespace xnnpack {
 namespace internal {

+Tensor allocate_padded_if_needed(const Tensor& input_contig);
+
 // TODO: Remove this function when at::native::empty() is modified to accept a
 // custom memory allocator.

--- a/aten/src/ATen/native/xnnpack/Linear.cpp
+++ b/aten/src/ATen/native/xnnpack/Linear.cpp
@ -1,8 +1,8 @@
 #ifdef USE_XNNPACK

-#include <ATen/cpp_custom_type_hack.h>
 #include <ATen/native/xnnpack/Common.h>
 #include <ATen/native/xnnpack/Factory.h>
+#include <ATen/native/xnnpack/Linear.h>

 namespace at {
 namespace native {
@ -10,17 +10,6 @@ namespace xnnpack {
 namespace internal {
 namespace linear {

-struct Context final {
-  Operator linear_op;
-
-  struct Output final {
-    int64_t channels;
-  } output;
-
-  static constexpr float kMin = -std::numeric_limits<float>::infinity();
-  static constexpr float kMax = std::numeric_limits<float>::infinity();
-};
-
 namespace {

 // Supports NHWC and NCHW FP32 linear operators.
@ -33,62 +22,19 @@ bool available(
    const float output_max) {
         // XNNPACK
  return xnnpack::internal::available() &&
-         // Weight
-         (2 == weight.ndimension()) &&
-         (c10::DeviceType::CPU == weight.device().type()) &&
-         (kFloat == weight.scalar_type()) &&
-         // Bias
-         ((bias && bias->defined()) ? ((1 == bias->ndimension()) &&
-                                      (c10::DeviceType::CPU == bias->device().type()) &&
-                                      (kFloat == bias->scalar_type()) &&
-                                      (weight.size(Layout::Filter::output)) == bias->size(0))
-                                    : true) &&
-         // Output Min / Max
-         (output_max > output_min) &&
-         true;
-}
-
-Context create(
-    const Tensor& weight,
-    const c10::optional<Tensor>& bias,
-    const float output_min,
-    const float output_max) {
-  const Tensor weight_contig = weight.contiguous();
-
-  TORCH_CHECK(
-      available(
-          weight_contig,
-          bias,
-          output_min,
-          output_max),
-      "XNNPACK Linear not available! "
-      "Reason: The provided (weight, bias, output_min, output_max) parameters are "
-      "either invalid individually or their combination is not supported by XNNPACK.");
-
-  xnn_operator_t linear_op{};
-
-  const xnn_status create_status = xnn_create_fully_connected_nc_f32(
-      weight_contig.size(Layout::Filter::input),                      // input_channels
-      weight_contig.size(Layout::Filter::output),                     // output_channels
-      weight_contig.size(Layout::Filter::input),                      // input_pixel_stride
-      weight_contig.size(Layout::Filter::output),                     // output_pixel_stride
-      weight_contig.data_ptr<float>(),                                // kernel
-      (bias && bias->defined()) ? bias->data_ptr<float>() : nullptr,  // bias
-      output_min,                                                     // output_min
-      output_max,                                                     // output_max
-      0u,                                                             // flags
-      &linear_op);                                                    // operator
-
-  TORCH_CHECK(
-      xnn_status_success == create_status,
-      "xnn_create_fully_connected_nc_f32 failed!");
-
-  return Context{
-    Operator(linear_op),
-    {
-      weight_contig.size(Layout::Filter::output),
-    }
-  };
+          // Weight
+          (2 == weight.ndimension()) &&
+          (c10::DeviceType::CPU == weight.device().type()) &&
+          (kFloat == weight.scalar_type()) &&
+          // Bias
+          ((bias && bias->defined()) ? ((1 == bias->ndimension()) &&
+                                       (c10::DeviceType::CPU == bias->device().type()) &&
+                                       (kFloat == bias->scalar_type()) &&
+                                       (weight.size(Layout::Filter::output)) == bias->size(0))
+                                     : true) &&
+          // Output Min / Max
+          (output_max > output_min) &&
+          true;
 }

 // TODO: Decouple and improve error handling and messages.
@ -101,30 +47,30 @@ bool usable(const Tensor& input) {
 }

 Tensor run(
-    const Context& context,
+    const ContextLinear& context,
    const Tensor& input) {
  using namespace internal;

-  const Tensor& input_contig = input.contiguous();
+  const Tensor padded_input = allocate_padded_if_needed(input.contiguous());

  TORCH_CHECK(
-      usable(input_contig),
+      usable(padded_input),
      "XNNPACK Linear not usable! "
      "Reason: The provided input tensor is either invalid or unsupported by XNNPACK.");

-  const IntArrayRef input_size = input_contig.sizes();
+  const IntArrayRef input_size = padded_input.sizes();
  std::vector<int64_t> output_size(input_size.cbegin(), input_size.cend());
-  output_size.back() = context.output.channels;
+  output_size.back() = context.output_channels;

  Tensor output = empty_with_tail_padding(
      output_size,
-      input_contig.options().dtype(),
-      input_contig.suggest_memory_format());
+      padded_input.options().dtype(),
+      padded_input.suggest_memory_format());

  const xnn_status setup_status = xnn_setup_fully_connected_nc_f32(
-      context.linear_op.get(),                            // operator
-      Layout::ActivationND::batch(input_contig.sizes()),  // Batch,
-      input_contig.data_ptr<float>(),                     // input
+      context.op.get(),                                   // operator
+      Layout::ActivationND::batch(padded_input.sizes()),  // Batch,
+      padded_input.data_ptr<float>(),                     // input
      output.data_ptr<float>(),                           // output
      nullptr);                                           // threadpool

@ -133,7 +79,7 @@ Tensor run(
      "xnn_setup_fully_connected_nc_f32 failed!");

  const xnn_status run_status = xnn_run_operator(
-      context.linear_op.get(),  // operator
+      context.op.get(),         // operator
      nullptr);                 // threadpool

  TORCH_INTERNAL_ASSERT(
@ -159,6 +105,63 @@ Tensor create_and_run(
 }

 } // namespace
+
+ContextLinear create(
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias,
+    const float output_min,
+    const float output_max) {
+  const Tensor weight_contig = weight.contiguous();
+
+  TORCH_CHECK(
+      available(
+          weight_contig,
+          bias,
+          output_min,
+          output_max),
+      "XNNPACK Linear not available! "
+      "Reason: The provided (weight, bias, output_min, output_max) parameters are "
+      "either invalid individually or their combination is not supported by XNNPACK.");
+
+  xnn_operator_t linear_op{};
+
+  const xnn_status create_status = xnn_create_fully_connected_nc_f32(
+      weight_contig.size(Layout::Filter::input),                        // input_channels
+      weight_contig.size(Layout::Filter::output),                       // output_channels
+      weight_contig.size(Layout::Filter::input),                        // input_pixel_stride
+      weight_contig.size(Layout::Filter::output),                       // output_pixel_stride
+      weight_contig.data_ptr<float>(),                                  // kernel
+      (bias && bias->defined()) ? bias->data_ptr<float>() : nullptr,  // bias
+      output_min,                                                     // output_min
+      output_max,                                                     // output_max
+      0u,                                                             // flags
+      &linear_op);                                                    // operator
+
+  TORCH_CHECK(
+      xnn_status_success == create_status,
+      "xnn_create_fully_connected_nc_f32 failed!");
+
+  return ContextLinear(
+    Operator(linear_op),
+    weight_contig.size(Layout::Filter::output)
+  );
+}
+
+c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext>
+    LinearPrePack::operator()(
+        Tensor weight,
+        c10::optional<Tensor> bias) {
+      return xnnpack::XNNPackLinearOpContext::create_context(
+          std::move(weight), std::move(bias), {}, {});
+}
+
+Tensor LinearPacked::operator()(
+    const Tensor& input,
+    const c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext>& op_context) {
+      return
+        xnnpack::internal::linear::run((op_context.get())->get_context(), input);
+}
+
 } // namespace linear
 } // namespace internal

@ -169,8 +172,8 @@ bool use_linear(
  return internal::linear::available(
            weight,
            bias,
-            internal::linear::Context::kMin,
-            internal::linear::Context::kMax) &&
+            ContextLinear::kMin,
+            ContextLinear::kMax) &&
         internal::linear::usable(input);
 }

@ -182,42 +185,13 @@ Tensor linear(
      input,
      weight,
      bias,
-      internal::linear::Context::kMin,
-      internal::linear::Context::kMax);
+      ContextLinear::kMin,
+      ContextLinear::kMax);
 }

 } // namespace xnnpack

-Tensor _linear_prepack(
-    const Tensor& weight,
-    const Tensor& bias,
-    const c10::optional<double> output_min,
-    const c10::optional<double> output_max) {
-  return cpp_custom_type_hack::create(
-      std::make_unique<xnnpack::internal::linear::Context>(
-          xnnpack::internal::linear::create(
-              weight,
-              bias,
-              output_min ? *output_min : xnnpack::internal::linear::Context::kMin,
-              output_max ? *output_max : xnnpack::internal::linear::Context::kMax)),
-      weight.options());
-}
-
-Tensor _linear_packed(
-    const Tensor& packed_weight,
-    const Tensor& input) {
-  return xnnpack::internal::linear::run(
-      cpp_custom_type_hack::cast<xnnpack::internal::linear::Context>(packed_weight),
-      input);
-}
-
 } // namespace native
 } // namespace at

-namespace caffe2 {
-
-CAFFE_KNOWN_TYPE(at::native::xnnpack::internal::linear::Context);
-
-} // namespace caffe2
-
 #endif /* USE_XNNPACK */
--- a/aten/src/ATen/native/xnnpack/Linear.h
+++ b/aten/src/ATen/native/xnnpack/Linear.h
@ -0,0 +1,40 @@
+#pragma once
+
+#ifdef USE_XNNPACK
+
+#include <ATen/Tensor.h>
+#include <ATen/native/xnnpack/Common.h>
+#include <ATen/native/xnnpack/OpContext.h>
+
+namespace at {
+namespace native {
+namespace xnnpack {
+namespace internal {
+namespace linear {
+
+class LinearPrePack final : public torch::OperatorKernel {
+  public:
+  c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext> operator()(
+      Tensor weight,
+      c10::optional<Tensor> bias);
+};
+
+class LinearPacked final : public torch::OperatorKernel {
+  public:
+  Tensor operator()(
+      const Tensor& input,
+      const c10::intrusive_ptr<xnnpack::XNNPackLinearOpContext>& op_context);
+};
+
+ContextLinear create(
+    const Tensor& weight,
+    const c10::optional<Tensor>& bias,
+    const float output_min,
+    const float output_max);
+} // namespace linear
+} // namespace internal
+} // namespace xnnpack
+} // namespace native
+} // namespace at
+
+#endif /* USE_XNNPACK */
--- a/aten/src/ATen/native/xnnpack/OpContext.cpp
+++ b/aten/src/ATen/native/xnnpack/OpContext.cpp
@ -0,0 +1,64 @@
+#ifdef USE_XNNPACK
+#include <ATen/native/xnnpack/Convolution.h>
+#include <ATen/native/xnnpack/Linear.h>
+#include <ATen/native/xnnpack/OpContext.h>
+
+namespace at {
+namespace native {
+namespace xnnpack {
+
+c10::intrusive_ptr<XNNPackLinearOpContext>
+XNNPackLinearOpContext::create_context(
+    at::Tensor&& weight,
+    c10::optional<at::Tensor>&& bias,
+    const c10::optional<double> output_min,
+    const c10::optional<double> output_max) {
+  auto linear_op_context =
+      c10::make_intrusive<XNNPackLinearOpContext>(
+          std::move(weight),
+          std::move(bias),
+          xnnpack::internal::linear::create(
+              weight,
+              bias,
+              output_min ? *output_min : xnnpack::ContextLinear::kMin,
+              output_max ? *output_max : xnnpack::ContextLinear::kMax)
+          );
+  return linear_op_context;
+}
+
+c10::intrusive_ptr<XNNPackConv2dOpContext>
+XNNPackConv2dOpContext::create_context(at::Tensor&& weight,
+    c10::optional<at::Tensor>&& bias,
+    std::vector<int64_t>&& padding,
+    std::vector<int64_t>&& stride,
+    std::vector<int64_t>&& dilation,
+    int64_t groups,
+    const c10::optional<double> output_min,
+    const c10::optional<double> output_max) {
+  auto op_context =
+      xnnpack::internal::convolution2d::create(
+          weight,
+          bias,
+          padding,
+          stride,
+          dilation,
+          groups,
+          output_min ? *output_min : xnnpack::ContextConv2D::kMin,
+          output_max ? *output_max : xnnpack::ContextConv2D::kMax);
+  auto conv2d_op_context =
+      c10::make_intrusive<XNNPackConv2dOpContext>(
+          std::move(weight),
+          std::move(bias),
+          std::move(padding),
+          std::move(stride),
+          std::move(dilation),
+          groups,
+          std::move(op_context));
+  return conv2d_op_context;
+}
+
+} // xnnpack
+} // native
+} // at
+
+#endif /* USE_XNNPACK */
--- a/aten/src/ATen/native/xnnpack/OpContext.h
+++ b/aten/src/ATen/native/xnnpack/OpContext.h
@ -0,0 +1,96 @@
+#pragma once
+
+#ifdef USE_XNNPACK
+
+#include <ATen/core/ivalue.h>
+#include <ATen/native/xnnpack/Common.h>
+#include <ATen/Tensor.h>
+
+namespace at {
+namespace native {
+namespace xnnpack {
+
+using SerializationTypeLinearPrePack = std::tuple<Tensor, c10::optional<Tensor>>;
+using SerializationTypeConv2dPrePack =
+  std::tuple<Tensor, c10::optional<Tensor>,
+  std::vector<int64_t>, std::vector<int64_t>, std::vector<int64_t>, int64_t>;
+
+class XNNPackLinearOpContext : public torch::jit::CustomClassHolder {
+  private:
+    Tensor orig_weight_;
+    c10::optional<Tensor> orig_bias_;
+    ContextLinear op_context_;
+
+  public:
+    XNNPackLinearOpContext(Tensor&& weight,
+        c10::optional<Tensor>&& bias,
+        ContextLinear&& op_context) :
+      orig_weight_(std::move(weight)),
+      orig_bias_(std::move(bias)),
+      op_context_(std::move(op_context)) {}
+
+    const ContextLinear& get_context() const {
+      return op_context_;
+    }
+
+    SerializationTypeLinearPrePack unpack() {
+      return std::make_tuple(orig_weight_, orig_bias_);
+    }
+
+    static c10::intrusive_ptr<XNNPackLinearOpContext> create_context(Tensor&& weight,
+        c10::optional<Tensor>&& bias,
+        const c10::optional<double> output_min,
+        const c10::optional<double> output_max);
+};
+
+class XNNPackConv2dOpContext : public torch::jit::CustomClassHolder {
+  private:
+    Tensor orig_weight_;
+    c10::optional<Tensor> orig_bias_;
+    std::vector<int64_t> padding_;
+    std::vector<int64_t> stride_;
+    std::vector<int64_t> dilation_;
+    int64_t groups_;
+    ContextConv2D op_context_;
+
+  public:
+    XNNPackConv2dOpContext(Tensor&& weight,
+        c10::optional<Tensor>&& bias,
+        std::vector<int64_t>&& padding,
+        std::vector<int64_t>&& stride,
+        std::vector<int64_t>&& dilation,
+        uint64_t groups,
+        ContextConv2D&& op_context
+        ) :
+        orig_weight_(std::move(weight)),
+        orig_bias_(std::move(bias)),
+        padding_(std::move(padding)),
+        stride_(std::move(stride)),
+        dilation_(std::move(dilation)),
+        groups_(groups),
+        op_context_(std::move(op_context)) {}
+
+    const ContextConv2D& get_context() const {
+      return op_context_;
+    }
+
+    SerializationTypeConv2dPrePack unpack() {
+      return std::make_tuple(orig_weight_, orig_bias_, padding_,
+          stride_, dilation_, groups_);
+    }
+
+    static c10::intrusive_ptr<XNNPackConv2dOpContext> create_context(Tensor&& weight,
+        c10::optional<Tensor>&& bias,
+        std::vector<int64_t>&& padding,
+        std::vector<int64_t>&& stride,
+        std::vector<int64_t>&& dilation,
+        int64_t groups,
+        const c10::optional<double> output_min,
+        const c10::optional<double> output_max);
+};
+} // xnnpack
+
+} // native
+} // at
+
+#endif /* USE_XNNPACK */
--- a/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
+++ b/aten/src/ATen/native/xnnpack/RegisterOpContextClass.cpp
@ -0,0 +1,101 @@
+#ifdef USE_XNNPACK
+
+#include <ATen/core/op_registration/op_registration.h>
+#include <ATen/native/xnnpack/Convolution.h>
+#include <ATen/native/xnnpack/Linear.h>
+#include <ATen/native/xnnpack/OpContext.h>
+#include <ATen/Tensor.h>
+#include <torch/custom_class.h>
+
+namespace at {
+namespace native {
+namespace xnnpack {
+
+namespace {
+torch::jit::class_<XNNPackLinearOpContext> register_xnnpack_linear_op_context_class() {
+  static auto register_linear_op_context_class =
+      torch::jit::class_<XNNPackLinearOpContext>("XNNPackLinearOpContext")
+          .def_pickle(
+              [](const c10::intrusive_ptr<XNNPackLinearOpContext>& op_context)
+                  -> SerializationTypeLinearPrePack { // __getstate__
+                Tensor weight;
+                c10::optional<Tensor> bias;
+                return op_context->unpack();
+              },
+              [](SerializationTypeLinearPrePack state)
+                  -> c10::intrusive_ptr<
+                      XNNPackLinearOpContext> { // __setstate__
+                return XNNPackLinearOpContext::create_context(
+                    std::move(std::get<0>(state)),
+                    std::move(std::get<1>(state)),
+                    {},
+                    {}
+                    );
+              }
+              );
+  return register_linear_op_context_class;
+}
+
+torch::jit::class_<XNNPackConv2dOpContext> register_xnnpack_conv2d_op_context_class() {
+  static auto register_conv2d_op_context_class =
+      torch::jit::class_<XNNPackConv2dOpContext>("XNNPackConv2dOpContext")
+          .def_pickle(
+              [](const c10::intrusive_ptr<XNNPackConv2dOpContext>& op_context)
+                  -> SerializationTypeConv2dPrePack { // __getstate__
+                Tensor weight;
+                std::vector<int64_t> padding, stride, dilation;
+                int64_t groups;
+                c10::optional<Tensor> bias;
+                return  op_context->unpack();
+              },
+              [](SerializationTypeConv2dPrePack state)
+                  -> c10::intrusive_ptr<
+                      XNNPackConv2dOpContext> { // __setstate__
+                return XNNPackConv2dOpContext::create_context(
+                    std::move(std::get<0>(state)),
+                    std::move(std::get<1>(state)),
+                    std::move(std::get<2>(state)),
+                    std::move(std::get<3>(state)),
+                    std::move(std::get<4>(state)),
+                    std::move(std::get<5>(state)),
+                    {},
+                    {}
+                    );
+              }
+              );
+  return register_conv2d_op_context_class;
+}
+
+static auto xnnpack_linear_op_context_class = register_xnnpack_linear_op_context_class();
+static auto xnnpack_conv2d_op_context_class = register_xnnpack_conv2d_op_context_class();
+
+// Op registeration
+static auto registry =
+  // Registering under _xnnpack namespace for now. As we add more backend requiring similar functionality
+  // We can refactor the code and use a better namespace.
+    torch::RegisterOperators()
+        .op("_xnnpack::linear_prepack(Tensor W, Tensor? B=None) -> __torch__.torch.classes.XNNPackLinearOpContext",
+            torch::RegisterOperators::options().kernel<internal::linear::LinearPrePack>(
+                DispatchKey::CPUTensorId))
+        .op("_xnnpack::linear_packed(Tensor X, __torch__.torch.classes.XNNPackLinearOpContext W_prepack) -> Tensor Y",
+            torch::RegisterOperators::options().kernel<internal::linear::LinearPacked>(
+                DispatchKey::CPUTensorId))
+        .op("_xnnpack::conv2d_prepack(Tensor W, Tensor? B, int[2] stride, "
+            "int[2] padding, int[2] dilation, int groups) "
+            "-> __torch__.torch.classes.XNNPackConv2dOpContext",
+            torch::RegisterOperators::options().kernel<internal::convolution2d::Conv2dPrePack>(
+                DispatchKey::CPUTensorId))
+        .op("_xnnpack::conv2d_packed(Tensor X, "
+            "__torch__.torch.classes.XNNPackConv2dOpContext W_prepack) -> Tensor Y",
+            torch::RegisterOperators::options().kernel<internal::convolution2d::Conv2dPacked>(
+                DispatchKey::CPUTensorId));
+} // namespace
+
+} // xnnpack
+} // native
+} // at
+
+namespace {
+}
+
+#endif /* USE_XNNPACK */
--- a/aten/src/ATen/native/xnnpack/Shim.cpp
+++ b/aten/src/ATen/native/xnnpack/Shim.cpp
@ -58,38 +58,6 @@ Tensor linear(

 } // namespace xnnpack

-at::Tensor _conv2d_prepack(
-    const Tensor&,
-    const Tensor&,
-    const IntArrayRef,
-    const IntArrayRef,
-    const IntArrayRef,
-    const int64_t,
-    const c10::optional<double>,
-    const c10::optional<double>) {
-  TORCH_CHECK(false, xnnpack::internal::kError);
-}
-
-at::Tensor _conv2d_packed(
-    const Tensor&,
-    const Tensor&) {
-  TORCH_CHECK(false, xnnpack::internal::kError);
-}
-
-Tensor _linear_prepack(
-    const Tensor&,
-    const Tensor&,
-    const c10::optional<double>,
-    const c10::optional<double>) {
-  TORCH_CHECK(false, xnnpack::internal::kError);
-}
-
-Tensor _linear_packed(
-    const Tensor&,
-    const Tensor&) {
-  TORCH_CHECK(false, xnnpack::internal::kError);
-}
-
 } // namespace native
 } // namespace at

--- a/test/test_xnnpack_integration.py
+++ b/test/test_xnnpack_integration.py
@ -0,0 +1,364 @@
+from __future__ import division
+
+import unittest
+
+import torch
+import torch.backends.xnnpack
+from torch.nn import functional as F
+import torch.testing._internal.hypothesis_utils as hu
+from torch.testing._internal.common_utils import TestCase, run_tests
+from hypothesis import given, assume
+from hypothesis import strategies as st
+import io
+
+
+@unittest.skipUnless(torch.backends.xnnpack.enabled,
+                     " XNNPACK must be enabled for these tests."
+                     " Please build with USE_XNNPACK=1.")
+class TestXNNPACKOps(TestCase):
+    @given(batch_size=st.integers(0, 3),
+           data_shape=hu.array_shapes(1, 3, 2, 64),
+           weight_output_dim=st.integers(2, 64),
+           use_bias=st.booleans())
+    def test_linear(self, batch_size, data_shape, weight_output_dim, use_bias):
+        data_shape = [batch_size] + list(data_shape)
+        input_data = torch.rand(data_shape)
+        weight = torch.rand((weight_output_dim, data_shape[-1]))
+        if use_bias:
+            bias = torch.rand((weight_output_dim))
+        else:
+            bias = None
+        ref_result = F.linear(input_data, weight, bias)
+        packed_weight_bias = torch.ops._xnnpack.linear_prepack(weight, bias)
+        output_linear_xnnpack = torch.ops._xnnpack.linear_packed(input_data, packed_weight_bias)
+        torch.testing.assert_allclose(ref_result, output_linear_xnnpack, rtol=1e-2, atol=1e-3)
+
+    @given(batch_size=st.integers(0, 3),
+           input_channels_per_group=st.integers(1, 32),
+           height=st.integers(5, 64),
+           width=st.integers(5, 64),
+           output_channels_per_group=st.integers(1, 32),
+           groups=st.integers(1, 16),
+           kernel_h=st.integers(1, 7),
+           kernel_w=st.integers(1, 7),
+           stride_h=st.integers(1, 2),
+           stride_w=st.integers(1, 2),
+           pad_h=st.integers(0, 2),
+           pad_w=st.integers(0, 2),
+           dilation=st.integers(1, 2),
+           use_bias=st.booleans())
+    def test_conv2d(self,
+                    batch_size,
+                    input_channels_per_group,
+                    height,
+                    width,
+                    output_channels_per_group,
+                    groups,
+                    kernel_h,
+                    kernel_w,
+                    stride_h,
+                    stride_w,
+                    pad_h,
+                    pad_w,
+                    dilation,
+                    use_bias):
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        kernels = (kernel_h, kernel_w)
+        strides = (stride_h, stride_w)
+        paddings = (pad_h, pad_w)
+        dilations = (dilation, dilation)
+        assume(height + 2 * paddings[0] >=
+               dilations[0] * (kernels[0] - 1) + 1)
+        assume(width + 2 * paddings[1] >=
+               dilations[1] * (kernels[1] - 1) + 1)
+
+        input_data = torch.rand((batch_size, input_channels, height, width))
+        weight = torch.rand((output_channels, input_channels_per_group, kernel_h, kernel_w))
+        bias = None
+        if use_bias:
+            bias = torch.rand((output_channels))
+
+        ref_result = F.conv2d(input_data, weight, bias,
+                              strides, paddings, dilations, groups)
+        packed_weight_bias = torch.ops._xnnpack.conv2d_prepack(weight, bias,
+                                                               strides, paddings, dilations, groups)
+        xnnpack_result = torch.ops._xnnpack.conv2d_packed(input_data, packed_weight_bias)
+        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+
+
+@unittest.skipUnless(torch.backends.xnnpack.enabled,
+                     " XNNPACK must be enabled for these tests."
+                     " Please build with USE_XNNPACK=1.")
+class TestXNNPACKSerDes(TestCase):
+    @given(batch_size=st.integers(0, 3),
+           data_shape=hu.array_shapes(1, 3, 2, 64),
+           weight_output_dim=st.integers(2, 64),
+           use_bias=st.booleans())
+    def test_linear(self, batch_size, data_shape, weight_output_dim, use_bias):
+        class Linear(torch.nn.Module):
+            def __init__(self, weight, bias=None):
+                super(Linear, self).__init__()
+                self.weight = weight
+                self.bias = bias
+
+            def forward(self, x):
+                return F.linear(x, self.weight, self.bias)
+
+        class LinearPrePacked(torch.nn.Module):
+            def __init__(self, weight, bias=None):
+                super(LinearPrePacked, self).__init__()
+                self.packed_weight_bias = torch.ops._xnnpack.linear_prepack(weight, bias)
+
+            def forward(self, x):
+                return torch.ops._xnnpack.linear_packed(x, self.packed_weight_bias)
+
+        data_shape = [batch_size] + list(data_shape)
+        weight = torch.rand((weight_output_dim, data_shape[-1]))
+        if use_bias:
+            bias = torch.rand((weight_output_dim))
+        else:
+            bias = None
+        scripted_linear = torch.jit.script(Linear(weight, bias))
+        scripted_linear_prepacked = torch.jit.script(LinearPrePacked(weight, bias))
+        input_data = torch.rand(data_shape)
+        ref_result = scripted_linear(input_data)
+        output_linear_xnnpack = scripted_linear_prepacked(input_data)
+        torch.testing.assert_allclose(ref_result, output_linear_xnnpack, rtol=1e-2, atol=1e-3)
+
+        # Serialize the modules and then deserialize
+        input_data = torch.rand(data_shape)
+        buffer = io.BytesIO()
+        torch.jit.save(scripted_linear, buffer)
+        buffer.seek(0)
+        deserialized_linear = torch.jit.load(buffer)
+        buffer = io.BytesIO()
+        torch.jit.save(scripted_linear_prepacked, buffer)
+        buffer.seek(0)
+        deserialized_linear_prepacked = torch.jit.load(buffer)
+        ref_result = deserialized_linear(input_data)
+        output_linear_xnnpack = deserialized_linear_prepacked(input_data)
+        torch.testing.assert_allclose(ref_result, output_linear_xnnpack, rtol=1e-2, atol=1e-3)
+
+    @given(batch_size=st.integers(0, 3),
+           input_channels_per_group=st.integers(1, 32),
+           height=st.integers(5, 64),
+           width=st.integers(5, 64),
+           output_channels_per_group=st.integers(1, 32),
+           groups=st.integers(1, 16),
+           kernel_h=st.integers(1, 7),
+           kernel_w=st.integers(1, 7),
+           stride_h=st.integers(1, 2),
+           stride_w=st.integers(1, 2),
+           pad_h=st.integers(0, 2),
+           pad_w=st.integers(0, 2),
+           dilation=st.integers(1, 2),
+           use_bias=st.booleans())
+    def test_conv2d(self,
+                    batch_size,
+                    input_channels_per_group,
+                    height,
+                    width,
+                    output_channels_per_group,
+                    groups,
+                    kernel_h,
+                    kernel_w,
+                    stride_h,
+                    stride_w,
+                    pad_h,
+                    pad_w,
+                    dilation,
+                    use_bias):
+        class Conv2D(torch.nn.Module):
+            def __init__(self, weight, bias, strides, paddings, dilations, groups):
+                super(Conv2D, self).__init__()
+                self.weight = weight
+                self.bias = bias
+                self.strides = strides
+                self.paddings = paddings
+                self.dilations = dilations
+                self.groups = groups
+
+            def forward(self, x):
+                return F.conv2d(x, self.weight, self.bias,
+                                self.strides, self.paddings, self.dilations, self.groups)
+
+        class Conv2DPrePacked(torch.nn.Module):
+            def __init__(self, weight, bias, strides, paddings, dilations, groups):
+                super(Conv2DPrePacked, self).__init__()
+                self.packed_weight_bias = torch.ops._xnnpack.conv2d_prepack(weight, bias,
+                                                                            strides, paddings, dilations, groups)
+
+            def forward(self, x):
+                return torch.ops._xnnpack.conv2d_packed(x, self.packed_weight_bias)
+
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        kernels = (kernel_h, kernel_w)
+        strides = (stride_h, stride_w)
+        paddings = (pad_h, pad_w)
+        dilations = (dilation, dilation)
+        assume(height + 2 * paddings[0] >=
+               dilations[0] * (kernels[0] - 1) + 1)
+        assume(width + 2 * paddings[1] >=
+               dilations[1] * (kernels[1] - 1) + 1)
+
+        input_data = torch.rand((batch_size, input_channels, height, width))
+        weight = torch.rand((output_channels, input_channels_per_group, kernel_h, kernel_w))
+        bias = None
+        if use_bias:
+            bias = torch.rand((output_channels))
+
+        scripted_conv2d = torch.jit.script(Conv2D(weight, bias,
+                                                  strides, paddings, dilations, groups))
+        scripted_conv2d_prepacked = torch.jit.script(Conv2DPrePacked(
+            weight, bias, strides, paddings, dilations, groups))
+        ref_result = scripted_conv2d(input_data)
+        xnnpack_result = scripted_conv2d_prepacked(input_data)
+        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+
+        # Serialize the modules and then deserialize
+        input_data = torch.rand((batch_size, input_channels, height, width))
+        buffer = io.BytesIO()
+        torch.jit.save(scripted_conv2d, buffer)
+        buffer.seek(0)
+        deserialized_conv2d = torch.jit.load(buffer)
+        buffer = io.BytesIO()
+        torch.jit.save(scripted_conv2d_prepacked, buffer)
+        buffer.seek(0)
+        deserialized_conv2d_prepacked = torch.jit.load(buffer)
+        ref_result = deserialized_conv2d(input_data)
+        xnnpack_result = deserialized_conv2d_prepacked(input_data)
+        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+
+    @given(batch_size=st.integers(0, 3),
+           input_channels_per_group=st.integers(1, 32),
+           height=st.integers(5, 64),
+           width=st.integers(5, 64),
+           output_channels_per_group=st.integers(1, 32),
+           groups=st.integers(1, 16),
+           kernel_h=st.integers(1, 7),
+           kernel_w=st.integers(1, 7),
+           stride_h=st.integers(1, 2),
+           stride_w=st.integers(1, 2),
+           pad_h=st.integers(0, 2),
+           pad_w=st.integers(0, 2),
+           dilation=st.integers(1, 2),
+           linear_weight_output_dim=st.integers(2, 64),
+           use_bias=st.booleans())
+    def test_combined_model(self,
+                            batch_size,
+                            input_channels_per_group,
+                            height,
+                            width,
+                            output_channels_per_group,
+                            groups,
+                            kernel_h,
+                            kernel_w,
+                            stride_h,
+                            stride_w,
+                            pad_h,
+                            pad_w,
+                            dilation,
+                            linear_weight_output_dim,
+                            use_bias):
+        class M(torch.nn.Module):
+            def __init__(self, conv_weight, conv_bias, linear_weight, linear_bias,
+                         strides, paddings, dilations, groups):
+                super(M, self).__init__()
+                self.conv_weight = conv_weight
+                self.conv_bias = conv_bias
+                self.linear_weight = linear_weight
+                self.linear_bias = linear_bias
+                self.strides = strides
+                self.paddings = paddings
+                self.dilations = dilations
+                self.groups = groups
+
+            def forward(self, x):
+                o = F.conv2d(x, self.conv_weight, self.conv_bias,
+                             self.strides, self.paddings, self.dilations, self.groups)
+                o = o.permute([0, 2, 3, 1])
+                o = F.linear(o, self.linear_weight, self.linear_bias)
+                return F.relu(o)
+
+        class MPrePacked(torch.nn.Module):
+            def __init__(self, conv_weight, conv_bias, linear_weight, linear_bias,
+                         strides, paddings, dilations, groups):
+                super(MPrePacked, self).__init__()
+                self.conv2d_packed_weight_bias = \
+                    torch.ops._xnnpack.conv2d_prepack(conv_weight, conv_bias,
+                                                      strides, paddings, dilations, groups)
+                self.linear_packed_weight_bias = \
+                    torch.ops._xnnpack.linear_prepack(linear_weight, linear_bias)
+
+            def forward(self, x):
+                o = torch.ops._xnnpack.conv2d_packed(x, self.conv2d_packed_weight_bias)
+                o = o.permute([0, 2, 3, 1])
+                o = torch.ops._xnnpack.linear_packed(o, self.linear_packed_weight_bias)
+                return F.relu(o)
+
+        input_channels = input_channels_per_group * groups
+        output_channels = output_channels_per_group * groups
+        kernels = (kernel_h, kernel_w)
+        strides = (stride_h, stride_w)
+        paddings = (pad_h, pad_w)
+        dilations = (dilation, dilation)
+        assume(height + 2 * paddings[0] >=
+               dilations[0] * (kernels[0] - 1) + 1)
+        assume(width + 2 * paddings[1] >=
+               dilations[1] * (kernels[1] - 1) + 1)
+
+        input_data = torch.rand((batch_size, input_channels, height, width))
+        conv_weight = torch.rand((output_channels, input_channels_per_group, kernel_h, kernel_w))
+        conv_bias = None
+        if use_bias:
+            conv_bias = torch.rand((output_channels))
+
+        # This is done just to find the output shape of the result
+        # so that the shape of weight for the following linear layer
+        # can be determined.
+        result = F.conv2d(input_data, conv_weight, conv_bias,
+                          strides, paddings, dilations, groups)
+        linear_input_shape = result.shape[1]
+
+        input_data = input_data.contiguous(memory_format=torch.channels_last)
+        linear_weight = torch.rand((linear_weight_output_dim, linear_input_shape))
+        linear_bias = None
+        if use_bias:
+            linear_bias = torch.rand((linear_weight_output_dim))
+
+        scripted_m = torch.jit.script(M(conv_weight, conv_bias, linear_weight,
+                                        linear_bias, strides, paddings, dilations, groups))
+        scripted_m_prepacked = torch.jit.script(
+            MPrePacked(
+                conv_weight,
+                conv_bias,
+                linear_weight,
+                linear_bias,
+                strides,
+                paddings,
+                dilations,
+                groups))
+        ref_result = scripted_m(input_data)
+        xnnpack_result = scripted_m_prepacked(input_data)
+        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+
+        # Serialize the modules and then deserialize
+        input_data = torch.rand((batch_size, input_channels, height, width))
+        input_data = input_data.contiguous(memory_format=torch.channels_last)
+        buffer = io.BytesIO()
+        torch.jit.save(scripted_m, buffer)
+        buffer.seek(0)
+        deserialized_m = torch.jit.load(buffer)
+        buffer = io.BytesIO()
+        torch.jit.save(scripted_m_prepacked, buffer)
+        buffer.seek(0)
+        deserialized_m_prepacked = torch.jit.load(buffer)
+        ref_result = deserialized_m(input_data)
+        xnnpack_result = deserialized_m_prepacked(input_data)
+        torch.testing.assert_allclose(ref_result, xnnpack_result, rtol=1e-2, atol=1e-3)
+
+
+if __name__ == "__main__":
+    run_tests()
--- a/torch/backends/xnnpack/init.py
+++ b/torch/backends/xnnpack/init.py
@ -0,0 +1,25 @@
+from __future__ import absolute_import, division, print_function, unicode_literals
+import sys
+import torch
+import types
+
+class _XNNPACKEnabled(object):
+    def __get__(self, obj, objtype):
+        return torch._C._is_xnnpack_enabled()
+
+    def __set__(self, obj, val):
+        raise RuntimeError("Assignment not supported")
+
+class XNNPACKEngine(types.ModuleType):
+    def __init__(self, m, name):
+        super(XNNPACKEngine, self).__init__(name)
+        self.m = m
+
+    def __getattr__(self, attr):
+        return self.m.__getattribute__(attr)
+
+    enabled = _XNNPACKEnabled()
+
+# This is the sys.modules replacement trick, see
+# https://stackoverflow.com/questions/2447353/getattr-on-a-module/7668273#7668273
+sys.modules[__name__] = XNNPACKEngine(sys.modules[__name__], __name__)
--- a/torch/csrc/Module.cpp
+++ b/torch/csrc/Module.cpp
@ -516,6 +516,12 @@ PyObject *THPModule_supportedQEngines(PyObject */* unused */)
  return list.release();
 }

+PyObject *THPModule_isEnabledXNNPACK(PyObject * /* unused */)
+{
+  if (at::globalContext().isXNNPACKAvailable()) Py_RETURN_TRUE;
+  else Py_RETURN_FALSE;
+}
+
 //NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, modernize-avoid-c-arrays)
 static PyMethodDef TorchMethods[] = {
  {"_initExtension",  (PyCFunction)THPModule_initExtension,   METH_O,       nullptr},
@ -556,6 +562,7 @@ static PyMethodDef TorchMethods[] = {
  {"_get_qengine", (PyCFunction)THPModule_qEngine, METH_NOARGS, nullptr},
  {"_set_qengine", (PyCFunction)THPModule_setQEngine, METH_O, nullptr},
  {"_supported_qengines", (PyCFunction)THPModule_supportedQEngines, METH_NOARGS, nullptr},
+  {"_is_xnnpack_enabled", (PyCFunction)THPModule_isEnabledXNNPACK, METH_NOARGS, nullptr},
  {nullptr, nullptr, 0, nullptr}
 };