Remove caffe2 contrib and experiments (#125038)

This PR tries to decompose #122527 into a smaller one. To be noted, this was inspired and is co-dev with @r-barnes. Pull Request resolved: https://github.com/pytorch/pytorch/pull/125038 Approved by: https://github.com/malfet
2025-10-20 21:14:14 +08:00 · 2024-04-29 06:27:13 +00:00
parent 555f1aeb02
commit 5585138db9
189 changed files with 1 additions and 37411 deletions
--- a/BUILD.bazel
+++ b/BUILD.bazel
@ -446,7 +446,6 @@ cu_library(
 # caffe2
 CAFFE2_COPTS = COMMON_COPTS + [
    "-Dcaffe2_EXPORTS",
-    "-DCAFFE2_USE_GLOO",
    "-DCAFFE2_USE_CUDNN",
    "-DCAFFE2_BUILD_MAIN_LIB",
    "-fvisibility-inlines-hidden",
@ -454,22 +453,6 @@ CAFFE2_COPTS = COMMON_COPTS + [
    "-fno-trapping-math",
 ]

-filegroup(
-    name = "caffe2_contrib_srcs",
-    srcs = [
-        "caffe2/contrib/aten/aten_op.cc",
-        "caffe2/contrib/gloo/allgather_ops.cc",
-        "caffe2/contrib/gloo/allreduce_ops.cc",
-        "caffe2/contrib/gloo/barrier_ops.cc",
-        "caffe2/contrib/gloo/broadcast_ops.cc",
-        "caffe2/contrib/gloo/common.cc",
-        "caffe2/contrib/gloo/common_world_ops.cc",
-        "caffe2/contrib/gloo/context.cc",
-        "caffe2/contrib/gloo/reduce_scatter_ops.cc",
-        "caffe2/contrib/gloo/store_handler.cc",
-    ],
-)
-
 filegroup(
    name = "caffe2_core_srcs",
    srcs = [
@ -1024,10 +1007,6 @@ filegroup(
 filegroup(
    name = "caffe2_cuda_cpp_srcs",
    srcs = [
-        "caffe2/contrib/aten/aten_op_gpu.cc",
-        "caffe2/contrib/gloo/allreduce_ops_gpu.cc",
-        "caffe2/contrib/gloo/broadcast_ops_gpu.cc",
-        "caffe2/contrib/gloo/common_world_ops_gpu.cc",
        "caffe2/core/blob_serialization_gpu.cc",
        "caffe2/core/common_cudnn.cc",
        "caffe2/core/common_gpu.cc",
@ -1271,35 +1250,10 @@ cc_library(
    ],
 )

-py_binary(
-    name = "gen_op",
-    srcs = ["caffe2/contrib/aten/gen_op.py"],
-    deps = ["//torchgen"],
-)
-
-genrule(
-    name = "generated_caffe2_aten_op_headers",
-    srcs = [
-        "caffe2/contrib/aten/aten_op_template.h",
-        "aten/src/ATen/Declarations.yaml",
-    ],
-    outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"],
-    cmd = """
-    $(location :gen_op) \
-        --output_prefix gen_ \
-        --install_dir $(@D) \
-        --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \
-        --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \
-        --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""",
-    tools = [":gen_op"],
-)
-
 cc_library(
    name = "caffe2_headers",
    hdrs = glob(
        [
-            "caffe2/contrib/aten/*.h",
-            "caffe2/contrib/gloo/*.h",
            "caffe2/core/*.h",
            "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h",
            "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h",
@ -1338,10 +1292,9 @@ cc_library(
    ) + if_cuda(glob([
        "caffe2/**/*.cuh",
        "caffe2/image/*.h",
-    ])) + [":generated_caffe2_aten_op_headers"],
+    ])),
    copts = CAFFE2_COPTS,
    includes = [
-        "caffe2/contrib/aten",
        "caffe2/core/nomnigraph/include",
    ],
    visibility = ["//visibility:public"],
@ -1385,7 +1338,6 @@ cc_library(
        "caffe2/db/create_db_op.cc",
        "caffe2/db/protodb.cc",
        "caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc",
-        ":caffe2_contrib_srcs",
        ":caffe2_core_srcs",
        ":caffe2_distributed_srcs",
        ":caffe2_ideep_srcs",
@ -1419,7 +1371,6 @@ cc_library(
        "@fbgemm//:fbgemm_src_headers",
        "@fmt",
        "@foxi",
-        "@gloo",
        "@onnx",
    ] + if_cuda(
        [
@ -1467,7 +1418,6 @@ cu_library(
        "@cuda//:curand",
        "@cudnn",
        "@eigen",
-        "@gloo",
        "@tensorpipe//:tensorpipe_cuda",
    ],
    alwayslink = True,
--- a/caffe2/CMakeLists.txt
+++ b/caffe2/CMakeLists.txt
@ -59,23 +59,7 @@ if(INTERN_BUILD_ATEN_OPS)

  # Generate the headers wrapped by our operator
  file(GLOB_RECURSE torchgen_python "${PROJECT_SOURCE_DIR}/torchgen/*.py")
-  add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h
-  COMMAND
-  "${PYTHON_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
-    --aten_root=${CMAKE_CURRENT_SOURCE_DIR}/../aten
-    --template_dir=${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten
-    --yaml_dir=${CMAKE_BINARY_DIR}/aten/src/ATen
-    --install_dir=${CMAKE_CURRENT_BINARY_DIR}/contrib/aten
-  DEPENDS
-  ${torchgen_python}
-  ${CMAKE_BINARY_DIR}/aten/src/ATen/Declarations.yaml
-  ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/gen_op.py
-  ${CMAKE_CURRENT_SOURCE_DIR}/contrib/aten/aten_op_template.h)

-  add_custom_target(__aten_op_header_gen
-    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/contrib/aten/aten_op.h)
-  add_library(aten_op_header_gen INTERFACE)
-  add_dependencies(aten_op_header_gen __aten_op_header_gen)

  # Add source, includes, and libs to lists
  list(APPEND Caffe2_CPU_SRCS ${ATen_CPU_SRCS})
@ -132,7 +116,6 @@ endif()

 # Skip modules that are not used by libtorch mobile yet.
 if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
-  add_subdirectory(contrib)
  add_subdirectory(predictor)
  add_subdirectory(predictor/emulator)
  add_subdirectory(core/nomnigraph)
@ -141,7 +124,6 @@ if(BUILD_CAFFE2 AND NOT INTERN_BUILD_MOBILE)
  endif()
  add_subdirectory(db)
  add_subdirectory(distributed)
-  # add_subdirectory(experiments) # note, we may remove this folder at some point
  add_subdirectory(ideep)
  add_subdirectory(image)
  add_subdirectory(video)
--- a/caffe2/contrib/CMakeLists.txt
+++ b/caffe2/contrib/CMakeLists.txt
@ -1,37 +0,0 @@
-add_subdirectory(aten)
-add_subdirectory(nccl)
-add_subdirectory(opencl)
-add_subdirectory(prof)
-add_subdirectory(shm_mutex)
-add_subdirectory(fakelowp)
-if(USE_TENSORRT)
-add_subdirectory(tensorrt)
-endif()
-
-# Only build Gloo Caffe2 ops on Linux, as it hardcodes
-# the Linux-specific `gloo::transport::tcp` namespace.
-if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
-add_subdirectory(gloo)
-endif()
-
-# Pass the src lists back to the parent
-
-# CPU source, include, deps, test sources, binary sources
-set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_INCLUDE ${Caffe2_CPU_INCLUDE} PARENT_SCOPE)
-set(Caffe2_DEPENDENCY_LIBS ${Caffe2_DEPENDENCY_LIBS} PARENT_SCOPE)
-set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_CPU_BINARY_SRCS ${Caffe2_CPU_BINARY_SRCS} PARENT_SCOPE)
-
-# GPU source, include, deps, test sources, binary sources
-set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_INCLUDE ${Caffe2_GPU_INCLUDE} PARENT_SCOPE)
-set(Caffe2_CUDA_DEPENDENCY_LIBS ${Caffe2_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE)
-set(Caffe2_GPU_TEST_SRCS ${Caffe2_GPU_TEST_SRCS} PARENT_SCOPE)
-set(Caffe2_GPU_BINARY_SRCS ${Caffe2_GPU_BINARY_SRCS} PARENT_SCOPE)
-
-# HIP sources, include, test sources
-set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-set(Caffe2_HIP_INCLUDE ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
-set(Caffe2_HIP_DEPENDENCY_LIBS ${Caffe2_HIP_DEPENDENCY_LIBS} PARENT_SCOPE)
-set(Caffe2_HIP_TEST_SRCS ${Caffe2_HIP_TEST_SRCS} PARENT_SCOPE)
--- a/caffe2/contrib/init.py
+++ b/caffe2/contrib/init.py
--- a/caffe2/contrib/aten/CMakeLists.txt
+++ b/caffe2/contrib/aten/CMakeLists.txt
@ -1,12 +0,0 @@
-if(NOT INTERN_BUILD_MOBILE AND BUILD_CAFFE2_OPS)
-  # Add source generated by Codegen.cmake and pass to parent
-  list(APPEND Caffe2_CPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op.cc)
-  list(APPEND Caffe2_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/aten_op_gpu.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-
-  if(USE_ROCM)
-    list(APPEND Caffe2_HIP_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/hip/aten_op_gpu.cc)
-    set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-  endif()
-endif()
--- a/caffe2/contrib/aten/README.md
+++ b/caffe2/contrib/aten/README.md
@ -1,80 +0,0 @@
-# An ATen operator for Caffe2
-
-ATen is a simple tensor library thats exposes the Tensor operations in Torch
-and PyTorch directly in C++17. This library provides a generated wrapper around the ATen API
-that makes these functions available in Caffe2 as an operator. It also makes it accessible using the
-ToffeeIR.
-
-
-### Example Usage in Caffe2
-
-First identify a function in ATen you want to call in Functions.h,
-Tensor.h, or Type.h.
-
-We will call the `pow` operator:
-
-```
-static inline Tensor pow(const Tensor & self, Scalar exponent);
-```
-
-Now create a Caffe2 operator to call this op. The name of the operator is always `"ATen"`,
-and there is always a string attribute `operator` that defines which ATen function to call:
-
-
-```
-import numpy as np
-from caffe2.python import core, workspace
-
-
-# create the Caffe2 Op:
-op = core.CreateOperator(
-    "ATen",
-    ["MyInput"],
-    ["MyOutput"],
-    operator="pow", exponent=2.0)
-
-```
-
-Each `Tensor` input becomes an Caffe2 input Blob, and each output becomes a Caffe2 output blob.
-Non-tensor inputs such as `Scalar exponent` become Caffe2 `arg` attributes.
-In the case of `Scalar` the attributes can be either an integers or floating point numbers.
-
-The op can now be run like any other Caffe2 operator:
-
-```
-workspace.FeedBlob("MyInput",np.random.randn(2,3).astype(np.float32))
-workspace.RunOperatorOnce(op)
-print(workspace.FetchBlob("MyOutput")
-```
-
-For methods, the first input is always the `this` Tensor in C++.
-To call methods of ATen's `Type` objects, you provide an additional string attribute
-that determines the type:
-
-```
-# create a 2x4 tensor filled with floating point ones
-op = core.CreateOperator(
-    "ATen",
-    [],
-    ["MyOutput"],
-    operator="ones", type="Float", size={2,4})
-```
-
-Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
-
-### Example Usage via PyTorch Symbolic
-
-The ATen operator can also be used to define `symbolic` definitions for PyTorch when an operator is being exported
-to ONNX. In this case, the definition of the operator looks the same but is defined using PyTorch's ONNX API:
-
-```
-class Add(torch.autograd.Function):
-
-    @staticmethod
-    def symbolic(g, a, b):
-        return g.at("add", a, b)
-
-    @staticmethod
-    def forward(ctx, a, b):
-        return a + b
-```
--- a/caffe2/contrib/aten/init.py
+++ b/caffe2/contrib/aten/init.py
--- a/caffe2/contrib/aten/aten_op.cc
+++ b/caffe2/contrib/aten/aten_op.cc
@ -1,56 +0,0 @@
-#include "caffe2/contrib/aten/aten_op.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-
-namespace internal {
-at::Tensor index_with_uint8_handling(
-    const at::Tensor& self,
-    const torch::List<std::optional<at::Tensor>>& indices) {
-  // Support BC only for the simplest case of mask indexing
-  if (indices.size() == 1) {
-    std::optional<at::Tensor> first = indices[0];
-    if (first.has_value()
-        && first->scalar_type() == at::kByte) {
-      TORCH_WARN(
-          "Indexing with uint8 mask tensor in ATenOp is now deprecated,"
-          " please use a bool mask instead.");
-      return at::index(self, {first->to(at::kBool)});
-    }
-  }
-  return at::index(self, indices);
-}
-} // namespace internal
-
-REGISTER_CPU_OPERATOR(ATen, ATenOp<CPUContext>);
-template <>
-at::Backend ATenOp<CPUContext>::backend() const {
-  return at::Backend::CPU;
-}
-
-OPERATOR_SCHEMA(ATen);
-
-namespace math {
-
-template <>
-void Set<at::Half, CPUContext>(
-    const std::int64_t /* N */,
-    const at::Half h,
-    at::Half* v,
-    CPUContext* c) {
-  Set(0, h.x, (uint16_t*)v, c);
-}
-
-template <>
-void Set<at::BFloat16, CPUContext>(
-    const std::int64_t /* N */,
-    const at::BFloat16 b,
-    at::BFloat16* v,
-    CPUContext* c) {
-  Set(0, b.x, (uint16_t*)v, c);
-}
-
-
-} // namespace math
-
-} // namespace caffe2
--- a/caffe2/contrib/aten/aten_op.h
+++ b/caffe2/contrib/aten/aten_op.h
@ -1 +0,0 @@
-#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"
--- a/caffe2/contrib/aten/aten_op_gpu.cc
+++ b/caffe2/contrib/aten/aten_op_gpu.cc
@ -1,12 +0,0 @@
-#include "caffe2/contrib/aten/aten_op.h"
-#include "caffe2/core/context_gpu.h"
-
-namespace caffe2 {
-
-REGISTER_CUDA_OPERATOR(ATen, ATenOp<CUDAContext>);
-template<>
-at::Backend ATenOp<CUDAContext>::backend() const {
-  return at::Backend::CUDA;
-}
-
-}
--- a/caffe2/contrib/aten/aten_op_template.h
+++ b/caffe2/contrib/aten/aten_op_template.h
@ -1,237 +0,0 @@
-#pragma once
-#include <unordered_map>
-#include <string>
-#include <ATen/Functions.h>
-#include <c10/macros/Macros.h>
-#include <c10/util/irange.h>
-#include <caffe2/core/context.h>
-#include <caffe2/core/operator.h>
-#include <caffe2/utils/math.h>
-#include <iostream>
-
-// a map from descriptor strings (see [DESCRIPTORS])
-// to the key in the switch statement that implements them
-static std::unordered_map<std::string, int> op_to_key = {
-  ${mappings}
-};
-
-namespace caffe2 {
-
-using at::Half; // for AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, ...)
-
-namespace internal {
-TORCH_API at::Tensor index_with_uint8_handling(
-    const at::Tensor& self,
-    const torch::List<std::optional<at::Tensor>>& indices);
-}
-
-template <class Context>
-class ATenOp : public Operator<Context> {
- public:
-  ATenOp(const OperatorDef& operator_def, Workspace* ws)
-  : Operator<Context>(operator_def, ws) {
-    VLOG(2) << "ATen OpDef: " << ProtoDebugString(operator_def) << "\n";
-    switch(findImplementation(operator_def)) {
-      ${cases}
-      default:
-        CAFFE_THROW("Unexpected key value for aten operator");
-    }
-  }
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  bool RunOnDevice() override {
-    return run_op();
-  }
-private:
-  // actual operator implementation is initialized in ctor.
-  std::function<bool()> run_op;
-  at::Backend backend() const;
-
-  TypeMeta typeMetaFor(const at::Tensor & t) {
-    return typeMetaFor(t.scalar_type());
-  }
-  TypeMeta typeMetaFor(at::ScalarType st) {
-    #define DEFINE_CASE(ctype,aten_name) \
-      case at::k##aten_name: \
-        return TypeMeta::Make<ctype>();
-    switch(st) {
-      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CASE)
-    default:
-      CAFFE_THROW("Unknown ATen Type");
-    }
-    #undef DEFINE_CASE
-  }
-
-  at::TensorOptions optionsFor(const Tensor& ten) {
-    at::Device device = ten.GetDevice();
-#if defined(USE_ROCM)
-    if (backend() == at::Backend::HIP) {
-      device = at::Device(kCUDA, device.index());
-    }
-#endif
-    return at::TensorOptions(device).dtype(ten.dtype());
-  }
-
-  at::Tensor tensorWrapping(const Tensor& ten_) {
-    auto& ten = const_cast<Tensor&>(ten_);
-    return at::from_blob(
-        ten.raw_mutable_data(),
-        ten.sizes(),
-        optionsFor(ten));
-  }
-
-  at::Tensor peek(size_t i, size_t N) {
-    auto real_idx = InputSize() - N + i;
-    return tensorWrapping(Input(real_idx));
-  }
-
-  std::vector<at::Tensor> peekSlice(size_t i, size_t len, size_t N) {
-    std::vector<at::Tensor> results;
-    results.reserve(len);
-    for (size_t ii = i; ii < i + len; ++ii) {
-      results.push_back(peek(ii, N));
-    }
-    return results;
-  }
-
-  torch::List<std::optional<at::Tensor>> peekSliceOptionals(size_t i, size_t len, size_t N) {
-    torch::List<std::optional<at::Tensor>> results;
-    results.reserve(len);
-    for (size_t ii = i; ii < i + len; ++ii) {
-      results.push_back(peek(ii, N));
-    }
-    return results;
-  }
-
-  void assignTo(Tensor* dst, const at::Tensor& src_) {
-    at::Tensor src = src_.contiguous();
-    auto at_sizes = src.sizes();
-    caffe2::TypeMeta type_meta = typeMetaFor(src);
-    at::Device device = src.device();
-#if defined(USE_ROCM)
-    if (device.is_cuda()) {
-      device = at::Device(at::DeviceType::HIP, device.index());
-    }
-#endif
-    at::TensorImpl* src_impl = src.unsafeReleaseTensorImpl();
-    std::vector<int64_t> dims(at_sizes.begin(), at_sizes.end());
-    dst->Resize(dims);
-    dst->ShareExternalPointer(
-        at::DataPtr(
-            src_impl->mutable_data(),
-            static_cast<void*>(src_impl),
-            [](void* t_ptr) -> void {
-              at::TensorImpl* local_impl = static_cast<at::TensorImpl*>(t_ptr);
-              c10::raw::intrusive_ptr::decref(local_impl);
-            },
-            device),
-        type_meta,
-        0);
-  }
-  void assignListStartingAt(
-      size_t offset,
-      const std::vector<at::Tensor>& tensors) {
-    for (const auto i : c10::irange(tensors.size())) {
-      assignTo(Output(offset + i), tensors[i]);
-    }
-  }
-
-  template<typename T,
-          typename std::enable_if<std::numeric_limits<T>::is_integer, bool>::type* =
-              nullptr>
-  int64_t extract(const at::Scalar &s) {
-    return s.toLong();
-  }
-
-  template<typename T,
-          typename std::enable_if<!std::numeric_limits<T>::is_integer, bool>::type* =
-              nullptr>
-  int64_t extract(const at::Scalar &s) {
-    return s.toDouble();
-  }
-
-  void assignTo(Tensor* dst, at::ScalarType scalar_type, const at::Scalar& scalar) {
-    switch(scalar_type) {
-      #define DEFINE_CASE(ctype,aten_name) \
-        case at::k##aten_name: { \
-          auto value = extract<ctype>(scalar); \
-          assignToValue<ctype>(dst, at::convert<ctype,decltype(value)>(value)); \
-        } break;
-      AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CASE)
-#undef DEFINE_CASE
-      default:
-        CAFFE_THROW("Unknown ATen Type");
-    }
-  }
-  template <typename T>
-  void assignToValue(Tensor* dst, T v) {
-    dst->Resize(std::vector<int64_t>());
-    math::Set(1, v, dst->template mutable_data<T>(), &context_);
-  }
-  int findImplementation(const OperatorDef& operator_def) {
-    CAFFE_ENFORCE(HasArgument("operator"));
-    std::string op = OperatorBase::GetSingleArgument<std::string>("operator", "");
-    // construct descriptor string ([DESCRIPTORS]) given the attributes
-    // and inputs of this operator_def, and look up the implementation key
-    // for this variant
-    std::stringstream descriptor;
-    descriptor << op;
-    std::vector<std::string> attrs;
-    for (const auto i : c10::irange(operator_def.arg_size())) {
-      auto & attr = operator_def.arg(i);
-      if (attr.name() == "operator" || attr.name() == "type" || attr.name() == "overload_name") {
-        continue;
-      }
-      attrs.push_back(attr.name());
-    }
-    std::sort(attrs.begin(), attrs.end());
-    for(auto & a : attrs)
-      descriptor << "-" << a;
-
-    std::string descriptor_sized =
-        descriptor.str() + "-" + c10::to_string(InputSize());
-    std::string descriptor_var_args = descriptor.str() + "-*";
-    if (op_to_key.count(descriptor_sized) > 0) {
-      return op_to_key[descriptor_sized];
-    }
-    if (op_to_key.count(descriptor_var_args) > 0) {
-      return op_to_key[descriptor_var_args];
-    }
-    std::stringstream ss;
-    ss << "Attempting to run unknown ATen operator configuration: "
-       << descriptor_sized;
-    CAFFE_THROW(ss.str());
-  }
-  at::Scalar readScalarAttribute(const std::string & name) {
-    if(OperatorBase::HasSingleArgumentOfType<int64_t>(name)) {
-      return OperatorBase::GetSingleArgument<int64_t>(name, 0);
-    } else {
-      CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<float>(name));
-      return OperatorBase::GetSingleArgument<float>(name, 0);
-    }
-  }
-  template<typename T>
-  T readAttribute(const std::string & name) {
-    CAFFE_ENFORCE(OperatorBase::HasSingleArgumentOfType<T>(name));
-    return OperatorBase::GetSingleArgument<T>(name, 0);
-  }
-  std::vector<int64_t> readIntArrayRef(const std::string & name) {
-    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
-    return OperatorBase::GetRepeatedArgument<int64_t>(name, {});
-  }
-  template <int N>
-  std::array<bool, N> readBoolMask(const std::string& name) {
-    CAFFE_ENFORCE(OperatorBase::HasArgument(name));
-    std::vector<int64_t> ints =
-        OperatorBase::GetRepeatedArgument<int64_t>(name, {});
-    std::array<bool, N> result;
-    for (const auto i : c10::irange(N)) {
-      result[i] = ints.at(i);
-    }
-    return result;
-  }
-
-  ${implementations}
-};
-
-}
--- a/caffe2/contrib/aten/aten_test.py
+++ b/caffe2/contrib/aten/aten_test.py
@ -1,131 +0,0 @@
-from caffe2.python import core
-from hypothesis import given
-
-import caffe2.python.hypothesis_test_util as hu
-import hypothesis.strategies as st
-import numpy as np
-
-
-class TestATen(hu.HypothesisTestCase):
-
-    @given(inputs=hu.tensors(n=2), **hu.gcs)
-    def test_add(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["X", "Y"],
-            ["Z"],
-            operator="add")
-
-        def ref(X, Y):
-            return [X + Y]
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(inputs=hu.tensors(n=2, dtype=np.float16), **hu.gcs_gpu_only)
-    def test_add_half(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["X", "Y"],
-            ["Z"],
-            operator="add")
-
-        def ref(X, Y):
-            return [X + Y]
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(inputs=hu.tensors(n=1), **hu.gcs)
-    def test_pow(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["S"],
-            ["Z"],
-            operator="pow", exponent=2.0)
-
-        def ref(X):
-            return [np.square(X)]
-
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(x=st.integers(min_value=2, max_value=8), **hu.gcs)
-    def test_sort(self, x, gc, dc):
-        inputs = [np.random.permutation(x)]
-        op = core.CreateOperator(
-            "ATen",
-            ["S"],
-            ["Z", "I"],
-            operator="sort")
-
-        def ref(X):
-            return [np.sort(X), np.argsort(X)]
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(inputs=hu.tensors(n=1), **hu.gcs)
-    def test_sum(self, inputs, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ["S"],
-            ["Z"],
-            operator="sum")
-
-        def ref(X):
-            return [np.sum(X)]
-
-        self.assertReferenceChecks(gc, op, inputs, ref)
-
-    @given(**hu.gcs)
-    def test_index_uint8(self, gc, dc):
-        # Indexing with uint8 is deprecated, but we need to provide backward compatibility for some old models exported through ONNX
-        op = core.CreateOperator(
-            "ATen",
-            ['self', 'mask'],
-            ["Z"],
-            operator="index")
-
-        def ref(self, mask):
-            return (self[mask.astype(np.bool_)],)
-
-        tensor = np.random.randn(2, 3, 4).astype(np.float32)
-        mask = np.array([[1, 0, 0], [1, 1, 0]]).astype(np.uint8)
-
-        self.assertReferenceChecks(gc, op, [tensor, mask], ref)
-
-    @given(**hu.gcs)
-    def test_index_put(self, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ['self', 'indices', 'values'],
-            ["Z"],
-            operator="index_put")
-
-        def ref(self, indices, values):
-            self[indices] = values
-            return (self,)
-
-        tensor = np.random.randn(3, 3).astype(np.float32)
-        mask = np.array([[True, True, True], [True, False, False], [True, True, False]])
-        values = np.random.randn(6).astype(np.float32)
-
-        self.assertReferenceChecks(gc, op, [tensor, mask, values], ref)
-
-    @given(**hu.gcs)
-    def test_unique(self, gc, dc):
-        op = core.CreateOperator(
-            "ATen",
-            ['self'],
-            ["output"],
-            sorted=True,
-            return_inverse=True,
-            # return_counts=False,
-            operator="_unique")
-
-        def ref(self):
-            index, _ = np.unique(self, return_index=False, return_inverse=True, return_counts=False)
-            return (index,)
-
-        tensor = np.array([1, 2, 6, 4, 2, 3, 2])
-        print(ref(tensor))
-        self.assertReferenceChecks(gc, op, [tensor], ref)
-
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
--- a/caffe2/contrib/aten/docs/init.py
+++ b/caffe2/contrib/aten/docs/init.py
--- a/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
+++ b/caffe2/contrib/aten/docs/pytorch_to_caffe2.md
@ -1,157 +0,0 @@
-# Using ONNX and ATen to export models from PyTorch to Caffe2
-
-When using ONNX to export a model from PyTorch into Caffe2, you sometimes end up
-hitting operators that are not yet part of the ONNX specification. These may be
-operators that haven't been standardized yet, or custom `torch.autograd.Function` types that
-are specific to a network.
-
-To bridge this gap, we provide an experimental operator in ONNX that allows you to directly access PyTorch's tensor functions using the ATen library.
-[ATen](https://github.com/pytorch/pytorch/tree/main/aten) is the underlying C++ library that PyTorch uses to do tensor operations. Caffe2 has an [ATen operator](https://github.com/pytorch/pytorch/tree/main/caffe2/contrib/aten)
-that can run these tensor functions in a Caffe2 network after importing them through ONNX.
-
-This guide explains how to configure Caffe2 and modify your PyTorch program to use
-this functionality.
-
-### Enable ATen in Caffe2
-
-The ATen facility in Caffe2 is part of a contrib package and needs to be enabled
-when you configure Caffe2 using cmake:
-
-```
-git clone https://github.com/caffe2/caffe2/
-mkdir caffe2/build
-cd caffe2/build
-cmake -DUSE_ATEN=ON <other build options> ..
-make install
-```
-
-### Describe How to Export a PyTorch Autograd Function using ATen
-
-To export a model to ONNX, PyTorch first creates a trace of all the `torch.autograd.Function`s run
-in the forward pass of a network. For each function in the trace, it calls that function's
-`symbolic` method which describes how to construct the part of the ONNX graph
-that will compute this function (see [basic_ops.py](https://github.com/pytorch/pytorch/blob/main/torch/autograd/_functions/basic_ops.py#L59) for examples).
-
-When equivalent ONNX operators do not exist, you can instead call any ATen function.
-As an example let's assume we have an autograd function which computes `x*x+y`:
-
-```
-  class MyFunction(Function):
-    @staticmethod
-    def forward(ctx, x, y):
-      return x*x + y
-```
-
-We can add a `symbolic` method to it like so:
-
-```
-  class MyFunction(Function):
-    @staticmethod
-    def forward(ctx, x, y):
-      return x*x + y
-    @staticmethod
-    def symbolic(graph, x, y):
-      x2 = graph.at("mul", x, x)
-      r = graph.at("add", x2, y)
-      # x, y, x2, and r are 'Node' objects
-      # print(r) or print(graph) will print out a textual representation for debugging.
-      # this representation will be converted to ONNX protobufs on export.
-      return r
-```
-
-The function `graph.at` adds a new ATen op the computation graph.
-You can call any ATen function using this facility. To do so,
-first identify a function in ATen you want to call in Functions.h,
-Tensor.h, or Type.h.
-
-As an example, we might want to call the `pow` operator:
-
-```
-static inline Tensor pow(const Tensor & self, Scalar exponent);
-```
-
-We can translate this into the equivalent `graph.at` function:
-
-```
-  def symbolic(graph, x):
-    graph.at("pow", x, exponent_f = 2.0) # compute x**2
-```
-
-Tensor arguments to ATen functions become arguments to `graph.at`, while a `Scalar`
-like `exponent` becomes a keyword argument that specify ONNX attributes.
-Attributes are suffixed with their type (`_f` for floats and `_i` for integers, and `_s` for strings).
-
-For methods, the first input is always the `this` Tensor in C++.
-To call methods of ATen's `Type` objects, you provide an additional string attribute
-that determines the type. For instance, `ones` creates a new constant tensor of all ones:
-```
-class Type {
-  ...
-  virtual Tensor ones(IntArrayRef size) const;
-  ...
-};
-```
-
-From PyTorch it can be created by adding the type as an additional attribute:
-
-```
-  def symbolic(graph, x):
-    return graph.at("ones", type_s="float", size_i=[2,4])
-```
-
-
-Generally ATen operators are polymorphic across input types, and work on both the CPU and CUDA.
-
-## Putting it together
-
-With these building blocks we can now write and export networks that include custom operators using `torch.onnx.export`:
-
-```
-class MyModule(nn.Module):
-    def forward(self, x, y):
-        # you can combine your ATen ops with standard onnx ones
-        x = nn.ReLU()(x)
-        return MyFunction.apply(x, y)
-
-torch.onnx.export(MyModule(),
-                  (Variable(torch.ones(3,4)), Variable(torch.ones(3,4))),
-                  "output.onnx",
-                  verbose=True)
-```
-
-This exports the following graph, which contains calls the `ATen` operator:
-
-```
-graph(%1 : Float(3, 4)
-       %2 : Float(3, 4)) {
-   %3 : Float(3, 4) = Relu(%1), uses = [%4.i0, %4.i1];
-   %4 : UNKNOWN_TYPE = ATen[operator=mul](%3, %3), uses = [%5.i0];
-   %5 : Float(3, 4) = ATen[operator=add](%4, %2), uses = [%0.i0];
-   return (%5);
-}
-```
-
-The graph can then be imported using ONNX and run with Caffe2:
-
-```
-import onnx
-import caffe2.python.onnx.backend
-import numpy as np
-
-graph = onnx.load("output.onnx")
-
-a = np.random.randn(3, 2).astype(np.float32)
-b = np.random.randn(3, 2).astype(np.float32)
-
-prepared_backend = caffe2.python.onnx.backend.prepare(graph)
-W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
-c2_out = prepared_backend.run(W)[0]
-
-x = np.maximum(a, 0)
-r = x*x + b
-np.testing.assert_array_almost_equal(r, c2_out)
-```
-
-### Code
-
-For the full source code for this tutorial, see [sample.py](sample.py).
--- a/caffe2/contrib/aten/docs/sample.py
+++ b/caffe2/contrib/aten/docs/sample.py
@ -1,56 +0,0 @@
-import tempfile
-
-import numpy as np
-
-from torch import nn
-from torch.autograd import Variable, Function
-import torch.onnx
-
-import onnx
-import caffe2.python.onnx.backend
-
-class MyFunction(Function):
-    @staticmethod
-    def forward(ctx, x, y):
-        return x * x + y
-
-    @staticmethod
-    def symbolic(graph, x, y):
-        x2 = graph.at("mul", x, x)
-        r = graph.at("add", x2, y)
-        # x, y, x2, and r are 'Node' objects
-        # print(r) or print(graph) will print out a textual representation for debugging.
-        # this representation will be converted to ONNX protobufs on export.
-        return r
-
-class MyModule(nn.Module):
-    def forward(self, x, y):
-        # you can combine your ATen ops with standard onnx ones
-        x = nn.ReLU()(x)
-        return MyFunction.apply(x, y)
-
-f = tempfile.NamedTemporaryFile()
-torch.onnx.export(MyModule(),
-                  (Variable(torch.ones(3, 4)), Variable(torch.ones(3, 4))),
-                  f, verbose=True)
-
-# prints the graph for debugging:
-# graph(%input : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu),
-#       %y : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu)):
-#   %2 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = onnx::Relu(%input)
-#   %3 : Tensor = aten::ATen[operator="mul"](%2, %2)
-#   %4 : Float(3, 4, strides=[4, 1], requires_grad=0, device=cpu) = aten::ATen[operator="add"](%3, %y)
-#   return (%4)
-
-graph = onnx.load(f.name)
-
-a = np.random.randn(3, 4).astype(np.float32)
-b = np.random.randn(3, 4).astype(np.float32)
-
-prepared_backend = caffe2.python.onnx.backend.prepare(graph)
-W = {graph.graph.input[0].name: a, graph.graph.input[1].name: b}
-c2_out = prepared_backend.run(W)[0]
-
-x = np.maximum(a, 0)
-r = x * x + b
-np.testing.assert_array_almost_equal(r, c2_out)
--- a/caffe2/contrib/aten/gen_op.py
+++ b/caffe2/contrib/aten/gen_op.py
@ -1,330 +0,0 @@
-#!/bin/env python3
-
-# Copyright (c) 2016-present, Facebook, Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-##############################################################################
-
-import sys
-import yaml
-import argparse
-import os
-from copy import deepcopy
-from typing import Dict, List, Set
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--template_dir", default=".", help="where template.h is")
-parser.add_argument("--yaml_dir", default="aten/src/ATen/ATen",
-                    help="where ATen yaml files are")
-parser.add_argument("--output_prefix", default="", help="")
-parser.add_argument(
-    "--install_dir", default=".", help="where to put generated file")
-parser.add_argument("--aten_root", default="", help="root directory of aten")
-args, _ = parser.parse_known_args()
-
-if args.aten_root:
-    if not os.path.exists(args.aten_root):
-        raise ValueError('aten_root ({}) does not exist'.format(
-            args.aten_root))
-    sys.path.insert(0, os.path.join(args.aten_root, '..'))
-    from torchgen.code_template import CodeTemplate as CT
-else:
-    from torchgen.code_template import CodeTemplate as CT
-
-OP_TEMPLATE = CT.from_file(
-    os.path.join(args.template_dir, 'aten_op_template.h'))
-
-
-try:
-    # use faster C loader if available
-    from yaml import CSafeLoader as Loader
-except ImportError:
-    from yaml import SafeLoader as Loader  # type: ignore[assignment, misc]
-
-
-def write(filename, s):
-    with open(filename, "w") as f:
-        f.write(s)
-
-
-def read(filename):
-    with open(filename, "r") as f:
-        return f.read()
-
-
-def value_has_tensors(v):
-    # Sparse shouldn't appear in public API, seems to be temporary bug
-    return "Tensor" in v['dynamic_type'] and "Sparse" not in v['dynamic_type']
-
-
-def value_is_tensor_type(v):
-    return value_has_tensors(v) and v['dynamic_type'] not in TENSORLIST_TYPE
-
-TENSORLIST_TYPE = [
-    'at::TensorList',
-    'const at::ITensorListRef &',
-    'const c10::List<::std::optional<at::Tensor>> &',
-]
-
-# for each aten type, how do we handle a return value of that type?
-RETURN_MAP = {
-    'at::Tensor': 'assignTo(Output(${offset}),${output});',
-    'at::Scalar': 'assignTo(Output(${offset}),${output}.type(), ${output});',
-    'bool': 'assignToValue<int64_t>(Output(${offset}),${output});',
-    'int64_t': 'assignToValue<int64_t>(Output(${offset}),${output});',
-    '::std::vector<at::Tensor>': 'assignListStartingAt(${offset}, ${output});',
-}
-
-# for each non-Tensor aten argument, how to we read it from caffe2's
-# attribute list. Most of these call runtime functions defined in the
-# template class.
-ARGUMENT_MAP = {
-    'const at::Scalar &': 'at::Scalar ${arg} = readScalarAttribute("${arg}");',
-    'bool': 'bool ${arg} = readAttribute<int64_t>("${arg}");',
-    'int': 'int ${arg} = readAttribute<int64_t>("${arg}");',
-    'double': 'double ${arg} = readAttribute<float>("${arg}");',
-    'int64_t': 'int64_t ${arg} = readAttribute<int64_t>("${arg}");',
-    'at::IntArrayRef': 'auto ${arg} = readIntArrayRef("${arg}");',
-    '::std::array<bool,2>': 'auto ${arg} = readBoolMask<2>("${arg}");',
-    '::std::array<bool,3>': 'auto ${arg} = readBoolMask<3>("${arg}");',
-}
-
-# for BC reasons we want to route some of the functions to different
-# implementations
-SPECIAL_IMPLEMENTATIONS = {
-    'index': 'internal::index_with_uint8_handling',
-}
-
-def expand(o):
-    num_defaults = sum(1 if 'default' in arg else 0 for arg in o['arguments'])
-    results = [o]
-    for i in range(0, num_defaults):
-        # last num_default values should be default
-        assert('default' in o['arguments'][-(i + 1)])
-        v = deepcopy(o)
-        v['arguments'] = v['arguments'][:-(i + 1)]
-        results.append(v)
-    return results
-
-
-# filter the list of declarations removing things we cannot support
-def supports(o, factory_methods):
-    # Ignore all families (!) of functions that have TensorOptions (i.e. tensor factory methods).
-    if o['name'] in factory_methods:
-        if factory_methods[o['name']] == 0:
-            print("Skipping {} because it is a factory method".format(o['name']))
-        factory_methods[o['name']] += 1
-        return False
-
-    # skip all in-place operators for now since aten cannot Resize
-    # caffe2 memory inside an operator
-    if o['inplace']:
-        return False
-
-    # _out variants also work in-place on arguments taken as destinations
-    # we also cannot handle these because aten cannot resize caffe2 Tensors
-    if "_out" in o['name']:
-        return False
-
-    # skip if no return, previously it is 'void'
-    if len(o['returns']) == 0:
-        return False
-
-    # skip return types we cannot handle
-    for ret in o['returns']:
-        if not value_has_tensors(ret) and ret['type'] not in RETURN_MAP:
-            print("Skipping {} Because of Ret: {} ({})".format(
-                  o['name'], ret['type'], ret['dynamic_type']))
-            return False
-
-    # skip arguments we cannot handle
-    for arg in o['arguments']:
-        if not value_has_tensors(arg) and arg['type'] not in ARGUMENT_MAP:
-            print("Skipping {} Because of Arg: {} ({}) ".format(
-                  o['name'], arg['type'], arg['dynamic_type']))
-            return False
-    return True
-
-
-# template for each potential operator.
-# each operator has an integer 'key' associated with it, and
-# a lambda that defines the operator
-# non-tensor attributes are created in ${initialization}
-# and then saved as arguments to the lambda
-# Inputs/Outputs are read inside the lambda
-#
-# each implementation is defined in a separate method annotated with
-# C10_NOINLINE to avoid inlining into the ATenOp constructor, which would
-# trigger pathological compile times.
-IMPLEMENTATION_TEMPLATE = CT("""\
-C10_NOINLINE void implementation_${key}() { // ${name}
-    ${initialization}
-    run_op = [=] {
-        at::AutoDispatchBelowAutograd guard;
-        ${statements}
-        auto the_result = ${invocation};
-        ${assignments}
-        return true;
-    };
-}
-""")
-
-CASE_TEMPLATE = CT("""\
-case ${key}: // ${name}
-  implementation_${key}();
-  break;
-""")
-
-ASSIGN_CHECK_SIZE_TEMPLATE = CT("""\
-  if(OutputSize() > ${offset}) {${assignment}}
-""")
-
-
-def get_output(o, i):
-    if len(o['returns']) == 1:
-        return 'the_result'
-    else:
-        return '::std::get<{}>(the_result)'.format(i)
-
-
-def attribute_names(o):
-    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a)])
-
-
-def required_attribute_names(o):
-    return sorted([a['name'] for a in o['arguments'] if not value_has_tensors(a) and 'default' not in a])
-
-
-def self_as_first_argument(arguments):
-    return ([a for a in arguments if a['name'] == 'self'] +
-            [a for a in arguments if a['name'] != 'self'])
-
-
-def get_num_inputs(o):
-    args = 0
-    for a in o['arguments']:
-        if a['type'] in TENSORLIST_TYPE:
-            return '*'
-        elif value_has_tensors(a):
-            args += 1
-    return str(args)
-
-
-def find_factory_methods(decls):
-    factory_methods = {}
-    for o in decls:
-        if any(arg['dynamic_type'] == 'at::TensorOptions' for arg in o['arguments']):
-            factory_methods[o['name']] = 0
-    return factory_methods
-
-
-def emit_assignments(o, env):
-    for i, r in enumerate(o['returns']):
-        t = RETURN_MAP[r['type'] if not value_is_tensor_type(r) else 'at::Tensor']
-        assignment = CT(t).substitute(env, offset=i, output=get_output(o, i))
-        check_size_assignment = ASSIGN_CHECK_SIZE_TEMPLATE.substitute(env, offset=i, assignment=assignment)
-
-        env['assignments'].append(check_size_assignment)
-
-
-if __name__ == '__main__':
-    decls = yaml.load(read(os.path.join(args.yaml_dir, 'Declarations.yaml')), Loader=Loader)
-    factory_methods = find_factory_methods(decls)
-    filtered = [expanded for o in decls for expanded in expand(o) if supports(expanded, factory_methods)]
-    top_env: Dict[str, List] = {
-        'mappings': [],
-        'implementations': [],
-        'cases': [],
-    }
-    seen: Set[str] = set()
-    key = 0
-    for o in filtered:
-        # [DESCRIPTORS]
-        # each option is associated with a descriptor string that is used
-        # to figure out which version of an op is being used:
-        # The format is:
-        #     opname-num_inputs-attribute_1-attribute2
-        # Example:
-        #  lerp-2-weight
-        #  the operator lerp takes 2 arguments and has the attribute weight
-        attr_names = attribute_names(o)
-        num_inputs = get_num_inputs(o)
-        descriptor = '-'.join([o['name']] + attr_names + [num_inputs])
-        if descriptor in seen:
-            continue
-        seen.add(descriptor)
-
-        # map from descriptor string to the integer key in the switch statements
-        # that initializes the operators
-        top_env['mappings'].append('{{ "{}", {} }},'.format(descriptor, key))
-        env = {
-            'name': o['name'],
-            'statements': [],
-            'arguments': [],
-            'assignments': [],
-            'initialization': [],
-            'key': str(key),
-        }
-
-        if 'namespace' not in o['method_of'] and 'Tensor' not in o['method_of']:
-            # methods on type like 'ones' or 'zeros' always take a
-            # string attribute that is translated into the at::Type object
-            # e.g. "Float" is at::kFloat
-            assert('Type' in o['method_of'])
-
-        static_tensor_inputs = sum(arg['type'] not in TENSORLIST_TYPE and value_is_tensor_type(arg) for arg in o['arguments'])
-        has_tensorlist = any(arg['type'] in TENSORLIST_TYPE for arg in o['arguments'])
-        if has_tensorlist:
-            tensorlist_idx = [i for i, arg in enumerate(o['arguments']) if arg['type'] in TENSORLIST_TYPE][0]
-
-        real_inputs = 0
-        for i, arg in enumerate(o['arguments']):
-            env['arguments'].append(arg['name'])
-            # Pretend the flat argument list is a stack where the end is the top.
-            view_length = 'InputSize()' if has_tensorlist and i < tensorlist_idx else static_tensor_inputs
-            if arg['type'] == 'at::TensorList' or arg['type'] == 'const at::ITensorListRef &':
-                # NOTE: do not advance real_inputs here. After this we will
-                # switch to indexing the "stack" from the end
-                env['statements'].append(
-                    'auto {} = peekSlice({}, InputSize() - {}, InputSize());'
-                    .format(arg['name'], real_inputs, static_tensor_inputs))
-            elif arg['type'] == 'const c10::List<::std::optional<at::Tensor>> &':
-                # NOTE: do not advance real_inputs here. After this we will
-                # switch to indexing the "stack" from the end
-                env['statements'].append(
-                    'auto {} = peekSliceOptionals({}, InputSize() - {}, InputSize());'
-                    .format(arg['name'], real_inputs, static_tensor_inputs))
-            elif value_is_tensor_type(arg):
-                # load tensor inputs from Caffe2
-                env['statements'].append(
-                    'auto {} = peek({}, {});'.format(arg['name'], real_inputs, view_length))
-                real_inputs += 1
-            else:
-                init = CT(ARGUMENT_MAP[arg['type']]).substitute(env, arg=arg['name'])
-                env['initialization'].append(init)
-
-        emit_assignments(o, env)
-
-        if o['name'] in SPECIAL_IMPLEMENTATIONS:
-            env['invocation'] = "{}({})".format(SPECIAL_IMPLEMENTATIONS[o['name']], ','.join(env['arguments']))
-        elif 'namespace' in o['method_of']:
-            env['invocation'] = CT("at::${name}(${arguments})").substitute(env)
-        else:
-            assert('Tensor' in o['method_of'])
-            env['invocation'] = "self.{}({})".format(
-                o['name'], ', '.join(env['arguments'][1:]))
-
-        top_env['implementations'].append(IMPLEMENTATION_TEMPLATE.substitute(env))
-        top_env['cases'].append(CASE_TEMPLATE.substitute(env))
-        key += 1
-    write(os.path.join(args.install_dir, args.output_prefix + "aten_op.h"), OP_TEMPLATE.substitute(top_env))
--- a/caffe2/contrib/fakelowp/CMakeLists.txt
+++ b/caffe2/contrib/fakelowp/CMakeLists.txt
@ -1,35 +0,0 @@
-if(USE_FAKELOWP)
-  message(STATUS "Including FakeLowP operators")
-
-  # ---[ CPU files.
-  file(GLOB_RECURSE tmp *.cc)
-  set(FAKELOWP_CPU_SRCS ${FAKELOWP_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB_RECURSE tmp *_test.cc)
-  exclude(FAKELOWP_CPU_SRCS "${FAKELOWP_CPU_SRCS}" ${tmp})
-
-  # We will only build the perf kernel files if the compiler supports avx2
-  # extensions.
-  if(CXX_AVX2_FOUND)
-    add_library(caffe2_fakelowp_ops OBJECT ${FAKELOWP_CPU_SRCS})
-    add_dependencies(caffe2_fakelowp_ops fbgemm cpuinfo Caffe2_PROTO c10 aten_op_header_gen)
-    target_include_directories(caffe2_fakelowp_ops BEFORE
-      PRIVATE $<BUILD_INTERFACE:${FBGEMM_SOURCE_DIR}/include>)
-    target_include_directories(caffe2_fakelowp_ops BEFORE
-      PRIVATE $<BUILD_INTERFACE:${CPUINFO_SOURCE_DIR}/include>)
-
-    if(MSVC)
-      set_property(SOURCE ${FAKELOWP_CPU_SRCS}
-        APPEND_STRING PROPERTY COMPILE_FLAGS " /arch:AVX2 ")
-    else()
-      set_property(SOURCE ${FAKELOWP_CPU_SRCS}
-        APPEND_STRING PROPERTY COMPILE_FLAGS " -mavx2 -mfma -mf16c -mxsave ")
-    endif()
-    set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS}
-      $<TARGET_OBJECTS:caffe2_fakelowp_ops>)
-  endif()
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-else()
-  message(STATUS "Excluding FakeLowP operators")
-endif()
--- a/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.cc
@ -1,66 +0,0 @@
-#include "batch_matmul_fp16_fake_op.h"
-
-#include "caffe2/core/operator_schema.h"
-
-namespace caffe2 {
-
-vector<TensorShape> TensorInferenceForBatchMatMul(
-    const OperatorDef& def,
-    const vector<TensorShape>& in);
-OpSchema::Cost CostInferenceForBatchMatMul(
-    const OperatorDef& def,
-    const vector<TensorShape>& in);
-
-REGISTER_CPU_OPERATOR(BatchMatMulFP16Fake, BatchMatMulFP16FakeOp<CPUContext>);
-
-OPERATOR_SCHEMA(BatchMatMulFP16Fake)
-    .NumInputs(2)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Batch Matrix multiplication Yi = Ai * Bi, where A has shape (dim0, dim1, ... M, K),
-B has shape (dim0, dim1, ... K, N), Y has shape (dim0, dim1, ... M, N) and i ranges
-from 0 to (dim0 * dim1 ...) - 1. rank(A) == rank(B) >= 2. In case of A and B being
-two diemnsional, it behaves like normal matrix multiplication.
-)DOC")
-    .Input(0, "A", "tensor of shape (dim0, dim1 ... M, K)")
-    .Input(1, "B", "tensor of shpae (dim0, dim2 ... K, N)")
-    .Output(0, "Y", "tensor of shape (dim0, dim1 ... M, N)")
-    .Arg(
-        "trans_a",
-        "Pass 1 to transpose the last two dimensions of A before "
-        "doing multiplication")
-    .Arg(
-        "trans_b",
-        "Pass 1 to transpose the last two dimensions of B before "
-        "doing multiplication")
-    .Arg(
-        "broadcast",
-        "Pass 1 to allow broadcasting of dimensions. Behavior is the same as numpy.matmul. Gradient is currently not supported when running in broadcast mode.")
-    .TensorInferenceFunction(TensorInferenceForBatchMatMul)
-    .CostInferenceFunction(
-        OpSchema::CostInferenceFunctionType(CostInferenceForBatchMatMul))
-    .InheritOnnxSchema();
-
-REGISTER_CPU_OPERATOR(
-    BatchMatMulFP16Acc16Fake,
-    BatchMatMulFP16FakeOp<
-        CPUContext,
-        DefaultEngine,
-        true /*use custom fp16 gemm acc16*/,
-        false /*not using temp accmulator*/,
-        false /*use math fp16 gemm acc 32*/>);
-
-OPERATOR_SCHEMA(BatchMatMulFP16Acc16Fake).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    BatchMatMulFP16Acc32Fake,
-    BatchMatMulFP16FakeOp<
-        CPUContext,
-        DefaultEngine,
-        false /*use custom fp16 gemm acc16*/,
-        false /*not using temp accmulator*/,
-        true /*use custom fp16 gemm acc32*/>);
-
-OPERATOR_SCHEMA(BatchMatMulFP16Acc32Fake).NumInputs(2).NumOutputs(1);
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/batch_matmul_fp16_fake_op.h
@ -1,440 +0,0 @@
-#ifndef CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
-#define CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
-
-#include <ATen/Utils.h>
-#include <c10/util/accumulate.h>
-#include <fbgemm/FbgemmConvert.h>
-
-#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-#include <algorithm>
-#include <functional>
-#include <numeric>
-#include <string>
-#include <vector>
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-template <
-    class Context,
-    class Engine = DefaultEngine,
-    bool USE_ACC_FP16 = false,
-    bool USE_TMP_ACCUMULATOR = false,
-    bool USE_CUSTOM_ACC32 =
-        false> /* if  USE_ACC_FP16=false, set to true to use custom gemm kernel
-                 in fp16_gemm_utils.cc instead of math.h gemm functions */
-class BatchMatMulFP16FakeOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  template <class... Args>
-  explicit BatchMatMulFP16FakeOp(Args&&... args)
-      : Operator<Context>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(bool, "trans_a", trans_a_, false),
-        OP_SINGLE_ARG(bool, "trans_b", trans_b_, false),
-        OP_SINGLE_ARG(bool, "broadcast", broadcast_, false) {}
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& A = Input(0);
-    const auto& B = Input(1);
-    const int A_ndim = A.dim();
-    const int B_ndim = B.dim();
-    const std::vector<std::int64_t> A_dims = A.sizes().vec();
-    const std::vector<std::int64_t> B_dims = B.sizes().vec();
-    const T* A_data = A.template data<T>();
-    const T* B_data = B.template data<T>();
-
-    // Fake fp16 rounding of input
-    std::vector<float> A_rounded(A.numel());
-    std::vector<float> B_rounded(B.numel());
-    fbgemm::RoundToFloat16(
-        A_data,
-        A_rounded.data(),
-        A.numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-        USE_ACC_FP16);
-    fbgemm::RoundToFloat16(
-        B_data,
-        B_rounded.data(),
-        B.numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-        USE_ACC_FP16);
-    A_data = A_rounded.data();
-    B_data = B_rounded.data();
-
-    if (A_ndim == 1 && B_ndim == 1) {
-      CAFFE_ENFORCE_EQ(A.numel(), B.numel());
-      auto* Y = Output(0, {1}, at::dtype<T>());
-      T* Y_data = Y->template mutable_data<T>();
-      math::Dot<T, Context>(A.numel(), A_data, B_data, Y_data, &context_);
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          USE_ACC_FP16);
-      return true;
-    }
-    if (A_ndim == 1) {
-      const int N = A.numel();
-      if (trans_b_) {
-        CAFFE_ENFORCE_EQ(B_dims[B_ndim - 1], N);
-      } else {
-        CAFFE_ENFORCE_EQ(B_dims[B_ndim - 2], N);
-      }
-      std::vector<std::int64_t> Y_dims(B_ndim - 1);
-      if (trans_b_) {
-        std::copy_n(B_dims.cbegin(), B_ndim - 1, Y_dims.begin());
-      } else {
-        std::copy_n(B_dims.cbegin(), B_ndim - 2, Y_dims.begin());
-        Y_dims.back() = B_dims.back();
-      }
-      auto* Y = Output(0, Y_dims, at::dtype<T>());
-      T* Y_data = Y->template mutable_data<T>();
-      if (trans_b_) {
-        const int M = B.numel() / N;
-        caffe2::custom_fp16_gemv(
-            USE_ACC_FP16,
-            USE_CUSTOM_ACC32,
-            USE_TMP_ACCUMULATOR,
-            CblasNoTrans,
-            M,
-            N,
-            1.0f,
-            B_data,
-            A_data,
-            0.0f,
-            Y_data,
-            &context_);
-      } else {
-        const int M = B_dims[B_ndim - 1];
-        const int batch_size = B.numel() / (M * N);
-        if (batch_size == 1) {
-          caffe2::custom_fp16_gemv(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              N,
-              M,
-              1.0f,
-              B_data,
-              A_data,
-              0.0f,
-              Y_data,
-              &context_);
-        } else {
-          caffe2::custom_fp16_gemm_strided_batched(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              CblasNoTrans,
-              batch_size,
-              M,
-              1,
-              N,
-              1.0f,
-              B_data,
-              M * N,
-              A_data,
-              0,
-              0.0f,
-              Y_data,
-              M,
-              &context_);
-        }
-      }
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          USE_ACC_FP16);
-      return true;
-    }
-    if (B_ndim == 1) {
-      const int N = B.numel();
-      if (trans_a_) {
-        CAFFE_ENFORCE_EQ(A_dims[A_ndim - 2], N);
-      } else {
-        CAFFE_ENFORCE_EQ(A_dims[A_ndim - 1], N);
-      }
-      const std::vector<std::int64_t> Y_dims(
-          A_dims.cbegin(), A_dims.cbegin() + A_ndim - 1);
-      auto* Y = Output(0, Y_dims, at::dtype<T>());
-      T* Y_data = Y->template mutable_data<T>();
-      if (trans_a_) {
-        const int M = A_dims[A_ndim - 1];
-        const int batch_size = A.numel() / (M * N);
-        if (batch_size == 1) {
-          caffe2::custom_fp16_gemv(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              N,
-              M,
-              1.0f,
-              A_data,
-              B_data,
-              0.0f,
-              Y_data,
-              &context_);
-        } else {
-          caffe2::custom_fp16_gemm_strided_batched(
-              USE_ACC_FP16,
-              USE_CUSTOM_ACC32,
-              USE_TMP_ACCUMULATOR,
-              CblasTrans,
-              CblasNoTrans,
-              batch_size,
-              M,
-              1,
-              N,
-              1.0f,
-              A_data,
-              M * N,
-              B_data,
-              0,
-              0.0f,
-              Y_data,
-              M,
-              &context_);
-        }
-      } else {
-        const int M = A.numel() / N;
-        caffe2::custom_fp16_gemv(
-            USE_ACC_FP16,
-            USE_CUSTOM_ACC32,
-            USE_TMP_ACCUMULATOR,
-            CblasNoTrans,
-            M,
-            N,
-            1.0f,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            &context_);
-      }
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      return true;
-    }
-
-    const int M = trans_a_ ? A_dims[A_ndim - 1] : A_dims[A_ndim - 2];
-    const int K = trans_a_ ? A_dims[A_ndim - 2] : A_dims[A_ndim - 1];
-    if (trans_b_) {
-      CAFFE_ENFORCE_EQ(B_dims[B_ndim - 1], K);
-    } else {
-      CAFFE_ENFORCE_EQ(B_dims[B_ndim - 2], K);
-    }
-    const int N = trans_b_ ? B_dims[B_ndim - 2] : B_dims[B_ndim - 1];
-    const int ndim = std::max(A_ndim, B_ndim);
-    std::vector<std::int64_t> A_broadcast_dims(ndim);
-    std::vector<std::int64_t> B_broadcast_dims(ndim);
-    std::vector<std::int64_t> Y_broadcast_dims(ndim);
-    math::utils::ComputeBroadcastBinaryOpDims(
-        A_ndim - 2,
-        A_dims.data(),
-        B_ndim - 2,
-        B_dims.data(),
-        A_broadcast_dims.data(),
-        B_broadcast_dims.data(),
-        Y_broadcast_dims.data());
-    Y_broadcast_dims[ndim - 2] = M;
-    Y_broadcast_dims[ndim - 1] = N;
-    auto* Y = Output(0, Y_broadcast_dims, at::dtype<T>());
-    T* Y_data = Y->template mutable_data<T>();
-
-    const int batch_dim = ndim - 2;
-    const bool is_broadcast_dims = !std::equal(
-        A_broadcast_dims.cbegin(),
-        A_broadcast_dims.cbegin() + batch_dim,
-        B_broadcast_dims.cbegin());
-    if (is_broadcast_dims) {
-      CAFFE_ENFORCE(broadcast_);
-    }
-
-    const std::int64_t A_batch_size = c10::multiply_integers(
-        A_broadcast_dims.cbegin(),
-        A_broadcast_dims.cbegin() + batch_dim);
-    const std::int64_t B_batch_size = c10::multiply_integers(
-        B_broadcast_dims.cbegin(),
-        B_broadcast_dims.cbegin() + batch_dim);
-    const std::int64_t Y_batch_size = c10::multiply_integers(
-        Y_broadcast_dims.cbegin(),
-        Y_broadcast_dims.cbegin() + batch_dim);
-    if (Y_batch_size == 0) {
-      fbgemm::RoundToFloat16(
-          reinterpret_cast<const float*>(Y_data),
-          Y_data,
-          Y->numel(),
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      return true;
-    }
-    if (A_batch_size == 1 && B_batch_size == 1) {
-      if (USE_ACC_FP16) {
-        caffe2::custom_fp16_gemm_with_trans(
-            trans_a_ ? CblasTrans : CblasNoTrans,
-            trans_b_ ? CblasTrans : CblasNoTrans,
-            M,
-            K,
-            N,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            true, /* use acc16*/
-            USE_TMP_ACCUMULATOR);
-      } else if (USE_CUSTOM_ACC32) {
-        caffe2::custom_fp16_gemm_with_trans(
-            trans_a_ ? CblasTrans : CblasNoTrans,
-            trans_b_ ? CblasTrans : CblasNoTrans,
-            M,
-            K,
-            N,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            false, /* use acc32*/
-            USE_TMP_ACCUMULATOR);
-      } else {
-        math::Gemm<T, Context, Engine>(
-            trans_a_ ? CblasTrans : CblasNoTrans,
-            trans_b_ ? CblasTrans : CblasNoTrans,
-            M,
-            N,
-            K,
-            1.0f,
-            A_data,
-            B_data,
-            0.0f,
-            Y_data,
-            &context_);
-      }
-
-    } else if (A_batch_size == 1) {
-      caffe2::custom_fp16_gemm_strided_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_data,
-          0,
-          B_data,
-          K * N,
-          0.0f,
-          Y_data,
-          M * N,
-          &context_);
-    } else if (B_batch_size == 1) {
-      caffe2::custom_fp16_gemm_strided_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_data,
-          M * K,
-          B_data,
-          0,
-          0.0f,
-          Y_data,
-          M * N,
-          &context_);
-    } else if (!is_broadcast_dims) {
-      caffe2::custom_fp16_gemm_strided_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_data,
-          M * K,
-          B_data,
-          K * N,
-          0.0f,
-          Y_data,
-          M * N,
-          &context_);
-    } else {
-      std::vector<const T*> A_ptr(Y_batch_size);
-      std::vector<const T*> B_ptr(Y_batch_size);
-      std::vector<T*> Y_ptr(Y_batch_size);
-      std::vector<std::int64_t> index(batch_dim);
-      for (std::int64_t i = 0; i < Y_batch_size; ++i) {
-        const std::int64_t A_index = math::utils::GetIndexFromDims(
-            batch_dim, A_broadcast_dims.data(), index.data());
-        const std::int64_t B_index = math::utils::GetIndexFromDims(
-            batch_dim, B_broadcast_dims.data(), index.data());
-        A_ptr[i] = A_data + A_index * M * K;
-        B_ptr[i] = B_data + B_index * K * N;
-        Y_ptr[i] = Y_data + i * M * N;
-        math::utils::IncreaseIndexInDims(
-            batch_dim, Y_broadcast_dims.data(), index.data());
-      }
-      caffe2::custom_fp16_gemm_batched(
-          USE_ACC_FP16,
-          USE_CUSTOM_ACC32,
-          USE_TMP_ACCUMULATOR,
-          trans_a_ ? CblasTrans : CblasNoTrans,
-          trans_b_ ? CblasTrans : CblasNoTrans,
-          Y_batch_size,
-          M,
-          N,
-          K,
-          1.0f,
-          A_ptr.data(),
-          B_ptr.data(),
-          0.0f,
-          Y_ptr.data(),
-          &context_);
-    }
-    fbgemm::RoundToFloat16(
-        reinterpret_cast<const float*>(Y_data),
-        Y_data,
-        Y->numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-
- private:
-  const bool trans_a_;
-  const bool trans_b_;
-  const bool broadcast_;
-};
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_BATCH_MATMUL_OP_H_
--- a/caffe2/contrib/fakelowp/common.cc
+++ b/caffe2/contrib/fakelowp/common.cc
@ -1,5 +0,0 @@
-#include "caffe2/core/init.h"
-
-C10_DEFINE_bool(caffe2_fbgemm_fake_fp16_clamp, true, "");
-
-C10_DEFINE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms, true, "");
--- a/caffe2/contrib/fakelowp/common.h
+++ b/caffe2/contrib/fakelowp/common.h
@ -1,5 +0,0 @@
-#pragma once
-
-namespace caffe2 {
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/elementwise_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/elementwise_fp16_fake_op.cc
@ -1,102 +0,0 @@
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/contrib/fakelowp/sum_fp16_fake_op.h"
-#include "caffe2/operators/elementwise_add_op.h"
-#include "caffe2/operators/elementwise_div_op.h"
-#include "caffe2/operators/elementwise_mul_op.h"
-#include "caffe2/operators/elementwise_sub_op.h"
-#include "caffe2/operators/utility_ops.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-namespace {
-
-int getSizeFromDims(const std::vector<int>& dims) {
-  int tot = 1;
-  for (auto i = 0; i < dims.size(); i++) {
-    tot *= dims[i];
-  }
-  return tot;
-}
-
-template <class Functor>
-struct FP16PairWiseCPUFunctor {
-  template <typename TIn, typename TOut>
-  bool Forward(
-      const std::vector<int>& A_dims,
-      const std::vector<int>& B_dims,
-      const TIn* A,
-      const TIn* B,
-      TOut* C,
-      CPUContext* context) const {
-    functor.Forward(A_dims, B_dims, A, B, C, context);
-
-    return true;
-  }
-
-  template<>
-  bool Forward<float, float>(
-      const std::vector<int>& A_dims,
-      const std::vector<int>& B_dims,
-      const float* A,
-      const float* B,
-      float* C,
-      CPUContext* context) const {
-    auto A_sz = getSizeFromDims(A_dims);
-    auto B_sz = getSizeFromDims(B_dims);
-
-    std::vector<float> A_fp16(A_sz);
-    std::vector<float> B_fp16(B_sz);
-
-    fbgemm::RoundToFloat16(
-        A, A_fp16.data(), A_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        B, B_fp16.data(), B_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    functor.Forward(A_dims, B_dims, A_fp16.data(), B_fp16.data(), C, context);
-    fbgemm::RoundToFloat16(C, C, A_sz, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
-  Functor functor;
-};
-} // namespace
-
-REGISTER_CPU_OPERATOR(SumFakeFp16, SumFP16FP16AccOp<CPUContext>);
-OPERATOR_SCHEMA(SumFakeFp16).NumInputs(1, INT_MAX).NumOutputs(1, INT_MAX);
-
-REGISTER_CPU_OPERATOR(
-    AddFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float, int, long>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<AddFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(AddFakeFp16).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    DivFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float, double>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<DivFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(DivFakeFp16).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    MulFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<MulFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(MulFakeFp16).NumInputs(2).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(
-    SubFakeFp16,
-    BinaryElementwiseOp<
-        TensorTypes<float>,
-        CPUContext,
-        FP16PairWiseCPUFunctor<SubFunctor<CPUContext>>>);
-OPERATOR_SCHEMA(SubFakeFp16).NumInputs(2).NumOutputs(1);
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/fp16_fc_acc_op.cc
+++ b/caffe2/contrib/fakelowp/fp16_fc_acc_op.cc
@ -1,110 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <functional>
-
-#include "caffe2/contrib/fakelowp/fp16_fc_acc_op.h"
-#include "caffe2/core/init.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/operators/fc_inference.h"
-
-namespace caffe2 {
-
-template <>
-int Fp16FCAccOp<CPUContext, DefaultEngine, false>::runs = 0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, false>::total_error = 0.0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, false>::total_error_with_bias =
-    0.0;
-
-template <>
-int Fp16FCAccOp<CPUContext, DefaultEngine, true>::runs = 0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, true>::total_error = 0.0;
-template <>
-float Fp16FCAccOp<CPUContext, DefaultEngine, true>::total_error_with_bias = 0.0;
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc32,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        false /* USE_ACC_FP16 */,
-        true /* USE_TMP_ACCUMULATOR */,
-        false /* ADD_BIAS_FIRST */>);
-
-using namespace std::placeholders;
-
-OPERATOR_SCHEMA(Fp16FCAcc32)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc16,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        true /* USE_ACC_FP16 */,
-        true /* USE_TMP_ACCUMULATOR */,
-        false /* ADD_BIAS_FIRST */>);
-
-OPERATOR_SCHEMA(Fp16FCAcc16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc32NNPI,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        false /* USE_ACC_FP16 */,
-        false /* USE_TMP_ACCUMULATOR */,
-        true /* ADD_BIAS_FIRST */>);
-
-OPERATOR_SCHEMA(Fp16FCAcc32NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-
-REGISTER_CPU_OPERATOR(
-    Fp16FCAcc16NNPI,
-    Fp16FCAccOp<
-        CPUContext,
-        DefaultEngine,
-        true /* USE_ACC_FP16 */,
-        false /* USE_TMP_ACCUMULATOR */,
-        true /* ADD_BIAS_FIRST */>);
-
-OPERATOR_SCHEMA(Fp16FCAcc16NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .TensorInferenceFunction(std::bind(FCShapeInference, _1, _2, false))
-    .CostInferenceFunction(OpSchema::CostInferenceFunctionType(
-        std::bind(CostInferenceForFC, _1, _2, false)))
-    .SetDoc(R"DOC(Same as FC)DOC");
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/fp16_fc_acc_op.h
+++ b/caffe2/contrib/fakelowp/fp16_fc_acc_op.h
@ -1,398 +0,0 @@
-/**
- * Copyright (c) 2016-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <fbgemm/FbgemmConvert.h>
-#include <fbgemm/FbgemmFP16.h>
-#include <immintrin.h>
-
-#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor.h"
-#include "caffe2/utils/conversions.h"
-#include "caffe2/utils/math.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-using namespace std;
-
-// C2 wrapper for fp16 gemm with fp16 accumulation
-template <
-    class Context,
-    class Engine = DefaultEngine,
-    bool USE_ACC_FP16 = false, // Whether use fp16 accumulation
-    bool USE_TMP_ACCUMULATOR = false,
-    bool ADD_BIAS_FIRST = false>
-class Fp16FCAccOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  Fp16FCAccOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        axis_(OperatorBase::GetSingleArgument<int32_t>("axis", 1)),
-        axis_w_(OperatorBase::GetSingleArgument<int32_t>("axis_w", 1)) {}
-  ~Fp16FCAccOp() noexcept override {
-    if (X_fp16_ != nullptr) {
-      delete[] X_fp16_;
-    }
-    if (W_fp16_ != nullptr) {
-      delete[] W_fp16_;
-    }
-    if (b_fp16_ != nullptr) {
-      delete[] b_fp16_;
-    }
-    if (bias_multiplier_fp16_ != nullptr) {
-      delete[] bias_multiplier_fp16_;
-    }
-  }
-
-  // template on X, B, W and Y.
-  template <typename T_X, typename T_B, typename T_W, typename T_Y>
-  bool DoRunWithType() {
-    const auto& X = Input(0);
-    const auto& W_blob = OperatorBase::InputBlob(1);
-    const auto& b = Input(2);
-    auto* Y = Output(0);
-    CAFFE_ENFORCE(b.ndim() == 1, b.ndim());
-    // batch size
-    const auto canonical_axis = X.canonical_axis_index(axis_);
-    const int M = X.size_to_dim(canonical_axis);
-    const int N = b.size();
-    const int K = X.size_from_dim(canonical_axis);
-
-    Y_shape_cache_ = X.sizes().vec();
-    // This is an invariant of canonical_axis, so we can DCHECK.
-    TORCH_DCHECK_LE(canonical_axis + 1, Y_shape_cache_.size());
-    Y_shape_cache_.resize(canonical_axis + 1);
-    Y_shape_cache_[canonical_axis] = N;
-    Y->Resize(Y_shape_cache_);
-
-    if (X.size() == 0) {
-      // skip the rest of the computation if X is empty
-      Y->template mutable_data<T_Y>();
-      return true;
-    }
-
-    // Convert X and W to fp16
-    int X_size = M * K;
-    int W_size = N * K;
-    if (X_fp16_ == nullptr) {
-      X_fp16_ = new float[X_size];
-      X_size_cached_ = X_size;
-    } else if (X_size > X_size_cached_) {
-      delete[] X_fp16_;
-      X_fp16_ = new float[X_size];
-      X_size_cached_ = X_size;
-    }
-    fbgemm::RoundToFloat16(
-        X.template data<T_X>(),
-        X_fp16_,
-        X_size,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    if (W_fp16_ == nullptr) {
-      W_fp16_ = new float[W_size];
-      const T_W* W_data = nullptr;
-      if (W_blob.template IsType<
-              caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
-        auto* W_fbgemm =
-            OperatorBase::Input<
-                caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>(1)
-                .get();
-
-        if (!W_fbgemm->packed()) {
-          float* W_fp16_trans = new float[W_size];
-          fbgemm::Float16ToFloat_avx2(W_fbgemm->pmat(), W_fp16_trans, W_size);
-          for (const auto i : c10::irange(N)) {
-            for (const auto j : c10::irange(K)) {
-              W_fp16_[j * N + i] = W_fp16_trans[i * K + j];
-            }
-          }
-          delete[] W_fp16_trans;
-        } else {
-          vector<fbgemm::float16> unpacked_mat;
-          unpacked_mat.resize(W_size);
-          W_fbgemm->unpack(
-              unpacked_mat.data(), fbgemm::matrix_op_t::NoTranspose);
-          fbgemm::Float16ToFloat_avx2(unpacked_mat.data(), W_fp16_, W_size);
-        }
-
-      } else {
-        const auto& W = Input(1);
-        W_data = W.template data<T_W>();
-        // Transpose W
-        for (const auto i : c10::irange(N)) {
-          for (const auto j : c10::irange(K)) {
-            W_fp16_[j * N + i] = W_data[i * K + j];
-          }
-        }
-      }
-
-      fbgemm::RoundToFloat16(
-          W_fp16_, W_fp16_, W_size, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    }
-
-    auto Y_data = Y->template mutable_data<T_Y>();
-    int Y_size = M * N;
-
-    // Initialize Y
-    memset(Y_data, 0.0, sizeof(float) * Y_size);
-
-    // Add bias term, accumulation is in fp16.
-    if (bias_multiplier_.size() != M) {
-      // If the helper bias multiplier is not M, reshape and fill it with one.
-      bias_multiplier_.Resize(M);
-      math::Set<T_B, Context>(
-          M,
-          convert::To<float, T_B>(1),
-          bias_multiplier_.template mutable_data<T_B>(),
-          &context_);
-    }
-    if (bias_multiplier_fp16_ == nullptr) {
-      bias_multiplier_fp16_ = new float[M];
-      M_cached_ = M;
-    } else if (M > M_cached_) {
-      delete[] bias_multiplier_fp16_;
-      bias_multiplier_fp16_ = new float[M];
-      M_cached_ = M;
-    }
-    fbgemm::RoundToFloat16(
-        bias_multiplier_.template data<T_B>(),
-        bias_multiplier_fp16_,
-        M,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    if (b_fp16_ == nullptr) {
-      b_fp16_ = new float[N];
-    }
-    fbgemm::RoundToFloat16(
-        b.template data<T_B>(),
-        b_fp16_,
-        N,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    if (ADD_BIAS_FIRST) {
-      custom_fp16_gemm(
-          M,
-          1,
-          N,
-          bias_multiplier_fp16_,
-          b_fp16_,
-          0.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      float* Y_ref = new float[M * N]();
-      TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
-      math::Gemm<T_B, Context, Engine>(
-          CblasNoTrans,
-          CblasNoTrans,
-          M,
-          N,
-          1,
-          1,
-          bias_multiplier_.template data<T_B>(),
-          b.template data<T_B>(),
-          0.f,
-          Y_ref,
-          &context_,
-          math_type);
-
-      relative_error =
-          compute_relative_error(Y->template mutable_data<T_Y>(), Y_ref, M * N);
-      total_error_with_bias += relative_error;
-      VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-          << "Relative error for Y = bias_multiplier_ * b' = " << relative_error
-          << ", average error with bias after " << runs
-          << " runs = " << total_error_with_bias / runs << endl;
-#endif
-
-      custom_fp16_gemm(
-          M,
-          K,
-          N,
-          X_fp16_,
-          W_fp16_,
-          1.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      if (!W_blob.IsType<caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
-        const auto& W = Input(1);
-        math::Gemm<float, Context, Engine>(
-            CblasNoTrans,
-            CblasTrans,
-            M,
-            N,
-            K,
-            1,
-            X.template data<T_X>(),
-            W.template data<T_W>(),
-            1.f,
-            Y_ref,
-            &context_,
-            math_type);
-
-        runs++;
-        float relative_error = compute_relative_error(
-            Y->template mutable_data<T_Y>(), Y_ref, M * N);
-        total_error += relative_error;
-        VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-            << "Relative error for Y = bias_multiplier_ * b' + X * W' = "
-            << relative_error << ", average error after " << runs
-            << " runs = " << total_error / runs << endl;
-
-        if (Y_ref != nullptr) {
-          delete[] Y_ref;
-        }
-      }
-#endif
-
-    } else {
-      custom_fp16_gemm(
-          M,
-          K,
-          N,
-          X_fp16_,
-          W_fp16_,
-          0.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      if (!W_blob.IsType<caffe2::unique_ptr<fbgemm::PackedGemmMatrixFP16>>()) {
-        const auto& W = Input(1);
-        float* Y_ref = new float[M * N]();
-        TensorProto::DataType math_type = TensorProto_DataType_FLOAT;
-        math::Gemm<float, Context, Engine>(
-            CblasNoTrans,
-            CblasTrans,
-            M,
-            N,
-            K,
-            1,
-            X.template data<T_X>(),
-            W.template data<T_W>(),
-            0.f,
-            Y_ref,
-            &context_,
-            math_type);
-
-        runs++;
-        float relative_error = compute_relative_error(
-            Y->template mutable_data<T_Y>(), Y_ref, M * N);
-        total_error += relative_error;
-        VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-            << "Relative error for Y = X * W' = " << relative_error
-            << ", average error after " << runs
-            << " runs = " << total_error / runs << endl;
-      }
-#endif
-
-      custom_fp16_gemm(
-          M,
-          1,
-          N,
-          bias_multiplier_fp16_,
-          b_fp16_,
-          1.f,
-          Y->template mutable_data<T_Y>(),
-          USE_ACC_FP16,
-          USE_TMP_ACCUMULATOR);
-
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-      math::Gemm<T_B, Context, Engine>(
-          CblasNoTrans,
-          CblasNoTrans,
-          M,
-          N,
-          1,
-          1,
-          bias_multiplier_.template data<T_B>(),
-          b.template data<T_B>(),
-          1,
-          Y_ref,
-          &context_,
-          math_type);
-
-      relative_error =
-          compute_relative_error(Y->template mutable_data<T_Y>(), Y_ref, M * N);
-      total_error_with_bias += relative_error;
-      VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-          << "Relative error for Y = X * W' + bias_multiplier_ * b' = "
-          << relative_error << ", average error with bias after " << runs
-          << " runs = " << total_error_with_bias / runs << endl;
-      if (Y_ref != nullptr) {
-        delete[] Y_ref;
-      }
-#endif
-    }
-
-    return true;
-  }
-
-#ifdef LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG
-  float compute_L2_norm(float* A, int size) {
-    float square_sum = 0.0;
-    for (const auto i : c10::irange(size)) {
-      square_sum += A[i] * A[i];
-    }
-    return std::sqrt(square_sum);
-  }
-
-  float compute_relative_error(float* A, float* A_ref, int size) {
-    float error = 0.0;
-    for (const auto i : c10::irange(size)) {
-      error += (A[i] - A_ref[i]) * (A[i] - A_ref[i]);
-    }
-    error = std::sqrt(error);
-    float L2_norm = compute_L2_norm(A, size);
-    return error / L2_norm;
-  }
-#endif
-
-  bool RunOnDevice() override {
-    return DoRunWithType<
-        float, // X
-        float, // B
-        float, // W
-        float>(); // Y
-  }
-
- protected:
-  size_t axis_{1};
-  size_t axis_w_{1};
-  size_t X_size_cached_{0};
-  size_t M_cached_{0};
-  static int runs;
-  static float total_error;
-  static float total_error_with_bias;
-  float* X_fp16_ = nullptr;
-  float* W_fp16_ = nullptr;
-  float* b_fp16_ = nullptr;
-  float* bias_multiplier_fp16_ = nullptr;
-  // A local vector to cache the output shape so we don't need to recreate
-  // a vector object every time we run Run().
-  vector<int64_t> Y_shape_cache_;
-  Tensor bias_multiplier_{Context::GetDeviceType()};
-};
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/fp16_fma.cc
+++ b/caffe2/contrib/fakelowp/fp16_fma.cc
@ -1,129 +0,0 @@
-#include "fp16_fma.h"
-#include <immintrin.h>
-#include <cmath>
-#include <cstdint>
-
-namespace fake_fp16 {
-
-// Compute fp16 FMA using fp16
-// Out = FMA (A, B, Out)
-//
-// Algorithm:
-//  Do an FMA in fp64
-//  Since fp16 has 10 bits of mantissa and fp64 has 52, zero out
-//   42 bits.
-//  Extract the exponent.
-//  If the exponent ends up in the subnormal range, shift out
-//  only 42 - (14 + exponent).
-//  Compute the bounce value as a value that is big enough to
-//  push all the digits except for the required ones in fp16,
-//  the objective is to push digits to let the machine do rounding.
-//  Add 42 or the computed number (in case of denormals) to the exponent.
-//  For negative numbers set the highest bit of the mantissa to 1.
-void fma_fp16(int N, const float* A, const float* B, float* Out) {
-  constexpr int blockSize = 4;
-  constexpr uint64_t mask = 0x7ff0000000000000;
-  constexpr uint64_t shift_bits = 52;
-  constexpr uint64_t offset = 1023;
-  constexpr uint64_t dbl_threehalf = 0x3ff8000000000000;
-
-  uint64_t expo_bouncer;
-
-  // It can be proven than in the absence of intermediate overflow
-  // the desired numerical result can be obtained even with the
-  // possibility of a double rounding, as follow.
-  //    round-to-fp16-precision(   (double)A * (double)B + (double)C  )
-  // This statement is not proved here; but we explain how to round a fp64
-  // number into fp16 precision using the technique of a "Bouncer"
-  // Suppose a numerical value in fp64 has exponent value of E
-  // If -14 <= E <= 15 (the fp16 exponent value for normalized number),
-  // the lsb of this value in fp16 precision is 2^(E-10).
-  // Now consider this fp64 number Bouncer which is 2^(52+(E-10)) * 3/2
-  // The lsb of Bouncer is (by design) 2^(E-10). Because Bouncer is
-  // is very much bigger than the fp16 value, denoted by say x,
-  //          2^(52+(E-10)) < Bouncer + x < 2^(53+(E-10))
-  // Thus TMP := Bouncer + x  in double precision forces x to be rounded off
-  // at the lsb position of 2^(E-10).
-  // Consequently, the subtraction yields the desired result
-  //          x_fp16_precision := TMP - Bouncer;
-  // If E < -14, we are dealing with the subnormal number range, there the lsb
-  // of fp16 precision is FIXED at 2^(-24) (definition of fp16).
-  // Hence the Bouncer is set at 2^(52-24) = 2^(28)
-
-  int n = 0;
-  for (; n + blockSize < N; n += blockSize) {
-    __m256d mA = _mm256_cvtps_pd(_mm_loadu_ps(A + n));
-    __m256d mB = _mm256_cvtps_pd(_mm_loadu_ps(B + n));
-    __m256d mOut = _mm256_cvtps_pd(_mm_loadu_ps(Out + n));
-
-    mOut = _mm256_fmadd_pd(mA, mB, mOut);
-
-    __m256i mExpv =
-        _mm256_and_si256(_mm256_castpd_si256(mOut), _mm256_set1_epi64x(mask));
-    mExpv = _mm256_srli_epi64(mExpv, shift_bits);
-    mExpv = _mm256_sub_epi64(mExpv, _mm256_set1_epi64x(offset));
-
-    __m256i cmp = _mm256_cmpgt_epi64(_mm256_set1_epi64x(-14), mExpv);
-
-    __m256i mExpoBouncer = _mm256_and_si256(cmp, _mm256_set1_epi64x(28));
-    mExpoBouncer = _mm256_or_si256(
-        mExpoBouncer,
-        _mm256_andnot_si256(
-            cmp, _mm256_add_epi64(_mm256_set1_epi64x(42), mExpv)));
-
-    __m256i mBouncer = _mm256_add_epi64(
-        _mm256_set1_epi64x(dbl_threehalf),
-        _mm256_slli_epi64(mExpoBouncer, shift_bits));
-
-    mOut = _mm256_sub_pd(
-        _mm256_add_pd(_mm256_castsi256_pd(mBouncer), mOut),
-        _mm256_castsi256_pd(mBouncer));
-
-    _mm_storeu_ps(Out + n, _mm256_cvtpd_ps(mOut));
-  }
-  // Epilogue
-  for (; n < N; n++) {
-    typedef union {
-      uint64_t I;
-      double F;
-    } flint64;
-
-    flint64 A_, B_, Out_, Bouncer;
-    A_.F = A[n];
-    B_.F = B[n];
-    Out_.F = Out[n];
-
-    // This is FMA in FP64
-    Out_.F = std::fma(A_.F, B_.F, Out_.F);
-
-    // We now round Out_.F to fp16 precision using a Bouncer
-
-    // First, figure out the exponent value E of Out_.F
-    int64_t expv = ((Out_.I & mask) >> shift_bits) - offset;
-
-    // Second: create the Bouncer. To do that, we
-    // first compute its exponent and then add that exponent value
-    // to the exponent field of the constant 3/2.
-    if (expv < -14) {
-      expo_bouncer = 28;
-    } else {
-      expo_bouncer = 42 + expv;
-    }
-    Bouncer.I = dbl_threehalf + (expo_bouncer << shift_bits);
-
-    // This is rounding to fp16 precision; add and subtract Bouncer
-    Out_.F = (Bouncer.F + Out_.F) - Bouncer.F;
-    Out[n] = Out_.F;
-  }
-}
-
-float fmafp32_avx_emulation(float v1, float v2, float v3) {
-  __m256 v1Vec = _mm256_set1_ps(v1);
-  __m256 v2Vec = _mm256_set1_ps(v2);
-  __m256 v3Vec = _mm256_set1_ps(v3);
-  __m256 resVec = _mm256_fmadd_ps(v1Vec, v2Vec, v3Vec);
-  float *result = (float *)&resVec;
-  return *result;
-}
-
-} // namespace fake_fp16
--- a/caffe2/contrib/fakelowp/fp16_fma.h
+++ b/caffe2/contrib/fakelowp/fp16_fma.h
@ -1,16 +0,0 @@
-#pragma once
-#include <glog/logging.h>
-
-namespace fake_fp16 {
-
-// Compute FMA using fp16 accumulation
-// Out = FMA (A, B, Out)
-void fma_fp16(int N, const float* A, const float* B, float* Out);
-
-void fma_fp16_slow(int N, const float* A, const float* B, float* Out);
-
-float fma_fp16_slow(const float A, const float B, float Out);
-
-float fmafp32_avx_emulation(float v1, float v2, float v3);
-
-} // namespace fake_fp16
--- a/caffe2/contrib/fakelowp/fp16_fma_slow.cc
+++ b/caffe2/contrib/fakelowp/fp16_fma_slow.cc
@ -1,540 +0,0 @@
-#include <immintrin.h>
-#include "fp16_fma.h"
-
-namespace fp16_fma {
-
-typedef int int16;
-typedef char int8;
-typedef unsigned short int bits16;
-typedef unsigned int bits32;
-typedef signed char Word8;
-typedef unsigned char UWord8;
-typedef signed short Word16;
-typedef unsigned short UWord16;
-typedef signed int Word32;
-typedef unsigned int UWord32;
-typedef long long Word64;
-typedef unsigned long long UWord64;
-typedef unsigned short float16;
-typedef signed int sbits32;
-typedef signed short int sbits16;
-
-typedef char flag;
-
-#define MAX_U32 (UWord32)0xffffffffL
-#define MAX_U16 (UWord16)0xffff
-#define BITMASK_T(typ, w) (((typ)1 << (w)) - 1)
-#define TESTBIT(x, n) (((x) >> (n)) & 1)
-
-#define float16_default_nan 0x7E00
-#define float16_default_nan_pos 0x7E00
-#define float16_default_nan_neg 0xFE00
-
-int8 float_exception_flags = 0;
-
-enum {
-  float_round_nearest_even = 0,
-  float_round_down = 1,
-  float_round_up = 2,
-  float_round_to_zero = 3
-};
-
-int8 float_rounding_mode = float_round_nearest_even;
-enum { float_tininess_after_rounding = 0, float_tininess_before_rounding = 1 };
-int float_detect_tininess = float_tininess_after_rounding;
-
-inline bits16 extractFloat16Frac(float16 a) {
-  return a & 0x3FF;
-}
-
-inline int16 extractFloat16Exp(float16 a) {
-  return (a >> 10) & 0x1F;
-}
-
-inline flag extractFloat16Sign(float16 a) {
-  return a >> 15;
-}
-
-flag float16_is_quiet_nan(float16 a) {
-  return (0xFC00 <= (bits16)(a << 1));
-}
-
-flag float16_is_signaling_nan(float16 a) {
-  return (((a >> 9) & 0x3F) == 0x3E) && (a & 0x01FF);
-}
-
-enum {
-  float_flag_inexact = 1,
-  float_flag_divbyzero = 2,
-  float_flag_underflow = 4,
-  float_flag_overflow = 8,
-  float_flag_invalid = 16
-};
-
-void float_raise(int8 flags) {
-  float_exception_flags |= flags;
-}
-int pickNaNMulAdd(
-    flag aIsQNaN,
-    flag aIsSNaN,
-    flag bIsQNaN,
-    flag bIsSNaN,
-    flag cIsQNaN,
-    flag cIsSNaN,
-    flag infzero) {
-  if (infzero) {
-    float_raise(float_flag_invalid);
-    return 2;
-  }
-
-  if (cIsSNaN || cIsQNaN) {
-    return 2;
-  } else if (bIsSNaN || bIsQNaN) {
-    return 1;
-  } else {
-    return 0;
-  }
-}
-
-inline float16 packFloat16(flag zSign, int16 zExp, bits16 zSig) {
-  return (((bits16)zSign) << 15) + (((bits16)zExp) << 10) + zSig;
-}
-
-float16
-propagateFloat16MulAddNaN(float16 a, float16 b, float16 c, flag infzero) {
-  flag aIsQuietNaN, aIsSignalingNaN, bIsQuietNaN, bIsSignalingNaN, cIsQuietNaN,
-      cIsSignalingNaN;
-  int selNaN;
-
-  aIsQuietNaN = float16_is_quiet_nan(a);
-  aIsSignalingNaN = float16_is_signaling_nan(a);
-  bIsQuietNaN = float16_is_quiet_nan(b);
-  bIsSignalingNaN = float16_is_signaling_nan(b);
-  cIsQuietNaN = float16_is_quiet_nan(c);
-  cIsSignalingNaN = float16_is_signaling_nan(c);
-
-  if (aIsSignalingNaN | bIsSignalingNaN | cIsSignalingNaN) {
-    float_raise(float_flag_invalid);
-  }
-
-  selNaN = pickNaNMulAdd(
-      aIsQuietNaN,
-      aIsSignalingNaN,
-      bIsQuietNaN,
-      bIsSignalingNaN,
-      cIsQuietNaN,
-      cIsSignalingNaN,
-      infzero);
-
-  switch (selNaN) {
-    case 0:
-      return a | (1 << 9);
-    case 1:
-      return b | (1 << 9);
-    case 2:
-      return c | (1 << 9);
-    case 3:
-    default:
-      return float16_default_nan;
-  }
-}
-
-inline void shift32RightJamming(bits32 a, int16 count, bits32* zPtr) {
-  bits32 z;
-
-  if (count == 0) {
-    z = a;
-  } else if (count < 32) {
-    z = (a >> count) | ((a << ((-count) & 31)) != 0);
-  } else {
-    z = (a != 0);
-  }
-  *zPtr = z;
-}
-
-void shift16RightJamming(bits16 a, int16 count, bits16* zPtr) {
-  bits16 z;
-
-  if (count == 0) {
-    z = a;
-  } else if (count < 16) {
-    z = (a >> count) | (((a << ((-count) & 15)) & 0xffff) != 0);
-  } else {
-    z = (a != 0);
-  }
-  *zPtr = z;
-}
-
-Word8 GetRound(Word32 fcr) {
-  Word8 res, round_mode;
-  round_mode = fcr & 0x3; // lower 2 bits as rounding mode in FCR
-  res = (round_mode == 3)
-      ? 1
-      : ((round_mode == 2)
-             ? 2
-             : ((round_mode == 1) ? 3 : 0)); // Translate to float_rounding_mode
-  return res;
-}
-
-Word8 GetException(Word32 fsr) {
-  Word8 res = 0;
-  if (TESTBIT(fsr, 7) == 1)
-    res |= 32; // float_flag_inexact
-  if (TESTBIT(fsr, 8) == 1)
-    res |= 16; // float_flag_underflow
-  if (TESTBIT(fsr, 9) == 1)
-    res |= 8; // float_flag_overflow
-  if (TESTBIT(fsr, 10) == 1)
-    res |= 4; // float_flag_divbyzero
-  if (TESTBIT(fsr, 11) == 1)
-    res |= 1; // float_flag_invalid
-  return res;
-}
-
-float16 roundAndPackFloat16(flag zSign, int16 zExp, bits16 zSig) {
-  int8 roundingMode;
-  flag roundNearestEven;
-  int8 roundIncrement, roundBits;
-  flag isTiny;
-
-  roundingMode = float_rounding_mode;
-  roundNearestEven = (roundingMode == float_round_nearest_even);
-  roundIncrement = 0x8;
-  if (!roundNearestEven) {
-    //    if ( ( ! roundNearestEven ) && ( roundingMode !=
-    //    float_round_ties_away) ) {
-    if (roundingMode == float_round_to_zero) {
-      roundIncrement = 0;
-    } else {
-      roundIncrement = 0xF;
-      if (zSign) {
-        if (roundingMode == float_round_up)
-          roundIncrement = 0;
-      } else {
-        if (roundingMode == float_round_down)
-          roundIncrement = 0;
-      }
-    }
-  }
-  roundBits = zSig & 0xF;
-  if (0x1D <= (bits16)zExp) {
-    if ((0x1D < zExp) ||
-        ((zExp == 0x1D) && ((sbits16)(zSig + roundIncrement) < 0))) {
-      float_raise(float_flag_overflow | float_flag_inexact);
-      return packFloat16(zSign, 0x1F, 0) - (roundIncrement == 0);
-    }
-    if (zExp < 0) {
-      isTiny = (float_detect_tininess == float_tininess_before_rounding) ||
-          (zExp < -1) || (zSig + roundIncrement < 0x8000);
-      shift16RightJamming(zSig, -zExp, &zSig);
-      zExp = 0;
-      roundBits = zSig & 0xF;
-
-      if (isTiny && roundBits)
-        float_raise(float_flag_underflow);
-    }
-  }
-  if (roundBits)
-    float_exception_flags |= float_flag_inexact;
-  zSig = (zSig + roundIncrement) >> 4;
-  zSig &= ~(((roundBits ^ 0x8) == 0) & roundNearestEven);
-  if (zSig == 0)
-    zExp = 0;
-  return packFloat16(zSign, zExp, zSig);
-}
-
-int8 countLeadingZeros32(bits32 a) {
-  static const int8 countLeadingZerosHigh[] = {
-      8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3,
-      3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-      2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
-      1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
-  int8 shiftCount;
-
-  shiftCount = 0;
-  if (a < 0x10000) {
-    shiftCount += 16;
-    a <<= 16;
-  }
-  if (a < 0x1000000) {
-    shiftCount += 8;
-    a <<= 8;
-  }
-  shiftCount += countLeadingZerosHigh[a >> 24];
-  return shiftCount;
-}
-
-void normalizeFloat16Subnormal(bits16 aSig, int16* zExpPtr, bits16* zSigPtr) {
-  int8 shiftCount;
-
-  shiftCount = countLeadingZeros32((bits32)aSig) - 16 - 5;
-  *zSigPtr = aSig << shiftCount;
-  *zExpPtr = 1 - shiftCount;
-}
-
-float16 float16_muladd(float16 a, float16 b, float16 c, flag negate_product) {
-  flag aSign, bSign, cSign, zSign;
-  int16 aExp, bExp, cExp, pExp, zExp, expDiff;
-  bits16 aSig, bSig, cSig;
-  flag pInf, pZero, pSign;
-  bits32 pSig32, cSig32, zSig32;
-  bits16 pSig;
-  int shiftcount;
-  flag infzero;
-
-  /* Extract the sign bit, exponent and significant  */
-  aSig = extractFloat16Frac(a);
-  aExp = extractFloat16Exp(a);
-  aSign = extractFloat16Sign(a);
-
-  bSig = extractFloat16Frac(b);
-  bExp = extractFloat16Exp(b);
-  bSign = extractFloat16Sign(b);
-
-  cSig = extractFloat16Frac(c);
-  cExp = extractFloat16Exp(c);
-  cSign = extractFloat16Sign(c);
-
-  /* Flag to indicate fusedMultiplyAdd(0, inf,  or fusedMultiplyAdd(inf, 0 c) */
-  infzero =
-      ((aExp == 0 && aSig == 0 && bExp == 0x1f && bSig == 0) ||
-       (aExp == 0x1f && aSig == 0 && bExp == 0 && bSig == 0));
-
-  /* CASE1: if any input is NaN =>  NaN propagate */
-
-  /* It is implementation-defined whether the cases of (0,inf,qnan)
-   * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
-   * they return if they do), so we have to hand this information
-   * off to the target-specific pick-a-NaN routine.
-   */
-
-  /* IEEE754 7.2 - Invalid: fusedMultiplyAdd(0, inf, c) or
-   * fusedMultiplyAdd(inf, 0 , c) unless c is a quiet NaN; If c is a
-   * quiet NaN then it is implementation defined whether the invalid operation
-   * exception is signaled.
-   */
-  if (((aExp == 0x1f) && aSig) || ((bExp == 0x1f) && bSig) ||
-      ((cExp == 0x1f) && cSig)) {
-    return propagateFloat16MulAddNaN(a, b, c, infzero);
-  }
-
-  /* Work out the sign and type of the product */
-  pSign = aSign ^ bSign;
-  if (negate_product) {
-    pSign ^= 1;
-  }
-
-  /* CASE2: fusedMultiplyAdd(0, inf, c) or fusedMultiplyAdd(inf,0,  c) and c is
-   * not NaN  => raise invalid */
-  if (infzero) {
-    float_raise(float_flag_invalid);
-    return float16_default_nan;
-  }
-
-  pInf = (aExp == 0x1f) || (bExp == 0x1f);
-  pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
-
-  /* CASE3 and CASE4: c is inf, p is number or inf*/
-  if (cExp == 0x1f) {
-    if (pInf && (pSign ^ cSign)) {
-      /* CASE3: addition of opposite-signed infinities => InvalidOperation */
-      float_raise(float_flag_invalid);
-      return float16_default_nan;
-    }
-    /* CASE4: Otherwise generate an infinity of the same sign */
-    return packFloat16(cSign, 0x1f, 0);
-  }
-
-  /* CASE5: c is number and p is inf */
-  if (pInf) {
-    return packFloat16(pSign, 0x1f, 0);
-  }
-
-  /* CASE6: c is number, p is zero */
-  if (pZero) {
-    if (cExp == 0) {
-      if (cSig == 0) {
-        /* Adding two exact zeroes */
-        if (pSign == cSign) {
-          zSign = pSign;
-        } else if (float_rounding_mode == float_round_down) {
-          zSign = 1;
-        } else {
-          zSign = 0;
-        }
-        return packFloat16(zSign, 0, 0);
-      }
-    }
-    /* CASE7: Zero plus something non-zero : just return the something */
-    return c;
-  }
-
-  if (aExp == 0) {
-    normalizeFloat16Subnormal(aSig, &aExp, &aSig);
-  }
-  if (bExp == 0) {
-    normalizeFloat16Subnormal(bSig, &bExp, &bSig);
-  }
-
-  /* Calculate the actual result a * b + c */
-
-  /* NOTE: we subtract 0x7e where float16_mul() subtracts 0x7f
-   * because we want the true exponent, not the "one-less-than"
-   * flavour that roundAndPackFloat16() takes.
-   */
-  pExp = aExp + bExp - 0xe;
-  aSig = (aSig | 0x0400) << 4;
-  bSig = (bSig | 0x0400) << 5;
-  pSig32 = (bits32)aSig * bSig;
-  if ((sbits32)(pSig32 << 1) >= 0) {
-    pSig32 <<= 1;
-    pExp--;
-  }
-
-  zSign = pSign;
-
-  /* Now pSig32 is the significand of the multiply, with the explicit bit in
-   * position 30.
-   */
-  if (cExp == 0) {
-    if (!cSig) {
-      /* Throw out the special case of c being an exact zero now */
-      shift32RightJamming(pSig32, 16, &pSig32);
-      pSig = pSig32;
-      return roundAndPackFloat16(zSign, pExp - 1, pSig);
-    }
-    normalizeFloat16Subnormal(cSig, &cExp, &cSig);
-  }
-
-  cSig32 = (bits32)cSig << (30 - 10);
-  cSig32 |= 0x40000000;
-  expDiff = pExp - cExp;
-
-  if (pSign == cSign) {
-    /* Addition */
-    if (expDiff > 0) {
-      /* scale c to match p */
-      shift32RightJamming(cSig32, expDiff, &cSig32);
-      zExp = pExp;
-    } else if (expDiff < 0) {
-      /* scale p to match c */
-      shift32RightJamming(pSig32, -expDiff, &pSig32);
-      zExp = cExp;
-    } else {
-      /* no scaling needed */
-      zExp = cExp;
-    }
-    /* Add significands and make sure explicit bit ends up in posn 62 */
-    zSig32 = pSig32 + cSig32;
-    if ((sbits32)zSig32 < 0) {
-      shift32RightJamming(zSig32, 1, &zSig32);
-    } else {
-      zExp--;
-    }
-  } else {
-    /* Subtraction */
-    if (expDiff > 0) {
-      shift32RightJamming(cSig32, expDiff, &cSig32);
-      zSig32 = pSig32 - cSig32;
-      zExp = pExp;
-    } else if (expDiff < 0) {
-      shift32RightJamming(pSig32, -expDiff, &pSig32);
-      zSig32 = cSig32 - pSig32;
-      zExp = cExp;
-      zSign ^= 1;
-    } else {
-      zExp = pExp;
-      if (cSig32 < pSig32) {
-        zSig32 = pSig32 - cSig32;
-      } else if (pSig32 < cSig32) {
-        zSig32 = cSig32 - pSig32;
-        zSign ^= 1;
-      } else {
-        /* Exact zero */
-        zSign = 0;
-        if (float_rounding_mode == float_round_down) {
-          zSign ^= 1;
-        }
-        return packFloat16(zSign, 0, 0);
-      }
-    }
-    --zExp;
-    /* Normalize to put the explicit bit back into bit 62. */
-    shiftcount = countLeadingZeros32(zSig32) - 1;
-    zSig32 <<= shiftcount;
-    zExp -= shiftcount;
-  }
-  shift32RightJamming(zSig32, 16, &zSig32);
-  return roundAndPackFloat16(zSign, zExp, zSig32);
-}
-
-void fp_mac_h(
-    Word16 d0,
-    Word16 d1,
-    Word16 d2,
-    Word32 negate_product,
-    Word32 fcr,
-    Word32 fsr_i,
-    Word16* res,
-    Word32* fsr_o) {
-  // Extract rounding mode from FCR/FSR to softfloat
-  float_rounding_mode = GetRound(fcr);
-  float_exception_flags = GetException(fsr_i);
-  // Call softfloat lib
-  *res = float16_muladd(d1, d2, d0, negate_product);
-  //*fsr_o =  PutException(float_exception_flags, fsr_i);
-}
-
-void fma16(
-    const Word16 input,
-    const Word16 a,
-    const Word16 b,
-    const Word32 fcr,
-    const Word32 fsr_i,
-    Word16* result,
-    Word32* fsr_o) {
-  Word16 res;
-  Word32 fsr = 0;
-  // Call fp utility
-  fp_mac_h(b, input, a, 0, fcr, fsr_i, &res, &fsr);
-  // Output result
-  *fsr_o = fsr;
-  *result = res;
-}
-
-float fake_fma_fp16_slow(float v1, float v2, float v3) {
-  uint32_t fcr_val = 0;
-  uint32_t fsr_val = 0x00000F80;
-  uint32_t exception_flags = 0;
-
-  uint16_t hv1, hv2, hv3, hresult;
-  hv1 = _cvtss_sh(v1, 0);
-  hv2 = _cvtss_sh(v2, 0);
-  hv3 = _cvtss_sh(v3, 0);
-
-  fma16(
-      *reinterpret_cast<Word16*>(&hv1),
-      *reinterpret_cast<Word16*>(&hv2),
-      *reinterpret_cast<Word16*>(&hv3),
-      *reinterpret_cast<Word32*>(&fcr_val),
-      *reinterpret_cast<Word32*>(&fsr_val),
-      reinterpret_cast<Word16*>(&hresult),
-      reinterpret_cast<Word32*>(&exception_flags));
-
-  return _cvtsh_ss(hresult);
-}
-
-void fake_fma_fp16_slow(int N, const float* A, const float* B, float* Out) {
-  for (int n = 0; n < N; n++) {
-    Out[n] = fake_fma_fp16_slow(A[n], B[n], Out[n]);
-  }
-}
-
-} // namespace fp16_fma
--- a/caffe2/contrib/fakelowp/fp16_fma_test.cc
+++ b/caffe2/contrib/fakelowp/fp16_fma_test.cc
@ -1,41 +0,0 @@
-#include <glog/logging.h>
-#include <gtest/gtest.h>
-#include <cmath>
-#include <vector>
-#include "fp16_fma.h"
-
-using namespace std;
-using namespace fake_fp16;
-
-TEST(FP16_FMA, Simple) {
-  int x = 1;
-  x += 2;
-  int N = 6;
-
-  vector<float> A(N, 1.23);
-  vector<float> B(N, 2.34);
-  vector<float> C(N, 3.45);
-  fma_fp16(N, A.data(), B.data(), C.data());
-
-  for (int i = 0; i < N; i++) {
-    LOG(INFO) << C[i] << " ";
-    ASSERT_TRUE(abs(C[i] - 6.32812) < 1e-3);
-  }
-}
-
-TEST(FP16_FMA, Comprehensive) {
-#if 0
-#pragma omp parallel num_threads(30)
-  for (uint16_t a = 0; a < 1 << 15; a++) {
-    for (uint16_t b = 0; b < 1 << 15; b++) {
-      for (uint16_t c = 0; c < 1 << 15; c++) {
-        uint16_t z = a + b * c;
-
-        //       fake_fma_fp16_slow(A[0], B[0], C[0]);
-      }
-    }
-  }
-
-  fake_fma_fp16_slow(A[0], B[0], C[0]);
-#endif
-}
--- a/caffe2/contrib/fakelowp/fp16_gemm_utils.cc
+++ b/caffe2/contrib/fakelowp/fp16_gemm_utils.cc
@ -1,467 +0,0 @@
-#include "caffe2/contrib/fakelowp/fp16_gemm_utils.h"
-#include <fbgemm/FbgemmConvert.h>
-#include <fbgemm/FbgemmFP16.h>
-#include <glog/logging.h>
-#include <immintrin.h>
-#include "caffe2/core/context.h"
-#include "caffe2/utils/math.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-// dimA(before transpose) = M x N, dimA (after transpose) = N x M.
-void transpose(const float* A, std::vector<float>& A_trans, int M, int N) {
-  CAFFE_ENFORCE_EQ(M * N, A_trans.size());
-  fbgemm::transpose_simd(M, N, A, N, A_trans.data(), M);
-}
-
-void custom_fp16_gemm_with_trans(
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int m,
-    const int k,
-    const int n,
-    const float* A,
-    const float* B,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator) {
-  switch (trans_A) {
-    case CblasNoTrans: {
-      switch (trans_B) {
-        case CblasNoTrans: {
-          // A * B
-          custom_fp16_gemm(
-              m, k, n, A, B, beta, C, use_acc_fp16, use_temp_accumulator);
-          break;
-        }
-        case CblasTrans: {
-          // A * B_trans
-          std::vector<float> B_trans(n * k);
-          transpose(B, B_trans, n, k);
-          custom_fp16_gemm(
-              m,
-              k,
-              n,
-              A,
-              B_trans.data(),
-              beta,
-              C,
-              use_acc_fp16,
-              use_temp_accumulator);
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_B";
-      }
-    } break;
-    case CblasTrans: {
-      switch (trans_B) {
-        case CblasNoTrans: {
-          // A_trans * B
-          std::vector<float> A_trans(k * m);
-          transpose(A, A_trans, k, m);
-          custom_fp16_gemm(
-              m,
-              k,
-              n,
-              A_trans.data(),
-              B,
-              beta,
-              C,
-              use_acc_fp16,
-              use_temp_accumulator);
-          break;
-        }
-        case CblasTrans: {
-          // A_trans * B_trans
-          std::vector<float> A_trans(k * m);
-          std::vector<float> B_trans(n * k);
-          transpose(A, A_trans, k, m);
-          transpose(B, B_trans, n, k);
-          custom_fp16_gemm(
-              m,
-              k,
-              n,
-              A_trans.data(),
-              B_trans.data(),
-              beta,
-              C,
-              use_acc_fp16,
-              use_temp_accumulator);
-          break;
-        }
-        default:
-          LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_B";
-      }
-    } break;
-    default:
-      LOG(FATAL) << "Unexpected CBLAS_TRAnSPOSE for trans_A";
-  }
-}
-
-static inline __m256 clamp_subnormals(__m256 input, const float epsilon_) {
-  __m256 epsilon = _mm256_set1_ps(epsilon_);
-  __m256 nepsilon = _mm256_set1_ps(-epsilon_);
-
-  __m256 mask = _mm256_or_ps(
-      _mm256_cmp_ps(input, nepsilon, _CMP_LE_OS),
-      _mm256_cmp_ps(input, epsilon, _CMP_GE_OS));
-  return _mm256_and_ps(input, mask);
-}
-
-void custom_fp16_gemm(
-    const int m,
-    const int k,
-    const int n,
-    const float* A_fp16,
-    const float* B_fp16,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator) {
-#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
-  clock_t begin = clock();
-#endif
-  int C_size = m * n;
-  if (beta == 0) {
-    // In Caffe2 we often do a lazy initialization, which may contain NaNs in
-    // the float values. As a result, if beta is 0, we explicitly do a setzero.
-    memset(C, 0, C_size * sizeof(C[0]));
-  } else {
-    float beta_fp16 = fbgemm::cpu_half2float(fbgemm::cpu_float2half_rn(beta));
-
-    __m256 mBetaFp16 = _mm256_broadcast_ss(&beta_fp16);
-    int i = 0;
-    for (i = 0; i + 8 <= C_size; i += 8) {
-      __m256 mC = _mm256_loadu_ps(C + i);
-      mC = _mm256_mul_ps(mC, mBetaFp16);
-      _mm256_storeu_ps(C + i, mC);
-    }
-    for (; i < C_size; i++) {
-      C[i] = C[i] * beta_fp16;
-    }
-  }
-
-  // Encode the smallest normal number in float16
-  union epsilon_t {
-    float f;
-    uint32_t i;
-  };
-
-  union epsilon_t epsilon;
-  epsilon.i = 0x38800000u; // 1 / 16384
-
-  constexpr int VLEn = 8;
-  const int kb_max = 128;
-  for (int i = 0; i < m; i++) {
-    for (int l = 0; l < k; l += kb_max) {
-      int kb = std::min(kb_max, k - l);
-      for (int j = 0; j < n; j += VLEn) {
-        int nb = std::min(VLEn, n - j);
-        if (nb == VLEn) {
-          __m256 mC = _mm256_loadu_ps(C + i * n + j);
-          __m256 mC_temp = _mm256_setzero_ps();
-          for (int l2 = l; l2 < l + kb; l2++) {
-            __m256 mA_fp16 = _mm256_broadcast_ss(A_fp16 + i * k + l2);
-            __m256 mB_fp16 = _mm256_loadu_ps((B_fp16 + l2 * n + j));
-
-            if (use_acc_fp16) {
-              mA_fp16 = clamp_subnormals(mA_fp16, epsilon.f);
-              mB_fp16 = clamp_subnormals(mB_fp16, epsilon.f);
-            }
-
-            __m256 mAB = _mm256_mul_ps(mA_fp16, mB_fp16);
-
-            if (use_acc_fp16) {
-              __m256 mAB_fp16 = _mm256_cvtph_ps(_mm256_cvtps_ph(mAB, 0));
-              mAB_fp16 = clamp_subnormals(mAB_fp16, epsilon.f);
-
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB_fp16);
-                mC_temp = _mm256_cvtph_ps(_mm256_cvtps_ph(mC_temp, 0));
-              } else {
-                mC = _mm256_add_ps(mC, mAB_fp16);
-                mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              }
-            } else {
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB);
-              } else {
-                mC = _mm256_add_ps(mC, mAB);
-              }
-            }
-
-            if (use_acc_fp16) {
-              mC = clamp_subnormals(mC, epsilon.f);
-            }
-          }
-          if (use_temp_accumulator) {
-            if (use_acc_fp16) {
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              mC = _mm256_add_ps(mC, mC_temp);
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-            } else {
-              mC = _mm256_add_ps(mC, mC_temp);
-            }
-          }
-          _mm256_storeu_ps(C + i * n + j, mC);
-        } else {
-          __m256 mC_temp = _mm256_setzero_ps();
-          int32_t mask_src[] = {
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              -1,
-              0,
-              0,
-              0,
-              0,
-              0,
-              0,
-              0,
-              0,
-          };
-          __m256i imask =
-              _mm256_loadu_si256((__m256i const*)(mask_src + 8 - nb));
-          __m256 mC = _mm256_maskload_ps(C + i * n + j, imask);
-          for (int l2 = l; l2 < l + kb; l2++) {
-            __m256 mA_fp16 = _mm256_broadcast_ss(A_fp16 + i * k + l2);
-            __m256 mB_fp16 = _mm256_maskload_ps(B_fp16 + l2 * n + j, imask);
-
-            if (use_acc_fp16) {
-              mA_fp16 = clamp_subnormals(mA_fp16, epsilon.f);
-              mB_fp16 = clamp_subnormals(mB_fp16, epsilon.f);
-            }
-
-            __m256 mAB = _mm256_mul_ps(mA_fp16, mB_fp16);
-
-            if (use_acc_fp16) {
-              __m256 mAB_fp16 = _mm256_cvtph_ps(_mm256_cvtps_ph(mAB, 0));
-              mAB_fp16 = clamp_subnormals(mAB_fp16, epsilon.f);
-
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB_fp16);
-                mC_temp = _mm256_cvtph_ps(_mm256_cvtps_ph(mC_temp, 0));
-              } else {
-                mC = _mm256_add_ps(mC, mAB_fp16);
-                mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              }
-            } else {
-              if (use_temp_accumulator) {
-                mC_temp = _mm256_add_ps(mC_temp, mAB);
-              } else {
-                mC = _mm256_add_ps(mC, mAB);
-              }
-            }
-
-            if (use_acc_fp16) {
-              mC = clamp_subnormals(mC, epsilon.f);
-            }
-          }
-
-          if (use_temp_accumulator) {
-            if (use_acc_fp16) {
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-              mC = _mm256_add_ps(mC, mC_temp);
-              mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-            } else {
-              mC = _mm256_add_ps(mC, mC_temp);
-            }
-          }
-          _mm256_maskstore_ps(C + i * n + j, imask, mC);
-        }
-      }
-    }
-  }
-
-  if (!use_acc_fp16) {
-    constexpr int kSize=8;
-    int i = 0;
-    for (; i + kSize <= C_size; i+= kSize) {
-      __m256 mC = _mm256_loadu_ps(C + i);
-      mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-      _mm256_storeu_ps(C + i, mC);
-    }
-    if (i < C_size){
-      vector<float> tmp(8);
-      for (int kk =0; kk + i < C_size; kk++) {
-        tmp[kk] = C[i + kk];
-      }
-      __m256 mC = _mm256_loadu_ps(tmp.data());
-      mC = _mm256_cvtph_ps(_mm256_cvtps_ph(mC, 0));
-      _mm256_storeu_ps(tmp.data(), mC);
-      for (int kk =0; kk + i < C_size; kk++) {
-        C[i + kk] = tmp[kk];
-      }
-    }
-  }
-
-#ifdef LOG_LEVEL_FOR_FBFCPACkEDACC16_PERFORmAnCE_LOG
-  clock_t end = clock();
-  double elapsed_secs = double(end - begin) / CLOCkS_PER_SEC;
-  VLOG(LOG_LEVEL_FOR_FBFCPACKEDACC16_ACCURACY_LOG)
-      << "cblas_gemm_compute_acc16 run time = " << elapsed_secs << endl;
-#endif
-}
-
-void custom_fp16_gemv(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const int M,
-    const int N,
-    const float alpha,
-    const float* A,
-    const float* x,
-    const float beta,
-    float* y,
-    CPUContext* context) {
-  if (use_acc_fp16) {
-    custom_fp16_gemm_with_trans(
-        trans_A,
-        CblasNoTrans,
-        M,
-        1,
-        N,
-        A,
-        x,
-        beta,
-        y,
-        true /* use acc_fp16 */,
-        use_temp_accumulator);
-  } else if (use_custom_acc32 && use_temp_accumulator) {
-    custom_fp16_gemm_with_trans(
-        trans_A,
-        CblasNoTrans,
-        M,
-        1,
-        N,
-        A,
-        x,
-        beta,
-        y,
-        false /* use acc_fp32 */,
-        use_temp_accumulator);
-  } else {
-    math::Gemv<float, CPUContext>(trans_A, M, N, alpha, A, x, beta, y, context);
-  }
-}
-
-void custom_fp16_gemm_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float** A,
-    const float** B,
-    const float beta,
-    float** C,
-    CPUContext* context) {
-  if (!use_acc_fp16 && (!use_custom_acc32 || !use_temp_accumulator)) {
-    math::GemmBatched<float, CPUContext>(
-        trans_A, trans_B, batch_size, M, N, K, alpha, A, B, beta, C, context);
-    return;
-  }
-
-  for (int i = 0; i < batch_size; ++i) {
-    if (use_acc_fp16) {
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A[i],
-          B[i],
-          beta,
-          C[i],
-          true /* use acc_fp16 */,
-          use_temp_accumulator);
-    } else {
-      CAFFE_ENFORCE(use_custom_acc32 && use_temp_accumulator);
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A[i],
-          B[i],
-          beta,
-          C[i],
-          false /* use acc_fp32 */,
-          use_temp_accumulator);
-    }
-  }
-}
-
-void custom_fp16_gemm_strided_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha /* unused */,
-    const float* A,
-    const int A_stride,
-    const float* B,
-    const int B_stride,
-    const float beta,
-    float* C,
-    const int C_stride,
-    CPUContext* context) {
-  // loop over matrices in the batch
-  for (int i = 0; i < batch_size; ++i) {
-    if (use_acc_fp16) {
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A,
-          B,
-          beta,
-          C,
-          true /* use_acc_fp16 */,
-          use_temp_accumulator);
-
-    } else {
-      custom_fp16_gemm_with_trans(
-          trans_A,
-          trans_B,
-          M,
-          K,
-          N,
-          A,
-          B,
-          beta,
-          C,
-          false /* use acc_fp32*/,
-          use_temp_accumulator);
-    }
-    A += A_stride;
-    B += B_stride;
-    C += C_stride;
-  }
-}
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/fp16_gemm_utils.h
+++ b/caffe2/contrib/fakelowp/fp16_gemm_utils.h
@ -1,81 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-#pragma once
-#include "caffe2/core/context.h"
-#include "caffe2/utils/math.h"
-
-namespace caffe2 {
-void custom_fp16_gemm(
-    const int m,
-    const int k,
-    const int n,
-    const float* A_fp16,
-    const float* B_fp16,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator);
-
-void custom_fp16_gemm_with_trans(
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int m,
-    const int k,
-    const int n,
-    const float* A_fp16,
-    const float* B_fp16,
-    const float beta,
-    float* C,
-    const bool use_acc_fp16,
-    const bool use_temp_accumulator);
-
-void transpose(const float* A, float* A_trans, int M, int N);
-void custom_fp16_gemv(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const int M,
-    const int N,
-    const float alpha,
-    const float* A,
-    const float* x,
-    const float beta,
-    float* y,
-    CPUContext* context);
-
-void custom_fp16_gemm_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha,
-    const float** A,
-    const float** B,
-    const float beta,
-    float** C,
-    CPUContext* context);
-void custom_fp16_gemm_strided_batched(
-    const bool use_acc_fp16,
-    const bool use_custom_acc32,
-    const bool use_temp_accumulator,
-    const CBLAS_TRANSPOSE trans_A,
-    const CBLAS_TRANSPOSE trans_B,
-    const int batch_size,
-    const int M,
-    const int N,
-    const int K,
-    const float alpha /* unused */,
-    const float* A,
-    const int A_stride,
-    const float* B,
-    const int B_stride,
-    const float beta,
-    float* C,
-    const int C_stride,
-    CPUContext* context);
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.cc
+++ b/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.cc
@ -1,14 +0,0 @@
-#include "caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(Int8DequantizeNNPI, int8::Int8DequantizeNNPIOp);
-
-OPERATOR_SCHEMA(Int8DequantizeNNPI)
-    .IdenticalTypeAndShape()
-    .NumInputs(1)
-    .NumOutputs(1)
-    .Input(0, "qX", "Int8 Tensor qX.")
-    .Output(0, "Y", "FP32 Tensor that represents mapped real value of qX.");
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h
+++ b/caffe2/contrib/fakelowp/int8_dequantize_op_nnpi.h
@ -1,57 +0,0 @@
-#ifndef CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
-#define CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-namespace int8 {
-
-namespace {
-
-void Int8DequantizeNNPI(
-    const uint8_t* in,
-    float* out,
-    const int64_t N,
-    const float X_scale,
-    const int32_t X_offset) {
-  float X_scale_fp32 = 1.0f / X_scale;
-  for (const auto i : c10::irange(N)) {
-    out[i] = (float)(static_cast<int32_t>(in[i]) - X_offset) / X_scale_fp32;
-  }
-} // namespace
-
-} // namespace
-
-class Int8DequantizeNNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  bool RunOnDevice() override {
-    const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
-
-    auto* Y = Output(0, X.t.sizes(), at::dtype<float>());
-    int32_t X_offset = X.zero_point;
-    auto X_scale = X.scale;
-    Int8DequantizeNNPI(
-        X.t.data<uint8_t>(),
-        Y->mutable_data<float>(),
-        X.t.numel(),
-        X_scale,
-        X_offset);
-    // UsingOneOverScale_);
-    return true;
-  }
-};
-
-} // namespace int8
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_INT8_DEQUANTIZE_OP_H_
--- a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.cc
+++ b/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.cc
@ -1,15 +0,0 @@
-#include "caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h"
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(Int8QuantizeNNPI, int8::Int8QuantizeNNPIOp);
-
-OPERATOR_SCHEMA(Int8QuantizeNNPI)
-    .IdenticalTypeAndShape()
-    .Arg("Y_scale", "Output tensor quantization scale")
-    .Arg("Y_zero_point", "Output tensor quantization offset")
-    .NumInputs(1)
-    .NumOutputs(1)
-    .Input(0, "X", "FP32 Tensor X.")
-    .Output(0, "Y", "Int8 Tensor qX representing X with linear quantization.");
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h
+++ b/caffe2/contrib/fakelowp/int8_quantize_op_nnpi.h
@ -1,108 +0,0 @@
-#ifndef CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
-#define CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
-
-#include <fbgemm/FbgemmConvert.h>
-#include <cmath>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-#include "fp16_fma.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-namespace int8 {
-
-namespace {
-
-
-static float ClampScale(float s)
-{
-  const float MinScale(1e-10f);
-    if (std::fabs(s) < MinScale) {
-        LOG_EVERY_N(WARNING, 1000) << "Too small scale detected: "
-            << s << " clamping to +/-" << MinScale;
-        return std::signbit(s) ? -MinScale : MinScale;
-    } else {
-        return s;
-    }
-}
-
-void Int8QuantizeNNPI(
-    const float* in,
-    uint8_t* out,
-    const int64_t N,
-    const float Y_scale,
-    const int32_t Y_offset) {
-  const int32_t qmin = std::numeric_limits<uint8_t>::min();
-  const int32_t qmax = std::numeric_limits<uint8_t>::max();
-
-  float inv_scale = ClampScale(1 / Y_scale);
-  float inv_scale_fp16 = 0;
-  fbgemm::RoundToFloat16(
-      &inv_scale, &inv_scale_fp16, 1, false /* no clamping */);
-  float offset_tmp = -Y_offset;
-  fbgemm::RoundToFloat16(
-      &offset_tmp, &offset_tmp, 1, false /* no clamping */);
-  std::vector<float> in_fp16(N);
-  fbgemm::RoundToFloat16(
-      in, in_fp16.data(), N, false /* no clamping */);
-
-  std::vector<float> inv_scalev(N, inv_scale_fp16);
-  std::vector<float> offsetv(N, -offset_tmp);
-  fake_fp16::fma_fp16(N, in_fp16.data(), inv_scalev.data(), offsetv.data());
-  for (const auto i : c10::irange(N)) {
-    offsetv[i] = round(offsetv[i]);
-  }
-  fbgemm::RoundToFloat16(
-      offsetv.data(), offsetv.data(), N, false /* no clamping */);
-  for (const auto i : c10::irange(N)) {
-    float halfRes = offsetv[i];
-    if (std::isinf(halfRes)) {
-      if (halfRes > 0) {
-        halfRes = qmax;
-      } else {
-        halfRes = qmin;
-      }
-    }
-    if (halfRes > qmax) {
-      halfRes = qmax;
-    }
-    if (halfRes < qmin) {
-      halfRes = qmin;
-    }
-    out[i] = static_cast<uint8_t>(halfRes);
-  }
-}
-
-} // namespace
-
-class Int8QuantizeNNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
-    Y->t.ResizeLike(X);
-    int32_t Y_offset =
-        this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
-    Y->scale = Y_scale;
-    Y->zero_point = Y_offset;
-    Int8QuantizeNNPI(
-        X.data<float>(),
-        Y->t.mutable_data<uint8_t>(),
-        X.numel(),
-        Y_scale,
-        Y_offset);
-    return true;
-  }
-};
-
-} // namespace int8
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_INT8_QUANTIZE_OP_H_
--- a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.cc
+++ b/caffe2/contrib/fakelowp/int8_swish_op_nnpi.cc
@ -1,27 +0,0 @@
-#include "caffe2/contrib/fakelowp/int8_swish_op_nnpi.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SwishFakeInt8NNPI, int8::SwishInt8NNPIOp);
-
-OPERATOR_SCHEMA(SwishFakeInt8NNPI)
-    .IdenticalTypeAndShape()
-    .Arg("X_scale", "Inout tensor quantization scale")
-    .Arg("X_zero_point", "Input tensor quantization offset")
-    .Arg("Y_scale", "Output tensor quantization scale")
-    .Arg("Y_zero_point", "Output tensor quantization offset")
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Apply the Swish function element-wise after dequantizing input tensor.
-$$Swish(x) = \frac{x}{1+\exp(-x)}$$
-Quantize the Swish function output back to Int8.
-The input and output of this operator are converted to fp16 precision
-before applying the function.
-<details>
-</details>
-)DOC")
-    .Input(0, "X", "Int8 Tensor X.")
-    .Output(0, "Y", "Int8 Tensor Y.");
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h
+++ b/caffe2/contrib/fakelowp/int8_swish_op_nnpi.h
@ -1,87 +0,0 @@
-#ifndef CAFFE2_OPERATORS_INT8_SWISH_OP_H_
-#define CAFFE2_OPERATORS_INT8_SWISH_OP_H_
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-
-namespace caffe2 {
-
-namespace int8 {
-
-namespace {
-using namespace std;
-void SwishFakeInt8NNPI(
-    const uint8_t* in,
-    uint8_t* out,
-    const int64_t N,
-    const float X_scale,
-    const int32_t X_offset,
-    const float Y_scale,
-    const int32_t Y_offset) {
-
-  const uint8_t max_val = std::numeric_limits<uint8_t>::max();
-  const uint8_t min_val = std::numeric_limits<uint8_t>::min();
-  float X_scale_fp32 = 1.0f / X_scale;
-  float deq_val = 0.0f;
-  float deq_swish = 0.0f;
-  int32_t quant_val = 0;
-  uint8_t result = 0;
-
-  for (const auto i : c10::irange(N)) {
-    deq_val = (static_cast<uint8_t>(in[i]) - X_offset) / X_scale_fp32;
-    deq_swish = deq_val / (1 + exp(-deq_val));
-    quant_val = round(deq_swish / Y_scale + Y_offset);
-    result = quant_val;
-    if (quant_val > max_val) {
-      result = max_val;
-    }
-    if (quant_val < min_val) {
-      result = min_val;
-    }
-    out[i] = static_cast<uint8_t>(result);
-  }
-}
-
-} // namespace
-
-
-class SwishInt8NNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  template <class... Args>
-  explicit SwishInt8NNPIOp(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...) {}
-
-  bool RunOnDevice() override {
-    const auto& X = Inputs()[0]->template Get<Int8TensorCPU>();
-    auto* Y = Outputs()[0]->template GetMutable<Int8TensorCPU>();
-    Y->t.ResizeLike(X.t);
-
-    int32_t Y_offset_ =
-      this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale_ = this->template GetSingleArgument<float>("Y_scale", 1);
-
-    Y->scale = Y_scale_;
-    Y->zero_point = Y_offset_;
-
-    SwishFakeInt8NNPI(
-        X.t.data<uint8_t>(),
-        Y->t.mutable_data<uint8_t>(),
-        X.t.numel(),
-        X.scale,
-        X.zero_point,
-        Y_scale_,
-        Y_offset_);
-    return true;
-  }
-
-};
-
-} // namespace int8
-
-} // namespace caffe2
-
-#endif // CAFFE2_OPERATORS_INT8_SWISH_OP_H_
--- a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.cc
@ -1,201 +0,0 @@
-#include <algorithm>
-#include "layernorm_fp16_fake_op.h"
-#include "caffe2/contrib/fakelowp/common.h"
-#include "caffe2/contrib/fakelowp/fp16_fma.h"
-
-namespace caffe2 {
-
-void LayerNormUtils::calcY(
-    const int M,
-    const int N,
-    const float* X,
-    const float* mean,
-    const float* std,
-    const float* gamma,
-    const float* beta,
-    float* Y) {
-  ConstEigenArrayMap<float> X_arr(X, N, M);
-  ConstEigenVectorArrayMap<float> mean_arr(mean, M);
-  ConstEigenVectorArrayMap<float> std_arr(std, M);
-  EigenArrayMap<float> Y_arr(Y, N, M);
-
-  std::vector<float> normalized(N);
-  for (int i = 0; i < M; ++i) {
-    float normFactor = float(1.0f / std_arr[i]);
-    fbgemm::RoundToFloat16(&normFactor, &normFactor, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    for (int j = 0; j < N; ++j) {
-      normalized[j] = X_arr.col(i)[j] - mean[i];
-    }
-    fbgemm::RoundToFloat16(normalized.data(), normalized.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    for (int j = 0; j < N; ++j) {
-      normalized[j] *= normFactor;
-    }
-    fbgemm::RoundToFloat16(normalized.data(), &Y_arr.col(i)[0], N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  if (gamma != nullptr && beta != nullptr) {
-    ConstEigenVectorArrayMap<float> gamma_arr(gamma, N);
-    ConstEigenVectorArrayMap<float> beta_arr(beta, N);
-
-    for (int i = 0; i < M; ++i) {
-      vector<float> res(N);
-      for (int j = 0; j < N; j++) {
-        res[j] = beta[j];
-      }
-      fake_fp16::fma_fp16(N, &Y_arr.col(i)[0], gamma, res.data());
-      for (int j = 0; j < N; j++) {
-        Y_arr.col(i)[j] = res[j];
-      }
-    }
-  }
-}
-
-float LayerNormUtils::ReducedAdd(const std::vector<float>& vec) {
-  constexpr int VEC_SIZE = 32;
-  std::vector<float> v(vec.begin(), vec.end());
-
-  for (int factor = 2; factor <=32; factor *=2) {
-    int range = VEC_SIZE / factor;
-
-    for (int i = 0; i < range; ++i) { // 16
-      v[i] = v[2 * i] + v[2 * i + 1];
-    }
-    fbgemm::RoundToFloat16(v.data(), v.data(), range, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  return v[0];
-}
-
-void LayerNormUtils::calcMeanStd(
-    const int M,
-    const int N,
-    const float eps,
-    const float* X,
-    float* mean,
-    float* std) {
-  ConstEigenArrayMap<float> X_arr(X, N, M);
-
-  std::vector<float> sqr(M, 0.0f);
-  std::vector<float> var(M, 0.0f);
-  float inv_N_val = 1.0f / N;
-  fbgemm::RoundToFloat16(&inv_N_val, &inv_N_val, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-  constexpr int VEC_SIZE = 32;
-  std::vector<float> inv_N_vec(VEC_SIZE, inv_N_val);
-  std::vector<float> inv_N_prod_vec(VEC_SIZE, 0);
-  std::vector<float> avgVec(VEC_SIZE, 0.0f);
-  std::vector<float> sqrVec(VEC_SIZE, 0.0f);
-  std::vector<float> negMeanVec(M, 0.0f);
-  int numVecs = N / VEC_SIZE;
-  int tailSize = N - (numVecs * VEC_SIZE);
-
-  vector<float> X_fp16(M * N);
-  fbgemm::RoundToFloat16(
-      X, X_fp16.data(), M * N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-  for (int i = 0; i < M; ++i) {
-    std::fill(avgVec.begin(), avgVec.end(), 0.0f);
-    std::fill(sqrVec.begin(), sqrVec.end(), 0.0f);
-    for (int j = 0; j < numVecs; ++j) {
-      fake_fp16::fma_fp16(
-          VEC_SIZE,
-          &X_fp16[i * N + VEC_SIZE * j],
-          inv_N_vec.data(),
-          avgVec.data());
-      for (int k = 0; k < VEC_SIZE; k++) {
-        inv_N_prod_vec[k] = X_fp16[i * N + VEC_SIZE * j + k] * inv_N_val;
-      }
-      fbgemm::RoundToFloat16(
-          inv_N_prod_vec.data(),
-          inv_N_prod_vec.data(),
-          VEC_SIZE,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-      fake_fp16::fma_fp16(
-          VEC_SIZE,
-          &X_fp16[i * N + VEC_SIZE * j],
-          inv_N_prod_vec.data(),
-          sqrVec.data());
-    }
-
-    if (tailSize > 0) {
-      fake_fp16::fma_fp16(
-          tailSize,
-          &X_fp16[i * N + VEC_SIZE * numVecs],
-          inv_N_vec.data(),
-          avgVec.data());
-      for (int k = 0; k < tailSize; k++) {
-        inv_N_prod_vec[k] = X_fp16[i * N + VEC_SIZE * numVecs + k] * inv_N_val;
-      }
-      fbgemm::RoundToFloat16(
-          inv_N_prod_vec.data(),
-          inv_N_prod_vec.data(),
-          tailSize,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-      fake_fp16::fma_fp16(
-          tailSize,
-          &X_fp16[i * N + VEC_SIZE * numVecs],
-          inv_N_prod_vec.data(),
-          sqrVec.data());
-    }
-    mean[i] = ReducedAdd(avgVec);
-    sqr[i] = ReducedAdd(sqrVec);
-  }
-
-  // // compute variance and std deviation
-  std::copy(mean, mean + M, negMeanVec.begin());
-  std::transform(negMeanVec.cbegin(),
-      negMeanVec.cend(),
-      negMeanVec.begin(),
-      std::negate<float>());
-  fake_fp16::fma_fp16(M, mean, negMeanVec.data(), sqr.data());
-  std::copy(sqr.cbegin(), sqr.cend(), var.begin());
-
-  float teps = eps;
-  std::vector<float> tmpVec(M, 0.0f);
-  fbgemm::RoundToFloat16(&teps, &teps, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  int i = 0;
-  for (auto& v: var) {
-    if (v < 0.0) {
-      LOG_EVERY_N(WARNING, 1000) << "Variance " << v
-          << " negative, resetting to 0.";
-      v = 0.0;
-    }
-    tmpVec[i] = var[i] + teps;
-    ++i;
-  }
-  fbgemm::RoundToFloat16(
-      tmpVec.data(),
-      tmpVec.data(),
-      M,
-      FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  i = 0;
-  for (auto& v: tmpVec) {
-    if (v < 0) {
-      LOG_EVERY_N(WARNING, 1000) << "Variance " << v
-          << " negative, resetting to 0.";
-      v = 0.0;
-    }
-    std[i] = std::sqrt(v);
-    ++i;
-  }
-  fbgemm::RoundToFloat16(
-    std,
-    std,
-    M,
-    FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-}
-
-REGISTER_CPU_OPERATOR(LayerNormFakeFP16NNPI, LayerNormFakeFp16Op<false>);
-OPERATOR_SCHEMA(LayerNormFakeFP16NNPI).NumInputs({1, 3}).NumOutputs(3);
-
-REGISTER_CPU_OPERATOR(LayerNormInt8QuantizeFakeNNPI,
-                      LayerNormFakeFp16Op<true>);
-OPERATOR_SCHEMA(LayerNormInt8QuantizeFakeNNPI)
-    .IdenticalTypeAndShape()
-    .NumInputs({1, 3})
-    .NumOutputs(3);
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/layernorm_fp16_fake_op.h
@ -1,207 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <functional>
-#include <string>
-#include <vector>
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-#include "fp16_fma.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-
-class LayerNormUtils {
-  public:
-  static void calcY(
-      const int M,
-      const int N,
-      const float* X,
-      const float* mean,
-      const float* std,
-      const float* gamma,
-      const float* beta,
-      float* Y);
-
-  static void calcMeanStd(
-      const int M,
-      const int N,
-      const float eps,
-      const float* X,
-      float* mean,
-      float* std);
-
-  static float ReducedAdd(const std::vector<float>& vec);
-};
-
-template <bool quantizeOutput=false>
-class LayerNormFakeFp16Op final : public Operator<CPUContext> {
- public:
-  template <class... Args>
-  explicit LayerNormFakeFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(int, "axis", axis_, 1),
-        OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5f),
-        OP_SINGLE_ARG(bool, "elementwise_affine", elementwise_affine_, false) {}
-  ~LayerNormFakeFp16Op() noexcept override {}
-
-  bool RunOnDevice() override {
-    return DoRunWithType();
-  }
-
-  bool DoRunWithType() {
-    const auto& X = Input(INPUT);
-    vector <float> Y_fp16;
-
-    Tensor *Y;
-    if (!quantizeOutput) {
-      Y = Output(OUTPUT, X.sizes(), at::dtype<float>());
-    } else {
-      Y_fp16.resize(X.numel());
-    }
-    CAFFE_ENFORCE_GE(X.dim(), 2, "LayerNorm requires input dim >=2.");
-    const int canonical_axis = X.canonical_axis_index(axis_);
-    std::vector<int64_t> moments_dims(
-        X.sizes().cbegin(), X.sizes().cbegin() + canonical_axis);
-    moments_dims.push_back(1);
-    auto* mean = Output(MEAN, moments_dims, at::dtype<float>());
-    auto* sigma = Output(STD, moments_dims, at::dtype<float>());
-    const int M = X.size_to_dim(canonical_axis);
-    const int N = X.size_from_dim(canonical_axis);
-
-    if (!quantizeOutput) {
-      Y->ResizeLike(X);
-    }
-
-    const float* X_data = X.template data<float>();
-
-    float *Y_data;
-    if (!quantizeOutput) {
-      Y_data = Y->template mutable_data<float>();
-    } else {
-      Y_data = Y_fp16.data();
-    }
-
-    float* mean_data = mean->template mutable_data<float>();
-    float* sigma_data = sigma->template mutable_data<float>();
-
-    std::vector<float> X_rounded(X.numel());
-    fbgemm::RoundToFloat16(
-        X_data,
-        X_rounded.data(),
-        X.numel(),
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-        false /*USE_ACC_FP16*/);
-    X_data = X_rounded.data();
-
-    // Mean and Standard Deviation computation for the input data
-    LayerNormUtils::calcMeanStd(M, N, epsilon_, X_data, mean_data, sigma_data);
-
-    const float* gamma_data = nullptr;
-    const float* beta_data = nullptr;
-
-    // Layer Normalized Output computation
-    LayerNormUtils::calcY(
-        M, N, X_data, mean_data, sigma_data, gamma_data, beta_data, Y_data);
-
-    if (InputSize() == 3) {
-      // handle scale and bias via fp16_fma
-      std::vector<float> scale_data(N);
-      std::vector<float> bias_data(N);
-      fbgemm::RoundToFloat16(
-          Input(1).template data<float>(),
-          scale_data.data(),
-          N,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          false /*USE_ACC_FP16*/);
-      fbgemm::RoundToFloat16(
-          Input(2).template data<float>(),
-          bias_data.data(),
-          N,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-          false /*USE_ACC_FP16*/);
-
-      for (const auto i : c10::irange(M)) {
-        // fma_fp16(A, B, Out) -> Out = A * B + Out
-        std::vector<float> out(N);
-        std::memcpy(out.data(), bias_data.data(), sizeof(float) * N);
-        fake_fp16::fma_fp16(N, Y_data + i * N, scale_data.data(), out.data());
-        std::memcpy(Y_data + i * N, out.data(), sizeof(float) * N);
-      }
-    }
-
-    // Quantize
-    // We should be using the same quantization fucntion from int8quantize,
-    // but we need to adjust for int8 vs uint8 bias. A simple shift of the output is not enough
-    // because this causes problems when rounding inside the fma.
-    // TODO: figure out how to commonize this with int8 quantize
-    if (quantizeOutput) {
-      auto* Y_int8 = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>();
-      Y_int8->t.ResizeLike(X);
-
-      int32_t Y_offset =
-          this->template GetSingleArgument<int>("Y_zero_point", 0);
-      auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
-
-      float inv_scale = 1.0f / Y_scale;
-      fbgemm::RoundToFloat16(
-        &inv_scale, &inv_scale, 1, false /* no clamping */);
-
-      Y_int8->scale = Y_scale;
-      Y_int8->zero_point = Y_offset;
-
-      int Nout = X.numel();
-
-      std::vector<float> inv_scalev(Nout, inv_scale);
-      std::vector<float> offsetv(Nout, Y_offset);
-      uint8_t* Y_uint8_data = Y_int8->t.template mutable_data<uint8_t>();
-
-      fake_fp16::fma_fp16(Nout, Y_fp16.data(), inv_scalev.data(), offsetv.data());
-
-      const int32_t qmin = std::numeric_limits<uint8_t>::min();
-      const int32_t qmax = std::numeric_limits<uint8_t>::max();
-
-      for (const auto i : c10::irange(Nout)) {
-        float halfRes = offsetv[i];
-        halfRes = round(halfRes);
-        if (std::isinf(halfRes)) {
-          if (halfRes > 0) {
-            halfRes = qmax;
-          } else {
-            halfRes = qmin;
-          }
-        }
-        if (halfRes > qmax) {
-          halfRes = qmax;
-        }
-        if (halfRes < qmin) {
-          halfRes = qmin;
-        }
-        Y_uint8_data[i] = static_cast<uint8_t>(halfRes);
-      }
-    }
-
-    return true;
-  }
-
- private:
-  const int axis_;
-  const float epsilon_;
-  // LayerNorm FP16 FakeLowP Op applies the scales and biases (or gamma and beta)
-  // whenever those inputs are provided else it will omit them.
-  // We are keeping elementwise_affine to keep it consistent with LayerNorm FP32 Op.
-  const bool elementwise_affine_;
-
-  INPUT_TAGS(INPUT);
-  OUTPUT_TAGS(OUTPUT, MEAN, STD);
-};
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.cc
@ -1,163 +0,0 @@
-#include "lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused4BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false>);
-OPERATOR_SCHEMA(SparseLengthsSumFused4BitRowwiseFakeFP16NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-4-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 2-byte scale and 2-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused4BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-4-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 2-byte scale and 2-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused4BitRowwiseFakeFP16EmbeddingOnly);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::LENGTHS,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 4-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 2-byte scale and 2-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused4BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::DATA,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
-            INDICES,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
-            LENGTHS,
-        SparseLengthsFused4BitRowwiseFakeFP16Op<CPUContext, true, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 4-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 2-byte scale and 2-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused4BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused4BitRowwiseFakeFP16EmbeddingOnly);
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_4bit_rowwise_fp16_fake_op.h
@ -1,216 +0,0 @@
-#pragma once
-
-#include <immintrin.h>
-#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
-#include "fp16_fma.h"
-#include "lengths_reducer_ops.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-namespace caffe2 {
-
-template <
-    class Context,
-    bool with_weights = 0,
-    bool use_fp16_for_embedding_only = 0>
-class SparseLengthsFused4BitRowwiseFakeFP16Op final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  explicit SparseLengthsFused4BitRowwiseFakeFP16Op(
-      const OperatorDef& operator_def,
-      Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~SparseLengthsFused4BitRowwiseFakeFP16Op() noexcept override {}
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, Input(INDICES));
-  }
-
-  template <typename IndexType>
-  bool DoRunWithType() {
-    const auto& data = Input(DATA);
-    const auto& indices = Input(INDICES);
-    const auto& lengths = Input(LENGTHS);
-
-    CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES must be a vector");
-    CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS must be a vector");
-
-    const float* weights = nullptr;
-    if (with_weights) {
-      const auto& weights_input = Input(WEIGHTS);
-      CAFFE_ENFORCE_EQ(weights_input.dim(), 1, "WEIGHTS must be a vector");
-      CAFFE_ENFORCE_EQ(
-          weights_input.numel(),
-          indices.numel(),
-          "WEIGHTS should have the same length as INDICES.");
-      weights = weights_input.template data<float>();
-    }
-
-    CAFFE_ENFORCE_GT(
-        data.size(1),
-        sizeof(at::Half) * 2,
-        "DATA must have more than 4 columns");
-    constexpr int NUM_ELEM_PER_BYTE = 2;
-    // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
-    // bytes for bias that we use in the fused representation (per row).
-    const std::vector<int64_t> shape = {
-        lengths.size(0),
-        static_cast<int64_t>(data.size(1) - 2 * sizeof(at::Half)) *
-            NUM_ELEM_PER_BYTE};
-    auto* output = Output(0, shape, at::dtype<float>());
-
-    // Copied from Fused8BitRowwiseEmbeddingLookupGenericSlow in
-    // fused_8bit_rowwise_embedding_lookup.cc
-
-    int64_t output_block_size = output->size(1);
-    CAFFE_ENFORCE_EQ(
-        output_block_size % NUM_ELEM_PER_BYTE,
-        0,
-        "block size must be divisible by 2");
-    int64_t input_block_size = output_block_size / NUM_ELEM_PER_BYTE;
-    int64_t output_size = output->size(0);
-    int64_t index_size = indices.numel();
-    int64_t data_size = data.size(0);
-    const uint8_t* input = data.template data<uint8_t>();
-    const IndexType* indices_data = indices.template data<IndexType>();
-    const int* lengths_data = lengths.template data<int>();
-    float* out = output->template mutable_data<float>();
-
-    std::vector<float> rowTempSums[2];
-    rowTempSums[0].resize(output_block_size);
-    rowTempSums[1].resize(output_block_size);
-
-    const auto scale_bias_offset = 2 * sizeof(at::Half);
-    const int64_t input_fused_block_size = input_block_size + scale_bias_offset;
-    int64_t current = 0;
-    for (const auto m : c10::irange(output_size)) {
-      if (!use_fp16_for_embedding_only) {
-        memset(rowTempSums[0].data(), 0, sizeof(float) * output_block_size);
-        memset(rowTempSums[1].data(), 0, sizeof(float) * output_block_size);
-      }
-
-      memset(out, 0, sizeof(float) * output_block_size);
-
-      if (current + lengths_data[m] > index_size) {
-        return false;
-      }
-
-      for (int i = 0; i < lengths_data[m]; ++i) {
-        int64_t idx = indices_data[current];
-
-        int accIdx = 0;
-        if (output_block_size % 2 == 0 && output_block_size <= 96 &&
-            data.size(1) % 2 == 0) {
-          accIdx = i % 2;
-        }
-
-        if (idx < 0 || idx >= data_size) {
-          return false;
-        }
-
-        const at::Half* scale_bias = reinterpret_cast<const at::Half*>(
-            input + input_fused_block_size * indices_data[current] +
-            input_block_size);
-
-        float weight = 1.0f;
-        if (weights) {
-          weight = weights[current];
-          if (!use_fp16_for_embedding_only) {
-            // Fake fp16 rounding of weight
-            fbgemm::RoundToFloat16(
-                &weight, &weight, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          }
-        }
-        float scale = scale_bias[0];
-        float bias = scale_bias[1];
-
-        if (!use_fp16_for_embedding_only) {
-          scale *= weight;
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // Unpack int4 elements
-        std::vector<float> input_rounded(output_block_size);
-        int k = 0;
-        for (const auto j : c10::irange(input_block_size)) {
-          input_rounded[k++] =
-              input[input_fused_block_size * indices_data[current] + j] & 0x0f;
-          input_rounded[k++] =
-              input[input_fused_block_size * indices_data[current] + j] >> 4;
-        }
-
-        if (use_fp16_for_embedding_only) {
-          std::vector<float> product_rounded(output_block_size);
-          TypedAxpy<float, float>(
-              output_block_size,
-              scale,
-              input_rounded.data(),
-              product_rounded.data());
-
-          for (const auto j : c10::irange(output_block_size)) {
-            product_rounded[j] += bias;
-          }
-
-          // Fake fp16 rounding of scale x input + bias
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              output_block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-          // Accumulate w x (scale x input + bias) to output
-          TypedAxpy<float, float>(
-              output_block_size,
-              weight,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-        } else {
-          std::vector<float> product(output_block_size);
-          std::vector<float> scalev(output_block_size, scale);
-          std::vector<float> mBias(output_block_size, bias);
-          std::vector<float> mWeight(output_block_size, weight);
-
-          fake_fp16::fma_fp16(
-              output_block_size,
-              mBias.data(),
-              mWeight.data(),
-              rowTempSums[accIdx].data());
-
-          fake_fp16::fma_fp16(
-              output_block_size,
-              scalev.data(),
-              input_rounded.data(),
-              rowTempSums[accIdx].data());
-        }
-        ++current;
-      }
-
-      if (!use_fp16_for_embedding_only) {
-        for (const auto j : c10::irange(output_block_size)) {
-          out[j] = rowTempSums[0][j] + rowTempSums[1][j];
-        }
-        fbgemm::RoundToFloat16(
-            reinterpret_cast<const float*>(out),
-            out,
-            output_block_size,
-            FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      }
-
-      out += output_block_size;
-    }
-    return current == index_size;
-  }
-
-  enum {
-    DATA = 0,
-    WEIGHTS = 1,
-    INDICES = 1 + with_weights,
-    LENGTHS = 2 + with_weights,
-  };
-};
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.cc
@ -1,722 +0,0 @@
-#include "lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fma=*/false,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16EmbeddingOnly);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fma=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP32NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fp16_fma=*/false,
-        /*use_fp16_for_embedding_only*/ false,
-        /*use_acc_fp32*/ true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP32NNPI)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP32NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16AccFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights*/ false,
-        /*is_mean*/ 0,
-        /*use_acc_fp16*/ true,
-        /*use_inv_scale*/ true>);
-OPERATOR_SCHEMA(SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            false,
-            false,
-            true>::LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsSum, but operating on
-8-bit rowwise quantized matrices with fused storage (where each row
-stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output")
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFused8BitRowwiseFakeFP16AccInvScaleFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, /*with_weights=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fma=*/false,
-        /*use_fp16_for_embedding_only=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-Convert only embedding entries using fake fp16.
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16EmbeddingOnly);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true,
-        /*use_inv_scale=*/false,
-        /*use_fma=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/false,
-        /*use_inv_scale=*/false,
-        /*use_nnpi_fp16_fma=*/false,
-        /*use_fp16_for_embedding_only*/ false,
-        /*use_acc_fp32*/ true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<
-            CPUContext,
-            true,
-            false,
-            false,
-            false,
-            false,
-            false,
-            true>::WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/true,
-        /*is_mean=*/false,
-        /*use_acc_fp16=*/true,
-        /*use_inv_scale=*/true>);
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16)
-    .NumInputs(4)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            LENGTHS,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, true, false, true>::
-            WEIGHTS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsWeightedSum,
-but operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Input(
-        3,
-        "WEIGHTS",
-        "Vector of weights to scale rows of DATA with before reduction")
-    .Output(0, "output", "output");
-
-NO_GRADIENT(SparseLengthsWeightedSumFused8BitRowwiseFakeFP16AccInvScaleFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFused8BitRowwiseFakeFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/true>);
-OPERATOR_SCHEMA(SparseLengthsMeanFused8BitRowwiseFakeFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true>::
-            LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsMean, but
-operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output");
-NO_GRADIENT(SparseLengthsMeanFused8BitRowwiseFakeFP16);
-
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16,
-    SparseLengthsFused8BitRowwiseFakeFP16Op<
-        CPUContext,
-        /*with_weights=*/false,
-        /*is_mean=*/true,
-        /*use_acc_fp16=*/true>);
-OPERATOR_SCHEMA(SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16)
-    .NumInputs(3)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
-            DATA,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
-            INDICES,
-        SparseLengthsFused8BitRowwiseFakeFP16Op<CPUContext, false, true, true>::
-            LENGTHS)
-    .SetDoc(R"DOC(
-Performs the same operation as SparseLengthsMean, but
-operating on 8-bit rowwise quantized matrices with fused storage
-(where each row stores quantized values, and then 4-byte scale and 4-byte bias).
-)DOC")
-    .Input(
-        0,
-        "DATA",
-        "uint8 tensor obtained with "
-        "operator FloatToFused8BitRowwiseQuantized")
-    .Input(
-        1,
-        "INDICES",
-        "Integer vector containing indices of the first "
-        "dimension of DATA for the slices that are being aggregated")
-    .Input(
-        2,
-        "LENGTHS",
-        "Vector with the same sum of elements as the first dimension of DATA")
-    .Output(0, "output", "output");
-NO_GRADIENT(SparseLengthsMeanFused8BitRowwiseFakeFP16AccFP16);
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/lengths_reducer_fused_8bit_rowwise_fp16_fake_op.h
@ -1,312 +0,0 @@
-#pragma once
-
-#include "caffe2/perfkernels/fused_8bit_rowwise_embedding_lookup.h"
-#include "fp16_fma.h"
-#include "lengths_reducer_ops.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-namespace caffe2 {
-
-template <
-    class Context,
-    bool with_weights = 0,
-    bool is_mean = 0,
-    bool use_acc_fp16 = 0,
-    bool use_inv_scale = 0,
-    bool use_nnpi_fma = 0,
-    bool use_fp16_for_embedding_only = 0,
-    bool use_acc_fp32 = 0>
-class SparseLengthsFused8BitRowwiseFakeFP16Op final : public Operator<Context> {
- public:
-  static_assert(
-      !(with_weights && is_mean),
-      "Cannot have with_weights and is_mean a the same time");
-
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  explicit SparseLengthsFused8BitRowwiseFakeFP16Op(
-      const OperatorDef& operator_def,
-      Workspace* ws)
-      : Operator<Context>(operator_def, ws) {}
-  ~SparseLengthsFused8BitRowwiseFakeFP16Op() noexcept override {}
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<int32_t, int64_t>>::call(
-        this, Input(INDICES));
-  }
-
-  template <typename IndexType>
-  bool DoRunWithType() {
-    const auto& data = Input(DATA);
-    const auto& indices = Input(INDICES);
-    const auto& lengths = Input(LENGTHS);
-
-    CAFFE_ENFORCE_EQ(indices.dim(), 1, "INDICES must be a vector");
-    CAFFE_ENFORCE_EQ(lengths.dim(), 1, "LENGTHS must be a vector");
-
-    const float* weights = nullptr;
-    if (with_weights) {
-      const auto& weights_input = Input(WEIGHTS);
-      CAFFE_ENFORCE_EQ(weights_input.dim(), 1, "WEIGHTS must be a vector");
-      CAFFE_ENFORCE_EQ(
-          weights_input.numel(),
-          indices.numel(),
-          "WEIGHTS should have the same length as INDICES.");
-      weights = weights_input.template data<float>();
-    }
-
-    CAFFE_ENFORCE_GT(data.size(1), 8, "DATA must have more than 8 columns");
-    // Subtract 8 from the #columns of data for the 4 bytes for scale and 4
-    // bytes for bias that we use in the fused representation (per row).
-    const std::vector<int64_t> shape = {lengths.size(0), data.size(1) - 8};
-    auto* output = Output(0, shape, at::dtype<float>());
-
-    // Copied from Fused8BitRowwiseEmbeddingLookupGenericSlow in
-    // fused_8bit_rowwise_embedding_lookup.cc
-
-    int64_t block_size = output->size(1);
-    int64_t output_size = output->size(0);
-    int64_t index_size = indices.numel();
-    int64_t data_size = data.size(0);
-    const uint8_t* input = data.template data<uint8_t>();
-    const IndexType* indices_data = indices.template data<IndexType>();
-    const int* lengths_data = lengths.template data<int>();
-    bool normalize_by_length = is_mean;
-    float* out = output->template mutable_data<float>();
-
-    std::vector<float> rowTempSums[2];
-    rowTempSums[0].resize(block_size);
-    rowTempSums[1].resize(block_size);
-
-    // block_size is the number of elements and fused_block_size is the size of
-    // an entire row, including scale and bias.
-    const auto scale_bias_offset = 8 / sizeof(uint8_t);
-    const int64_t fused_block_size = block_size + scale_bias_offset;
-    int64_t current = 0;
-    for (const auto m : c10::irange(output_size)) {
-      memset(out, 0, sizeof(float) * block_size);
-      memset(rowTempSums[0].data(), 0, sizeof(float) * block_size);
-      memset(rowTempSums[1].data(), 0, sizeof(float) * block_size);
-
-      if (current + lengths_data[m] > index_size) {
-        return false;
-      }
-
-      for (int i = 0; i < lengths_data[m]; ++i) {
-        int64_t idx = indices_data[current];
-
-        int accIdx = 0;
-        // Only do double buffer accumulation when block size is even
-        if (use_nnpi_fma && block_size % 2 == 0 && block_size <= 96) {
-          accIdx = i % 2;
-        }
-
-        if (idx < 0 || idx >= data_size) {
-          return false;
-        }
-
-        const float* scale_bias = reinterpret_cast<const float*>(
-            input + fused_block_size * indices_data[current] + block_size);
-
-        float weight = 1.0f;
-        if (weights) {
-          weight = weights[current];
-          if (!use_fp16_for_embedding_only && !use_acc_fp32) {
-            // Fake fp16 rounding of weight
-            fbgemm::RoundToFloat16(
-                &weight, &weight, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          }
-        }
-        float scale = scale_bias[0];
-        float bias = scale_bias[1];
-
-        // Vendor might store scale as s' = 1 / s which implies b' = b / s
-        // We do      x = x_q * s + b
-        // Vendor does x = (x_q + b') / s'
-        // Solving these equations yields to the results above
-        if (use_inv_scale) {
-          constexpr float kEpsilon = 1e-8;
-          if (fabs(scale) < kEpsilon) {
-            if (scale < 0) {
-              scale = -kEpsilon;
-            } else {
-              scale = kEpsilon;
-            }
-          }
-          scale = 1.0 / (1.0 / scale);
-          bias = (bias / scale) * scale;
-        }
-
-        if (!use_fp16_for_embedding_only && !use_acc_fp32) {
-          // Fake fp16 rounding of scale and bias
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          fbgemm::RoundToFloat16(
-              &bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          scale *= weight;
-          // Fake fp16 rounding of scale and bias
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // Fake fp16 rounding of input/ it is already ints
-        std::vector<float> input_rounded(block_size);
-        for (const auto j : c10::irange(block_size)) {
-          input_rounded[j] =
-              input[fused_block_size * indices_data[current] + j];
-        }
-
-        if (use_fp16_for_embedding_only) {
-          // bias *= weight;
-
-          std::vector<float> product_rounded(block_size);
-          TypedAxpy<float, float>(
-              block_size, scale, input_rounded.data(), product_rounded.data());
-
-          for (const auto j : c10::irange(block_size)) {
-            product_rounded[j] += bias;
-          }
-
-          // Fake fp16 rounding of scale x input + bias
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-          // Accumulate w x (scale x input + bias) to output
-          TypedAxpy<float, float>(
-              block_size,
-              weight,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-
-        } else if (use_nnpi_fma) {
-          std::vector<float> mScale(block_size, scale);
-          std::vector<float> mBias(block_size, bias);
-          std::vector<float> mWeight(block_size, weight);
-
-          fake_fp16::fma_fp16(
-              block_size,
-              mBias.data(),
-              mWeight.data(),
-              rowTempSums[accIdx].data());
-
-          fake_fp16::fma_fp16(
-              block_size,
-              mScale.data(),
-              input_rounded.data(),
-              rowTempSums[accIdx].data());
-        } else if (use_acc_fp16) {
-          bias *= weight;
-          fbgemm::RoundToFloat16(
-              &bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          std::vector<float> product_rounded(block_size);
-          TypedAxpy<float, float>(
-              block_size, scale, input_rounded.data(), product_rounded.data());
-
-          // Fake fp16 rounding of w x scale x input
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          for (const auto j : c10::irange(block_size)) {
-            product_rounded[j] += bias;
-          }
-          // Fake fp16 rounding of w x scale x input + w x bias
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          // Accumulate w x scale x input + w x bias to output
-          TypedAxpy<float, float>(
-              block_size,
-              1.0,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-
-          // Fake fp16 rounding of out + (w x scale x input + w x bias)
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        } else if (use_acc_fp32) {
-          for (const auto j : c10::irange(block_size)) {
-            float deqVal = fake_fp16::fmafp32_avx_emulation(
-                scale,
-                input_rounded[j],
-                bias);
-            rowTempSums[accIdx][j] = fake_fp16::fmafp32_avx_emulation(
-                deqVal,
-                weight,
-                rowTempSums[accIdx][j]);
-          }
-        } else {
-          bias *= weight;
-          fbgemm::RoundToFloat16(
-              &bias, &bias, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          TypedAxpy<float, float>(block_size, scale, input_rounded.data(), out);
-
-          for (const auto j : c10::irange(block_size)) {
-            out[j] += bias;
-          }
-        }
-        ++current;
-      }
-
-      if (use_nnpi_fma || use_acc_fp32) {
-        for (const auto j : c10::irange(block_size)) {
-          out[j] = rowTempSums[0][j] + rowTempSums[1][j];
-        }
-      }
-
-      if (use_nnpi_fma) {
-        fbgemm::RoundToFloat16(
-            reinterpret_cast<const float*>(out),
-            out,
-            block_size,
-            FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      }
-
-      if (normalize_by_length && lengths_data[m]) {
-        float scale = 1.f / lengths_data[m];
-
-        if (!use_fp16_for_embedding_only) {
-          // Fake fp16 rounding of scale and out
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // hack: context is not really used
-        math::Scale<float, float, CPUContext>(
-            block_size, scale, out, out, nullptr);
-      }
-
-      out += block_size;
-    }
-    return current == index_size;
-  }
-
-  enum {
-    DATA = 0,
-    WEIGHTS = 1,
-    INDICES = 1 + with_weights,
-    LENGTHS = 2 + with_weights,
-  };
-};
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/lengths_reducer_ops.cc
+++ b/caffe2/contrib/fakelowp/lengths_reducer_ops.cc
@ -1,217 +0,0 @@
-#include "lengths_reducer_ops.h"
-
-#include "caffe2/operators/segment_reduction_op.h"
-
-namespace caffe2 {
-
-// Use _STR option because the schema is declared using _STR version too in
-// generic fashion. Otherwise it'd break schema declaration check.
-// TODO(dzhulgakov): remove _STR when all lengths ops are off generic version.
-
-using SparseLengthsSumOp =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 0>;
-using SparseLengthsWeightedSumOp =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 1, 0>;
-using SparseLengthsMeanOp =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 1>;
-using SparseLengthsSumAccFP16Op =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 0, 0, 1>;
-using SparseLengthsWeightedSumAccFP16Op =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 1, 0, 0, 1>;
-using SparseLengthsMeanAccFP16Op =
-    SparseLengthsReductionFakeFp16Op<TensorTypes<float, at::Half>, 0, 1, 0, 1>;
-using SparseLengthsSumFakeFP16EmbeddingOnlyOp =
-    SparseLengthsReductionFakeFp16Op<
-        TensorTypes<float, at::Half>,
-        0,
-        0,
-        0,
-        0,
-        1>;
-using SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp =
-    SparseLengthsReductionFakeFp16Op<
-        TensorTypes<float, at::Half>,
-        1,
-        0,
-        0,
-        0,
-        1>;
-using SparseLengthsMeanFakeFP16EmbeddingOnlyOp =
-    SparseLengthsReductionFakeFp16Op<
-        TensorTypes<float, at::Half>,
-        0,
-        1,
-        0,
-        0,
-        1>;
-
-REGISTER_CPU_OPERATOR(SparseLengthsSumFakeFP16, SparseLengthsSumOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFakeFP16,
-    SparseLengthsWeightedSumOp);
-REGISTER_CPU_OPERATOR(SparseLengthsMeanFakeFP16, SparseLengthsMeanOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFakeFP16AccFP16,
-    SparseLengthsSumAccFP16Op);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFakeFP16AccFP16,
-    SparseLengthsWeightedSumAccFP16Op);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFakeFP16AccFP16,
-    SparseLengthsMeanAccFP16Op);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsSumFakeFP16EmbeddingOnly,
-    SparseLengthsSumFakeFP16EmbeddingOnlyOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsWeightedSumFakeFP16EmbeddingOnly,
-    SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp);
-REGISTER_CPU_OPERATOR(
-    SparseLengthsMeanFakeFP16EmbeddingOnly,
-    SparseLengthsMeanFakeFP16EmbeddingOnlyOp);
-
-template <typename Def>
-string FormatDoc() {
-  string doc = Def::doc;
-  c10::ReplaceAll(doc, "{op}", Def::OpDef::name);
-  c10::ReplaceAll(doc, "{op_doc}", Def::OpDef::doc);
-  auto replaced = c10::ReplaceAll(doc, "{extra}", "");
-  CAFFE_ENFORCE_EQ(replaced, 0);
-  return doc;
-}
-
-using SparseLengthsSumDef = AbstractSparseLengthsDef<
-    float,
-    int,
-    CPUContext,
-    SumReducerDef,
-    true /*GradientNeedIndices*/>;
-OPERATOR_SCHEMA(SparseLengthsSumFakeFP16)
-    .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsSumOp::DATA,
-        SparseLengthsSumOp::INDICES,
-        SparseLengthsSumOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFakeFP16);
-
-using SparseLengthsWeightedSumDef = AbstractSparseLengthsDef<
-    float,
-    int,
-    CPUContext,
-    WeightedSumReducerDef,
-    true /*GradientNeedIndices*/>;
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16)
-    .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsWeightedSumOp::DATA,
-        SparseLengthsWeightedSumOp::INDICES,
-        SparseLengthsWeightedSumOp::LENGTHS,
-        SparseLengthsWeightedSumOp::WEIGHT)
-    .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsWeightedSumFakeFP16);
-
-using SparseLengthsMeanDef = AbstractSparseLengthsDef<
-    float,
-    int,
-    CPUContext,
-    MeanReducerDef,
-    true /*GradientNeedIndices*/>;
-OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16)
-    .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsMeanOp::DATA,
-        SparseLengthsMeanOp::INDICES,
-        SparseLengthsMeanOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsMeanDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsMeanDef::PopulateSchema);
-NO_GRADIENT(SparseLengthsMeanFakeFP16);
-
-OPERATOR_SCHEMA(SparseLengthsSumFakeFP16AccFP16)
-    .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsSumOp::DATA,
-        SparseLengthsSumOp::INDICES,
-        SparseLengthsSumOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFakeFP16AccFP16);
-
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16AccFP16)
-    .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsWeightedSumOp::DATA,
-        SparseLengthsWeightedSumOp::INDICES,
-        SparseLengthsWeightedSumOp::LENGTHS,
-        SparseLengthsWeightedSumOp::WEIGHT)
-    .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsWeightedSumFakeFP16AccFP16);
-
-OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16AccFP16)
-    .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsMeanOp::DATA,
-        SparseLengthsMeanOp::INDICES,
-        SparseLengthsMeanOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsMeanDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsMeanDef::PopulateSchema);
-NO_GRADIENT(SparseLengthsMeanFakeFP16AccFP16);
-
-OPERATOR_SCHEMA(SparseLengthsSumFakeFP16EmbeddingOnly)
-    .NumInputs(SparseLengthsSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsSumFakeFP16EmbeddingOnlyOp::DATA,
-        SparseLengthsSumFakeFP16EmbeddingOnlyOp::INDICES,
-        SparseLengthsSumFakeFP16EmbeddingOnlyOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsSumFakeFP16EmbeddingOnly);
-
-OPERATOR_SCHEMA(SparseLengthsWeightedSumFakeFP16EmbeddingOnly)
-    .NumInputs(SparseLengthsWeightedSumDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .WeightedValueKeyLengthInputFillers(
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::DATA,
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::INDICES,
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::LENGTHS,
-        SparseLengthsWeightedSumFakeFP16EmbeddingOnlyOp::WEIGHT)
-    .SetDoc(FormatDoc<SparseLengthsWeightedSumDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsWeightedSumDef::PopulateSchema)
-    .InheritOnnxSchema();
-NO_GRADIENT(SparseLengthsWeightedSumFakeFP16EmbeddingOnly);
-
-OPERATOR_SCHEMA(SparseLengthsMeanFakeFP16EmbeddingOnly)
-    .NumInputs(SparseLengthsMeanDef::ForwardOp::kNumInputs)
-    .NumOutputs(1)
-    .ValueKeyLengthInputFillers(
-        SparseLengthsMeanFakeFP16EmbeddingOnlyOp::DATA,
-        SparseLengthsMeanFakeFP16EmbeddingOnlyOp::INDICES,
-        SparseLengthsMeanFakeFP16EmbeddingOnlyOp::LENGTHS)
-    .SetDoc(FormatDoc<SparseLengthsMeanDef>())
-    .Output(0, "OUTPUT", "Aggregated tensor")
-    .FillUsing(SparseLengthsMeanDef::PopulateSchema);
-NO_GRADIENT(SparseLengthsMeanFakeFP16EmbeddingOnly);
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/lengths_reducer_ops.h
+++ b/caffe2/contrib/fakelowp/lengths_reducer_ops.h
@ -1,268 +0,0 @@
-#pragma once
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/perfkernels/typed_axpy.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-namespace caffe2 {
-
-// A templated class that implements SparseLengths[Sum,WeightedSum,Mean].
-template <
-    class InputTypes, // supported input types, such as TensorTypes<float>
-    bool USE_WEIGHT = 0, // Whether it is SparseLengthsWeightedSum
-    bool USE_MEAN = 0, // Whether this is SparseLengthsMean
-    bool USE_POSITIONAL_WEIGHT = 0,
-    bool USE_ACC_FP16 = 0, // Whether use fp16 accumulation
-    bool USE_FP16_FOR_EMBEDDING_ONLY =
-        0 // Whether use fp16 for embedding entries only
-    // USE_WEIGHT = 1 and USE_POSITIONAL_WEIGHT = 1
-    // -> SparseLengthsPositionalWeightedSum
-    >
-class SparseLengthsReductionFakeFp16Op final : public Operator<CPUContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CPUContext);
-  template <class... Args>
-  explicit SparseLengthsReductionFakeFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...) {
-    static_assert(
-        !(USE_WEIGHT & USE_MEAN), "Cannot both specify weight and mean.");
-  }
-
-  ~SparseLengthsReductionFakeFp16Op() noexcept override {}
-
-  // Currently, we support float and at::Half inputs for input data type, and
-  // int32_t and int64_t for the index type.
-
-  bool RunOnDevice() override {
-    return DispatchHelper<InputTypes>::call(this, Input(DATA));
-  }
-
-  template <typename InputType>
-  bool DoRunWithType() {
-    return DispatchHelper<TensorTypes2<int32_t, int64_t>, InputType>::call(
-        this, Input(INDICES));
-  }
-
-  template <typename InputType, typename IndexType>
-  bool DoRunWithType2() {
-    auto& dataInput = Input(DATA);
-    auto& indicesInput = Input(INDICES);
-    auto& lengthsInput = Input(LENGTHS);
-
-    CAFFE_ENFORCE_EQ(1, indicesInput.dim(), "INDICES must be a vector");
-    CAFFE_ENFORCE_EQ(1, lengthsInput.dim(), "LENGTHS must be a vector");
-    const int64_t N = dataInput.size(0);
-    const int D = dataInput.size_from_dim(1);
-    const int64_t M = lengthsInput.size(0);
-    const int64_t indices_size = indicesInput.numel();
-
-    auto shape = dataInput.sizes().vec();
-    shape[0] = M;
-    auto* output = Output(0, shape, at::dtype<float>());
-    float* out_data = output->template mutable_data<float>();
-
-    const InputType* in_data = dataInput.template data<InputType>();
-    const IndexType* indices = indicesInput.template data<IndexType>();
-    const int* lengths = lengthsInput.template data<int>();
-    const float* in_weight = nullptr;
-
-    if (USE_WEIGHT) {
-      // static if
-      auto& weightInput = Input(WEIGHT);
-      CAFFE_ENFORCE_EQ(1, weightInput.dim(), "WEIGHT must be a vector");
-      if (!USE_POSITIONAL_WEIGHT) {
-        CAFFE_ENFORCE_EQ(
-            weightInput.numel(),
-            indices_size,
-            "Weight should have the same length as indices.");
-      }
-      in_weight = weightInput.template data<float>();
-    }
-
-    // Copied from EmbeddingLookupGenericSlow in perfkernels/embedding_lookup.cc
-    int64_t block_size = D;
-    int64_t output_size = M;
-    int64_t index_size = indices_size;
-    int64_t data_size = N;
-    const InputType* input = in_data;
-    const float* weights = in_weight;
-    bool normalize_by_lengths = USE_MEAN;
-    float* out = out_data;
-
-    int64_t current = 0;
-    for (const auto m : c10::irange(output_size)) {
-      memset(out, 0, sizeof(float) * block_size);
-      if (current + lengths[m] > index_size) {
-        return false;
-      }
-      for (int i = 0; i < lengths[m]; ++i) {
-        int64_t idx = indices[current];
-        if (idx < 0 || idx >= data_size) {
-          return false;
-        }
-
-        float w = 1.f;
-        if (weights) {
-          w = weights[USE_POSITIONAL_WEIGHT ? i : current];
-          if (!USE_FP16_FOR_EMBEDDING_ONLY) {
-            // Fake fp16 rounding of w
-            fbgemm::RoundToFloat16(
-                &w, &w, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          }
-        }
-
-        if (USE_FP16_FOR_EMBEDDING_ONLY) {
-          std::vector<float> product_rounded(block_size);
-          if (std::is_same<InputType, at::Half>::value) {
-            TypedAxpy<InputType, float>(
-                block_size,
-                w,
-                input + block_size * indices[current],
-                product_rounded.data());
-          } else {
-            bool is_float = std::is_same<InputType, float>::value;
-            assert(is_float);
-            // Fake fp16 rounding of input
-            std::vector<float> input_rounded(block_size);
-            fbgemm::RoundToFloat16(
-                reinterpret_cast<const float*>(
-                    input + block_size * indices[current]),
-                input_rounded.data(),
-                block_size,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp_denorms);
-
-            TypedAxpy<float, float>(
-                block_size,
-                w,
-                reinterpret_cast<const float*>(input_rounded.data()),
-                product_rounded.data());
-          }
-
-          // Accumulate w x input to output
-          TypedAxpy<float, float>(
-              block_size,
-              1.0,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-        } else if (USE_ACC_FP16) {
-          std::vector<float> product_rounded(block_size);
-          if (std::is_same<InputType, at::Half>::value) {
-            TypedAxpy<InputType, float>(
-                block_size,
-                w,
-                input + block_size * indices[current],
-                product_rounded.data());
-          } else {
-            bool is_float = std::is_same<InputType, float>::value;
-            assert(is_float);
-            // Fake fp16 rounding of input
-            std::vector<float> input_rounded(block_size);
-            fbgemm::RoundToFloat16(
-                reinterpret_cast<const float*>(
-                    input + block_size * indices[current]),
-                input_rounded.data(),
-                block_size,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-            TypedAxpy<float, float>(
-                block_size,
-                w,
-                reinterpret_cast<const float*>(input_rounded.data()),
-                product_rounded.data());
-          }
-
-          // Fake fp16 rounding of w x input
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(product_rounded.data()),
-              product_rounded.data(),
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-          // Accumulate w x input to output
-          TypedAxpy<float, float>(
-              block_size,
-              1.0,
-              reinterpret_cast<const float*>(product_rounded.data()),
-              out);
-
-          // Fake fp16 rounding of out + w x input
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        } else {
-          if (std::is_same<InputType, at::Half>::value) {
-            TypedAxpy<InputType, float>(
-                block_size, w, input + block_size * indices[current], out);
-          } else {
-            bool is_float = std::is_same<InputType, float>::value;
-            assert(is_float);
-            // Fake fp16 rounding of input
-            std::vector<float> input_rounded(block_size);
-            fbgemm::RoundToFloat16(
-                reinterpret_cast<const float*>(
-                    input + block_size * indices[current]),
-                input_rounded.data(),
-                block_size,
-                FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-            TypedAxpy<float, float>(
-                block_size,
-                w,
-                reinterpret_cast<const float*>(input_rounded.data()),
-                out);
-          }
-        }
-
-        ++current;
-      }
-      if (normalize_by_lengths && lengths[m]) {
-        float scale = 1.f / lengths[m];
-
-        if (!USE_FP16_FOR_EMBEDDING_ONLY) {
-          // Fake fp16 rounding of scale and out
-          fbgemm::RoundToFloat16(
-              &scale, &scale, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-          fbgemm::RoundToFloat16(
-              reinterpret_cast<const float*>(out),
-              out,
-              block_size,
-              FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-        }
-
-        // hack: context is not really used
-        math::Scale<float, float, CPUContext>(
-            block_size, scale, out, out, nullptr);
-      }
-
-      if (!USE_FP16_FOR_EMBEDDING_ONLY) {
-        // Fake fp16 rounding of out
-        fbgemm::RoundToFloat16(
-            reinterpret_cast<const float*>(out),
-            reinterpret_cast<float*>(out),
-            block_size,
-            FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      }
-
-      out += block_size;
-    }
-    return current == index_size;
-  }
-
-  enum {
-    DATA = 0, // Data input.
-    WEIGHT = 1, // Weight input used in SparseLengthsWeightedSum
-    INDICES = 1 + USE_WEIGHT, // 1 in SparseLengths[Sum,Mean] and
-                              // 2 in SparseLengthsWeightedSum
-    LENGTHS = 2 + USE_WEIGHT, // 2 in SparseLengths[Sum, Mean],
-                              // 3 in SparseLengthsWeightedSum
-  };
-};
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.cc
@ -1,20 +0,0 @@
-#include "caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(TanhQuantFakeFp16NNPI, TanhInt8QuantizeNNPIOp);
-
-OPERATOR_SCHEMA(TanhQuantFakeFp16NNPI)
-    .Arg("Y_scale", "Output tensor quantization scale")
-    .Arg("Y_zero_point", "Output tensor quantization offset")
-    .NumInputs(1)
-    .NumOutputs(1)
-    .SetDoc(R"DOC(
-Apply TanH and convert the result to Int8.
-<details>
-</details>
-)DOC")
-    .Input(0, "X", "Float Tensor X.")
-    .Output(0, "Y", "Int8 Tensor Y.");
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/quant_lut_fp16_fake_op.h
@ -1,91 +0,0 @@
-#pragma once
-
-#include <array>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/tensor_int8.h"
-#include "caffe2/operators/quantized/int8_utils.h"
-
-#include <immintrin.h>
-#include <emmintrin.h>
-
-
-namespace caffe2 {
-
-namespace {
-
-
-class TanhInt8QuantizeNNPIOp final : public Operator<CPUContext> {
- public:
-  using Operator<CPUContext>::Operator;
-
-  bool RunOnDevice() override {
-    const auto& X = Input(0);
-    auto* Y = Outputs()[0]->template GetMutable<int8::Int8TensorCPU>();
-    Y->t.ResizeLike(X);
-
-    int32_t Y_offset = this->template GetSingleArgument<int>("Y_zero_point", 0);
-    auto Y_scale = this->template GetSingleArgument<float>("Y_scale", 1);
-
-    Y->scale = Y_scale;
-    Y->zero_point = Y_offset;
-
-    constexpr int tanhLUTMinOffset = 0;
-    constexpr int tanhLUTMaxOffset = 18000;
-    constexpr int lutSize = tanhLUTMaxOffset - tanhLUTMinOffset;
-
-    std::array<uint8_t, lutSize> tanhLUT;
-
-    Y_scale = 1.0f / Y_scale;
-
-    // create table once
-    for (const auto i : c10::irange(lutSize)) {
-        short input = i + tanhLUTMinOffset;
-        float x = _cvtsh_ss(input);
-        float tanh_x = tanh(x);
-        tanh_x = round(tanh_x * Y_scale + Y_offset);
-
-        if (tanh_x < 0 || tanh_x > 255.0) {
-            tanh_x = 255.0;
-        }
-        uint32_t tanh_quant = (uint32_t)(tanh_x);
-
-        tanhLUT[i] = (uint8_t)tanh_quant;
-    }
-
-    const float* X_data = X.template data<float>();
-    for (const auto i : c10::irange(X.numel())) {
-        short val = _cvtss_sh(X_data[i], 0);
-        unsigned short max16BitPositive = 0x7FFF;
-        unsigned short input16Bit = (*(unsigned short*)& val);
-        short shortAbsInput = input16Bit & max16BitPositive; // mask out negative bit
-        short clampShortAbsInput = shortAbsInput;
-        if (shortAbsInput < (short)tanhLUTMinOffset) {
-            clampShortAbsInput = (short)tanhLUTMinOffset;
-        }
-
-        if (shortAbsInput > (short)(tanhLUTMaxOffset - 1)) {
-            clampShortAbsInput = (short)(tanhLUTMaxOffset - 1);
-        }
-        short inputInLutRange = clampShortAbsInput - tanhLUTMinOffset;
-        short temp =  tanhLUT[inputInLutRange];
-
-        if (input16Bit > max16BitPositive) {  // negative value
-            temp = temp - Y_offset;
-            temp = temp * (-1);
-            temp = temp + Y_offset;
-        }
-        uint8_t output = (uint8_t)temp;
-        if (temp < 0) {
-            output = 0;
-        }
-
-        Y->t.mutable_data<uint8_t>()[i] = output;
-    }
-
-    return true;
-  }
-};
-
-}
-}
--- a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.cc
@ -1,15 +0,0 @@
-#include "spatial_batch_norm_fp16_fake_op.h"
-
-#include <array>
-
-#include "caffe2/utils/eigen_utils.h"
-
-namespace caffe2 {
-
-REGISTER_CPU_OPERATOR(SpatialBNFakeLoweredFp16NNPI, SpatialBNFakeLoweredFp16Op);
-OPERATOR_SCHEMA(SpatialBNFakeLoweredFp16NNPI).NumInputs({1, 5}).NumOutputs(1);
-
-REGISTER_CPU_OPERATOR(SpatialBNFakeFp16NNPI, SpatialBNFakeFp16Op);
-OPERATOR_SCHEMA(SpatialBNFakeFp16NNPI).NumInputs({1, 5}).NumOutputs(1);
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/spatial_batch_norm_fp16_fake_op.h
@ -1,395 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <array>
-#include <functional>
-#include <string>
-#include <vector>
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-#include "fp16_fma.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-class SpatialBNFakeLoweredFp16Op : public Operator<CPUContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CPUContext);
-
-  template <class... Args>
-  explicit SpatialBNFakeLoweredFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, false),
-        OP_SINGLE_ARG(double, "epsilon", epsilon_, 1e-5),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<std::string>("order", "NCHW"))),
-        OP_SINGLE_ARG(int, "num_batches", num_batches_, 1) {
-    // TODO: only support NCHW for now
-    CAFFE_ENFORCE_EQ(order_, StorageOrder::NCHW);
-    CAFFE_ENFORCE(
-        (is_test_ && OutputSize() == 1) || (!is_test_ && OutputSize() == 5));
-    CAFFE_ENFORCE_GT(epsilon_, 0);
-  }
-
-   ~SpatialBNFakeLoweredFp16Op() override = default;
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    const auto& X = Input(INPUT);
-    const auto& scale = Input(SCALE);
-    const auto& bias = Input(BIAS);
-
-    const int ndim = X.dim();
-    CAFFE_ENFORCE_GE(ndim, 2);
-    const int N = X.dim32(0);
-    const int C =
-        (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
-    const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
-    const int HxW =
-        std::accumulate(
-            X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
-        C;
-    CAFFE_ENFORCE_EQ(scale.numel(), C);
-    CAFFE_ENFORCE_EQ(bias.numel(), C);
-
-    auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
-    T* Y_data = Y->template mutable_data<T>();
-    ReinitializeTensor(
-        &alpha_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
-    T* alpha_data = alpha_.template mutable_data<T>();
-
-    // We only support this case at the moment
-    CAFFE_ENFORCE(is_test_);
-
-    std::vector<float> X_fp16(X.numel());
-
-    fbgemm::RoundToFloat16(
-        X.template data<T>(),
-        X_fp16.data(),
-        N * C * HxW,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    if (N == 0) {
-      return true;
-    }
-    const auto& mean = Input(EST_MEAN);
-    const auto& var = Input(EST_VAR);
-    CAFFE_ENFORCE_EQ(mean.numel(), C);
-    CAFFE_ENFORCE_EQ(var.numel(), C);
-    std::vector<float> mean_fp16(C), var_fp16(C);
-    std::vector<float> scale_fp16(C), bias_fp16(C);
-
-    fbgemm::RoundToFloat16(
-        scale.template data<T>(),
-        scale_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        bias.template data<T>(),
-        bias_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        mean.template data<T>(),
-        mean_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        var.template data<T>(),
-        var_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    EigenVectorArrayMap<T> alpha_arr(alpha_data, C);
-    std::vector<float> tmp(C);
-    EigenVectorArrayMap<T> tmp_arr(tmp.data(), C);
-
-    auto epsilon = static_cast<T>(epsilon_);
-    fbgemm::RoundToFloat16(
-        &epsilon, &epsilon, 1, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    tmp_arr = (ConstEigenVectorArrayMap<T>(var_fp16.data(), C) + epsilon);
-    fbgemm::RoundToFloat16(
-        tmp.data(), tmp.data(), C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    tmp_arr = tmp_arr.pow(0.5);
-    fbgemm::RoundToFloat16(
-        tmp.data(), tmp.data(), C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    alpha_arr = ConstEigenVectorArrayMap<T>(scale_fp16.data(), C) / tmp_arr;
-    fbgemm::RoundToFloat16(
-        alpha_data, alpha_data, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    AffineChannel_NCHW(
-        N,
-        C,
-        HxW,
-        X_fp16.data(),
-        alpha_data,
-        bias_fp16.data(),
-        mean_fp16.data(),
-        Y_data);
-
-    fbgemm::RoundToFloat16(
-        Y_data, Y_data, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
- protected:
-  void AffineChannel_NCHW(
-      const int N,
-      const int C,
-      const int HxW,
-      const float* X,
-      const float* scale,
-      const float* bias,
-      const float* mean,
-      float* Y) {
-    ConstEigenVectorArrayMap<float> scale_arr(scale, C);
-    ConstEigenVectorArrayMap<float> bias_arr(bias, C);
-    ConstEigenVectorArrayMap<float> mean_arr(mean, C);
-    const int stride = C * HxW;
-    const float* X_ptr = X;
-    float* Y_ptr = Y;
-    for ([[maybe_unused]] const auto i : c10::irange(N)) {
-      EigenArrayMap<float>(Y_ptr, HxW, C) =
-          ConstEigenArrayMap<float>(X_ptr, HxW, C).rowwise() -
-          mean_arr.transpose();
-      fbgemm::RoundToFloat16(
-          Y_ptr, Y_ptr, HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      EigenArrayMap<float>(Y_ptr, HxW, C).rowwise() *= scale_arr.transpose();
-      fbgemm::RoundToFloat16(
-          Y_ptr, Y_ptr, HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      EigenArrayMap<float>(Y_ptr, HxW, C).rowwise() += bias_arr.transpose();
-
-      X_ptr += stride;
-      Y_ptr += stride;
-    }
-    fbgemm::RoundToFloat16(
-        Y, Y, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  const bool is_test_;
-  double epsilon_;
-  const StorageOrder order_;
-  const int num_batches_;
-
-  Tensor alpha_;
-
-  INPUT_TAGS(
-      INPUT,
-      SCALE,
-      BIAS,
-      EST_MEAN,
-      EST_VAR,
-      BATCH_MEAN_SUM,
-      BATCH_VAR_SUM);
-  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_STD);
-};
-
-// Emulation of the NNPI SpatialBN kernel
-class SpatialBNFakeFp16Op : public Operator<CPUContext> {
- public:
-  USE_OPERATOR_FUNCTIONS(CPUContext);
-
-  template <class... Args>
-  explicit SpatialBNFakeFp16Op(Args&&... args)
-      : Operator<CPUContext>(std::forward<Args>(args)...),
-        OP_SINGLE_ARG(bool, OpSchema::Arg_IsTest, is_test_, false),
-        OP_SINGLE_ARG(float, "epsilon", epsilon_, 1e-5),
-        order_(StringToStorageOrder(
-            this->template GetSingleArgument<std::string>("order", "NCHW"))),
-        OP_SINGLE_ARG(int, "num_batches", num_batches_, 1) {
-    // TODO: only support NCHW for now
-    CAFFE_ENFORCE_EQ(order_, StorageOrder::NCHW);
-    // We only support this case at the moment
-    CAFFE_ENFORCE(is_test_);
-    CAFFE_ENFORCE_GT(epsilon_, 0);
-  }
-
-   ~SpatialBNFakeFp16Op() override = default;
-
-  bool RunOnDevice() override {
-    return DispatchHelper<TensorTypes<float>>::call(this, Input(0));
-  }
-
-  template <typename T>
-  bool DoRunWithType() {
-    LOG(INFO) << "Running with " << sizeof(T);
-    const auto& X = Input(INPUT);
-    const auto& scale = Input(SCALE);
-    const auto& bias = Input(BIAS);
-
-    const int ndim = X.dim();
-    CAFFE_ENFORCE_GE(ndim, 2);
-    const int N = X.dim32(0);
-    const int C =
-        (order_ == StorageOrder::NCHW ? X.dim32(1) : X.dim32(ndim - 1));
-    const std::vector<int> X_dims(X.sizes().cbegin(), X.sizes().cend());
-    const int HxW =
-        std::accumulate(
-            X_dims.cbegin() + 1, X_dims.cend(), 1, std::multiplies<int>()) /
-        C;
-    CAFFE_ENFORCE_EQ(scale.numel(), C);
-    CAFFE_ENFORCE_EQ(bias.numel(), C);
-
-    auto* Y = Output(OUTPUT, X.sizes(), at::dtype<T>());
-    T* Y_data = Y->template mutable_data<T>();
-    ReinitializeTensor(
-        &alpha_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
-    ReinitializeTensor(
-        &beta_, {C}, at::dtype<T>().device(CPUContext::GetDeviceType()));
-    T* alpha_data = alpha_.template mutable_data<T>();
-    T* beta_data = beta_.template mutable_data<T>();
-
-    std::vector<float> X_fp16(X.numel());
-
-    fbgemm::RoundToFloat16(
-        X.template data<T>(),
-        X_fp16.data(),
-        N * C * HxW,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    const auto& mean = Input(EST_MEAN);
-    const auto& var = Input(EST_VAR);
-    CAFFE_ENFORCE_EQ(mean.numel(), C);
-    CAFFE_ENFORCE_EQ(var.numel(), C);
-    std::vector<float> mean_fp16(C), var_fp16(C);
-    std::vector<float> scale_fp16(C), bias_fp16(C);
-
-    fbgemm::RoundToFloat16(
-        scale.template data<T>(),
-        scale_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        bias.template data<T>(),
-        bias_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        mean.template data<T>(),
-        mean_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(
-        var.template data<T>(),
-        var_fp16.data(),
-        C,
-        FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    // This part is run on the CPU/x86 core
-    ComputeFusedParam<T>(
-        C,
-        scale_fp16.data(),
-        bias_fp16.data(),
-        mean_fp16.data(),
-        var_fp16.data(),
-        alpha_data,
-        beta_data);
-    AffineChannel_NCHW(N, C, HxW, X_fp16.data(), alpha_data, beta_data, Y_data);
-
-    fbgemm::RoundToFloat16(
-        Y_data, Y_data, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
- protected:
-  template <typename T>
-  void ComputeFusedParam(
-      const int C,
-      const T* scale,
-      const T* bias,
-      const T* mean,
-      const T* var,
-      T* alpha,
-      T* beta) {
-    // alpha = scale / sqrt(var + epsilon)
-    // beta = bias - alpha * mean
-    EigenVectorArrayMap<T> alpha_arr(alpha, C);
-    EigenVectorArrayMap<T> beta_arr(beta, C);
-
-    std::vector<T> tmp(C, 0.0);
-    EigenVectorArrayMap<T> tmp_arr(tmp.data(), C);
-    tmp_arr = ConstEigenVectorArrayMap<T>(var, C) + static_cast<T>(epsilon_);
-
-    // sqrt using intrinsics
-    int i = 0;
-    constexpr int blockSize = 8;
-    for (i = 0; i + blockSize <= C; i += blockSize) {
-      __m256 t = _mm256_loadu_ps(&tmp[i]);
-      _mm256_storeu_ps(&tmp[i], _mm256_sqrt_ps(t));
-    }
-    for (; i < C; i++) {
-      tmp[i] = sqrt(tmp[i]);
-    }
-
-    alpha_arr = ConstEigenVectorArrayMap<T>(scale, C) / tmp_arr;
-    beta_arr = ConstEigenVectorArrayMap<T>(bias, C) -
-        alpha_arr * ConstEigenVectorArrayMap<T>(mean, C);
-    fbgemm::RoundToFloat16(
-        alpha, alpha, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    fbgemm::RoundToFloat16(beta, beta, C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  void AffineChannel_NCHW(
-      const int N,
-      const int C,
-      const int HxW,
-      const float* X,
-      const float* scale,
-      const float* bias,
-      float* Y) {
-    ConstEigenVectorArrayMap<float> scale_arr(scale, C);
-    ConstEigenVectorArrayMap<float> bias_arr(bias, C);
-    const int stride = C * HxW;
-    const float* X_ptr = X;
-    float* Y_ptr = Y;
-
-    // Do Y = X * scale + bias
-    for ([[maybe_unused]] const auto i : c10::irange(N)) {
-      for (const auto j : c10::irange(C)) {
-        for (const auto k : c10::irange(HxW)) {
-          Y_ptr[HxW * j + k] = bias[j];
-        }
-
-        std::vector<float> s2(HxW, scale[j]);
-        fake_fp16::fma_fp16(
-            HxW, X_ptr + j * HxW, s2.data(), Y_ptr + HxW * j); // b2.data());
-      }
-      X_ptr += stride;
-      Y_ptr += stride;
-    }
-    fbgemm::RoundToFloat16(
-        Y, Y, N * HxW * C, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-  }
-
-  const bool is_test_;
-  float epsilon_;
-  const StorageOrder order_;
-  const int num_batches_;
-
-  Tensor alpha_;
-  Tensor beta_;
-
-  INPUT_TAGS(
-      INPUT,
-      SCALE,
-      BIAS,
-      EST_MEAN,
-      EST_VAR,
-      BATCH_MEAN_SUM,
-      BATCH_VAR_SUM);
-  OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_STD);
-}; // namespace caffe2
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/sum_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/sum_fp16_fake_op.h
@ -1,69 +0,0 @@
-#pragma once
-
-#include <caffe2/core/operator.h>
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-
-template <class Context>
-class SumFP16FP16AccOp : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-  USE_SIMPLE_CTOR_DTOR(SumFP16FP16AccOp);
-
-  bool DoRunWithFloat() {
-    auto& input0 = Input(0);
-
-    size_t N = input0.numel();
-    auto* output = Output(0, input0.sizes(), at::dtype<float>());
-    // Dimension checking
-    for (const auto i : c10::irange(1, InputSize())) {
-      if (output->sizes() != Input(i).sizes()) {
-        CAFFE_THROW(
-            "Check failed: output->sizes() == Input(i).sizes().",
-            "Description: Input #",
-            i,
-            ", input dimension:",
-            Input(i).sizes(),
-            " should match output dimension: ",
-            output->sizes());
-      }
-    }
-
-    float* output_data = output->template mutable_data<float>();
-    memset(output_data, 0, sizeof(float) * input0.numel());
-
-    std::vector<float> t1(N);
-    std::vector<float> t2(N);
-
-    for (const auto i : c10::irange(InputSize())) {
-      fbgemm::RoundToFloat16(
-          Input(i).template data<float>(),
-          t1.data(),
-          N,
-          FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-      fbgemm::RoundToFloat16(
-          output_data, t2.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-      math::Add(N, t1.data(), t2.data(), output_data, &context_);
-    }
-    fbgemm::RoundToFloat16(
-        output_data, output_data, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-
-    return true;
-  }
-
-  bool RunOnDevice() override {
-    if (Input(0).template IsType<float>()) {
-      return DoRunWithFloat();
-    } else {
-      CAFFE_THROW(
-          "Sum operator only supports 32-bit float, but",
-          " input was of type ",
-          Input(0).dtype().name());
-    }
-  }
-};
-
-} // namespace caffe2
--- a/caffe2/contrib/fakelowp/test/README.md
+++ b/caffe2/contrib/fakelowp/test/README.md
@ -1,53 +0,0 @@
-# How to run FakeLowP vs Glow tests
-This was tested on Ubuntu 16.04 LTS but should work in general Linux system. The tested compiler is Clang-8.
-
-## Build Glow Onnxifi Library
-Follow https://github.com/pytorch/glow/blob/master/README.md to install the dependency of Glow. Then at glow root run
-```
-mkdir build && cd build
-cmake -G Ninja -DGLOW_BUILD_ONNXIFI_DYNLIB=ON ..
-ninja all
-```
-Note that here you probably want to add other flags like `-DGLOW_WITH_NNPI=1` to enable specific backend if you have the flow set up. Also, make sure you have the LD_LIBRARY_PATH set correctly pointing to libomp.so path when compiling with -DGLOW_WITH_NNPI=1.
-```
-export LD_LIBRARY_PATH=/usr/lib/llvm-8/lib
-```
-Once built successfully, you will get an dynamic library at `build/lib/Onnxifi/libonnxifi.so`. We will use it later.
-
-## Build and Install PyTorch
-Follow https://github.com/pytorch/pytorch/blob/main/README.md to install the dependency of PyTorch. It might be easy to
-setup a python virtualenv or conda. And please use Python > 3.5.2 because hypothesis library will expose a bug in Python which
-is fixed after 3.5.2. Something like 3.7 might be good enough. You can install python3.7 with
-```
-sudo apt-get install -y build-essential checkinstall libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev zlib1g-dev openssl libffi-dev python3-dev python3-setuptools wget
-wget https://www.python.org/ftp/python/3.7.4/Python-3.7.4.tgz && tar -xf Python-3.7.4.tgz
-cd Python-3.7.4
-./configure && make -j 8 && sudo make altinstall
-```
-
-Once you installed Python 3.7, here I give a virtualenv flow:
-```
-sudo pip3.7 install virtualenv
-python3.7 -m venv venv3
-source venv3/bin/activate
-cd pytorch
-pip install -r requirements.txt
-pip install pytest hypothesis protobuf
-```
-You probably need to install gflags-dev too with
-```
-sudo apt-get install libgflags-dev
-```
-
-Once you have all the dependency libs installed, build PyTorch with FakeLowP op support
-```
-USE_CUDA=0 USE_ROCM=0 USE_FAKELOWP=ON DEBUG=1 CMAKE_BUILD_TYPE=Debug USE_GFLAGS=1 USE_GLOG=1 USE_MKLDNN=0 BUILD_TEST=0 python setup.py install
-```
-The key options here are `USE_FAKELOWP=ON` which enables building of FakeLowP operators and `USE_GFLAGS=1` which enables gflags as we
-use gflags in Glow to pass options. Other flags are mostl for fast build time and debug purpose.
-
-## Run the test
-You can now run the tests with command like the following  when you are inside the virtual python env:
-```
-OSS_ONNXIFI_LIB=${PATH_TO_GLOW}/build/lib/Onnxifi/libonnxifi.so pytest pytorch/caffe2/contrib/fakelowp/test --hypothesis-show-statistics
-```
--- a/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py
@ -1,108 +0,0 @@
-# mypy: ignore-errors
-
-import numpy as np
-import unittest
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import datetime
-from hypothesis import given, settings
-import hypothesis.strategies as st
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-
-class TestBatchMatMul(serial.SerializedTestCase):
-    @given(
-        C=st.integers(min_value=1, max_value=10),
-        M=st.integers(min_value=1, max_value=50),
-        K=st.integers(min_value=1, max_value=512),
-        N=st.integers(min_value=1, max_value=50),
-        rand_seed=st.integers(0, 65534),
-        trans_a=st.booleans(),
-        trans_b=st.booleans(),
-        run_ints=st.booleans()
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_batch_matmul(self, M, K, N, C, rand_seed, trans_a, trans_b, run_ints):
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        batch_dims = [C]
-
-        if run_ints:
-            X = np.random.randint(low=1, high=3, size=((C, M, K))).astype(np.float32)
-        else:
-            X = 100 * (np.random.rand(*(batch_dims + [M, K])).astype(np.float32) - 0.5)
-        if trans_a:
-            X = X.swapaxes(-1, -2)
-
-        if run_ints:
-            Y = np.random.randint(low=1, high=3, size=((C, K, N))).astype(np.float32)
-        else:
-            Y = 100 * (np.random.rand(*(batch_dims + [K, N])).astype(np.float32) - 0.5)
-        if trans_b:
-            Y = Y.swapaxes(-1, -2)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "Y"])
-        pred_net.external_output.append("out")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                'BatchMatMul', ['X', 'Y'], 'out', trans_a=trans_a, trans_b=trans_b
-            )
-        )
-
-        pred_net_ref = core.Net("pred_net_ref")
-
-        # Reference updated to fp16 with fp32 accumulation
-        pred_net_ref.BatchMatMulFP16Acc32Fake(
-            ["X", "Y"], ['out'], trans_a=trans_a, trans_b=trans_b)
-
-        print("dims", batch_dims, X.shape, Y.shape)
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": X.shape, "Y": Y.shape},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("Y", Y)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        out_glow = workspace.FetchBlob('out')
-
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref)
-        out_c2_fakefp16 = workspace.FetchBlob('out')
-
-        diff = np.abs(out_c2_fakefp16 - out_glow)
-
-        if not np.allclose(out_glow, out_c2_fakefp16):
-            print_test_debug_info("bmm", {
-                "seed": rand_seed,
-                "m": M, "k": K,
-                "n": N, "X": X.shape, "Y": Y.shape,
-                "trans_a": trans_a,
-                "trans_b": trans_b,
-                "run_ints": run_ints,
-                "out_glow": out_glow,
-                "out_c2_fakefp16": out_c2_fakefp16,
-                "diff": diff
-            })
-            assert(0)
-
-
-if __name__ == "__main__":
-    unittest.main()
--- a/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_batchnorm_nnpi_fp16.py
@ -1,143 +0,0 @@
-import numpy as np
-import unittest
-
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-core.GlobalInit(["caffe2", "--glow_global_fp16=1",
-                 "--glow_global_fused_scale_offset_fp16=1",
-                 "--glow_global_force_sls_fp16_accum=1"])
-
-GLOW_LOWERED_BATCHNORM = False
-
-
-def reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order):
-    X = X.astype(np.float16)
-    scale = scale.astype(np.float16)
-    bias = bias.astype(np.float16)
-    mean = mean.astype(np.float16)
-    # var = var.astype(np.float16)
-    assert(order == "NCHW")
-
-    scale = scale[np.newaxis, :, np.newaxis, np.newaxis]
-    bias = bias[np.newaxis, :, np.newaxis, np.newaxis]
-    mean = mean[np.newaxis, :, np.newaxis, np.newaxis]
-    var = var[np.newaxis, :, np.newaxis, np.newaxis]
-    Y = ((X - mean) * (scale / np.sqrt(var + epsilon).astype(np.float16))) + bias
-    return Y.astype(np.float32)
-
-
-# Test the lowered BN op
-class BatchnormTest(serial.SerializedTestCase):
-    # TODO: using hypothesis seed, sweep dimensions
-    @given(seed=st.integers(0, 65535),
-           size=st.integers(2, 30),
-           input_channels=st.integers(2, 40),
-           batch_size=st.integers(2, 20))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_bn(self, seed, size, input_channels, batch_size):
-        workspace.ResetWorkspace()
-        np.random.seed(seed)
-
-        order = "NCHW"
-        epsilon = 1e-3
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "scale", "bias", "mean", "var"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SpatialBN",
-                ["X", "scale", "bias", "mean", "var"],
-                ["Y"],
-                order=order,
-                is_test=True,
-                epsilon=epsilon
-            )
-        )
-
-        if GLOW_LOWERED_BATCHNORM:
-            refopname = "SpatialBNFakeLoweredFp16NNPI"
-        else:
-            refopname = "SpatialBNFakeFp16NNPI"
-
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", "scale", "bias", "mean", "var"])
-        pred_net_ref.external_output.append("X")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                refopname,
-                ["X", "scale", "bias", "mean", "var"],
-                ["Y"],
-                order=order,
-                is_test=True,
-                epsilon=epsilon
-            )
-        )
-
-        scale = np.random.rand(input_channels).astype(np.float32) + 0.5
-        bias = np.random.rand(input_channels).astype(np.float32) - 0.5
-        mean = np.random.randn(input_channels).astype(np.float32)
-        var = np.random.rand(input_channels).astype(np.float32) + 0.5
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-
-        workspace.FeedBlob("scale", scale)
-        workspace.FeedBlob("bias", bias)
-        workspace.FeedBlob("mean", mean)
-        workspace.FeedBlob("var", var)
-
-        # Use for reference to debug
-        # Y_np = reference_spatialbn_test16(X, scale, bias, mean, var, epsilon, order)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": [batch_size, input_channels, size, size],
-             "scale": [input_channels],
-             "bias": [input_channels],
-             "mean": [input_channels],
-             "var": [input_channels]},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow.astype(np.float16), Y_c2.astype(np.float16)):
-            diff = np.abs(Y_glow - Y_c2).astype(np.float16)
-            print_test_debug_info(
-                "bn",
-                {
-                    "seed": seed,
-                    "scale": scale,
-                    "bias": bias,
-                    "mean": mean,
-                    "var": var,
-                    "Y_np": Y_c2,
-                    "Y_glow": Y_glow,
-                    "diff": diff,
-                    "rowwise_diff": np.max(np.abs(diff), -1)})
-            assert(0)
--- a/caffe2/contrib/fakelowp/test/test_chunking.py
+++ b/caffe2/contrib/fakelowp/test/test_chunking.py
@ -1,142 +0,0 @@
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings, example
-from hypothesis import strategies as st
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-# Test that parallel chunks behave the same way as the serial one
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=1",
-        "--glow_global_fused_scale_offset_fp16=1",
-        "--glow_global_force_sls_fp16_accum=1",
-        "--glow_nnpi_num_parallel_chunks=2",
-        "--glow_use_dag_optimizer=false",
-        "--glow_dump_graph=true",
-    ]
-)
-
-class Fusions(serial.SerializedTestCase):
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
-        if scale < 1e-6:
-            scale = np.float32(1e-6)
-        zero_point = 0 - tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    @given(
-        scale=st.floats(1e-4, 1e2),
-        zp=st.integers(-128, 128),
-        rand_seed=st.integers(0, 65534),
-        m=st.integers(32, 64),
-        k=st.integers(1000, 6000),
-        n=st.integers(200, 600),
-    )
-    # @example(m=64, k=5423, n=553, scale=1e-3, zp=120, rand_seed=1)
-    @settings(deadline=datetime.timedelta(seconds=1000), max_examples=1)
-    def test_ParallelFC(self, m, k, n, scale, zp, rand_seed):
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        # Y = W_T * X + b
-        X_fp32 = np.random.uniform(-1, 1, size=(m, k)).astype(np.float16) \
-            .astype(np.float32)
-
-        W_fp32 = np.random.uniform(-1, 1, size=(n, k)).astype(np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["Y_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8Relu(
-            ["Y_int8"],
-            ["Y_relu"],
-            Y_zero_point=X_zero_point,
-            Y_scale=X_scale,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_relu"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Relu"
-        ref_net.Proto().op[3].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        print(net_onnxified)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
--- a/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_deq_swish_quant_nnpi.py
@ -1,159 +0,0 @@
-import numpy as np
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-from hypothesis import settings
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-class DeqSwishQuantTest(serial.SerializedTestCase):
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.))
-        zero_point = -tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    def _sigmoid(self, x):
-        return 1. / (1. + np.exp(np.float32(-x)))
-
-    def _swish(self, x):
-        return np.float32(x) * self._sigmoid(x)
-
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_swish_int8(self):
-        np.random.seed(0)
-        workspace.ResetWorkspace()
-        n = 256
-
-        X_fp32 = np.linspace(-20.5, 8., num=n).astype(np.float32).reshape(1, n)
-        Y_fp32 = self._swish(X_fp32)
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-        Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)
-        W_fp32 = np.identity(n, dtype=np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net1 = core.Net("net")
-        ref_net1.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net1.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["U_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net1.SwishFakeInt8NNPI(
-            ["U_int8"],
-            ["Y"],
-            X_scale=X_scale,
-            X_zero_point=X_zero_point,
-            Y_scale=Y_scale,
-            Y_zero_point=Y_zero_point
-        )
-        ref_net1.Proto().external_output.append("Y")
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["U_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["U_int8"],
-            ["U_fp16"],
-            UsingOneOverScale=False
-        )
-        ref_net.SwishFakeFp16NNPI(
-            ["U_fp16"],
-            ["Y_fp16"]
-        )
-        ref_net.Int8QuantizeNNPI(
-            ["Y_fp16"],
-            ["Y"],
-            Y_scale=Y_scale,
-            Y_zero_point=Y_zero_point
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net1)
-        Y_fbgemm = workspace.FetchInt8Blob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        ref_net.Proto().op[3].type = "Swish"
-        ref_net.Proto().op[4].type = "Int8Quantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        # TODO: add an assertion to check the optimized net
-        # fused Dequantize->Swish->Quantize to QuantizedSwish
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y")
-        U_int8 = workspace.FetchInt8Blob("U_int8")
-
-        diff_Y = np.abs(Y_glow.data - Y_fbgemm.data)
-
-        num_mismatches = np.count_nonzero(diff_Y)
-        max_diff = np.max(diff_Y)
-        if max_diff > 0 or Y_glow.scale != Y_fbgemm.scale or \
-           Y_glow.zero_point != Y_fbgemm.zero_point:
-            print_test_debug_info(
-                "QuantizedSwish",
-                {
-                    "X": X_fp32,
-                    "X_scale": X_scale,
-                    "X_zero_point": X_zero_point,
-                    "Y_scale": Y_scale,
-                    "Y_zero_point": Y_zero_point,
-                    "U_int8": U_int8,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "max_diff": max_diff,
-                    "num_mismatches": num_mismatches,
-                },
-            )
-            assert 0
--- a/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_fc_nnpi_fp16.py
@ -1,357 +0,0 @@
-import numpy as np
-import unittest
-
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import datetime
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-GLOW_MATMUL_RTOL = 0
-
-
-class FCTest(serial.SerializedTestCase):
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_clip(self, seed):
-        np.random.seed(seed)
-        m, n, k = 8, 8, 8
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0", "W1", "b1"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["X1"],
-            )
-        )
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X1", "W1", "b1"],
-                ["Y"],
-            )
-        )
-        workspace.GlobalInit(
-            ['caffe2', '--caffe2_log_level=0', '--glow_global_fp16=1',
-             '--glow_clip_fp16', '--glow_global_fp16_constants=1'])
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = np.full((n, k), 65536.0, dtype)
-        b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        W1 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
-        b1 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-        workspace.FeedBlob("W1", W1)
-        workspace.FeedBlob("b1", b1)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": (m, k)},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False
-        )
-
-        X = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-        np.testing.assert_allclose(Y_glow, np.full((m, n), 65504.0, dtype))
-
-    @given(
-        m=st.integers(4, 50),
-        k=st.integers(4, 50),
-        n=st.integers(4, 50),
-        seed=st.integers(0, 65534)
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_fc_exercise(self, m, k, n, seed):
-        """ Test that the matmul engine is working, this doesn't test
-            precision
-        """
-        np.random.seed(seed)
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = np.random.randint(low=1, high=3, size=(n, k)).astype(dtype)
-        b0 = np.random.randint(low=1, high=3, size=(n)).astype(dtype)
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-        workspace.FeedBlob("X", X0)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net)
-
-        num_iterations = 2
-        for _ in range(num_iterations):
-            X0 = np.random.randint(low=1, high=3, size=(m, k)).astype(dtype)
-            workspace.FeedBlob("X", X0)
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob('Y')
-            # Run caffe2 net
-            workspace.RunNet(pred_net.name)
-            Y_c2 = workspace.FetchBlob('Y')
-            if not np.allclose(Y_c2, Y_glow):
-                print_test_debug_info("fc", {
-                    "seed": seed,
-                    "m": m,
-                    "k": k,
-                    "n": n,
-                    "X": X0,
-                    "W0": W0,
-                    "b0": b0,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff": np.abs((Y_c2 - Y_glow) / Y_c2)})
-                assert(0)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_fc_numeric_cases(self, seed):
-        """ Test numerics, use examples found from the unit test.
-            Use Fp16FCAcc16NNPI as a reference.
-        """
-        np.random.seed(seed)
-        m = 1
-        k = 20
-        n = 1
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "W0", "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FC",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", "W0", "b0"])
-        pred_net_ref.external_output.append("Y")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "Fp16FCAcc32NNPI",
-                ["X", "W0", "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-
-        W0 = np.array([[0.04882812, 0.21520996, 0.1027832, 0.04489136,
-                        -0.07635498, 0.14587402,
-                        -0.06240845, 0.3918457, 0.46362305, -0.11657715,
-                        0.29174805, 0.02890015,
-                        0.0680542, 0.4255371, -0.42895508, -0.4128418,
-                        -0.47973633, 0.33251953,
-                        0.27807617, 0.3701172]], dtype=np.float32)
-        b0 = np.array([0.47851562], dtype=np.float32)
-
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X_inputs = [
-            np.array([[
-                -2.94921875e-01, -3.58642578e-01, -1.92871094e-01,
-                2.81250000e-01, -1.30126953e-01, 2.32696533e-02,
-                -4.55566406e-01, -2.31811523e-01, -1.95190430e-01,
-                -7.76977539e-02, -1.29394531e-01, 2.94677734e-01,
-                8.96453857e-04, 4.97314453e-01, -6.07604980e-02,
-                2.55371094e-01, 3.49853516e-01, -1.37695312e-01,
-                2.95410156e-01, -3.67187500e-01]], dtype=np.float32),
-            np.array([[
-                -0.4494629, -0.22192383, -0.1640625, 0.11480713,
-                -0.09851074, -0.02084351,
-                0.19091797, -0.17468262, -0.47485352, 0.07489014,
-                0.03897095, 0.00197601,
-                0.02835083, -0.27294922, 0.26757812, -0.20996094,
-                -0.31103516, -0.41601562,
-                0.09918213, -0.07696533]], dtype=np.float32),
-            np.array([[
-                0.01150513, -0.20507812, 0.46704102, 0.00906372,
-                0.19848633, 0.3720703,
-                0.46557617, -0.47436523, -0.35107422, -0.0362854,
-                -0.20812988, 0.41918945,
-                0.09716797, 0.19897461, 0.3876953, -0.0165863,
-                0.23535156, 0.29956055,
-                0.24389648, -0.23486328]], dtype=np.float32)
-        ]
-
-        # keep onnxifi happy by feeding something with a shape
-        workspace.FeedBlob("X", X_inputs[0])
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        for i in range(len(X_inputs)):
-            workspace.FeedBlob("X", X_inputs[i])
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob('Y')
-            workspace.RunNet(pred_net_ref.name)
-            Y_c2 = workspace.FetchBlob('Y')
-
-            diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
-            rowdiff = np.max(diff, axis=1)
-
-            n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
-            if n_offenders > 0:
-                print_test_debug_info("fc", {
-                    "seed": seed,
-                    "iter": i,
-                    "m": m,
-                    "k": k,
-                    "n": n,
-                    "W0": W0,
-                    "b0": b0,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff": diff,
-                    "rowdiff": rowdiff})
-                assert(0)
-
-    @given(
-        m=st.integers(1, 50),
-        k=st.integers(1, 1000),
-        n=st.integers(1, 50),
-        seed=st.integers(0, 65534),
-        use_packed=st.integers(0, 2)
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_fc_num0(self, seed, m, k, n, use_packed):
-        """ Test numerics, fix a dimension and determine the ranges of error.
-            Use Fp16FCAcc16 as a reference.
-        """
-        W = "W_packed" if use_packed else "W0"
-        dtype = np.float32
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", W, "b0"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "FbFCPacked" if use_packed else "FC",
-                ["X", W, "b0"],
-                ["Y"],
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred"
-        pred_net_ref.external_input.extend(["X", W, "b0"])
-        pred_net_ref.external_output.append("Y")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "Fp16FCAcc32NNPI",
-                ["X", W, "b0"],
-                ["Y"],
-            )
-        )
-
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.ResetWorkspace()
-        W0 = 10 * (np.random.rand(n, k) - 0.5).astype(np.float16).astype(np.float32)
-        b0 = 1 * (np.random.rand(n) - 0.5).astype(np.float16).astype(np.float32)
-
-        workspace.FeedBlob("W0", W0)
-        workspace.FeedBlob("b0", b0)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FbGemmPack",
-                ['W0'],
-                ['W_packed'],
-                no_packing=True,
-            )
-        )
-
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                {"X": (m, k)},
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        X0 = np.random.rand(m, k).astype(dtype) - 0.5
-        workspace.FeedBlob("X", X0)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(pred_net_ref)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        diff = np.abs((Y_c2 - Y_glow) / (Y_c2 + 1e-8))
-        rowdiff = np.max(diff, axis=1)
-
-        n_offenders = np.count_nonzero(rowdiff[rowdiff > GLOW_MATMUL_RTOL])
-        if n_offenders > 0:
-            print_test_debug_info("fc", {
-                "seed": seed,
-                "use_packed": use_packed,
-                "m": m,
-                "k": k,
-                "n": n,
-                "X": X0.shape,
-                "W0": W0.shape,
-                "b0": b0.shape,
-                "Y_glow": Y_glow,
-                "Y_c2": Y_c2,
-                "diff": diff,
-                "rowdiff": rowdiff})
-            assert(0)
-
-if __name__ == '__main__':
-    unittest.main()
--- a/caffe2/contrib/fakelowp/test/test_fusions.py
+++ b/caffe2/contrib/fakelowp/test/test_fusions.py
@ -1,99 +0,0 @@
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=1",
-        "--glow_global_fused_scale_offset_fp16=1",
-        "--glow_global_force_sls_fp16_accum=1",
-    ]
-)
-
-class Fusions(serial.SerializedTestCase):
-    @given(
-        scale=st.floats(1e-4, 1e2),
-        zp=st.integers(-128, 128),
-        size=st.integers(1, 100000),
-        rand_seed=st.integers(0, 65534),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_tanhquantize(self, scale, zp, size, rand_seed):
-        np.random.seed(rand_seed)
-
-        workspace.ResetWorkspace()
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "ref"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y_q")
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Tanh", ["X"], ["Y"]
-            )
-        )
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
-            )
-        )
-
-        X = np.linspace(-1, 1, size).astype(np.float16).astype(np.float32)
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": X.shape},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y_q")
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.append("X")
-        ref_net.external_output.append("Y_q")
-
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "TanhQuantFakeFp16NNPI", ["X"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
-            )
-        )
-
-        workspace.CreateNet(ref_net)
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchInt8Blob("Y_q")
-
-        if not np.array_equal(Y_ref.data, Y_glow.data) or \
-           not Y_ref.scale == Y_glow.scale or \
-           not Y_ref.zero_point == Y_glow.zero_point:
-            print_test_debug_info(
-                "tanhfusion",
-                {
-                    "scale": scale,
-                    "zp": zp,
-                    "input": X,
-                    "ideal nonquant": np.tanh(X),
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_ref,
-                }
-            )
-            assert(0)
--- a/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_ops_nnpi.py
@ -1,322 +0,0 @@
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import numpy as np
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from hypothesis import given, strategies as st, settings
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-core.GlobalInit(["caffe2",
-                 "--caffe2_log_level=-3",
-                 "--glow_global_fp16=1",
-                 "--glow_clip_quant_range_to_fp16=1",
-                 "--glow_global_fp16_constants=1"
-                 ])
-
-
-class Int8OpsTest(serial.SerializedTestCase):
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
-        if scale < 1e-6:
-            scale = np.float32(1e-6)
-        zero_point = 0 - tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    @given(
-        n=st.integers(2, 1024),
-        rand_seed=st.integers(0, 65534),
-        non_zero_offset=st.booleans()
-    )
-    @settings(deadline=datetime.timedelta(seconds=50))
-    def test_int8_quantize(self, n, rand_seed, non_zero_offset):
-        print("n={}, rand_seed={}".format(n, rand_seed))
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        if non_zero_offset:
-            X_fp32 = np.random.uniform(-1, 1, size=(n, n)).astype(np.float16) \
-                .astype(np.float32)
-        else:
-            X_fp32 = np.random.rand(n, n).astype(np.float16).astype(np.float32)
-
-        W_fp32 = np.identity(n, dtype=np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["Y_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_int8"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
-
-    @given(
-        n=st.integers(1, 1024),
-        m=st.integers(1, 1024),
-        k=st.integers(1, 1024),
-        f=st.integers(1, 1),  # TODO: figure a safe number to increase
-        rand_seed=st.integers(0, 65534),
-        quantize_bias=st.sampled_from([False]),
-    )
-    @settings(deadline=datetime.timedelta(seconds=50))
-    def test_int8_fc(
-        self, n, m, k, rand_seed, quantize_bias, f
-    ):
-        print(
-            f"n={n}, m={m}, k={k}, rand_seed={rand_seed}, quantize_bias={quantize_bias}"
-        )
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        ff = float(f)
-        X_fp32 = np.random.uniform(-ff, ff, size=(m, k)).astype(np.float32)
-        W_fp32 = np.random.uniform(-ff, ff, size=(n, k)).astype(np.float32)
-        b_fp32 = np.random.uniform(-ff, ff, size=(n)).astype(np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-        Y_fp32 = np.dot(X_fp32, W_fp32.T) + b_fp32
-        Y_scale, Y_zero_point = self._get_scale_zp(Y_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W", "b"] if quantize_bias else ["W"],
-                ["W_int8", "b_int32"] if quantize_bias else ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b_int32" if quantize_bias else "b"],
-            ["Y_int8"],
-            Y_scale=Y_scale,
-            Y_zero_point=Y_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_int8"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b_int32"] if quantize_bias else ["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "m": m,
-                    "k": k,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
-
-    @given(
-        n=st.integers(1, 4),
-        rand_seed=st.integers(0, 65534)
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_int8_small_input(self, n, rand_seed):
-        print("n={}, rand_seed={}".format(n, rand_seed))
-        np.random.seed(rand_seed)
-        workspace.ResetWorkspace()
-
-        X_fp32 = np.random.uniform(0.01, 0.03, size=(n, n)).astype(np.float32)
-        W_fp32 = np.identity(n, dtype=np.float32)
-        b_fp32 = np.zeros((n,), dtype=np.float32)
-
-        X_scale, X_zero_point = self._get_scale_zp(X_fp32)
-
-        workspace.FeedBlob("X", X_fp32)
-        workspace.FeedBlob("W", W_fp32)
-        workspace.FeedBlob("b", b_fp32)
-
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "Int8FCPackWeight",
-                ["W"],
-                ["W_int8"],
-                engine="DNNLOWP",
-                save_unpacked_weights=True,
-                in_scale=X_scale,
-            )
-        )
-
-        ref_net = core.Net("net")
-        ref_net.Int8QuantizeNNPI(
-            ["X"],
-            ["X_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point
-        )
-        ref_net.Int8FCFakeAcc32NNPI(
-            ["X_int8", "W_int8", "b"],
-            ["Y_int8"],
-            Y_scale=X_scale,
-            Y_zero_point=X_zero_point,
-        )
-        ref_net.Int8DequantizeNNPI(
-            ["Y_int8"],
-            ["Y"]
-        )
-        ref_net.Proto().external_output.append("Y")
-
-        # run ref_net
-        workspace.RunNetOnce(ref_net)
-        Y_fbgemm = workspace.FetchBlob("Y")
-
-        # run onnxifi net
-        ref_net.Proto().op[0].type = "Int8Quantize"
-        ref_net.Proto().op[1].type = "Int8FC"
-        ref_net.Proto().op[2].type = "Int8Dequantize"
-        net_onnxified = onnxifi_caffe2_net(
-            ref_net.Proto(),
-            {},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-            weight_names=["W_int8", "b"],
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.CreateNet(net_onnxified)
-        workspace.RunNet(net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_fbgemm):
-            diff_Y = np.abs(Y_glow - Y_fbgemm)
-            print_test_debug_info(
-                "int8_fc",
-                {
-                    "seed": rand_seed,
-                    "n": n,
-                    "X": X_fp32,
-                    "W": W_fp32,
-                    "b": b_fp32,
-                    "Y_fbgemm": Y_fbgemm,
-                    "Y_glow": Y_glow,
-                    "diff": diff_Y,
-                    "maxdiff": diff_Y.max(axis=1),
-                },
-            )
-            assert 0
--- a/caffe2/contrib/fakelowp/test/test_int8_quant.py
+++ b/caffe2/contrib/fakelowp/test/test_int8_quant.py
@ -1,97 +0,0 @@
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-import caffe2.python.serialized_test.serialized_test_util as serial
-from hypothesis import settings
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=0",
-        "--glow_global_fused_scale_offset_fp16=0",
-        "--glow_global_force_sls_fp16_accum=0",
-    ]
-)
-
-class QuantTest(serial.SerializedTestCase):
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_dequantize(self):
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        x_scale = 0.10000000149011612
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["X"], ["I"], Y_scale=x_scale, Y_zero_point=0
-            )
-        )
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Dequantize", ["I"], ["Y"],
-            )
-        )
-        print(pred_net)
-        X = np.asarray([[1, 0], [0, 1]]).astype(np.float32)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net)
-        workspace.RunNet(pred_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-        workspace.ResetWorkspace()
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": [5, 2]},
-            debug=True,
-            adjust_batch=True,
-            block_list=[0],
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(len(pred_net_onnxified.op), 2)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-        np.testing.assert_equal(Y_ref, Y_glow)
-
-    @settings(deadline=datetime.timedelta(seconds=20))
-    def test_quantize(self):
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        x_scale = 0.10000000149011612
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["X"], ["Y"], Y_scale=x_scale, Y_zero_point=0
-            )
-        )
-        print(pred_net)
-        X = np.asarray([[1, 0], [0, 1]]).astype(np.float32)
-        workspace.FeedBlob("X", X)
-        workspace.RunNetOnce(pred_net)
-        Y_ref = workspace.FetchInt8Blob("Y")
-        workspace.ResetWorkspace()
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {"X": [2, 2]},
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y")
-        np.testing.assert_equal(Y_ref.data, Y_glow.data)
--- a/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_layernorm_nnpi_fp16.py
@ -1,240 +0,0 @@
-import numpy as np
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-from hypothesis import given, settings
-from hypothesis import strategies as st
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-core.GlobalInit(["caffe2",
-                 "--glow_global_fp16=1",
-                 "--glow_global_fused_scale_offset_fp16=1",
-                 "--glow_global_force_sls_fp16_accum=1"])
-
-GLOW_LOWERED_BATCHNORM = False
-
-
-# Test the lowered LayerNorm op
-class LayerNorm(serial.SerializedTestCase):
-
-    @given(seed=st.integers(0, 65535),
-           batch_size=st.integers(min_value=1, max_value=50),
-           size=st.integers(min_value=2, max_value=128),
-           epsilon=st.floats(min_value=1e-4, max_value=1e-3),
-           elementwise_affine=st.booleans())
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_layernorm(self, seed, batch_size, size, epsilon, elementwise_affine):
-        np.random.seed(seed)
-        # Reset the workspace
-        workspace.ResetWorkspace()
-        axis = 1
-
-        dims = np.array(([batch_size, size]))
-        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
-        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "gamma", "beta"])
-        pred_net.external_output.extend(["Y", "mean", "rstd"])
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNorm",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine
-            )
-        )
-
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred_ref"
-        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
-        pred_net_ref.external_output.extend(["Y", "mean", "rstd"])
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNormFakeFP16NNPI",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine
-            )
-        )
-
-        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            shape_hits,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("gamma", gamma)
-        workspace.FeedBlob("beta", beta)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        dims1 = np.array(([1, *dims]))
-        X_glow = X.reshape(dims1)
-        workspace.FeedBlob("X", X_glow)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_glow, Y_c2):
-            diff_Y = np.abs(Y_glow - Y_c2)
-            print_test_debug_info(
-                "layernorm",
-                {
-                    "seed": seed,
-                    "size": size,
-                    "batch_size": batch_size,
-                    "epsilon": epsilon,
-                    "gamma": gamma,
-                    "beta": beta,
-                    "elementwise_affine": elementwise_affine,
-                    "X": X,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff_Y": diff_Y,
-                }
-            )
-            assert(0)
-
-    def _get_scale_zp(self, tensor):
-        tensor_max = np.max(tensor)
-        tensor_min = min(0, np.min(tensor))
-        scale = np.float32(np.float16((tensor_max - tensor_min) / 255.0))
-        if scale < 1e-6:
-            scale = np.float32(1e-6)
-        zero_point = 0 - tensor_min / scale
-        zero_point = int(round(np.clip(zero_point, 0, 255.0)))
-        return (scale, zero_point)
-
-    def _layernorm_transform(self, X):
-        mean = np.mean(X, axis=1)
-        mean_exp = np.outer(mean, np.ones(X.shape[1]))
-        std = np.std(X, axis=1)
-        std_exp = np.outer(std, np.ones(X.shape[1]))
-        Y = (X - mean_exp) / std_exp
-        return Y
-
-    @given(seed=st.integers(0, 65535),
-           batch_size=st.integers(min_value=1, max_value=50),
-           size=st.integers(min_value=2, max_value=128),
-           epsilon=st.floats(min_value=1e-4, max_value=1e-3),
-           elementwise_affine=st.booleans())
-    @settings(deadline=datetime.timedelta(seconds=10))
-    # re-enable when T74553975 gets fixed
-    def test_fused_ln_quantize(self, seed, batch_size, size, epsilon, elementwise_affine):
-        np.random.seed(seed)
-
-        # Reset the workspace
-        workspace.ResetWorkspace()
-        axis = 1
-
-        dims = np.array(([batch_size, size]))
-        X = np.random.uniform(size=dims).astype(np.float32) - 0.5
-        gamma = np.random.randn(*X.shape[axis:]).astype(np.float32)
-        beta = np.random.randn(*X.shape[axis:]).astype(np.float32)
-
-        Y = self._layernorm_transform(X)
-        scale, zp = self._get_scale_zp(Y)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X", "gamma", "beta"])
-        pred_net.external_output.extend(["Y_q"])
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNorm",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine
-            )
-        )
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Int8Quantize", ["Y"], ["Y_q"], Y_scale=scale, Y_zero_point=zp
-            )
-        )
-
-        print(pred_net)
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "pred_ref"
-        pred_net_ref.external_input.extend(["X", "gamma", "beta"])
-        pred_net_ref.external_output.extend(["Y_q"])
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNormInt8QuantizeFakeNNPI",
-                ["X", "gamma", "beta"] if elementwise_affine else ["X"],
-                ["Y_q", "mean", "rstd"],
-                axis=axis,
-                epsilon=epsilon,
-                elementwise_affine=elementwise_affine,
-                Y_scale=scale, Y_zero_point=zp
-            )
-        )
-        shape_hits = {"X": X.shape, "gamma": gamma.shape, "beta": beta.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            shape_hits,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("X", X)
-        workspace.FeedBlob("gamma", gamma)
-        workspace.FeedBlob("beta", beta)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchInt8Blob("Y_q")
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchInt8Blob("Y_q")
-
-        if not np.allclose(Y_glow.data, Y_c2.data) or \
-           Y_glow.scale != Y_c2.scale or Y_glow.zero_point != Y_c2.zero_point:
-            diff_Y = np.abs(Y_glow.data.astype(np.float32) - Y_c2.data.astype(np.float32))
-            print_test_debug_info(
-                "layernorm",
-                {
-                    "seed": seed,
-                    "size": size,
-                    "batch_size": batch_size,
-                    "epsilon": epsilon,
-                    "gamma": gamma,
-                    "beta": beta,
-                    "elementwise_affine": elementwise_affine,
-                    "X": X,
-                    "Y_glow": Y_glow,
-                    "Y_c2": Y_c2,
-                    "diff_Y": diff_Y,
-                }
-            )
-            assert(0)
--- a/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_op_nnpi_fp16.py
@ -1,368 +0,0 @@
-import numpy as np
-
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core
-from caffe2.python import workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-from caffe2.python.fakelowp.test_utils import compute_ulp_error
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-core.GlobalInit(["caffe2", "--caffe2_log_level=-3", "--glow_global_fp16=1"])
-
-kEpsilon = 1e-8
-
-
-class ArithmeticOpsTest(serial.SerializedTestCase):
-    def _test_binary_op_graph(self, name, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        # First dimension is the batch size
-        dims = np.concatenate((np.array([1]), np.random.randint(1, 20, size=3)))
-        A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-        B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-        # Avoid dividing by 0
-        B[np.abs(B) < 1e-3] = 1e-3
-        print(A.shape, B.shape)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["A", "B"])
-        pred_net.external_output.append("C")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                name,
-                ["A", "B"],
-                ["C"]
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "ref"
-        pred_net_ref.external_input.extend(["A", "B"])
-        pred_net_ref.external_output.append("C_ref")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                name + "FakeFp16",
-                ["A", "B"],
-                ["C_ref"],
-            )
-        )
-
-        shape_hints = {"A": A.shape, "B": B.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=True,
-                                                use_onnx=False)
-        print(pred_net_onnxified)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("A", A)
-        workspace.FeedBlob("B", B)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-        num_iterations = 10
-        for _ in range(num_iterations):
-            A = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-            B = np.random.uniform(low=-100.0, high=100.0, size=dims).astype(np.float32)
-            # Avoid dividing by 0
-            B[np.abs(B) < 1e-3] = 1e-3
-
-            workspace.FeedBlob("A", A)
-            workspace.FeedBlob("B", B)
-            # Run caffe2 net
-            workspace.RunNet(pred_net_ref.name)
-            Y_c2 = workspace.FetchBlob("C_ref")
-
-            # Run Glow net
-            workspace.RunNet(pred_net_onnxified.name)
-            Y_glow = workspace.FetchBlob("C")
-
-            Y_glow[Y_glow == np.Inf] = np.finfo(np.float16).max
-            Y_glow[Y_glow == np.NINF] = np.finfo(np.float16).min
-
-            # Ignore mismatches solely due to difference in precision
-            fp16_finite = np.isfinite(A.astype(np.float16) / B.astype(np.float16))
-
-            # Results should be identical since we are comparing with the C2 emulation
-            if not np.allclose(Y_c2[fp16_finite], Y_glow[fp16_finite]):
-                diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
-                print_test_debug_info(name, {
-                    "dims": dims, "iter": _, "seed": seed, "A": A, "B": B,
-                    "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
-                assert(0)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_add_graph(self, seed):
-        self._test_binary_op_graph("Add", seed)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_sub_graph(self, seed):
-        self._test_binary_op_graph("Sub", seed)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_mul_graph(self, seed):
-        self._test_binary_op_graph("Mul", seed)
-
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_div_graph(self, seed):
-        self._test_binary_op_graph("Div", seed)
-
-
-class UnaryOpTest(serial.SerializedTestCase):
-    def _test_unary_op(self, opname, X, rtol=1e-5, atol=1e-8):
-        workspace.ResetWorkspace()
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                opname,
-                ['X'],
-                ['Y'])
-        )
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.append("X")
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                opname + 'FakeFp16NNPI',
-                ['X'],
-                ['Y'])
-        )
-        print("REF NET = {}".format(ref_net))
-
-        shape_hints = {"X": X.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(ref_net)
-        workspace.CreateNet(pred_net_onnxified)
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-        # Run caffe2 reference net
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y_c2, Y_glow, rtol=atol, atol=atol):
-            diff = np.abs(Y_c2 - Y_glow)
-            np.save('/tmp/' + opname + 'diff', diff)
-            np.save('/tmp/' + opname + 'result', Y_c2)
-            print_test_debug_info(opname, {
-                "X": X,
-                "Y_c2": Y_c2,
-                "Y_glow": Y_glow,
-                "diff": diff
-            })
-            assert(0)
-
-        return Y_glow
-
-    def _test_op_w_ulp_error(self, seed, opname, regions, atol=0, err_threshold=2):
-        ulp_err = 0
-        for x0, x1 in regions:
-            X = np.linspace(x0, x1, num=1025, dtype=np.float16).astype(np.float32)
-            Y_glow = self._test_unary_op(opname, X, atol=atol)
-            region_err = compute_ulp_error(opname, X, Y_glow)
-            ulp_err = max(np.max(np.abs(region_err)), ulp_err)
-        if (ulp_err > err_threshold):
-            print(r'{} Op detected ulp_err={}'.format(opname, ulp_err))
-            assert(0)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=20))
-    def test_sigmoid(self, seed):
-        np.random.seed(seed)
-        opname = "Sigmoid"
-        regions = [[-8., -4.], [-4., -2.], [-2., -1.], [-1., -.5], [-.5, -.25],
-                   [-.25, .25], [.25, .5], [.5, 1.], [1., 2.], [2., 4.],
-                   [4., 8.]]
-        self._test_op_w_ulp_error(seed, opname, regions, atol=0, err_threshold=2.5)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=20))
-    def test_tanh(self, seed):
-        np.random.seed(seed)
-        opname = "Tanh"
-        regions = [[2.**(-9), 2.**(-8)], [2.**(-8), 2.**(-7)],
-                   [2.**(-7), 2.**(-6)], [2.**(-6), 2.**(-5)],
-                   [2.**(-5), 2.**(-4)], [2.**(-4), 2.**(-3)],
-                   [2.**(-3), 2.**(-2)], [2.**(-2), 2.**(-1)],
-                   [2.**(-1), 1.], [1., 2.], [2., 4.], [4., 8.]]
-        self._test_op_w_ulp_error(seed, opname, regions, atol=0, err_threshold=2)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    # TODO: move atol to 1e-8 once we get a non-lowered swish implementation
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_swish(self, seed):
-        np.random.seed(seed)
-        opname = "Swish"
-        regions = [[-20.5, -11.], [-11., -8.], [-8., -1.], [-1., -0.1],
-                   [-1. / 8., 1. / 8.], [1. / 8, 5.], [5., 8.]]
-        self._test_op_w_ulp_error(seed, opname, regions, atol=0.008, err_threshold=384)
-
-    # These tests doesn't need to run multiple times given that it is a
-    # linear sweep and it is deterministic.
-    # Once hypothesis.testing version is updated, we can re-enable
-    # testing with different hypothesis examples.
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_logit(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        n = 1
-        m = 15361
-        X = np.linspace(0, 1, num=m, dtype=np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.append("X")
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                'Logit',
-                ['X'],
-                ['Y'],
-                eps=1e-6)
-        )
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.append("X")
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                'LogitFakeFp16NNPI',
-                ['X'],
-                ['Y'],
-                eps=1e-6)
-        )
-        print("REF NET = {}".format(ref_net))
-
-        shape_hints = {"X": (n, m)}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=False,
-                                                use_onnx=False)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-        workspace.CreateNet(ref_net)
-        workspace.CreateNet(pred_net_onnxified)
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-        # Run caffe2 reference net
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        diff = np.abs(Y_c2 - Y_glow)
-        if np.nanmax(diff) > 9e-3:
-            np.save('/tmp/logit_diff', diff)
-            np.save('/tmp/logit_result', Y_c2)
-            print_test_debug_info('Logit', {
-                "X": X,
-                "Y_c2": Y_c2,
-                "Y_glow": Y_glow,
-                "diff": diff
-            })
-            assert(0)
-
-class ReluTest(serial.SerializedTestCase):
-    @given(seed=st.integers(0, 65534))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def relu_test(self, inputs, gc, dc, seed):
-        np.random.seed(seed)
-        inputs = np.random.rand(1).astype(np.float32)
-        X = inputs[0]
-        # First dimension is the batch size
-        print(X.shape)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["X"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "Relu",
-                ["X"],
-                ["Y"]
-            )
-        )
-        pred_net_ref = caffe2_pb2.NetDef()
-        pred_net_ref.name = "ref"
-        pred_net_ref.external_input.extend(["X"])
-        pred_net_ref.external_output.append("Y_ref")
-        pred_net_ref.op.add().CopyFrom(
-            core.CreateOperator(
-                "ReluFakeFp16",
-                ["X"],
-                ["Y_ref"],
-            )
-        )
-
-        shape_hints = {"X": X.shape}
-        pred_net_onnxified = onnxifi_caffe2_net(pred_net,
-                                                shape_hints,
-                                                debug=True,
-                                                adjust_batch=True,
-                                                use_onnx=False)
-        print(pred_net_onnxified)
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.SwitchWorkspace("glow_test_ws", True)
-        workspace.FeedBlob("X", X)
-
-        workspace.CreateNet(pred_net_ref)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.FeedBlob("X", X)
-        # Run caffe2 net
-        workspace.RunNet(pred_net_ref.name)
-        Y_c2 = workspace.FetchBlob("Y_ref")
-
-        # Run Glow net
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        # Results should be identical since we are comparing with the C2 emulation
-        if not np.allclose(Y_c2, Y_glow):
-            diff = np.abs((Y_glow - Y_c2) / (Y_c2 + kEpsilon))
-            print_test_debug_info("Relu", {
-                "seed": seed, "X": X,
-                "Y_glow": Y_glow, "Y_c2": Y_c2, "diff": diff})
-            assert(0)
--- a/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_4bit_nnpi_fp16.py
@ -1,215 +0,0 @@
-import numpy as np
-import unittest
-
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-import datetime
-
-workspace.GlobalInit(["caffe2", "--glow_global_fp16=1",
-                      "--glow_global_fused_scale_offset_fp16=1",
-                      "--glow_global_force_sls_fp16_accum=1"])
-
-
-class SparseLengthsSum4BitFakeNNPIFp16Test(serial.SerializedTestCase):
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_4bit_rowwise_all_same(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        n = 1
-        m = 2
-        data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
-        max_segments = 5
-        max_segment_length = 100
-        num_lengths = np.random.randint(1, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(0, max_segment_length + 1,
-                                    size=num_lengths).astype(np.int32)
-        num_indices = np.sum(lengths)
-        indices = np.zeros(num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)])\
-            .astype(np.float32)
-        weights = np.ones(len(indices)).astype(np.float32)
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused4BitRowwiseQuantized",
-                ['data'],
-                ['quantized_data']
-            )
-        )
-        print("quantized", workspace.FetchBlob("quantized_data"))
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_4bit_rowwise",
-                {"seed": seed,
-                 "indices": indices,
-                 "data": data,
-                 "lengths": lengths,
-                 "weights": weights,
-                 "Y_c2": Y_c2,
-                 "Y_glow": Y_glow,
-                 "diff": Y_glow - Y_c2,
-                 "rowwise_diff": (Y_glow - Y_c2)[:, 0]})
-            assert(0)
-
-
-    @given(
-        seed=st.integers(0, 65535),
-        num_rows=st.integers(2, 20),
-        embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 72, 128]),
-        batch_size=st.integers(1, 32),
-        max_weight=st.integers(0, 1),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_4bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
-        workspace.ResetWorkspace()
-        np.random.seed(seed)
-        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
-        data = data * 1e-3
-
-        lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
-        _indices = []
-        for length in lengths:
-            _indices.extend(np.random.choice(np.arange(1, num_rows), length))
-        indices = np.asarray(_indices).astype(np.int64)
-
-        weights = np.random.uniform(
-            low=0,
-            high=max_weight,
-            size=[len(indices)]
-        ).astype(np.float32) - max_weight / 2.0
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"])
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused4BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused4BitRowwiseQuantized",
-                ["data"],
-                ["quantized_data"]
-            )
-        )
-
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=batch_size,
-            max_seq_size=np.max(lengths),
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False
-        )
-
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob('Y')
-
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob('Y')
-
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_4bit_rowwise",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_c2": Y_c2.shape,
-                    "Y_glow": Y_glow.shape,
-                    "diff": Y_glow - Y_c2,
-                    "rowwise_diff": (Y_glow - Y_c2)[:, 0]
-                }
-            )
-            assert(0)
-
-if __name__ == '__main__':
-    unittest.main()
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp16.py
@ -1,566 +0,0 @@
-import unittest
-from typing import Dict, Any
-
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=1",
-        "--glow_global_fused_scale_offset_fp16=1",
-        "--glow_global_force_sls_fp16_accum=1",
-    ]
-)
-GLOW_MATMUL_ATOL = 1e-5
-GLOW_MATMUL_RTOL = 1e-3
-
-
-class SparseLengthsSum8BitFakeNNPIFp16Test(serial.SerializedTestCase):
-    def Skip_test_SLS_NonQuantized_fp16(self):
-        N = 20000
-        DIM = 64
-        D = (4 * np.random.random_sample((N, DIM)) + 1).astype(np.float32)
-        I = (np.random.randint(0, N, size=12)).astype(np.int64)
-        L = np.asarray([4, 4, 4]).astype(np.int32)
-        workspace.FeedBlob("D", D)
-
-        ref_c2_net = core.Net("test_ref_c2")
-        ref_c2_net.SparseLengthsSum(["D", "I", "L"], "ref_out")
-        ref_c2_net.Proto().external_input.extend(["D", "I", "L"])
-        ref_c2_net.Proto().external_output.extend(["ref_out"])
-
-        fp16_c2_net = core.Net("test_fp16_c2")
-        fp16_c2_net.SparseLengthsSumFakeFP16AccFP16(["D", "I", "L"], "fp16_out")
-
-        input_dict : Dict[Any, Any] = {}
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(["D", "I", "L"])
-        pred_net.external_output.append("glow_out")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator("SparseLengthsSum", ["D", "I", "L"], ["glow_out"])
-        )
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            input_dict,
-            max_batch_size=3,
-            max_seq_size=16,
-            debug=True,
-            adjust_batch=False,
-            use_onnx=False,
-        )
-
-        num_onnxified_ops = sum(
-            1 if op.type == "Onnxifi" else 0 for op in onnxified_net.op
-        )
-        print(onnxified_net)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("I", I)
-        workspace.FeedBlob("L", L)
-
-        workspace.RunNetOnce(ref_c2_net)
-        ref_c2_out = workspace.FetchBlob("ref_out")
-
-        workspace.RunNetOnce(fp16_c2_net)
-        fp16_c2_out = workspace.FetchBlob("fp16_out")
-
-        np.testing.assert_allclose(fp16_c2_out, ref_c2_out, atol=1e-3, rtol=1e-3)
-
-        workspace.RunNetOnce(onnxified_net)
-        fp16_glow_out = workspace.FetchBlob("glow_out")
-
-        if not np.allclose(fp16_glow_out, fp16_c2_out):
-            diff = np.abs(fp16_glow_out - fp16_c2_out)
-            print_test_debug_info(
-                "sls",
-                {
-                    "indices": I,
-                    "data": D,
-                    "lengths": L,
-                    "Y_c2": fp16_c2_out,
-                    "Y_glow": fp16_glow_out,
-                    "diff": diff,
-                    "rowwise_diff": diff[:, 0],
-                },
-            )
-            assert 0
-
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_8bit_rowwise_all_same(self, seed):
-        # Comment out for predictable debugging
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-        n = 1
-        m = 2
-        data = np.ones((n, m)).astype(np.float32) * 0.2 - 0.1
-
-        max_segments = 5
-        max_segment_length = 200
-        num_lengths = np.random.randint(1, max_segments + 1)
-        # number of segments to run
-        lengths = np.random.randint(0, max_segment_length + 1, size=num_lengths).astype(
-            np.int32
-        )
-        num_indices = np.sum(lengths)
-        indices = np.zeros(num_indices, dtype=np.int64)
-        weights = np.random.uniform(low=-0.5, high=0.5, size=[len(indices)]).astype(
-            np.float32
-        )
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        pred_net_onnxified = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=max_segments,
-            max_seq_size=max_segment_length,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in pred_net_onnxified.op
-        )
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(pred_net_onnxified)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(pred_net_onnxified.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_c2 = workspace.FetchBlob("Y")
-
-        if not np.allclose(Y_c2, Y_glow):
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_c2": Y_c2,
-                    "Y_glow": Y_glow,
-                    "diff": Y_glow - Y_c2,
-                    "rowwise_diff": (Y_glow - Y_c2)[:, 0],
-                },
-            )
-            assert 0
-
-    @given(
-        seed=st.integers(0, 65535),
-        num_rows=st.integers(2, 20),
-        embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 128]),
-        batch_size=st.integers(1, 5),
-        max_weight=st.integers(0, 100),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_8bit_rowwise(self, seed, num_rows, embedding_dim, batch_size, max_weight):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
-        lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
-
-        _indices = []
-        for length in lengths:
-            _indices.extend(np.random.choice(np.arange(1, num_rows), length))
-        indices = np.asarray(_indices).astype(np.int64)
-
-        weights = np.random.uniform(
-            low=0,
-            high=max_weight,
-            size=[len(indices)]
-        ).astype(np.float32)
-
-        assert(len(weights) < 64000)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=batch_size,
-            max_seq_size=np.max(lengths),
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "seed": seed,
-                    "num_rows": num_rows,
-                    "embedding_dim": embedding_dim,
-                    "batch_size": batch_size,
-                    "max_weight": max_weight,
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-    # Simple test to aid debugging order of operations
-    # Minimize the case to an SLS that adds two rows
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_small_sls(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_sls_layernorm(self, seed):
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y_norm")
-        pred_net.external_output.append("Y_mean")
-        pred_net.external_output.append("Y_std")
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNorm",
-                ["Y"],
-                ["Y_norm", "Y_mean", "Y_std"],
-                epsilon=1e-4,
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y_norm")
-        ref_net.external_output.append("Y_mean")
-        ref_net.external_output.append("Y_std")
-
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP16NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "LayerNormFakeFP16NNPI",
-                ["Y"],
-                ["Y_norm", "Y_mean", "Y_std"],
-                epsilon=1e-4,
-                axis=1,
-                elementwise_affine=False
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        print("before", pred_net)
-        print("after", onnxified_net)
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y_norm")
-        Y_mean_glow = workspace.FetchBlob("Y_mean")
-        Y_std_glow = workspace.FetchBlob("Y_std")
-
-        workspace.RunNet(ref_net.name)
-        Y = workspace.FetchBlob("Y")
-        print("pre normalization", Y)
-        Y_ref = workspace.FetchBlob("Y_norm")
-        Y_mean_ref = workspace.FetchBlob("Y_mean")
-        Y_std_ref = workspace.FetchBlob("Y_std")
-
-        # print(Y_ref, Y_glow)
-        # print(Y_ref.shape, Y_glow.shape)
-
-        diff = np.abs(Y_ref - Y_glow)
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "slws_fused_8bit_rowwise_inv_scale",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_norm_glow": Y_glow,
-                    "Y_norm_ref": Y_ref,
-                    "Y_mean_glow": Y_mean_glow,
-                    "Y_std_glow": Y_std_glow,
-                    "Y_mean_ref": Y_mean_ref,
-                    "Y_std_ref": Y_std_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
+++ b/caffe2/contrib/fakelowp/test/test_sls_8bit_nnpi_fp32.py
@ -1,264 +0,0 @@
-import unittest
-
-# Must happen before importing caffe2.python.*
-import caffe2.python.fakelowp.init_shared_libs  # noqa
-import datetime
-import numpy as np
-from hypothesis import given, settings
-from hypothesis import strategies as st
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace
-from caffe2.python.onnx.onnxifi import onnxifi_caffe2_net
-from caffe2.python.fakelowp.test_utils import print_test_debug_info
-import caffe2.python.serialized_test.serialized_test_util as serial
-
-workspace.GlobalInit(
-    [
-        "caffe2",
-        "--glow_global_fp16=0",
-        "--glow_global_fused_scale_offset_fp16=0",
-        "--glow_global_force_sls_fp16_accum=0",
-    ]
-)
-GLOW_MATMUL_ATOL = 1e-5
-GLOW_MATMUL_RTOL = 1e-3
-
-class SparseLengthsSum8BitFakeNNPIFp32Test(serial.SerializedTestCase):
-    @given(
-        seed=st.integers(0, 65535),
-        num_rows=st.integers(2, 20),
-        embedding_dim=st.sampled_from([8, 12, 16, 24, 32, 54, 64, 128]),
-        batch_size=st.integers(1, 5),
-        max_weight=st.integers(0, 100),
-    )
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_slws_fused_8bit_rowwise_acc32_nnpi(
-        self, seed, num_rows, embedding_dim, batch_size, max_weight
-    ):
-        workspace.GlobalInit(
-            [
-                "caffe2",
-                "--glow_global_fp16=0",
-                "--glow_global_fused_scale_offset_fp16=0",
-                "--glow_global_force_sls_fp16_accum=0",
-            ]
-        )
-
-        workspace.ResetWorkspace()
-        np.random.seed(seed)
-        data = np.random.rand(num_rows, embedding_dim).astype(np.float32)
-        lengths = np.random.choice(np.arange(1, num_rows), batch_size).astype(np.int32)
-
-        _indices = []
-        for length in lengths:
-            _indices.extend(np.random.choice(np.arange(1, num_rows), length))
-        indices = np.asarray(_indices).astype(np.int64)
-
-        weights = np.random.uniform(
-            low=0,
-            high=max_weight,
-            size=[len(indices)]
-        ).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized",
-                ["data"],
-                ["quantized_data"]
-            )
-        )
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=batch_size,
-            max_seq_size=np.max(lengths),
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            print_test_debug_info(
-                "test_slws_fused_8bit_rowwise_acc32_nnpi",
-                {
-                    "seed": seed,
-                    "num_rows": num_rows,
-                    "embedding_dim": embedding_dim,
-                    "batch_size": batch_size,
-                    "indices": indices,
-                    "data": data.shape,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-
-    @given(seed=st.integers(0, 65535))
-    @settings(deadline=datetime.timedelta(seconds=10))
-    def test_small_sls_acc32(self, seed):
-        workspace.GlobalInit(
-            [
-                "caffe2",
-                "--glow_global_fp16=0",
-                "--glow_global_fused_scale_offset_fp16=0",
-                "--glow_global_force_sls_fp16_accum=0",
-            ]
-        )
-        np.random.seed(seed)
-        workspace.ResetWorkspace()
-
-        n = 2
-        DIM = 3
-        data = 4 * (np.random.random_sample((n, DIM)) + 1).astype(np.float32)
-
-        lengths = np.array([n], dtype=np.int32)
-        indices = np.array(range(n), dtype=np.int64)
-        weights = np.random.uniform(low=0.01, high=0.5, size=[n]).astype(np.float32)
-
-        pred_net = caffe2_pb2.NetDef()
-        pred_net.name = "pred"
-        pred_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        pred_net.external_output.append("Y")
-        pred_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwise",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        ref_net = caffe2_pb2.NetDef()
-        ref_net.name = "ref"
-        ref_net.external_input.extend(
-            ["quantized_data", "weights", "indices", "lengths"]
-        )
-        ref_net.external_output.append("Y")
-        ref_net.op.add().CopyFrom(
-            core.CreateOperator(
-                "SparseLengthsWeightedSumFused8BitRowwiseFakeFP32NNPI",
-                ["quantized_data", "weights", "indices", "lengths"],
-                ["Y"],
-            )
-        )
-
-        workspace.FeedBlob("data", data)
-        workspace.RunOperatorOnce(
-            core.CreateOperator(
-                "FloatToFused8BitRowwiseQuantized", ["data"], ["quantized_data"]
-            )
-        )
-
-        quantized_data = workspace.FetchBlob("quantized_data")
-
-        onnxified_net = onnxifi_caffe2_net(
-            pred_net,
-            {},
-            max_batch_size=1,
-            max_seq_size=n,
-            debug=True,
-            adjust_batch=True,
-            use_onnx=False,
-        )
-        num_onnxified_ops = sum(
-            1 if o.type == "Onnxifi" else 0 for o in onnxified_net.op)
-        np.testing.assert_equal(num_onnxified_ops, 1)
-
-        workspace.FeedBlob("indices", indices)
-        workspace.FeedBlob("lengths", lengths)
-        workspace.FeedBlob("weights", weights)
-
-        workspace.CreateNet(onnxified_net)
-        workspace.CreateNet(ref_net)
-
-        workspace.RunNet(onnxified_net.name)
-        Y_glow = workspace.FetchBlob("Y")
-
-        workspace.RunNet(ref_net.name)
-        Y_ref = workspace.FetchBlob("Y")
-
-        diff = np.abs((Y_ref - Y_glow) / (Y_ref + 1e-8))
-        max_err = np.max(diff, axis=1)
-        num_offenders = (max_err > 0).sum()
-        if num_offenders > 0:
-            np.set_printoptions(precision=12)
-            print(
-                "ref",
-                Y_ref.astype(np.float16).astype(np.float32),
-                "glow",
-                Y_glow.astype(np.float16).astype(np.float32),
-            )
-            print_test_debug_info(
-                "test_small_sls_acc32",
-                {
-                    "seed": seed,
-                    "indices": indices,
-                    "data": data,
-                    "quantized_data": quantized_data,
-                    "lengths": lengths,
-                    "weights": weights,
-                    "Y_glow": Y_glow,
-                    "Y_ref": Y_ref,
-                    "diff": diff,
-                    "rowwise_diff": np.max(diff, axis=1),
-                },
-            )
-            assert 0
-
-
-if __name__ == '__main__':
-    unittest.main()
--- a/caffe2/contrib/fakelowp/unary_fp16_fake_op.cc
+++ b/caffe2/contrib/fakelowp/unary_fp16_fake_op.cc
--- a/caffe2/contrib/fakelowp/unary_fp16_fake_op.h
+++ b/caffe2/contrib/fakelowp/unary_fp16_fake_op.h
@ -1,74 +0,0 @@
-#pragma once
-
-#include <vector>
-
-#include <fbgemm/FbgemmConvert.h>
-#include "caffe2/operators/elementwise_ops.h"
-#include "caffe2/utils/eigen_utils.h"
-#include "caffe2/utils/math.h"
-
-C10_DECLARE_bool(caffe2_fbgemm_fake_fp16_clamp);
-
-namespace caffe2 {
-using namespace std;
-template <class Context>
-struct ReluFakeFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, Context* /* unused */) const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(
-        X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    EigenVectorMap<T>(Y, N) =
-        ConstEigenVectorMap<float>(X_fp16.data(), N).cwiseMax(T(0));
-    return true;
-  }
-};
-
-template <class Context>
-struct SqrFakeFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, Context* context) const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(
-        X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    math::Sqr(N, X_fp16.data(), Y, context);
-    fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-};
-
-struct SigmoidFakeIdealFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, CPUContext* /* unused */)
-      const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(X, X_fp16.data(), N);
-    EigenVectorArrayMap<T>(Y, N) =
-        T(1) / (T(1) + (-ConstEigenVectorArrayMap<T>(X_fp16.data(), N)).exp());
-    fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-};
-
-struct TanhFakeIdealFp16Functor {
-  template <typename T>
-  bool operator()(const int N, const T* X, T* Y, CPUContext* context) const {
-    std::vector<float> X_fp16(N);
-    fbgemm::RoundToFloat16(
-        X, X_fp16.data(), N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    math::Tanh<T, CPUContext>(N, X_fp16.data(), Y, context);
-    fbgemm::RoundToFloat16(Y, Y, N, FLAGS_caffe2_fbgemm_fake_fp16_clamp);
-    return true;
-  }
-};
-
-} // namespace caffe2
-
-namespace fake_fp16 {
-
-at::Half CalcSigmoidByLUT(at::Half x);
-at::Half CalcSwishByLUT(at::Half x);
-at::Half CalcSwishByLUTCubic(at::Half x);
-at::Half CalcTanhByLUT(at::Half input);
-
-} // namespace fake_fp16
--- a/caffe2/contrib/gloo/CMakeLists.txt
+++ b/caffe2/contrib/gloo/CMakeLists.txt
@ -1,33 +0,0 @@
-if(USE_GLOO)
-  set(Caffe2_CONTRIB_GLOO_CPU_SRC
-    "${CMAKE_CURRENT_SOURCE_DIR}/allgather_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/barrier_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/common.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/context.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/reduce_scatter_ops.cc"
-    "${CMAKE_CURRENT_SOURCE_DIR}/store_handler.cc"
-    )
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_GLOO_CPU_SRC} PARENT_SCOPE)
-
-  if(USE_CUDA)
-    set(Caffe2_CONTRIB_GLOO_GPU_SRC
-      "${CMAKE_CURRENT_SOURCE_DIR}/allreduce_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/broadcast_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/common_world_ops_gpu.cc"
-      )
-    set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_GLOO_GPU_SRC} PARENT_SCOPE)
-  endif(USE_CUDA)
-
-  if(USE_ROCM)
-    set(Caffe2_CONTRIB_GLOO_HIP_SRC
-      "${CMAKE_CURRENT_SOURCE_DIR}/hip/allreduce_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/hip/broadcast_ops_gpu.cc"
-      "${CMAKE_CURRENT_SOURCE_DIR}/hip/common_world_ops_gpu.cc"
-      )
-    set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_GLOO_HIP_SRC} PARENT_SCOPE)
-    set(Caffe2_HIP_INCLUDE ${GLOO_HIP_INCLUDE} ${Caffe2_HIP_INCLUDE} PARENT_SCOPE)
-  endif(USE_ROCM)
-endif()
--- a/caffe2/contrib/gloo/init.py
+++ b/caffe2/contrib/gloo/init.py
--- a/caffe2/contrib/gloo/allgather_ops.cc
+++ b/caffe2/contrib/gloo/allgather_ops.cc
@ -1,64 +0,0 @@
-/**
- * Copyright (c) 2017-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "allgather_ops.h"
-
-#include <gloo/allgather_ring.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void AllgatherOp<Context>::initializeAlgorithm() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<float>(
-        init_.context,
-        init_.template getInputs<float>(),
-        init_.template getOutput<float>(),
-        init_.size));
-  } else if (init_.template IsType<long>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<long>(
-        init_.context,
-        init_.template getInputs<long>(),
-        init_.template getOutput<long>(),
-        init_.size));
-  } else if (init_.template IsType<int>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<int>(
-        init_.context,
-        init_.template getInputs<int>(),
-        init_.template getOutput<int>(),
-        init_.size));
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_.reset(new ::gloo::AllgatherRing<::gloo::float16>(
-        init_.context,
-        init_.template getInputs<::gloo::float16>(),
-        init_.template getOutput<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-// Used outside of the translation unit
-template void AllgatherOp<CPUContext>::initializeAlgorithm();
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Allgather, GLOO, AllgatherOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/allgather_ops.h
+++ b/caffe2/contrib/gloo/allgather_ops.h
@ -1,130 +0,0 @@
-/**
- * Copyright (c) 2017-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/types.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class AllgatherOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  AllgatherOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~AllgatherOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    // Allocate output tensor
-    CAFFE_ENFORCE_EQ(OutputSize(), 1);
-    auto comm_size =
-        OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0)->size;
-    const auto dims = std::vector<int64_t>(
-        1, (InputSize() - 1) * Input(1).numel() * comm_size);
-    Output(0)->Resize(dims);
-
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    CAFFE_ENFORCE_EQ(init_.outputs.size(), 1);
-
-    // Verify tensors all have same size
-    size_t size = Input(1).numel();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same type
-    TypeMeta meta = Input(1).dtype();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    // Finally initialize the algorithm
-    initializeAlgorithm();
-  }
-
-  void initializeAlgorithm();
-
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 1);
-    params.size = Input(1).numel();
-    params.meta = Input(1).dtype();
-    for (const auto i : c10::irange(params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-    }
-    params.outputs.resize(OutputSize());
-    params.outputs[0] = Output(0)->raw_mutable_data(params.meta);
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-};
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/allreduce_ops.cc
+++ b/caffe2/contrib/gloo/allreduce_ops.cc
@ -1,123 +0,0 @@
-#include "allreduce_ops.h"
-
-#include <math.h>
-
-#include <gloo/allreduce_bcube.h>
-#include <gloo/allreduce_halving_doubling.h>
-#include <gloo/allreduce_ring.h>
-#include <gloo/allreduce_ring_chunked.h>
-#include <gloo/types.h>
-
-namespace {
-/**
- * This is a helper function which attempts to get a base value depending on the
- * # of nodes. Larger the base the better performance (up to 4) is what we have
- * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
- * ^ x. Where x is some constant. So, if # node don't match our expectation
- * simply return -1. This will indicate caller to switch to another algorithm
- * like halving-doubling.
- */
-static int getAllrduceBcubeBase(int nodes) {
-  auto getExponent = [](int n, int b) -> int {
-    float lg2n = log2(n);
-    float lg2b = log2(b);
-    return ceil(lg2n / lg2b);
-  };
-  auto baseCheck = [&](int n, int b) -> bool {
-    int e = getExponent(n, b);
-    return n == pow(b, e);
-  };
-  for (const auto base : {6, 5, 4, 3, 2}) {
-    if (baseCheck(nodes, base)) {
-      return base;
-    }
-    /*
-     * Base could work if # nodes is multiple of the base yet smaller than
-     * base^2
-     */
-    if (nodes < base * base && 0 == nodes % base) {
-      return base;
-    }
-  }
-  return -1;
-}
-} // namespace
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void AllreduceOp<Context>::initializeBcube() {
-  int base = getAllrduceBcubeBase(init_.size);
-  if (-1 == base) {
-    return initializeHalvingDoubling();
-  }
-  init_.context->base = base;
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceBcube<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceBcube<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeHalvingDoubling() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceHalvingDoubling<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-// Used outside of the translation unit
-template void AllreduceOp<CPUContext>::initializeHalvingDoubling();
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingFull() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceRing<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceRing<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingChunked() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::AllreduceRingChunked<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::AllreduceRingChunked<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/allreduce_ops.h
+++ b/caffe2/contrib/gloo/allreduce_ops.h
@ -1,134 +0,0 @@
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class AllreduceOp final : public Operator<Context> {
-  enum Mode { RING_FULL, RING_CHUNKED, HALVING_DOUBLING, BCUBE };
-
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  AllreduceOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
-        gpu_direct_(
-            OperatorBase::GetSingleArgument<bool>("gpu_direct", false)) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~AllreduceOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    Mode mode = HALVING_DOUBLING;
-
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    // Verify inputs == outputs
-    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (const auto i : c10::irange(0U, init_.inputs.size())) {
-      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
-    }
-
-    // Verify tensors all have same size
-    auto size = Input(1).numel();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same type
-    TypeMeta meta = Input(1).dtype();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    switch (mode) {
-      case RING_FULL:
-        initializeRingFull();
-        return;
-      case RING_CHUNKED:
-        initializeRingChunked();
-        return;
-      case HALVING_DOUBLING:
-        initializeHalvingDoubling();
-        return;
-      case BCUBE:
-        initializeBcube();
-        return;
-    }
-
-    CAFFE_ENFORCE(false, "Unreachable code");
-  }
-
-  void initializeBcube();
-  void initializeHalvingDoubling();
-  void initializeRingFull();
-  void initializeRingChunked();
-
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 1);
-    params.outputs.resize(OutputSize());
-    for (const auto i : c10::irange(0U, params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-      params.outputs[i] = Output(i)->raw_mutable_data();
-    }
-    params.size = Output(0)->numel();
-    params.meta = Output(0)->dtype();
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-  const bool gpu_direct_;
-};
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/allreduce_ops_gpu.cc
+++ b/caffe2/contrib/gloo/allreduce_ops_gpu.cc
@ -1,168 +0,0 @@
-#include "caffe2/contrib/gloo/allreduce_ops.h"
-
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/logging.h"
-
-#include <gloo/cuda_allreduce_bcube.h>
-#include <gloo/cuda_allreduce_halving_doubling.h>
-#include <gloo/cuda_allreduce_ring.h>
-#include <gloo/cuda_allreduce_ring_chunked.h>
-#include <gloo/types.h>
-
-namespace caffe2 {
-namespace gloo {
-
-namespace {
-
-// Decides on using GPUDirect based on device support.
-template <template <typename T, typename W> class A, typename T>
-std::unique_ptr<::gloo::Algorithm> initializeAlgorithm(
-    bool gpu_direct_,
-    std::shared_ptr<::gloo::Context> context,
-    std::vector<T*> ptrs,
-    size_t size) {
-  if (gpu_direct_) {
-    if (context->getDevice()->hasGPUDirect()) {
-      return std::unique_ptr<::gloo::Algorithm>(
-        new A<T, ::gloo::CudaDeviceWorkspace<T>>(context, ptrs, size));
-    } else {
-      LOG(WARNING)
-        << "GPUDirect not available; "
-        << "Gloo communication will go through system memory instead.";
-    }
-  }
-
-  return std::unique_ptr<::gloo::Algorithm>(
-    new A<T, ::gloo::CudaHostWorkspace<T>>(context, ptrs, size));
-}
-
-/**
- * This is a helper function which attempts to get a base value depending on the
- * # of nodes. Larger the base the better performance (up to 4) is what we have
- * observed in gloo benchmarks. At the moment bcube works only if # nodes = base
- * ^ x. Where x is some constant. So, if # node don't match our expectation
- * simply return -1. This will indicate caller to switch to another algorithm
- * like halving-doubling.
- */
-static int getAllrduceBcubeBase(int nodes) {
-  auto getExponent = [](int n, int b) -> int {
-    float lg2n = log2(n);
-    float lg2b = log2(b);
-    return ceil(lg2n / lg2b);
-  };
-  auto baseCheck = [&](int n, int b) -> bool {
-    int e = getExponent(n, b);
-    return n == pow(b, e);
-  };
-  for (const auto base : {6, 5, 4, 3, 2}) {
-    if (baseCheck(nodes, base)) {
-      return base;
-    }
-    /*
-     * Base could work if # nodes is multiple of the base yet smaller than
-     * base^2
-     */
-    if (nodes < base * base && 0 == nodes % base) {
-      return base;
-    }
-  }
-  return -1;
-}
-
-} // namespace
-
-template <class Context>
-void AllreduceOp<Context>::initializeBcube() {
-  int base = getAllrduceBcubeBase(init_.size);
-  if (-1 == base) {
-    return initializeHalvingDoubling();
-  }
-  init_.context->base = base;
-  if (init_.template IsType<float>()) {
-    algorithm_ = initializeAlgorithm<::gloo::CudaAllreduceBcube, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-        initializeAlgorithm<::gloo::CudaAllreduceBcube, ::gloo::float16>(
-            gpu_direct_,
-            init_.context,
-            init_.template getOutputs<::gloo::float16>(),
-            init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeHalvingDoubling() {
-  if (init_.template IsType<float>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceHalvingDoubling, ::gloo::float16>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingFull() {
-  if (init_.template IsType<float>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRing, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRing, ::gloo::float16>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-template <class Context>
-void AllreduceOp<Context>::initializeRingChunked() {
-  if (init_.template IsType<float>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRingChunked, float>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size);
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_ =
-      initializeAlgorithm<::gloo::CudaAllreduceRingChunked, ::gloo::float16>(
-        gpu_direct_,
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size);
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(Allreduce, GLOO, AllreduceOp<CUDAContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/barrier_ops.cc
+++ b/caffe2/contrib/gloo/barrier_ops.cc
@ -1,11 +0,0 @@
-#include "barrier_ops.h"
-
-namespace caffe2 {
-namespace gloo {
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Barrier, GLOO, BarrierOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/barrier_ops.h
+++ b/caffe2/contrib/gloo/barrier_ops.h
@ -1,63 +0,0 @@
-#pragma once
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/barrier_all_to_one.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class BarrierOp final : public Operator<Context> {
- public:
-  BarrierOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~BarrierOp() override {}
-
-  bool RunOnDevice() override {
-    auto context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    std::call_once(once_, [&] {
-      initContext_ = context;
-      // Use an all-to-one barrier synchronizing against rank 0
-      algorithm_.reset(new ::gloo::BarrierAllToOne(initContext_, 0));
-    });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    CAFFE_ENFORCE(context == initContext_, "Context has changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  std::once_flag once_;
-  std::shared_ptr<::gloo::Context> initContext_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-  Workspace* ws_;
-  std::string status_blob_;
-};
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/broadcast_ops.cc
+++ b/caffe2/contrib/gloo/broadcast_ops.cc
@ -1,36 +0,0 @@
-#include "broadcast_ops.h"
-
-#include <gloo/broadcast_one_to_all.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void BroadcastOp<Context>::initializeAlgorithm() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size, root_));
-  } else if (init_.template IsType<long>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<long>(
-        init_.context, init_.template getOutputs<long>(), init_.size, root_));
-  } else if (init_.template IsType<int>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<int>(
-        init_.context, init_.template getOutputs<int>(), init_.size, root_));
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_.reset(new ::gloo::BroadcastOneToAll<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size,
-        root_));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/broadcast_ops.h
+++ b/caffe2/contrib/gloo/broadcast_ops.h
@ -1,112 +0,0 @@
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/core/types.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class BroadcastOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  BroadcastOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        root_(OperatorBase::template GetSingleArgument<int>("root", 0)),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~BroadcastOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    // Verify inputs == outputs
-    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (const auto i : c10::irange(init_.inputs.size())) {
-      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
-    }
-
-    // Verify tensors all have same size
-    size_t size = Input(1).numel();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same size
-    TypeMeta meta = Input(1).dtype();
-    for (const auto i : c10::irange(2, InputSize())) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    // Finally initialize the algorithm
-    initializeAlgorithm();
-  }
-
-  void initializeAlgorithm();
-
-  const int root_;
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 1);
-    params.outputs.resize(OutputSize());
-    for (const auto i : c10::irange(params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-      params.outputs[i] = Output(i)->raw_mutable_data();
-    }
-    params.size = Output(0)->numel();
-    params.meta = Output(0)->dtype();
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-};
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/broadcast_ops_gpu.cc
+++ b/caffe2/contrib/gloo/broadcast_ops_gpu.cc
@ -1,38 +0,0 @@
-#include "caffe2/contrib/gloo/broadcast_ops.h"
-
-#include "caffe2/core/context_gpu.h"
-
-#include <gloo/cuda_broadcast_one_to_all.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void BroadcastOp<Context>::initializeAlgorithm() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<float>(
-        init_.context, init_.template getOutputs<float>(), init_.size, root_));
-  } else if (init_.template IsType<long>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<long>(
-        init_.context, init_.template getOutputs<long>(), init_.size, root_));
-  } else if (init_.template IsType<int>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<int>(
-        init_.context, init_.template getOutputs<int>(), init_.size, root_));
-  } else if (init_.template IsType<at::Half>()) {
-    algorithm_.reset(new ::gloo::CudaBroadcastOneToAll<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size,
-        root_));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(Broadcast, GLOO, BroadcastOp<CUDAContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/common.cc
+++ b/caffe2/contrib/gloo/common.cc
@ -1,48 +0,0 @@
-#include "caffe2/contrib/gloo/common.h"
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/tensor.h"
-
-#include <gloo/transport/tcp/device.h>
-#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
-#include <gloo/transport/ibverbs/device.h>
-#endif
-
-namespace caffe2 {
-namespace gloo {
-
-void signalFailure(Blob* status_blob, std::exception& /* unused */) {
-  auto* res = BlobGetMutableTensor(status_blob, CPU);
-  res->Resize(1);
-  res->template mutable_data<int32_t>()[0] = 1;
-}
-
-std::shared_ptr<::gloo::transport::Device> createDevice(
-    const createDeviceAttr attr) {
-  if (attr.transport == "tcp") {
-    ::gloo::transport::tcp::attr tcpAttr;
-    if (attr.interface.size() > 0) {
-      tcpAttr.iface = attr.interface;
-    }
-    return ::gloo::transport::tcp::CreateDevice(tcpAttr);
-  } else if (attr.transport == "ibverbs") {
-#if defined(GLOO_USE_IBVERBS) && GLOO_USE_IBVERBS
-    ::gloo::transport::ibverbs::attr ibverbsAttr;
-    ibverbsAttr.port = 1;
-    ibverbsAttr.index = 0;
-    if (attr.interface.size() > 0) {
-      ibverbsAttr.name = attr.interface;
-    }
-    return ::gloo::transport::ibverbs::CreateDevice(ibverbsAttr);
-#else
-    CAFFE_THROW(
-      "Gloo was not compiled with ibverbs support. ",
-      "Please recompile with -DUSE_IBVERBS=1.");
-#endif
-  }
-
-  CAFFE_THROW("Invalid transport: ", attr.transport);
-}
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/common.h
+++ b/caffe2/contrib/gloo/common.h
@ -1,73 +0,0 @@
-#pragma once
-
-#include <exception>
-
-#include "caffe2/core/blob.h"
-
-#include <gloo/config.h>
-#include <gloo/context.h>
-#include <gloo/transport/device.h>
-
-namespace caffe2 {
-namespace gloo {
-
-TORCH_API void signalFailure(Blob* status_blob, std::exception& exception);
-
-struct createDeviceAttr {
-    // "tcp" or "ibverbs"
-    std::string transport;
-
-    // E.g. "eth0" (tcp), or "mlx5_0" (ibverbs).
-    // This may be empty to make Gloo figure it out.
-    std::string interface;
-};
-
-TORCH_API std::shared_ptr<::gloo::transport::Device> createDevice(
-    const createDeviceAttr attr);
-
-// Captures the parameters passed to Gloo.
-struct GlooParameters {
-  std::shared_ptr<::gloo::Context> context;
-  std::vector<const void*> inputs;
-  std::vector<void*> outputs;
-  size_t size;
-  TypeMeta meta;
-
-  template <typename T>
-  std::vector<const T*> getInputs() {
-    std::vector<const T*> result;
-    result.reserve(inputs.size());
-    for (auto& input : inputs) {
-      result.push_back(reinterpret_cast<const T*>(input));
-    }
-    return result;
-  }
-
-  template <typename T>
-  std::vector<T*> getOutputs() {
-    std::vector<T*> result;
-    result.reserve(outputs.size());
-    for (auto& output : outputs) {
-      result.push_back(reinterpret_cast<T*>(output));
-    }
-    return result;
-  }
-
-  template <typename T>
-  T* getOutput() {
-    return reinterpret_cast<T*>(outputs[0]);
-  }
-
-  template <typename T>
-  bool IsType() const {
-    return meta.Match<T>();
-  }
-
-  bool operator==(GlooParameters const& other) const {
-    return context == other.context && inputs == other.inputs &&
-        outputs == other.outputs && size == other.size;
-  }
-};
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/common_world_ops.cc
+++ b/caffe2/contrib/gloo/common_world_ops.cc
@ -1,29 +0,0 @@
-#include "caffe2/contrib/gloo/common_world_ops.h"
-
-#include <gloo/transport/tcp/device.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <>
-void CreateCommonWorld<CPUContext>::initializeForContext() {
-  // Nothing to initialize for CPUContext.
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(
-    CreateCommonWorld,
-    GLOO,
-    CreateCommonWorld<CPUContext>);
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(
-    CloneCommonWorld,
-    GLOO,
-    CloneCommonWorld<CPUContext>);
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(DestroyCommonWorld, GLOO, DestroyCommonWorld);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/common_world_ops.h
+++ b/caffe2/contrib/gloo/common_world_ops.h
@ -1,249 +0,0 @@
-#pragma once
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/contrib/gloo/store_handler.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/distributed/store_handler.h"
-
-#include <gloo/common/error.h>
-#include <gloo/config.h>
-#include <gloo/rendezvous/context.h>
-#include <gloo/rendezvous/prefix_store.h>
-
-#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
-#include <gloo/mpi/context.h>
-#endif
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class CreateCommonWorld final : public Operator<Context> {
- public:
-  using CommonWorld = std::shared_ptr<::gloo::Context>;
-
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  CreateCommonWorld(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        size_(OperatorBase::template GetSingleArgument<int>("size", 0)),
-        rank_(OperatorBase::template GetSingleArgument<int>("rank", 0)),
-        sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
-        transport_(OperatorBase::template GetSingleArgument<std::string>(
-                       "transport", "tcp")),
-        interface_(OperatorBase::template GetSingleArgument<std::string>(
-                       "interface", "")),
-        mpi_rendezvous_(OperatorBase::template GetSingleArgument<bool>(
-                       "mpi_rendezvous", false)),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")),
-        timeout_ms_(OperatorBase::GetSingleArgument<int>("timeout_ms", -1)),
-        ws_(ws) {
-    CAFFE_ENFORCE(
-        operator_def.has_name(), "CreateCommonWorld operator requires name");
-    CAFFE_ENFORCE(rank_ >= 0 && rank_ < size_);
-    name_ = operator_def.name();
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-    initialize();
-  }
-
-  ~CreateCommonWorld() override {
-  }
-
-  CommonWorld rendezvousWithMPI() {
-#if defined(GLOO_USE_MPI) && GLOO_USE_MPI
-    auto context = ::gloo::mpi::Context::createManaged();
-    if (timeout_ms_ != -1) {
-      context->setTimeout(std::chrono::milliseconds(timeout_ms_));
-    }
-    context->connectFullMesh(device_);
-    return context;
-#else
-    CAFFE_THROW(
-      "Gloo was not compiled with MPI support. ",
-      "Please recompile with -DUSE_MPI=1.");
-#endif
-  }
-
-  CommonWorld rendezvousWithStore(
-      const std::unique_ptr<StoreHandler>& handler) {
-    // Use PrefixStore to isolate different CreateCommonWorld instances
-    StoreHandlerWrapper wrapper(*handler);
-    ::gloo::rendezvous::PrefixStore store(name_, wrapper);
-    auto context = std::make_shared<::gloo::rendezvous::Context>(rank_, size_);
-    if (timeout_ms_ != -1) {
-      context->setTimeout(std::chrono::milliseconds(timeout_ms_));
-    }
-    context->connectFullMesh(store, device_);
-    return context;
-  }
-
-  bool RunOnDevice() override {
-    try {
-      CommonWorld context;
-      if (mpi_rendezvous_) {
-        context = rendezvousWithMPI();
-      } else {
-        CAFFE_ENFORCE_EQ(InputSize(), 1, "Expected store handler input");
-        const auto& handler =
-            OperatorBase::Input<std::unique_ptr<StoreHandler>>(STORE_HANDLER);
-        context = rendezvousWithStore(handler);
-      }
-
-      // Switch pairs to synchronous mode if configured to do so
-      if (sync_) {
-        for (int i = 0; i < context->size; i++) {
-          auto& pair = context->getPair(i);
-          if (pair) {
-            pair->setSync(true, false);
-          }
-        }
-      }
-
-      *OperatorBase::Output<CommonWorld>(COMM) = std::move(context);
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      return handleException(ioe);
-    } catch (::caffe2::StoreHandlerTimeoutException& te) {
-      LOG(ERROR) << "Caught store handler timeout exception: " << te.what();
-      return handleException(te);
-    }
-    return true;
-  }
-
- private:
-  bool handleException(std::exception& ex) {
-    if (status_blob_ != "") {
-      signalFailure(ws_->GetBlob(status_blob_), ex);
-      return false;
-    } else {
-      throw;
-    }
-  }
-
-  void initialize() {
-    // Share single device between all common worlds.
-    static std::once_flag once;
-    static std::shared_ptr<::gloo::transport::Device> device;
-    std::call_once(once, [&]() {
-        createDeviceAttr attr;
-        attr.transport = transport_;
-        attr.interface = interface_;
-        device = createDevice(attr);
-      });
-    device_ = device;
-
-    // Context specific initialization.
-    initializeForContext();
-  }
-
-  void initializeForContext();
-
-  const int size_;
-  const int rank_;
-  const bool sync_;
-  const std::string transport_;
-  const std::string interface_;
-  const bool mpi_rendezvous_;
-  const std::string status_blob_;
-  const int timeout_ms_;
-  Workspace* ws_;
-
-  std::string name_;
-  std::shared_ptr<::gloo::transport::Device> device_;
-
-  INPUT_TAGS(STORE_HANDLER);
-  OUTPUT_TAGS(COMM);
-};
-
-template <class Context>
-class CloneCommonWorld final : public Operator<Context> {
- public:
-  using CommonWorld = std::shared_ptr<::gloo::Context>;
-
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  CloneCommonWorld(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        sync_(OperatorBase::template GetSingleArgument<bool>("sync", false)),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~CloneCommonWorld() override {}
-
-  bool RunOnDevice() override {
-    try {
-      auto existing = OperatorBase::Input<CommonWorld>(EXISTING_COMM);
-      ::gloo::rendezvous::ContextFactory factory(existing);
-      auto clone = factory.makeContext(existing->getDevice());
-
-      // Switch pairs to synchronous mode if configured to do so
-      if (sync_) {
-        for (int i = 0; i < clone->size; i++) {
-          auto& pair = clone->getPair(i);
-          if (pair) {
-            pair->setSync(true, false);
-          }
-        }
-      }
-
-      *OperatorBase::Output<CommonWorld>(CLONED_COMM) = std::move(clone);
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      return handleException(ioe);
-    }
-    return true;
-  }
-
- private:
-  bool handleException(std::exception& ex) {
-    if (status_blob_ != "") {
-      signalFailure(ws_->GetBlob(status_blob_), ex);
-      return false;
-    } else {
-      throw;
-    }
-  }
-
-  const bool sync_;
-  Workspace* ws_;
-  std::string status_blob_;
-
-  INPUT_TAGS(EXISTING_COMM);
-  OUTPUT_TAGS(CLONED_COMM);
-};
-
-class DestroyCommonWorld final : public Operator<CPUContext> {
- public:
-  DestroyCommonWorld(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    cw_name_ = operator_def.input(0);
-  }
-
-  bool RunOnDevice() override {
-    if (OperatorBase::InputBlob(0).GetRaw() == nullptr) {
-      return true;
-    }
-    const auto& context =
-        OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-
-    if (context) {
-      LOG(INFO) << "Closing connections: " << cw_name_;
-      context->closeConnections();
-    }
-    return true;
-  }
-
- private:
-  std::string cw_name_;
-};
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/common_world_ops_gpu.cc
+++ b/caffe2/contrib/gloo/common_world_ops_gpu.cc
@ -1,35 +0,0 @@
-#include "caffe2/contrib/gloo/common_world_ops.h"
-
-#include "caffe2/core/context_gpu.h"
-
-#include <gloo/cuda.h>
-#include <gloo/transport/tcp/device.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <>
-void CreateCommonWorld<CUDAContext>::initializeForContext() {
-  static std::once_flag once;
-  std::call_once(once, [&]() {
-      // This is the first time we call Gloo code for a CUDAContext.
-      // Share Caffe2 CUDA mutex with Gloo.
-      ::gloo::CudaShared::setMutex(&CUDAContext::mutex());
-    });
-}
-
-namespace {
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(
-    CreateCommonWorld,
-    GLOO,
-    CreateCommonWorld<CUDAContext>);
-
-REGISTER_CUDA_OPERATOR_WITH_ENGINE(
-    CloneCommonWorld,
-    GLOO,
-    CloneCommonWorld<CUDAContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/context.cc
+++ b/caffe2/contrib/gloo/context.cc
@ -1,12 +0,0 @@
-#include "context.h"
-
-#include <c10/util/typeid.h>
-
-#include <gloo/types.h>
-
-namespace caffe2 {
-
-CAFFE_KNOWN_TYPE(::gloo::float16);
-CAFFE_KNOWN_TYPE(std::shared_ptr<::gloo::Context>);
-
-} // namespace caffe2
--- a/caffe2/contrib/gloo/context.h
+++ b/caffe2/contrib/gloo/context.h
@ -1,3 +0,0 @@
-#pragma once
-
-#include <gloo/context.h>
--- a/caffe2/contrib/gloo/gloo_test.py
+++ b/caffe2/contrib/gloo/gloo_test.py
@ -1,706 +0,0 @@
-#!/usr/bin/env python3
-
-
-
-
-
-
-from hypothesis import given, settings
-import hypothesis.strategies as st
-from multiprocessing import Process, Queue
-
-import numpy as np
-import os
-import pickle
-import tempfile
-import shutil
-
-from caffe2.python import core, workspace, dyndep
-import caffe2.python.hypothesis_test_util as hu
-from gloo.python import IoError
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:file_store_handler_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:redis_store_handler_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/distributed:store_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops")
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/gloo:gloo_ops_gpu")
-
-op_engine = 'GLOO'
-
-class TemporaryDirectory:
-    def __enter__(self):
-        self.tmpdir = tempfile.mkdtemp()
-        return self.tmpdir
-
-    def __exit__(self, type, value, traceback):
-        shutil.rmtree(self.tmpdir)
-
-
-class TestCase(hu.HypothesisTestCase):
-    test_counter = 0
-    sync_counter = 0
-
-    def run_test_locally(self, fn, device_option=None, **kwargs):
-        # Queue for assertion errors on subprocesses
-        queue = Queue()
-
-        # Capture any exception thrown by the subprocess
-        def run_fn(*args, **kwargs):
-            try:
-                with core.DeviceScope(device_option):
-                    fn(*args, **kwargs)
-                    workspace.ResetWorkspace()
-                    queue.put(True)
-            except Exception as ex:
-                queue.put(ex)
-
-        # Start N processes in the background
-        procs = []
-        for i in range(kwargs['comm_size']):
-            kwargs['comm_rank'] = i
-            proc = Process(
-                target=run_fn,
-                kwargs=kwargs)
-            proc.start()
-            procs.append(proc)
-
-        # Test complete, join background processes
-        while len(procs) > 0:
-            proc = procs.pop(0)
-            while proc.is_alive():
-                proc.join(10)
-
-            # Raise exception if we find any. Otherwise each worker
-            # should put a True into the queue
-            # Note that the following is executed ALSO after
-            # the last process was joined, so if ANY exception
-            # was raised, it will be re-raised here.
-            self.assertFalse(queue.empty(), "Job failed without a result")
-            o = queue.get()
-            if isinstance(o, Exception):
-                raise o
-            else:
-                self.assertTrue(o)
-
-    def run_test_distributed(self, fn, device_option=None, **kwargs):
-        comm_rank = os.getenv('COMM_RANK')
-        self.assertIsNotNone(comm_rank)
-        comm_size = os.getenv('COMM_SIZE')
-        self.assertIsNotNone(comm_size)
-        kwargs['comm_rank'] = int(comm_rank)
-        kwargs['comm_size'] = int(comm_size)
-        with core.DeviceScope(device_option):
-            fn(**kwargs)
-            workspace.ResetWorkspace()
-
-    def create_common_world(self, comm_rank, comm_size, tmpdir=None, existing_cw=None):
-        store_handler = "store_handler"
-
-        # If REDIS_HOST is set, use RedisStoreHandler for rendezvous.
-        if existing_cw is None:
-            redis_host = os.getenv("REDIS_HOST")
-            redis_port = int(os.getenv("REDIS_PORT", 6379))
-            if redis_host is not None:
-                workspace.RunOperatorOnce(
-                    core.CreateOperator(
-                        "RedisStoreHandlerCreate",
-                        [],
-                        [store_handler],
-                        prefix=str(TestCase.test_counter) + "/",
-                        host=redis_host,
-                        port=redis_port))
-            else:
-                workspace.RunOperatorOnce(
-                    core.CreateOperator(
-                        "FileStoreHandlerCreate",
-                        [],
-                        [store_handler],
-                        path=tmpdir))
-            common_world = "common_world"
-        else:
-            common_world = str(existing_cw) + ".forked"
-
-        if existing_cw is not None:
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CloneCommonWorld",
-                    [existing_cw],
-                    [common_world],
-                    sync=True,
-                    engine=op_engine))
-        else:
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "CreateCommonWorld",
-                    [store_handler],
-                    [common_world],
-                    size=comm_size,
-                    rank=comm_rank,
-                    sync=True,
-                    engine=op_engine))
-        return (store_handler, common_world)
-
-    def synchronize(self, store_handler, value, comm_rank=None):
-        TestCase.sync_counter += 1
-        blob = "sync_{}".format(TestCase.sync_counter)
-        if comm_rank == 0:
-            workspace.FeedBlob(blob, pickle.dumps(value))
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "StoreSet",
-                    [store_handler, blob],
-                    []))
-        else:
-            workspace.RunOperatorOnce(
-                core.CreateOperator(
-                    "StoreGet",
-                    [store_handler],
-                    [blob]))
-        return pickle.loads(workspace.FetchBlob(blob))
-
-    def _test_broadcast(self,
-                        comm_rank=None,
-                        comm_size=None,
-                        blob_size=None,
-                        num_blobs=None,
-                        tmpdir=None,
-                        use_float16=False,
-                        ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        for i in range(comm_size):
-            blobs = []
-            for j in range(num_blobs):
-                blob = "blob_{}".format(j)
-                offset = (comm_rank * num_blobs) + j
-                value = np.full(blob_size, offset,
-                                np.float16 if use_float16 else np.float32)
-                workspace.FeedBlob(blob, value)
-                blobs.append(blob)
-
-            net = core.Net("broadcast")
-            net.Broadcast(
-                [common_world] + blobs,
-                blobs,
-                root=i,
-                engine=op_engine)
-
-            workspace.CreateNet(net)
-            workspace.RunNet(net.Name())
-
-            for j in range(num_blobs):
-                np.testing.assert_array_equal(
-                    workspace.FetchBlob(blobs[j]),
-                    i * num_blobs)
-
-            # Run the net a few more times to check the operator
-            # works not just the first time it's called
-            for _tmp in range(4):
-                workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(deadline=10000)
-    def test_broadcast(self, comm_size, blob_size, num_blobs, device_option,
-                       use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_broadcast,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_broadcast,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    def _test_allreduce(self,
-                        comm_rank=None,
-                        comm_size=None,
-                        blob_size=None,
-                        num_blobs=None,
-                        tmpdir=None,
-                        use_float16=False
-                        ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        blobs = []
-        for i in range(num_blobs):
-            blob = "blob_{}".format(i)
-            value = np.full(blob_size, (comm_rank * num_blobs) + i,
-                            np.float16 if use_float16 else np.float32)
-            workspace.FeedBlob(blob, value)
-            blobs.append(blob)
-
-        net = core.Net("allreduce")
-        net.Allreduce(
-            [common_world] + blobs,
-            blobs,
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-        for i in range(num_blobs):
-            np.testing.assert_array_equal(
-                workspace.FetchBlob(blobs[i]),
-                (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    def _test_allreduce_multicw(self,
-                                comm_rank=None,
-                                comm_size=None,
-                                tmpdir=None
-                                ):
-        _store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        _, common_world2 = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir,
-            existing_cw=common_world)
-
-        blob_size = int(1e4)
-        num_blobs = 4
-
-        for cw in [common_world, common_world2]:
-            blobs = []
-            for i in range(num_blobs):
-                blob = "blob_{}".format(i)
-                value = np.full(blob_size, (comm_rank * num_blobs) + i, np.float32)
-                workspace.FeedBlob(blob, value)
-                blobs.append(blob)
-
-            net = core.Net("allreduce_multicw")
-            net.Allreduce(
-                [cw] + blobs,
-                blobs,
-                engine=op_engine)
-
-            workspace.RunNetOnce(net)
-            for i in range(num_blobs):
-                np.testing.assert_array_equal(
-                    workspace.FetchBlob(blobs[i]),
-                    (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(deadline=10000)
-    def test_allreduce(self, comm_size, blob_size, num_blobs, device_option,
-                       use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_allreduce,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_allreduce,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    def _test_reduce_scatter(self,
-                             comm_rank=None,
-                             comm_size=None,
-                             blob_size=None,
-                             num_blobs=None,
-                             tmpdir=None,
-                             use_float16=False
-                             ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        blobs = []
-        for i in range(num_blobs):
-            blob = "blob_{}".format(i)
-            value = np.full(blob_size, (comm_rank * num_blobs) + i,
-                            np.float16 if use_float16 else np.float32)
-            workspace.FeedBlob(blob, value)
-            blobs.append(blob)
-
-        # Specify distribution among ranks i.e. number of elements
-        # scattered/distributed to each process.
-        recv_counts = np.zeros(comm_size, dtype=np.int32)
-        remaining = blob_size
-        chunk_size = (blob_size + comm_size - 1) / comm_size
-        for i in range(comm_size):
-            recv_counts[i] = min(chunk_size, remaining)
-            remaining = remaining - chunk_size if remaining > chunk_size else 0
-        recv_counts_blob = "recvCounts"
-        workspace.FeedBlob(recv_counts_blob, recv_counts)
-        blobs.append(recv_counts_blob)
-
-        net = core.Net("reduce_scatter")
-        net.ReduceScatter(
-            [common_world] + blobs,
-            blobs,
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-        for i in range(num_blobs):
-            np.testing.assert_array_equal(
-                np.resize(workspace.FetchBlob(blobs[i]), recv_counts[comm_rank]),
-                (num_blobs * comm_size) * (num_blobs * comm_size - 1) / 2)
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(deadline=10000)
-    def test_reduce_scatter(self, comm_size, blob_size, num_blobs,
-                            device_option, use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_reduce_scatter,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_reduce_scatter,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    def _test_allgather(self,
-                        comm_rank=None,
-                        comm_size=None,
-                        blob_size=None,
-                        num_blobs=None,
-                        tmpdir=None,
-                        use_float16=False
-                        ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        blob_size = self.synchronize(
-            store_handler,
-            blob_size,
-            comm_rank=comm_rank)
-
-        num_blobs = self.synchronize(
-            store_handler,
-            num_blobs,
-            comm_rank=comm_rank)
-
-        blobs = []
-        for i in range(num_blobs):
-            blob = "blob_{}".format(i)
-            value = np.full(blob_size, (comm_rank * num_blobs) + i,
-                            np.float16 if use_float16 else np.float32)
-            workspace.FeedBlob(blob, value)
-            blobs.append(blob)
-
-        net = core.Net("allgather")
-        net.Allgather(
-            [common_world] + blobs,
-            ["Gathered"],
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-        # create expected output
-        expected_output = np.array([])
-        for i in range(comm_size):
-            for j in range(num_blobs):
-                value = np.full(blob_size, (i * num_blobs) + j,
-                                np.float16 if use_float16 else np.float32)
-                expected_output = np.concatenate((expected_output, value))
-        np.testing.assert_array_equal(
-            workspace.FetchBlob("Gathered"), expected_output)
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           blob_size=st.integers(min_value=int(1e3), max_value=int(1e6)),
-           num_blobs=st.integers(min_value=1, max_value=4),
-           device_option=st.sampled_from([hu.cpu_do]),
-           use_float16=st.booleans())
-    @settings(max_examples=10, deadline=None)
-    def test_allgather(self, comm_size, blob_size, num_blobs, device_option,
-                       use_float16):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_allgather,
-                blob_size=blob_size,
-                num_blobs=num_blobs,
-                use_float16=use_float16,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_allgather,
-                    comm_size=comm_size,
-                    blob_size=blob_size,
-                    num_blobs=num_blobs,
-                    device_option=device_option,
-                    tmpdir=tmpdir,
-                    use_float16=use_float16)
-
-    @given(device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_forked_cw(self, device_option):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_allreduce_multicw,
-                device_option=device_option)
-        else:
-            # Note: this test exercises the path where we fork a common world.
-            # We therefore don't need a comm size larger than 2. It used to be
-            # run with comm_size=8, which causes flaky results in a stress run.
-            # The flakiness was caused by too many listening sockets being
-            # created by Gloo context initialization (8 processes times
-            # 7 sockets times 20-way concurrency, plus TIME_WAIT).
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_allreduce_multicw,
-                    comm_size=2,
-                    device_option=device_option,
-                    tmpdir=tmpdir)
-
-    def _test_barrier(
-        self,
-        comm_rank=None,
-        comm_size=None,
-        tmpdir=None,
-    ):
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
-        )
-
-        net = core.Net("barrier")
-        net.Barrier(
-            [common_world],
-            [],
-            engine=op_engine)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-        # Run the net a few more times to check the operator
-        # works not just the first time it's called
-        for _tmp in range(4):
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_barrier(self, comm_size, device_option):
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_barrier,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_barrier,
-                    comm_size=comm_size,
-                    device_option=device_option,
-                    tmpdir=tmpdir)
-
-    def _test_close_connection(
-        self,
-        comm_rank=None,
-        comm_size=None,
-        tmpdir=None,
-    ):
-        '''
-        One node calls close connection, others wait it on barrier.
-        Test will check that all will exit eventually.
-        '''
-        # Caffe's for closers only:
-        # https://www.youtube.com/watch?v=QMFwFgG9NE8
-        closer = comm_rank == comm_size // 2,
-
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank, comm_size=comm_size, tmpdir=tmpdir
-        )
-
-        net = core.Net("barrier_or_close")
-        if not closer:
-            net.Barrier(
-                [common_world],
-                [],
-                engine=op_engine)
-        else:
-            net.DestroyCommonWorld(
-                [common_world], [common_world], engine=op_engine)
-            # Sleep a bit to ensure others start the barrier
-            import time
-            time.sleep(0.1)
-
-        workspace.CreateNet(net)
-        workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_close_connection(self, comm_size, device_option):
-        import time
-        start_time = time.time()
-        TestCase.test_counter += 1
-        if os.getenv('COMM_RANK') is not None:
-            self.run_test_distributed(
-                self._test_close_connection,
-                device_option=device_option)
-        else:
-            with TemporaryDirectory() as tmpdir:
-                self.run_test_locally(
-                    self._test_close_connection,
-                    comm_size=comm_size,
-                    device_option=device_option,
-                    tmpdir=tmpdir)
-        # Check that test finishes quickly because connections get closed.
-        # This assert used to check that the end to end runtime was less
-        # than 2 seconds, but this may not always be the case if there
-        # is significant overhead in starting processes. Ideally, this
-        # assert is replaced by one that doesn't depend on time but rather
-        # checks the success/failure status of the barrier that is run.
-        self.assertLess(time.time() - start_time, 20.0)
-
-    def _test_io_error(
-        self,
-        comm_rank=None,
-        comm_size=None,
-        tmpdir=None,
-    ):
-        '''
-        Only one node will participate in allreduce, resulting in an IoError
-        '''
-        store_handler, common_world = self.create_common_world(
-            comm_rank=comm_rank,
-            comm_size=comm_size,
-            tmpdir=tmpdir)
-
-        if comm_rank == 0:
-            blob_size = 1000
-            num_blobs = 1
-
-            blobs = []
-            for i in range(num_blobs):
-                blob = "blob_{}".format(i)
-                value = np.full(
-                    blob_size, (comm_rank * num_blobs) + i, np.float32
-                )
-                workspace.FeedBlob(blob, value)
-                blobs.append(blob)
-
-            net = core.Net("allreduce")
-            net.Allreduce(
-                [common_world] + blobs,
-                blobs,
-                engine=op_engine)
-
-            workspace.CreateNet(net)
-            workspace.RunNet(net.Name())
-
-    @given(comm_size=st.integers(min_value=2, max_value=8),
-           device_option=st.sampled_from([hu.cpu_do]))
-    @settings(deadline=10000)
-    def test_io_error(self, comm_size, device_option):
-        TestCase.test_counter += 1
-        with self.assertRaises(IoError):
-            if os.getenv('COMM_RANK') is not None:
-                self.run_test_distributed(
-                    self._test_io_error,
-                    device_option=device_option)
-            else:
-                with TemporaryDirectory() as tmpdir:
-                    self.run_test_locally(
-                        self._test_io_error,
-                        comm_size=comm_size,
-                        device_option=device_option,
-                        tmpdir=tmpdir)
-
-if __name__ == "__main__":
-    import unittest
-    unittest.main()
--- a/caffe2/contrib/gloo/py_export.cc
+++ b/caffe2/contrib/gloo/py_export.cc
@ -1,15 +0,0 @@
-#include <gloo/common/error.h>
-#include <pybind11/pybind11.h>
-
-namespace gloo {
-namespace python {
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(python, m) {
-  m.doc() = "Python interface for Gloo";
-  py::register_exception<IoException>(m, "IoError");
-}
-
-} // namespace python
-} // namespace gloo
--- a/caffe2/contrib/gloo/reduce_scatter_ops.cc
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.cc
@ -1,53 +0,0 @@
-/**
- * Copyright (c) 2018-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include "reduce_scatter_ops.h"
-
-#include <gloo/reduce_scatter.h>
-#include <gloo/types.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-void ReduceScatterOp<Context>::initializeHalvingDoubling() {
-  if (init_.template IsType<float>()) {
-    algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<float>(
-        init_.context,
-        init_.template getOutputs<float>(),
-        init_.size,
-        recvCounts_));
-  } else if (init_.template IsType<::at::Half>()) {
-    algorithm_.reset(new ::gloo::ReduceScatterHalvingDoubling<::gloo::float16>(
-        init_.context,
-        init_.template getOutputs<::gloo::float16>(),
-        init_.size,
-        recvCounts_));
-  } else {
-    CAFFE_ENFORCE(false, "Unhandled type: ", init_.meta.name());
-  }
-}
-
-namespace {
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(
-    ReduceScatter,
-    GLOO,
-    ReduceScatterOp<CPUContext>);
-
-} // namespace
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/reduce_scatter_ops.h
+++ b/caffe2/contrib/gloo/reduce_scatter_ops.h
@ -1,131 +0,0 @@
-/**
- * Copyright (c) 2018-present, Facebook, Inc.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#pragma once
-
-#include <algorithm>
-
-#include "caffe2/contrib/gloo/common.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-#include <gloo/algorithm.h>
-#include <gloo/common/error.h>
-#include <gloo/context.h>
-
-namespace caffe2 {
-namespace gloo {
-
-template <class Context>
-class ReduceScatterOp final : public Operator<Context> {
- public:
-  USE_OPERATOR_CONTEXT_FUNCTIONS;
-
-  ReduceScatterOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<Context>(operator_def, ws),
-        ws_(ws),
-        status_blob_(
-            OperatorBase::GetSingleArgument<std::string>("status_blob", "")) {
-    if (status_blob_ != "") {
-      ws_->CreateBlob(status_blob_);
-    }
-  }
-
-  ~ReduceScatterOp() override {}
-
-  bool RunOnDevice() override {
-    std::call_once(once_, [&] { initialize(); });
-
-    // If any parameter has changed in between runs, the initialized
-    // algorithm is invalid and cannot be used.
-    update(current_);
-    CAFFE_ENFORCE(current_ == init_, "Inputs/outputs have changed");
-
-    try {
-      algorithm_->run();
-    } catch (::gloo::IoException& ioe) {
-      LOG(ERROR) << "Caught gloo IO exception: " << ioe.what();
-      if (status_blob_ != "") {
-        signalFailure(ws_->GetBlob(status_blob_), ioe);
-        return false;
-      } else {
-        throw;
-      }
-    }
-    return true;
-  }
-
- protected:
-  void initialize() {
-    // Store which inputs/outputs this instance initialized with
-    update(init_);
-
-    // Verify inputs == outputs
-    CAFFE_ENFORCE_EQ(init_.inputs.size(), init_.outputs.size());
-    for (const auto i : c10::irange(init_.inputs.size())) {
-      CAFFE_ENFORCE_EQ(init_.inputs[i], init_.outputs[i]);
-    }
-
-    // Verify tensors all have same size
-    size_t size = Input(1).numel();
-    for (auto i = 2; i < InputSize() - 1; i++) {
-      CAFFE_ENFORCE_EQ(Input(i).numel(), size);
-    }
-
-    // Verify tensors all have same type
-    TypeMeta meta = Input(1).dtype();
-    for (auto i = 2; i < InputSize() - 1; i++) {
-      CAFFE_ENFORCE(Input(i).dtype() == meta);
-    }
-
-    initializeHalvingDoubling();
-  }
-
-  void initializeHalvingDoubling();
-
-  std::once_flag once_;
-  std::unique_ptr<::gloo::Algorithm> algorithm_;
-
-  // Captures the parameters passed to Gloo when first initialized.
-  // An instance is updated every time this op runs and is compared
-  // to the reference instance for equality. If any parameter has
-  // changed from run to run, the initialized algorithm is invalid.
-  void update(GlooParameters& params) {
-    params.context = OperatorBase::Input<std::shared_ptr<::gloo::Context>>(0);
-    params.inputs.resize(InputSize() - 2);
-    params.outputs.resize(OutputSize() - 1);
-    for (const auto i : c10::irange(params.inputs.size())) {
-      params.inputs[i] = Input(i + 1).raw_data();
-      params.outputs[i] = Output(i)->raw_mutable_data();
-    }
-    params.size = Output(0)->numel();
-    params.meta = Output(0)->dtype();
-
-    // Verify recvCountsSize == comm_size
-    CAFFE_ENFORCE_EQ(Input(InputSize() - 1).numel(), params.context->size);
-    int* recvCounts = (int*)Input(InputSize() - 1).raw_data();
-    recvCounts_.assign(recvCounts, recvCounts + Input(InputSize() - 1).numel());
-  }
-
-  GlooParameters init_;
-  GlooParameters current_;
-  Workspace* ws_;
-  std::string status_blob_;
-  std::vector<int> recvCounts_;
-};
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/store_handler.cc
+++ b/caffe2/contrib/gloo/store_handler.cc
@ -1,25 +0,0 @@
-#include "store_handler.h"
-
-namespace caffe2 {
-namespace gloo {
-
-void StoreHandlerWrapper::set(
-    const std::string& key,
-    const std::vector<char>& data) {
-  std::string stringValue(data.data(), data.size());
-  handler_.set(key, stringValue);
-}
-
-std::vector<char> StoreHandlerWrapper::get(const std::string& key) {
-  std::string str = handler_.get(key);
-  return std::vector<char>(str.begin(), str.end());
-}
-
-void StoreHandlerWrapper::wait(
-    const std::vector<std::string>& keys,
-    const std::chrono::milliseconds& timeout) {
-  handler_.wait(keys, timeout);
-}
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/gloo/store_handler.h
+++ b/caffe2/contrib/gloo/store_handler.h
@ -1,35 +0,0 @@
-#pragma once
-
-#include "caffe2/core/common.h"
-#include "caffe2/distributed/store_handler.h"
-
-#include <gloo/rendezvous/store.h>
-
-namespace caffe2 {
-namespace gloo {
-
-class TORCH_API StoreHandlerWrapper : public ::gloo::rendezvous::Store {
- public:
-  explicit StoreHandlerWrapper(StoreHandler& handler) : handler_(handler) {}
-
-  virtual ~StoreHandlerWrapper() override {}
-
-  virtual void set(const std::string& key, const std::vector<char>& data)
-      override;
-
-   std::vector<char> get(const std::string& key) override;
-
-   void wait(const std::vector<std::string>& keys) override {
-    wait(keys, ::gloo::rendezvous::Store::kDefaultTimeout);
-  }
-
-  virtual void wait(
-      const std::vector<std::string>& keys,
-      const std::chrono::milliseconds& timeout) override;
-
- protected:
-  StoreHandler& handler_;
-};
-
-} // namespace gloo
-} // namespace caffe2
--- a/caffe2/contrib/ideep/CMakeLists.txt
+++ b/caffe2/contrib/ideep/CMakeLists.txt
@ -1,20 +0,0 @@
-if(USE_MKLDNN)
-  message(STATUS "Including IDEEP operators")
-
-  # ---[ CPU files.
-  file(GLOB_RECURSE tmp *.cc)
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp})
-  # exclude test files and gpu files
-  file(GLOB_RECURSE tmp *_test.cc)
-  exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
-
-  # ---[ CPU test files - currently none but just to be safe
-  file(GLOB_RECURSE tmp *_test.cc)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp})
-
-  # ---[ Send the lists to the parent scope.
-  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
-  set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
-else()
-  message(STATUS "Excluding ideep operators as we are not using ideep")
-endif()
--- a/caffe2/contrib/nccl/CMakeLists.txt
+++ b/caffe2/contrib/nccl/CMakeLists.txt
@ -1,25 +0,0 @@
-if(USE_NCCL)
-    if(USE_CUDA)
-      message(STATUS "Include NCCL operators")
-      set(Caffe2_CONTRIB_NCCL_GPU_SRC
-          "${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_gpu.cc"
-          "${CMAKE_CURRENT_SOURCE_DIR}/cuda_nccl_op_gpu.cc"
-      )
-
-      set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} ${Caffe2_CONTRIB_NCCL_GPU_SRC})
-      set(Caffe2_GPU_SRCS ${Caffe2_GPU_SRCS} PARENT_SCOPE)
-    endif(USE_CUDA)
-
-    if(USE_ROCM)
-      message(STATUS "Include AMD RCCL operators")
-      set(Caffe2_CONTRIB_NCCL_HIP_SRC
-          "${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_nccl_gpu.cc"
-          "${CMAKE_CURRENT_SOURCE_DIR}/hip/hip_nccl_op_gpu.cc"
-      )
-
-      set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} ${Caffe2_CONTRIB_NCCL_HIP_SRC})
-      set(Caffe2_HIP_SRCS ${Caffe2_HIP_SRCS} PARENT_SCOPE)
-    endif(USE_ROCM)
-else()
-  message(STATUS "NCCL operators skipped due to no CUDA support")
-endif()
--- a/caffe2/contrib/nccl/init.py
+++ b/caffe2/contrib/nccl/init.py
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.cc
@ -1,322 +0,0 @@
-#include "caffe2/contrib/nccl/cuda_nccl_gpu.h"
-
-namespace caffe2 {
-namespace nccl {
-namespace {
-
-std::vector<int> getDevices(const NCCLExecution& ex) {
-  std::vector<int> result;
-  result.reserve(ex.elements.size());
-  for (const auto& el : ex.elements) {
-    result.push_back(el.device);
-  }
-  return result;
-}
-
-class NCCLContext {
- public:
-  explicit NCCLContext(const NCCLExecution& ex)
-      : devices_(getDevices(ex)), master_gpu_id_(ex.stream_gpu_id) {
-    comms_.resize(devices_.size());
-    CAFFE_NCCL_CHECK(
-        ncclCommInitAll(comms_.data(), devices_.size(), devices_.data()));
-
-    streams_.resize(devices_.size());
-    events_.resize(devices_.size());
-    for (auto i = 0U; i < devices_.size(); ++i) {
-      CUDAGuard g(devices_[i]);
-      // get stream priorities
-      int lo_pri, hi_pri;
-      CUDA_ENFORCE(cudaDeviceGetStreamPriorityRange(&lo_pri, &hi_pri));
-      CUDA_ENFORCE(cudaStreamCreateWithPriority(
-          &streams_[i], cudaStreamNonBlocking, hi_pri));
-      CUDA_ENFORCE(cudaEventCreateWithFlags(
-          &events_[i], cudaEventDefault | cudaEventDisableTiming));
-    }
-    CUDAGuard g(master_gpu_id_);
-    CUDA_ENFORCE(cudaEventCreateWithFlags(
-        &master_event_, cudaEventDefault | cudaEventDisableTiming));
-  }
-
-  ~NCCLContext() {
-    for (auto i = 0U; i < devices_.size(); ++i) {
-      CUDAGuard g(devices_[i]);
-      CUDA_ENFORCE(cudaStreamDestroy(streams_[i]));
-      CUDA_ENFORCE(cudaEventDestroy(events_[i]));
-    }
-    CUDAGuard g(master_gpu_id_);
-    CUDA_ENFORCE(cudaEventDestroy(master_event_));
-
-    for (auto& comm : comms_) {
-      ncclCommDestroy(comm);
-    }
-  }
-
-  std::vector<int> devices_;
-  std::vector<ncclComm_t> comms_;
-  std::vector<cudaStream_t> streams_;
-  int master_gpu_id_;
-  cudaEvent_t master_event_;
-  std::vector<cudaEvent_t> events_;
-
-  C10_DISABLE_COPY_AND_ASSIGN(NCCLContext);
-};
-
-// We share the contexts across multiple operators, hence the cache.
-static std::mutex& gContextsMutex() {
-  static std::mutex m;
-  return m;
-}
-
-std::unordered_map<std::string, std::unique_ptr<NCCLContext>>& gContexts() {
-  static std::unordered_map<std::string, std::unique_ptr<NCCLContext>> m;
-  return m;
-}
-
-std::string ncclKey(const NCCLExecution& ex) {
-  std::string result;
-  int curr_device;
-  CUDA_CHECK(cudaGetDevice(&curr_device));
-  result += to_string(curr_device) + ":";
-  for (const auto& el : ex.elements) {
-    result += to_string(el.device) + ",";
-  }
-  return result;
-}
-
-NCCLContext* getNCCLContext(const NCCLExecution& ex) {
-  auto& contexts = gContexts();
-  const auto key = ncclKey(ex);
-  if (!contexts[key]) {
-    LOG(INFO) << "Creating NCCLContext for key: " << key;
-    contexts[key].reset(new NCCLContext(ex));
-  }
-  return TORCH_CHECK_NOTNULL(contexts[key].get());
-}
-
-template <typename T>
-class ncclTypeWrapper;
-
-template <>
-class ncclTypeWrapper<float> {
- public:
-  static const ncclDataType_t type = ncclFloat;
-};
-
-template <>
-class ncclTypeWrapper<int> {
- public:
-  static const ncclDataType_t type = ncclInt;
-};
-
-#ifdef CAFFE_HAS_CUDA_FP16
-template <>
-class ncclTypeWrapper<at::Half> {
- public:
-  static const ncclDataType_t type = ncclHalf;
-};
-#endif
-
-template <typename T, typename InitF, typename F>
-void runNCCL(const NCCLExecution& ex, InitF&& init_f, F&& f) {
-  // do initialization
-  for (auto i = 0U; i < ex.elements.size(); ++i) {
-    auto& ctx = ex.elements[i];
-    CUDAGuard g(ctx.device);
-    init_f(ex.elements[i]);
-  }
-
-  std::lock_guard<std::mutex> g(gContextsMutex());
-  auto* context = getNCCLContext(ex);
-  auto& comms = context->comms_;
-  auto& streams = context->streams_;
-  auto& events = context->events_;
-  // Record an event on the master context, wait on it in each of the
-  // children streams, so the children streams are synchronized WRT
-  // the original stream.
-  {
-    CUDAGuard g(ex.stream_gpu_id);
-    CUDA_ENFORCE(cudaEventRecord(context->master_event_, ex.stream));
-  }
-
-  {
-    // lock out alloc / free while NCCL launches
-    std::lock_guard<std::mutex> lock(CUDAContext::mutex());
-
-#if NCCL_VERSION_MIN(2, 0, 0)
-    CAFFE_NCCL_CHECK(ncclGroupStart());
-#endif
-
-    for (auto i = 0U; i < ex.elements.size(); ++i) {
-      auto& ctx = ex.elements[i];
-      CUDAGuard g(ctx.device);
-      auto& comm = comms[i];
-      auto& stream = streams[i];
-
-      TORCH_DCHECK_EQ(ctx.device, GetGPUIDForPointer(ctx.src->raw_data()));
-      CUDA_ENFORCE(cudaStreamWaitEvent(stream, context->master_event_, 0));
-      f(ctx, comm, stream);
-    }
-
-#if NCCL_VERSION_MIN(2, 0, 0)
-    CAFFE_NCCL_CHECK(ncclGroupEnd());
-#endif
-
-    for (auto i = 0U; i < ex.elements.size(); ++i) {
-      auto& ctx = ex.elements[i];
-      CUDAGuard g(ctx.device);
-      auto& stream = streams[i];
-      auto& event = events[i];
-
-      // Record an event on each children stream that we have finished
-      // our computation
-      CUDA_ENFORCE(cudaEventRecord(event, stream));
-    }
-  }
-
-  // Now, wait on all the events in the original stream.
-  CUDAGuard dg(ex.stream_gpu_id);
-  for (auto& event : events) {
-    CUDA_ENFORCE(cudaStreamWaitEvent(TORCH_CHECK_NOTNULL(ex.stream), event, 0));
-  }
-}
-
-} // namespace
-
-void destroyContexts() {
-  std::lock_guard<std::mutex> g(gContextsMutex());
-  auto& contexts = gContexts();
-  contexts.clear();
-}
-
-template <typename T>
-void NCCL<T>::AllReduce(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        ctx.dst->Resize(ctx.src->sizes());
-        ctx.dst->template mutable_data<T>();
-      },
-      [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclAllReduce(
-            ctx.src->raw_data(),
-            ctx.dst->raw_mutable_data(),
-            ctx.dst->numel(),
-            ncclTypeWrapper<T>::type,
-            ncclSum,
-            comm,
-            stream));
-      });
-}
-
-template <typename T>
-void NCCL<T>::Broadcast(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        ctx.dst->Resize(ctx.src->sizes());
-        ctx.dst->template mutable_data<T>();
-      },
-      [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclBcast(
-            ctx.dst->raw_mutable_data(),
-            ctx.dst->numel(),
-            ncclTypeWrapper<T>::type,
-            ex.root,
-            comm,
-            stream));
-      });
-}
-
-template <typename T>
-void NCCL<T>::Reduce(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        if (ctx.dst) {
-          ctx.dst->Resize(ctx.src->sizes());
-          ctx.dst->template mutable_data<T>();
-        }
-      },
-      [&ex](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclReduce(
-            ctx.src->raw_data(),
-            ctx.dst ? ctx.dst->raw_mutable_data() : nullptr,
-            ctx.src->numel(),
-            ncclTypeWrapper<T>::type,
-            ncclSum,
-            ex.root,
-            comm,
-            stream));
-      });
-}
-
-template <typename T>
-void NCCL<T>::AllGather(const NCCLExecution& ex) {
-  const auto n = ex.elements.size();
-  return runNCCL<T>(
-      ex,
-      [n](const NCCLElement& ctx) {
-        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
-        std::vector<int64_t> dims;
-        dims.reserve(ctx.src->dim() + 1);
-        dims.push_back(n);
-        for (auto d : ctx.src->sizes()) {
-          dims.push_back(d);
-        }
-        ctx.dst->Resize(dims);
-        ctx.dst->template mutable_data<T>();
-      },
-      [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-#if NCCL_VERSION_MIN(2, 0, 0)
-        CAFFE_NCCL_CHECK(ncclAllGather(
-            ctx.src->raw_data(),
-            ctx.dst->raw_mutable_data(),
-            ctx.src->numel(),
-            ncclTypeWrapper<T>::type,
-            comm,
-            stream));
-#else
-        CAFFE_NCCL_CHECK(ncclAllGather(
-            ctx.src->raw_data(),
-            ctx.src->size(),
-            ncclTypeWrapper<T>::type,
-            ctx.dst->raw_mutable_data(),
-            comm,
-            stream));
-#endif
-      });
-}
-
-template <typename T>
-void NCCL<T>::ReduceScatter(const NCCLExecution& ex) {
-  return runNCCL<T>(
-      ex,
-      [](const NCCLElement& ctx) {
-        CAFFE_ENFORCE_NE(ctx.src, ctx.dst);
-        const auto& srcDims = ctx.src->sizes();
-        std::vector<int64_t> dstDims(srcDims.begin() + 1, srcDims.end());
-        ctx.dst->Resize(dstDims);
-        ctx.dst->template mutable_data<T>();
-      },
-      [](const NCCLElement& ctx, ncclComm_t comm, cudaStream_t stream) {
-        CAFFE_NCCL_CHECK(ncclReduceScatter(
-            ctx.src->raw_data(),
-            ctx.dst->raw_mutable_data(),
-            ctx.dst->numel(),
-            ncclTypeWrapper<T>::type,
-            ncclSum,
-            comm,
-            stream));
-      });
-}
-
-// Explicit instantiation
-template class NCCL<float>;
-template class NCCL<int>;
-#ifdef CAFFE_HAS_CUDA_FP16
-template class NCCL<at::Half>;
-#endif
-
-} // namespace nccl
-} // namespace caffe2
--- a/caffe2/contrib/nccl/cuda_nccl_gpu.h
+++ b/caffe2/contrib/nccl/cuda_nccl_gpu.h
@ -1,63 +0,0 @@
-#pragma once
-
-#include <cstddef>
-
-#include "caffe2/core/common_gpu.h"
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/logging.h"
-
-#include <nccl.h>
-#include <unordered_map>
-
-#define NCCL_VERSION_MIN(major, minor, patch) \
-  ((NCCL_MAJOR > major) ||                    \
-   ((NCCL_MAJOR == major) &&                  \
-    ((NCCL_MINOR > minor) ||                  \
-     ((NCCL_MINOR == minor) && (NCCL_PATCH >= patch)))))
-
-namespace caffe2 {
-namespace nccl {
-
-#define CAFFE_NCCL_CHECK(condition)    \
-  do {                                 \
-    ncclResult_t status = (condition); \
-    CAFFE_ENFORCE_EQ(                  \
-        status,                        \
-        ncclSuccess,                   \
-        " ",                           \
-        "Error at: ",                  \
-        __FILE__,                      \
-        __LINE__,                      \
-        ": ",                          \
-        ncclGetErrorString(status));   \
-  } while (0)
-
-struct NCCLElement {
-  const TensorCUDA* src{nullptr};
-  TensorCUDA* dst{nullptr};
-  int device{0};
-};
-
-struct NCCLExecution {
-  int stream_gpu_id{0};
-  cudaStream_t stream{nullptr};
-  std::vector<NCCLElement> elements;
-  size_t root{0};
-};
-
-// Called when the last NCCL op is destructed and all lazily created
-// NCCLContext instances can safely be destroyed.
-void destroyContexts();
-
-template <typename T>
-class NCCL {
- public:
-  static void AllReduce(const NCCLExecution& ex);
-  static void Broadcast(const NCCLExecution& ex);
-  static void Reduce(const NCCLExecution& ex);
-  static void AllGather(const NCCLExecution& ex);
-  static void ReduceScatter(const NCCLExecution& ex);
-};
-
-} // namespace nccl
-} // namespace caffe2
--- a/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
+++ b/caffe2/contrib/nccl/cuda_nccl_op_gpu.cc
@ -1,275 +0,0 @@
-#include "caffe2/core/context_gpu.h"
-#include "caffe2/core/operator.h"
-
-#include "caffe2/contrib/nccl/cuda_nccl_gpu.h"
-
-namespace caffe2 {
-
-nccl::NCCLExecution getNCCLElements(
-    OperatorBase* op,
-    const CUDAContext& context) {
-  // We either do an N-N op, or an N-1 op.
-  CAFFE_ENFORCE(op->InputSize() == op->OutputSize() || op->OutputSize() == 1);
-  nccl::NCCLExecution ex;
-  ex.stream_gpu_id = context.device_id();
-  ex.stream = context.cuda_stream();
-  ex.root = op->template GetSingleArgument<int>("root", 0);
-  ex.elements.resize(op->InputSize());
-  for (auto i = 0; i < op->InputSize(); ++i) {
-    auto& el = ex.elements[i];
-    el.src = &(op->Input<Tensor>(i, CUDA));
-    if (op->OutputSize() == 1) {
-      // Reduce op
-      if (i == ex.root) {
-        el.dst = op->Output<Tensor>(0, CUDA);
-      }
-    } else if (i < op->OutputSize()) {
-      el.dst = op->Output<Tensor>(i, CUDA);
-    }
-    // TODO - expensive (>1ms) - cache these.
-    el.device = GetGPUIDForPointer(op->Input<Tensor>(i, CUDA).raw_data());
-  }
-
-  return ex;
-}
-
-namespace {
-
-// Check if all inputs are float
-template <typename T>
-bool AllInputsAre(OperatorBase* op) {
-  for (auto i = 0; i < op->InputSize(); ++i) {
-    if (op->Input<Tensor>(i, CUDA).IsType<T>()) {
-      continue;
-    } else {
-      return false;
-    }
-  }
-  return true;
-}
-
-// Manual count of all instantiated NCCL ops.
-// If this drops to zero after destructing the last NCCL op,
-// it means we can safely destroy all lazily created NCCL contexts.
-std::atomic<int> kNCCLOpCounter(0);
-
-}; // namespace
-
-class NCCLBaseOp : public Operator<CUDAContext> {
- public:
-  using Operator::Operator;
-
-  NCCLBaseOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CUDAContext>(operator_def, ws) {
-    kNCCLOpCounter++;
-  }
-
-  ~NCCLBaseOp() {
-    if (--kNCCLOpCounter == 0) {
-      nccl::destroyContexts();
-    }
-  }
-};
-
-class NCCLAllreduceOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::AllReduce(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::AllReduce(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-
-  static std::vector<TensorShape> ShapeInference(
-      const OperatorDef& def,
-      const std::vector<TensorShape>& in) {
-    auto n_outputs = def.output_size();
-    CAFFE_ENFORCE(
-        n_outputs == 1 || n_outputs == in.size(),
-        "NCCLAllreduce only supports N-1 or N-N reductions");
-
-    for (auto i = 0; i < in.size(); i++) {
-      CAFFE_ENFORCE(
-          in[0].dims_size() == in[i].dims_size(),
-          "NCCLAllreduce requires inputs of same dimension");
-      for (auto j = 0; j < in[0].dims_size(); j++) {
-        CAFFE_ENFORCE(
-            in[0].dims(j) == in[i].dims(j),
-            "NCCLAllreduce requires inputs to be of same shape");
-      }
-    }
-
-    std::vector<TensorShape> out(n_outputs);
-    for (auto i = 0; i < out.size(); i++) {
-      out[i] = in[0];
-    }
-    return out;
-  }
-
-  static struct OpSchema::Cost CostInference(
-      const OperatorDef& def,
-      const vector<TensorShape>& inputs) {
-    CAFFE_ENFORCE_GE(inputs.size(), 1, "Conv requires at least 1 input");
-    const TensorShape X0 = inputs[0];
-    const auto nElem = nElemFromDim(inputs[0]);
-
-    struct OpSchema::Cost c;
-    c.flops = (inputs.size() - 1) * nElem;
-    c.bytes_read = inputs.size() * nElem;
-    c.bytes_written = def.output_size() * nElem;
-    c.params_bytes = 0;
-    return c;
-  }
-};
-
-class NCCLBroadcastOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::Broadcast(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::Broadcast(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-class NCCLReduceOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-    const auto& ex = getNCCLElements(this, context_);
-
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::Reduce(ex);
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::Reduce(ex);
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-class NCCLAllGatherOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (InputSize() == 1)
-      return true;
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::AllGather(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::AllGather(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-class NCCLReduceScatterOp final : public NCCLBaseOp {
- public:
-  using NCCLBaseOp::NCCLBaseOp;
-
-  bool RunOnDevice() override {
-    if (AllInputsAre<float>(this)) {
-      nccl::NCCL<float>::ReduceScatter(getNCCLElements(this, context_));
-      return true;
-    } else if (AllInputsAre<at::Half>(this)) {
-      nccl::NCCL<at::Half>::ReduceScatter(getNCCLElements(this, context_));
-      return true;
-    } else {
-      return false;
-    }
-  }
-};
-
-namespace {
-
-std::pair<std::vector<DeviceOption>, std::vector<DeviceOption>> ncclOpDevInfer(
-    const OperatorDef& def) {
-  std::vector<DeviceOption> opt;
-  for (int i = 0; i < def.input().size(); ++i) {
-    DeviceOption dev;
-    dev.set_device_type(1);
-    dev.set_device_id(i);
-    opt.push_back(dev);
-  }
-  return std::make_pair(opt, opt);
-}
-
-REGISTER_CUDA_OPERATOR(NCCLAllreduce, NCCLAllreduceOp);
-OPERATOR_SCHEMA(NCCLAllreduce)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .CostInferenceFunction(NCCLAllreduceOp::CostInference)
-    .TensorInferenceFunction(NCCLAllreduceOp::ShapeInference)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .AllowOneToOneInplace()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLAllreduce);
-
-REGISTER_CUDA_OPERATOR(NCCLBroadcast, NCCLBroadcastOp);
-OPERATOR_SCHEMA(NCCLBroadcast)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .IdenticalTypeAndShape()
-    .InputsCanCrossDevices()
-    .EnforceOneToOneInplace()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-
-SHOULD_NOT_DO_GRADIENT(NCCLBroadcast);
-
-REGISTER_CUDA_OPERATOR(NCCLReduce, NCCLReduceOp);
-OPERATOR_SCHEMA(NCCLReduce)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1)
-    .IdenticalTypeAndShapeOfInput(0)
-    .InputsCanCrossDevices()
-    .AllowInplace([](int /*in*/, int out) -> bool { return (out == 0); })
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLReduce);
-
-REGISTER_CUDA_OPERATOR(NCCLAllGather, NCCLAllGatherOp);
-OPERATOR_SCHEMA(NCCLAllGather)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLAllGather);
-
-REGISTER_CUDA_OPERATOR(NCCLReduceScatter, NCCLReduceScatterOp);
-OPERATOR_SCHEMA(NCCLReduceScatter)
-    .NumInputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .NumOutputs(1, C10_COMPILE_TIME_MAX_GPUS)
-    .InputsCanCrossDevices()
-    .DeviceInferenceFunction(ncclOpDevInfer);
-SHOULD_NOT_DO_GRADIENT(NCCLReduceScatter);
-
-} // namespace
-} // namespace caffe2
--- a/caffe2/contrib/nccl/nccl_ops_test.py
+++ b/caffe2/contrib/nccl/nccl_ops_test.py
@ -1,192 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, assume
-import numpy as np
-import time
-import os
-from caffe2.proto import caffe2_pb2
-from caffe2.python import core, workspace, muji, dyndep
-import caffe2.python.hypothesis_test_util as hu
-
-np.random.seed(1)
-
-dyndep.InitOpsLibrary('@/caffe2/caffe2/contrib/nccl:nccl_ops')
-
-
-def gpu_device(i):
-    device_option = caffe2_pb2.DeviceOption()
-    device_option.device_type = workspace.GpuDeviceType
-    device_option.device_id = i
-    return device_option
-
-
-def benchmark(ws, net, warmups=5, iters=100):
-    for _ in range(warmups):
-        ws.run(net)
-    plan = core.Plan("plan")
-    plan.AddStep(core.ExecutionStep("test-step", net, iters))
-    before = time.time()
-    ws.run(plan)
-    after = time.time()
-    print("Timing network, time taken per-iteration: {:.6f}ms".format((
-        after - before) / float(iters) * 1000.0))
-    return after - before
-
-
-@unittest.skipIf(not workspace.has_cuda_support, "NCCL only on CUDA GPU")
-class NCCLOpsTest(hu.HypothesisTestCase):
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000),
-           in_place=st.booleans())
-    def test_nccl_allreduce(self, n, m, in_place):
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        prefix = "" if in_place else "o"
-        outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
-        op = core.CreateOperator("NCCLAllreduce", inputs, outputs)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def allreduce(*args):
-            assert len(args) == n
-            output = np.sum(args, axis=0)
-            return [output for _ in range(n)]
-
-        outputs = self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            allreduce, input_device_options)
-        for output in outputs:
-            np.testing.assert_array_equal(outputs[0], output)
-            self.assertEqual(outputs[0].tobytes(), output.tobytes())
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000),
-           root=st.integers(min_value=0,
-                            max_value=workspace.NumGpuDevices() - 1))
-    def test_nccl_broadcast(self, n, m, root):
-        assume(root < n)
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator("NCCLBroadcast", inputs, inputs, root=root)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def broadcast(*args):
-            assert len(args) == n
-            return [args[root] for _ in range(n)]
-
-        self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            broadcast, input_device_options)
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000),
-           # NCCL Reduce seems to deadlock for non-zero roots.
-           root=st.integers(min_value=0, max_value=0),
-           in_place=st.booleans())
-    def test_nccl_reduce(self, n, m, root, in_place):
-        assume(in_place is False or root == 0)
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator(
-            "NCCLReduce", inputs,
-            inputs[root] if in_place else b"o", root=root)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def reduce(*args):
-            assert len(args) == n
-            return [np.sum(args, axis=0)]
-
-        self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            reduce, input_device_options)
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000))
-    def test_nccl_allgather(self, n, m):
-        xs = [np.random.randn(m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        outputs = [str("o_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator("NCCLAllGather", inputs, outputs)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def allgather(*args):
-            assert len(args) == n
-            return [np.stack(args, axis=0) for _ in range(n)]
-
-        outputs = self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            allgather, input_device_options)
-        for output in outputs:
-            np.testing.assert_array_equal(outputs[0], output)
-            self.assertEqual(outputs[0].tobytes(), output.tobytes())
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=1, max_value=1000))
-    def test_nccl_reduce_scatter(self, n, m):
-        xs = [np.random.randn(n, m).astype(np.float32) for i in range(n)]
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        outputs = [str("o_{}".format(i)) for i in range(n)]
-        op = core.CreateOperator("NCCLReduceScatter", inputs, outputs)
-        input_device_options = {n: gpu_device(i) for i, n in enumerate(inputs)}
-
-        def reduce_scatter(*args):
-            assert len(args) == n
-            reduced = sum(args)
-            assert len(reduced.shape) > 1
-            ref = [reduced[i, :] for i in range(n)]
-            return ref
-
-        self.assertReferenceChecks(
-            hu.gpu_do, op, [xs[i] for i, _ in enumerate(inputs)],
-            reduce_scatter, input_device_options)
-
-    @given(n=st.integers(min_value=2, max_value=workspace.NumGpuDevices()),
-           m=st.integers(min_value=100000, max_value=100000),
-           iters=st.integers(min_value=1, max_value=100),
-           net_type=st.sampled_from(["dag", "async_dag", "simple"]))
-    def _test_nccl_sync(self, n, m, iters, net_type):
-        inputs = [str("x_{}".format(i)) for i in range(n)]
-        extra_inputs = [str("xe_{}".format(i)) for i in range(n)]
-        net = core.Net("asdf")
-        net.Proto().type = net_type
-        net.Proto().num_workers = n
-        for i in range(n):
-            net.ConstantFill([], inputs[i], shape=[m], value=0.0,
-                             device_option=gpu_device(i))
-            net.ConstantFill([], extra_inputs[i], shape=[m], value=1.0,
-                             device_option=gpu_device(i))
-            for _ in range(iters):
-                net.Sum([inputs[i], extra_inputs[i]], [inputs[i]],
-                        device_option=gpu_device(i))
-        net.NCCLReduce(inputs, [inputs[0]], device_option=gpu_device(0))
-        self.ws.run(net)
-        np.testing.assert_array_equal(
-            self.ws.blobs[inputs[0]].fetch(),
-            np.full(shape=(m,), fill_value=iters * n, dtype=np.float32))
-
-    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
-    def test_timings(self):
-        for n in range(2, workspace.NumGpuDevices()):
-            for in_place in [False, True]:
-                xs = [np.random.randn(1e7).astype(np.float32)
-                      for i in range(n)]
-                inputs = [str("x_{}".format(i)) for i in range(n)]
-                prefix = "" if in_place else "o"
-                outputs = [str("{}x_{}".format(prefix, i)) for i in range(n)]
-
-                net = core.Net("test")
-                net.NCCLAllreduce(inputs, outputs)
-                net.RunAllOnGPU()
-                for i in range(n):
-                    self.ws.create_blob(inputs[i]).feed(xs[i], gpu_device(i))
-                self.ws.run(net)
-                net_time = benchmark(self.ws, net)
-                vanilla = core.Net("vanilla")
-                muji.Allreduce(vanilla, inputs)
-                vanilla_time = benchmark(self.ws, vanilla)
-                print("Speedup for NCCL: {:.2f}".format(
-                    vanilla_time / net_time))
--- a/caffe2/contrib/nnpack/init.py
+++ b/caffe2/contrib/nnpack/init.py
--- a/caffe2/contrib/nnpack/nnpack_ops.cc
+++ b/caffe2/contrib/nnpack/nnpack_ops.cc
@ -1,352 +0,0 @@
-#include "caffe2/core/common.h"
-
-#ifdef CAFFE2_USE_MKL
-#include <mkl.h>
-#endif
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/operators/conv_pool_op_base.h"
-#include "caffe2/operators/leaky_relu_op.h"
-#include "caffe2/utils/cpuid.h"
-#include "caffe2/utils/math.h"
-#include "nnpack.h"
-
-C10_DEFINE_int(
-    caffe2_nnpack_num_threads,
-    1,
-    "The number of nnpack pthreadpool threads.");
-C10_DEFINE_bool(
-    caffe2_nnpack_use_mkl_num_threads,
-    true,
-    "If MKL is built, this sets nnpack to use the same number of threads as "
-    "MKL does. This overrides caffe2_nnpack_num_threads if set.");
-
-namespace caffe2 {
-////////////////////////////////////////////////////////////////////////////////
-// Helper Functions
-////////////////////////////////////////////////////////////////////////////////
-
-namespace {
-
-bool has_nnpack() {
-  // nnp_initialize is a noop after the first call so it's safe to invoke it
-  // repeatedly
-  auto nnpack_status = nnp_initialize();
-  return nnpack_status == nnp_status_success;
-}
-
-nnp_convolution_algorithm get_nnp_convolution_algorithm(
-    const std::string& algo) {
-  if (algo == "AUTO") {
-    return nnp_convolution_algorithm_auto;
-  }
-  if (algo == "WINOGRAD") {
-    return nnp_convolution_algorithm_wt8x8;
-  }
-  if (algo == "FT16") {
-    return nnp_convolution_algorithm_ft16x16;
-  }
-  if (algo == "FT8") {
-    return nnp_convolution_algorithm_ft8x8;
-  }
-  return nnp_convolution_algorithm_auto;
-}
-
-nnp_convolution_transform_strategy get_nnp_convolution_transform_strategy(
-    const std::string& kts) {
-  if (kts == "BLOCK") {
-    return nnp_convolution_transform_strategy_block_based;
-  }
-  if (kts == "TUPLE") {
-    return nnp_convolution_transform_strategy_tuple_based;
-  }
-  return nnp_convolution_transform_strategy_block_based;
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// Thread Pool
-////////////////////////////////////////////////////////////////////////////////
-
-static pthreadpool_t nnpack_threadpool_ = nullptr;
-
-pthreadpool_t nnpack_threadpool() {
-  if (nnpack_threadpool_ == nullptr) {
-    enum nnp_status nnpack_status = nnp_initialize();
-    CAFFE_ENFORCE(
-        nnpack_status == nnp_status_success, "NNPack is not supported here!");
-    int num_threads = FLAGS_caffe2_nnpack_num_threads;
-    if (FLAGS_caffe2_nnpack_use_mkl_num_threads) {
-#ifdef CAFFE2_USE_MKL
-      num_threads = mkl_get_max_threads();
-#else
-      VLOG(1) << "I am asked to use MKL num of threads for NNPACK but this "
-                 "Caffe2 is not built with MKL. Skipping.";
-#endif
-    }
-    nnpack_threadpool_ = pthreadpool_create(num_threads);
-  }
-  return nnpack_threadpool_;
-}
-}
-
-////////////////////////////////////////////////////////////////////////////////
-// NNPACK Ops
-////////////////////////////////////////////////////////////////////////////////
-
-class NNPACKConvOp final : public ConvPoolOpBase<CPUContext> {
- public:
-  NNPACKConvOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<CPUContext>(operator_def, ws),
-        algo_(get_nnp_convolution_algorithm(
-            OperatorBase::GetSingleArgument<std::string>("algo", "AUTO"))),
-        kts_(get_nnp_convolution_transform_strategy(
-            OperatorBase::GetSingleArgument<std::string>("kts", "TUPLE"))) {
-    OPERATOR_NEEDS_FEATURE(
-        this->order_ == StorageOrder::NCHW,
-        "NNPack only supports NCHW order. Please consider adding "
-        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
-    OPERATOR_NEEDS_FEATURE(
-        dilation_h() == 1 && dilation_w() == 1,
-        "The NNPack convolution does not support dilation yet.");
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    auto& X = Input(0);
-    auto& filter = Input(1);
-    auto& bias = Input(2);
-    auto* Y = Output(0);
-
-    const int N = X.dim32(0), C = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
-    const int M = filter.dim32(0);
-
-    CAFFE_ENFORCE(X.dim() == 4, "Input dim should be 4");
-    CAFFE_ENFORCE(filter.dim(), 4);
-    CAFFE_ENFORCE(C % this->group_ == 0, "");
-    CAFFE_ENFORCE(M % this->group_ == 0, "");
-    CAFFE_ENFORCE(filter.dim32(1) == C / this->group_, "");
-    CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h(), "");
-    CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w(), "");
-    CAFFE_ENFORCE(bias.numel() == M, "");
-
-    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, filter.dim32(0));
-    const int oH = Y->dim32(2), oW = Y->dim32(3);
-
-    if (N > 1) {
-      CAFFE_ENFORCE_EQ(
-          this->stride_h(),
-          1,
-          "NNPack only supports stride = 1 when doing batch feedforward");
-      CAFFE_ENFORCE_EQ(
-          this->stride_w(),
-          1,
-          "NNPack only supports stride = 1 when doing batch feedforward");
-    }
-    std::vector<int> pads(
-        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
-    std::vector<int> stride({this->stride_h(), this->stride_w()});
-
-    const size_t input_channels = X.dim32(1);
-    const size_t output_channels = Y->dim32(1);
-    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
-                                 .height = static_cast<size_t>(X.dim32(2))};
-    // filter is MCHW
-    const nnp_size kernel_size = {
-        .width = static_cast<size_t>(filter.dim32(3)),
-        .height = static_cast<size_t>(filter.dim32(2))};
-    // pad is tblr
-    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
-                                 .right = static_cast<size_t>(pads[3]),
-                                 .bottom = static_cast<size_t>(pads[1]),
-                                 .left = static_cast<size_t>(pads[2])};
-
-    const nnp_size output_subsample = {
-        .width = static_cast<size_t>(stride[1]),
-        .height = static_cast<size_t>(stride[0])};
-    if (N == 1) {
-      VLOG(1) << "Running inference mode";
-      for (auto g = 0; g < group_; ++g) {
-        const auto status = nnp_convolution_inference(
-            algo_,
-            kts_,
-            C / group_,
-            M / group_,
-            input_size,
-            padding,
-            kernel_size,
-            output_subsample,
-            X.template data<float>() + g * H * W * (C / group_),
-            filter.template data<float>() + filter.numel() / group_ * g,
-            bias.template data<float>() + bias.numel() / group_ * g,
-            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
-            nnpack_threadpool(),
-            nullptr);
-        CAFFE_ENFORCE(nnp_status_success == status, "");
-      }
-    } else {
-      VLOG(1) << "Running batched mode";
-      for (auto g = 0; g < group_; ++g) {
-        const auto status = nnp_convolution_output(
-            algo_,
-            N,
-            C / group_,
-            M / group_,
-            input_size,
-            padding,
-            kernel_size,
-            X.template data<float>() + g * H * W * (C / group_),
-            filter.template data<float>() + filter.numel() / group_ * g,
-            bias.template data<float>() + bias.numel() / group_ * g,
-            Y->template mutable_data<float>() + g * oH * oW * (M / group_),
-            nnpack_threadpool(),
-            nullptr);
-        CAFFE_ENFORCE(nnp_status_success == status, "");
-      }
-    }
-    return true;
-  }
-
- private:
-  const nnp_convolution_algorithm algo_;
-  const nnp_convolution_transform_strategy kts_;
-};
-
-class NNPACKMaxPoolOp final : public ConvPoolOpBase<CPUContext> {
- public:
-  NNPACKMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<CPUContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(
-        this->order_ == StorageOrder::NCHW,
-        "NNPack only supports NCHW order. Please consider add "
-        "TransposeOp with axes=[0, 3, 1, 2] before NNPack Conv.");
-    OPERATOR_NEEDS_FEATURE(
-        this->kernel_h() == 2, "NNPack only supports MaxPool kernel size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->kernel_w() == 2, "NNPack only supports MaxPool kernel size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->stride_h() == 2, "NNPack only supports MaxPool stride size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->stride_w() == 2, "NNPack only supports MaxPool stride size 2*2!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_t() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_l() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_r() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    OPERATOR_NEEDS_FEATURE(
-        this->pad_b() == 0,
-        "NNPack Pooling differs from Caffe2 Pooling when pad > 0!");
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override {
-    auto& X = Input(0);
-    auto* Y = Output(0);
-    CAFFE_ENFORCE(X.dim() == 4, "");
-    const int H = X.dim32(2), W = X.dim32(3);
-    ConvPoolOpBase<CPUContext>::SetOutputSize(X, Y, X.dim32(1));
-    std::vector<int> pads(
-        {this->pad_t(), this->pad_b(), this->pad_l(), this->pad_r()});
-    std::vector<int> stride({this->stride_h(), this->stride_w()});
-    std::vector<int> pooling({this->kernel_h(), this->kernel_w()});
-
-    // Input X is in NCHW order
-    const size_t batch_size = X.dim32(0);
-    const size_t input_channels = X.dim32(1);
-    const nnp_size input_size = {.width = static_cast<size_t>(X.dim32(3)),
-                                 .height = static_cast<size_t>(X.dim32(2))};
-    // pooling kernel
-    const nnp_size pooling_size = {.width = static_cast<size_t>(pooling[1]),
-                                   .height = static_cast<size_t>(pooling[0])};
-    // pad is tblr
-    const nnp_padding padding = {.top = static_cast<size_t>(pads[0]),
-                                 .right = static_cast<size_t>(pads[3]),
-                                 .bottom = static_cast<size_t>(pads[1]),
-                                 .left = static_cast<size_t>(pads[2])};
-
-    const nnp_size pooling_stride = {.width = static_cast<size_t>(stride[1]),
-                                     .height = static_cast<size_t>(stride[0])};
-    const auto status = nnp_max_pooling_output(
-        batch_size,
-        input_channels,
-        input_size,
-        padding,
-        pooling_size,
-        pooling_stride,
-        X.template data<float>(),
-        Y->template mutable_data<float>(),
-        nnpack_threadpool());
-    CAFFE_ENFORCE(nnp_status_success == status, "");
-    return true;
-  }
-
- private:
-};
-
-class NNPACKReluOp final : public Operator<CPUContext> {
- public:
-  NNPACKReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<CPUContext>(operator_def, ws) {
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDevice() override {
-    auto& X = Input(0);
-    auto* Y = Output(0);
-    const auto status = nnp_relu_output(
-        1,
-        X.numel(),
-        X.template data<float>(),
-        Y->template mutable_data<float>(),
-        0.0,
-        nnpack_threadpool());
-    CAFFE_ENFORCE(nnp_status_success == status, "");
-    return true;
-  }
-
- private:
-};
-
-class NNPACKLeakyReluOp final : public LeakyReluOp<float, CPUContext> {
- public:
-  NNPACKLeakyReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : LeakyReluOp<float, CPUContext>(operator_def, ws) {
-    // NNPACK can be built with avx2 support only and might not be able to run
-    // on a given machine.
-    OPERATOR_NEEDS_FEATURE(has_nnpack(), "NNPack can't run here. No AVX2?");
-  }
-
-  bool RunOnDevice() override {
-    auto& X = Input(0);
-    auto* Y = Output(0);
-    const auto status = nnp_relu_output(
-        1,
-        X.numel(),
-        X.template data<float>(),
-        Y->template mutable_data<float>(),
-        alpha_,
-        nnpack_threadpool());
-    CAFFE_ENFORCE(nnp_status_success == status, "");
-    return true;
-  }
-
- private:
-};
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
-REGISTER_CPU_OPERATOR_WITH_ENGINE(MaxPool, NNPACK, NNPACKMaxPoolOp);
-REGISTER_CPU_OPERATOR_WITH_ENGINE(Relu, NNPACK, NNPACKReluOp);
-REGISTER_CPU_OPERATOR_WITH_ENGINE(LeakyRelu, NNPACK, NNPACKLeakyReluOp);
-
-} // namespace caffe2
--- a/caffe2/contrib/nnpack/nnpack_ops_test.py
+++ b/caffe2/contrib/nnpack/nnpack_ops_test.py
@ -1,237 +0,0 @@
-
-
-
-
-
-import unittest
-import hypothesis.strategies as st
-from hypothesis import given, assume, settings
-import numpy as np
-import time
-import os
-from caffe2.python import core, dyndep
-import caffe2.python.hypothesis_test_util as hu
-
-
-dyndep.InitOpsLibrary("@/caffe2/caffe2/contrib/nnpack:nnpack_ops")
-
-np.random.seed(1)
-
-
-def benchmark(ws, net, warmups=5, iters=100):
-    for _ in range(warmups):
-        ws.run(net)
-    plan = core.Plan("plan")
-    plan.AddStep(core.ExecutionStep("test-step", net, iters))
-    before = time.time()
-    ws.run(plan)
-    after = time.time()
-    print("Timing network, time taken per-iteration: {:.6f}ms".format((
-        after - before) / float(iters) * 1000.0))
-    return after - before
-
-
-def has_avx2():
-    import subprocess
-    try:
-        subprocess.check_output(["grep", "avx2", "/proc/cpuinfo"])
-        return True
-    except subprocess.CalledProcessError:
-        # grep exits with rc 1 on no matches
-        return False
-
-
-@unittest.skipIf(not has_avx2(), "NNPACK requires AVX2")
-class NNPackOpsTest(hu.HypothesisTestCase):
-    @given(stride=st.integers(1, 3),
-           pad=st.integers(0, 2),
-           kernel=st.integers(3, 5),
-           size=st.integers(5, 10),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5),
-           groups=st.integers(1, 2))
-    def test_convolution_correctness(self, stride, pad, kernel, size,
-                                     input_channels,
-                                     batch_size, groups):
-        input_channels *= groups
-        output_channels = int(input_channels / groups)
-        assume(input_channels % groups == 0)
-        assume(output_channels % groups == 0)
-        assume(output_channels == input_channels / groups)
-        assume(stride <= kernel)
-        if stride != 1:
-            assume(batch_size == 1)
-
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(
-            input_channels, output_channels, kernel, kernel).astype(np.float32)\
-            - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        order = "NCHW"
-        outputs = {}
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "Conv",
-                ["X", "w", "b"],
-                ["Y"],
-                stride=stride,
-                kernel=kernel,
-                pad=pad,
-                order=order,
-                kts="TUPLE",
-                engine=engine,
-                group=groups,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.create_blob("w").feed(w)
-            self.ws.create_blob("b").feed(b)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(size=st.sampled_from([6, 8]),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5))
-    def test_max_pool_correctness(self, size, input_channels, batch_size):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        order = "NCHW"
-        outputs = {}
-        # only 2 * 2 stride and 2 * 2 pool is supported in NNPack now
-        stride = 2
-        kernel = 2
-        # The pooling strategy of NNPack is different from caffe2 pooling
-        pad = 0
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "MaxPool",
-                ["X"],
-                ["Y"],
-                stride=stride,
-                kernel=kernel,
-                pad=pad,
-                order=order,
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(size=st.sampled_from([6, 8]),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5))
-    def test_relu_correctness(self, size, input_channels, batch_size):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        outputs = {}
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "Relu",
-                ["X"],
-                ["Y"],
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @given(size=st.sampled_from([6, 8]),
-           input_channels=st.integers(1, 8),
-           batch_size=st.integers(1, 5),
-           alpha=st.floats(0, 1))
-    def test_leaky_relu_correctness(self, size, input_channels, batch_size,
-                                    alpha):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        outputs = {}
-        for engine in ["", "NNPACK"]:
-            op = core.CreateOperator(
-                "LeakyRelu",
-                ["X"],
-                ["Y"],
-                alpha=alpha,
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(op)
-            outputs[engine] = self.ws.blobs["Y"].fetch()
-        np.testing.assert_allclose(
-            outputs[""],
-            outputs["NNPACK"],
-            atol=1e-4,
-            rtol=1e-4)
-
-    @settings(deadline=3600)
-    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
-    @given(stride=st.integers(1, 1),
-           pad=st.integers(0, 2),
-           kernel=st.sampled_from([3, 5, 7]),
-           size=st.integers(30, 90),
-           input_channels=st.sampled_from([3, 64, 256]),
-           output_channels=st.sampled_from([32, 96, 256]),
-           batch_size=st.sampled_from([32, 64, 96, 128]))
-    def test_timings(self, stride, pad, kernel, size,
-                     input_channels, output_channels, batch_size):
-        assume(stride <= kernel)
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        w = np.random.rand(output_channels, input_channels,
-                           kernel, kernel).astype(np.float32) - 0.5
-        b = np.random.rand(output_channels).astype(np.float32) - 0.5
-        order = "NCHW"
-        times = {}
-        for engine in ["", "NNPACK"]:
-            net = core.Net(engine + "_test")
-            net.Conv(
-                ["X", "W", "b"], "Y",
-                order=order,
-                kernel=kernel,
-                stride=stride,
-                pad=pad,
-                kts="TUPLE",
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.create_blob("W").feed(w)
-            self.ws.create_blob("b").feed(b)
-            self.ws.run(net)
-            times[engine] = benchmark(self.ws, net)
-        print("Speedup for NNPACK: {:.2f}".format(
-            times[""] / times["NNPACK"]))
-
-    @settings(deadline=3600)
-    @unittest.skipIf(not os.environ.get("CAFFE2_BENCHMARK"), "Benchmark")
-    @given(size=st.integers(30, 90),
-           input_channels=st.sampled_from([3, 64, 256]),
-           batch_size=st.sampled_from([32, 64, 96, 128]))
-    def test_relu_timings(self, size, input_channels, batch_size):
-        X = np.random.rand(
-            batch_size, input_channels, size, size).astype(np.float32) - 0.5
-        times = {}
-        for engine in ["", "NNPACK"]:
-            net = core.Net(engine + "_test")
-            net.Relu(
-                ["X"],
-                ["Y"],
-                engine=engine,
-            )
-            self.ws.create_blob("X").feed(X)
-            self.ws.run(net)
-            times[engine] = benchmark(self.ws, net)
-        print("Speedup for NNPACK: {:.2f}".format(
-            times[""] / times["NNPACK"]))
--- a/Show More
+++ b/Show More
				`@ -1 +0,0 @@`
				`#include "caffe2/caffe2/contrib/aten/gen_aten_op.h"`