Fix metal build after sync

Summary: While I was trying to make a quick oss cmakefile, I found that some of the ios source files are out of sync with the most code changes. This diff should fix the issues. I manually ran cmake on the oss side with scripts/build_ios.sh to make sure things pass. Reviewed By: ajtulloch Differential Revision: D5582265 fbshipit-source-id: 2636d353d32fcd8fb7087385b9bbed8476e33e74
2025-10-20 21:14:14 +08:00 · 2017-08-08 10:10:18 -07:00
parent 9fcf676cfa
commit 679a586d53
42 changed files with 28 additions and 4423 deletions
--- a/caffe2/contrib/CMakeLists.txt
+++ b/caffe2/contrib/CMakeLists.txt
@ -1,4 +1,5 @@
 add_subdirectory(gloo)
+add_subdirectory(ios)
 add_subdirectory(nccl)
 add_subdirectory(nnpack)
 add_subdirectory(observers)
--- a/caffe2/contrib/ios/CMakeLists.txt
+++ b/caffe2/contrib/ios/CMakeLists.txt
@ -0,0 +1,15 @@
+# TODO: figure out conflict between contrib/nnpack/nnpack_ops.cc and mobile_nnpack.cc
+if(IOS)
+  # Basic ios srcs.
+  set(Caffe2_CONTRIB_IOS_SRC
+    "${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe.cc"
+    "${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe_predictor.cc"
+    # "${CMAKE_CURRENT_SOURCE_DIR}/mobile_nnpack.cc"
+    )
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_IOS_SRC})
+
+  # mpscnn files
+  add_subdirectory(mpscnn)
+endif()
+
+set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
--- a/caffe2/contrib/ios/metal/CopyOps.mm
+++ b/caffe2/contrib/ios/metal/CopyOps.mm
@ -1,99 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/core/common.h"
-
-#ifndef CAFFE2_MOBILE
-#error "Caffe2 mobile state not defined"
-#endif
-
-#if CAFFE2_MOBILE
-
-#include "caffe2/core/operator.h"
-#import "data_conversion.h"
-#import "MetalCaffeContext.h"
-#import "MetalImageFilter.h"
-#import "metal_sync_op.h"
-
-namespace caffe2 {
-class CopyToMetalGPUOp final : public Operator<MetalCaffeContext> {
- public:
-  CopyToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws) : Operator<MetalCaffeContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    const Blob *blob = Inputs()[0];
-    const TensorCPU &X = blob->Get<TensorCPU>();
-
-    CAFFE_ENFORCE(X.dim32(0) == 1);
-    int input_channels = X.dim32(1);
-    int input_width = X.dim32(3);
-    int input_height = X.dim32(2);
-
-    auto *Y = Output(0);
-    Y->Resize(X.dim32(0), input_channels, input_height, input_width);
-
-    const float *input = X.template data<float>();
-    float *output_data = (float *)[GetMetalAllocator()->Buffer((void *)Y->template mutable_data<float>()) contents];
-
-    memcpycvt(output_data, input, Y->size());
-
-    return true;
-  }
-};
-REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyToMetalGPU, METAL, CopyToMetalGPUOp);
-OPERATOR_SCHEMA(CopyToMetalGPU).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
-
-class CopyToMetalGPUFloat16Op final : public Operator<MetalCaffeContext> {
- public:
-  CopyToMetalGPUFloat16Op(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<MetalCaffeContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    const Blob *blob = Inputs()[0];
-    const TensorCPU &X = blob->Get<TensorCPU>();
-
-    auto *Y = Output(0);
-    Y->ResizeLike(X);
-
-    const float *input = X.template data<float>();
-    float16_t *output_data =
-        (float16_t *)[GetMetalAllocator()->Buffer((void *)Y->template mutable_data<uint16_t>()) contents];
-
-    memcpycvt(output_data, input, Y->size());
-
-    return true;
-  }
-};
-REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyToMetalGPUFloat16, METAL, CopyToMetalGPUFloat16Op);
-OPERATOR_SCHEMA(CopyToMetalGPUFloat16).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
-
-class CopyFromMetalGPUOp final : public Operator<CPUContext> {
- public:
-  CopyFromMetalGPUOp(const OperatorDef &operator_def, Workspace *ws) : Operator<CPUContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    const Blob *blob = Inputs()[0];
-    const TensorMetal &X = blob->Get<TensorMetal>();
-
-    CAFFE_ENFORCE(X.dim32(0) == 1);
-    int input_channels = X.dim32(1);
-    int input_width = X.dim32(3);
-    int input_height = X.dim32(2);
-
-    auto *Y = Output(0);
-    Y->Resize(X.dim32(0), input_channels, input_height, input_width);
-
-    const float *input = (float *)[GetMetalAllocator()->Buffer((void *)X.template data<float>()) contents];
-    float *output_data = Y->template mutable_data<float>();
-
-    metal_sync_op();
-
-    memcpycvt(output_data, input, Y->size());
-
-    return true;
-  }
-};
-REGISTER_CPU_OPERATOR(CopyFromMetalGPU, CopyFromMetalGPUOp);
-OPERATOR_SCHEMA(CopyFromMetalGPU).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
-} // namespace caffe2
-
-#endif // CAFFE2_MOBILE
--- a/caffe2/contrib/ios/metal/FBMetalCNNConvolution.h
+++ b/caffe2/contrib/ios/metal/FBMetalCNNConvolution.h
@ -1,119 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#include "arm_neon_support.h"
-
-#import "MetalImageFilter.h"
-#import "FBMetalConstantValues.h"
-
-class FBMetalCNNConstantValues : public FBMetalConstantValues {
-public:
-  ushort input_width;
-  ushort input_height;
-  ushort input_channels;
-  ushort input_stride_x;
-  ushort input_stride_y;
-  ushort input_pad_t;
-  ushort input_pad_l;
-  ushort input_pad_b;
-  ushort input_pad_r;
-  ushort filter_width;
-  ushort filter_height;
-  ushort output_width;
-  ushort output_height;
-  ushort output_channels;
-  bool transposed;
-
-  FBMetalCNNConstantValues(
-      ushort _input_width,
-      ushort _input_height,
-      ushort _input_channels,
-      ushort _input_stride_x,
-      ushort _input_stride_y,
-      ushort _input_pad_t,
-      ushort _input_pad_l,
-      ushort _input_pad_b,
-      ushort _input_pad_r,
-      ushort _filter_width,
-      ushort _filter_height,
-      ushort _output_width,
-      ushort _output_height,
-      ushort _output_channels,
-      bool _transposed) :
-        input_width(_input_width),
-        input_height(_input_height),
-        input_channels(_input_channels),
-        input_stride_x(_input_stride_x),
-        input_stride_y(_input_stride_y),
-        input_pad_t(_input_pad_t),
-        input_pad_l(_input_pad_l),
-        input_pad_b(_input_pad_b),
-        input_pad_r(_input_pad_r),
-        filter_width(_filter_width),
-        filter_height(_filter_height),
-        output_width(_output_width),
-        output_height(_output_height),
-        output_channels(_output_channels),
-        transposed(_transposed) { }
-
-  std::string to_string();
-};
-
-constexpr int MAX_KERNELS_PER_CONVOLUTION = 32;
-
-typedef float32_t data_buffer_type;
-
-typedef float16_t weight_buffer_type;
-
-@interface FBMetalCNNConvolution : MetalImageFilter
-
-@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
-
-@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
-
-@property (nonatomic, strong) id<MTLBuffer> weightBuffer;
-
-@property (nonatomic, strong) id<MTLBuffer> biasBuffer;
-
-+ (id<MTLBuffer>)loadFilterWithImage:(const float*)weight_data
-                        weightBuffer:(id<MTLBuffer>)weightBuffer
-                             kernels:(NSUInteger)kernels
-                       input_kernels:(NSUInteger)input_kernels
-                       kernel_offset:(NSUInteger)kernel_offset
-                       kernel_stride:(NSUInteger)kernel_stride
-                            channels:(NSUInteger)channels
-                               width:(NSUInteger)width
-                              height:(NSUInteger)height
-                          transposed:(bool)transposed
-                             context:(MetalContext*)context;
-
- (void)loadFilterWithImage:(const float*)weight_data
-                    kernels:(NSUInteger)kernels
-              input_kernels:(NSUInteger)input_kernels
-              kernel_offset:(NSUInteger)kernel_offset
-              kernel_stride:(NSUInteger)kernel_stride
-                   channels:(NSUInteger)channels
-                      width:(NSUInteger)image_width
-                     height:(NSUInteger)image_height
-                 transposed:(bool)transposed;
-
-+ (id<MTLBuffer>)loadDataWithImage:(const float*)imageData
-                   imageDataBuffer:(id<MTLBuffer>)imageDataBuffer
-                          channels:(NSUInteger)channels
-                             width:(NSUInteger)width
-                            height:(NSUInteger)height
-                           context:(MetalContext*)context;
-
- (void)loadBiasData:(const float *)bias
-              length:(NSUInteger)length;
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                         channels:(NSUInteger)channels
-                      kernel_size:(NSUInteger)kernel_size
-                   constantValues:(FBMetalCNNConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                         stride_x:(NSUInteger)stride_x
-                         stride_y:(NSUInteger)stride_y;
-@end
--- a/caffe2/contrib/ios/metal/FBMetalCNNConvolution.mm
+++ b/caffe2/contrib/ios/metal/FBMetalCNNConvolution.mm
@ -1,538 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "FBMetalConstantValues.h"
-#import "FBMetalCNNConvolution.h"
-#import "data_conversion.h"
-#import "MetalShaderUtilities.h"
-
-#include "caffe2/core/logging.h"
-
-static constexpr size_t kThreadGroupSize_x = 4;
-static constexpr size_t kThreadGroupSize_y = 8;
-
-std::string FBMetalCNNConstantValues::to_string() {
-  std::ostringstream ss;
-
-  ss << "X:" <<
-  input_channels << "x" <<
-  input_width << "x" <<
-  input_height << "[" <<
-  input_stride_x << ":" <<
-  input_stride_y << ":" <<
-  input_pad_t << ":" <<
-  input_pad_l << ":" <<
-  input_pad_b << ":" <<
-  input_pad_r << "]-Y:" <<
-  output_channels << "x" <<
-  output_width << "x" <<
-  output_height << "-W:" <<
-  filter_width << "x" <<
-  filter_height << ":" <<
-  (transposed ? "T" : "D");
-
-  return ss.str();
-}
-
-namespace {
-  extern const char *metalCode;
-}
-
-@interface FBMetalCNNConvolution () {
-  int stride_x;
-  int stride_y;
-  uint input_batch_size;
-  uint output_channels;
-}
-@end
-
-@implementation FBMetalCNNConvolution
-
-+ (id<MTLBuffer>)loadFilterWithImage:(const float*)weight_data
-                        weightBuffer:(id<MTLBuffer>)weightBuffer
-                             kernels:(NSUInteger)kernels
-                       input_kernels:(NSUInteger)input_kernels
-                       kernel_offset:(NSUInteger)kernel_offset
-                       kernel_stride:(NSUInteger)kernel_stride
-                            channels:(NSUInteger)channels
-                               width:(NSUInteger)width
-                              height:(NSUInteger)height
-                          transposed:(bool)transposed
-                             context:(MetalContext*)context
-
-{
-  reformatKernelImage<weight_buffer_type>(
-      weight_data,
-      kernels,
-      input_kernels,
-      kernel_offset,
-      kernel_stride,
-      channels,
-      width,
-      height,
-      transposed,
-      [&](size_t buffer_size) -> weight_buffer_type* {
-        if (weightBuffer == nil || [weightBuffer length] != sizeof(weight_buffer_type) * buffer_size) {
-          weightBuffer = [context.device newBufferWithLength:sizeof(weight_buffer_type) * buffer_size
-                                                     options:MTLResourceOptionCPUCacheModeDefault];
-          if (weightBuffer == nil) {
-            LOG(ERROR) << "couldn't create weight buffer of size: " << buffer_size;
-          }
-        }
-
-        return (weight_buffer_type*)(weightBuffer ? [weightBuffer contents] : NULL);
-      });
-
-  return weightBuffer;
-}
-
- (void)loadFilterWithImage:(const float*)weight_data
-                    kernels:(NSUInteger)kernels
-              input_kernels:(NSUInteger)input_kernels
-              kernel_offset:(NSUInteger)kernel_offset
-              kernel_stride:(NSUInteger)kernel_stride
-                   channels:(NSUInteger)channels
-                      width:(NSUInteger)width
-                     height:(NSUInteger)height
-                 transposed:(bool)transposed {
-  _weightBuffer = [FBMetalCNNConvolution loadFilterWithImage:weight_data
-                                                weightBuffer:_weightBuffer
-                                                     kernels:kernels
-                                               input_kernels:input_kernels
-                                               kernel_offset:kernel_offset
-                                               kernel_stride:kernel_stride
-                                                    channels:channels
-                                                       width:width
-                                                      height:height
-                                                  transposed:transposed
-                                                     context:self.context];
-}
-
- (void)loadBiasData:(const float *)bias
-              length:(NSUInteger)length {
-  if (_biasBuffer == nil || [_biasBuffer length] != sizeof(weight_buffer_type) * length) {
-    _biasBuffer = [self.context.device newBufferWithLength:sizeof(weight_buffer_type) * length
-                                                   options:MTLResourceOptionCPUCacheModeDefault];
-  }
-  if (_biasBuffer) {
-    weight_buffer_type *bias_data = (weight_buffer_type *) [_biasBuffer contents];
-    for (int i = 0; i < length; i++) {
-      bias_data[i] = bias[i];
-    }
-  }
-}
-
-+ (id<MTLBuffer>)loadDataWithImage:(const float*)imageData
-                   imageDataBuffer:(id<MTLBuffer>)imageDataBuffer
-                          channels:(NSUInteger)channels
-                             width:(NSUInteger)width
-                            height:(NSUInteger)height
-                           context:(MetalContext*)context {
-  id<MTLBuffer> newDataBuffer = imageDataBuffer;
-
-  reformatInputImage<data_buffer_type>(
-      imageData, channels, width, height, [&](size_t buffer_size) -> data_buffer_type* {
-        if (newDataBuffer == nil || [newDataBuffer length] != sizeof(data_buffer_type) * buffer_size) {
-          newDataBuffer = [context.device newBufferWithLength:sizeof(data_buffer_type) * buffer_size
-                                                      options:MTLResourceOptionCPUCacheModeDefault];
-          if (newDataBuffer == nil) {
-            VLOG(0) << "couldn't create data buffer of size: " << buffer_size;
-          }
-        }
-
-        return newDataBuffer ? (data_buffer_type*)[newDataBuffer contents] : NULL;
-      });
-
-  return newDataBuffer;
-}
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                         channels:(NSUInteger)channels
-                      kernel_size:(NSUInteger)kernel_size
-                   constantValues:(FBMetalCNNConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                         stride_x:(NSUInteger)stride_x
-                         stride_y:(NSUInteger)stride_y {
-  return [[self alloc] initWithContext:context
-                              channels:(NSUInteger)channels
-                           kernel_size:kernel_size
-                        constantValues:constantValues
-                                 width:width
-                                height:height
-                              stride_x:stride_x
-                              stride_y:stride_y];
-}
-
- (MTLSize)threadsPerThreadgroup {
-  if (input_batch_size > 1)
-    return MTLSizeMake(input_batch_size * kThreadGroupSize_x, kThreadGroupSize_y / input_batch_size, 1);
-  else
-    return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
-}
-
- (MTLSize)threadgroupsPerGrid {
-  MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
-
-  return MTLSizeMake(
-                     ((input_batch_size * self.outputTextureDescriptor.width + stride_x - 1) / stride_x + threadsPerThreadgroup.width - 1) /
-                     threadsPerThreadgroup.width,
-                     ((self.outputTextureDescriptor.height + stride_y - 1) / stride_y + threadsPerThreadgroup.height - 1) /
-                     threadsPerThreadgroup.height,
-                     1);
-}
-
-// TODO: this code is temporary, we need to find the optimal strategy for large channel numbers
-
-static uint input_channel_batching(uint input_channels) {
-  return input_channels <= 32 ? 1 :
-         input_channels % 8 == 0 ? 8 :
-         input_channels % 4 == 0 ? 4 :
-         input_channels % 3 == 0 ? 3 :
-         input_channels % 2 == 0 ? 2 : 1;
-}
-
- (instancetype)initWithContext:(MetalContext*)context
-                       channels:(NSUInteger)channels
-                    kernel_size:(NSUInteger)kernel_size
-                 constantValues:(FBMetalCNNConstantValues*)constantValues
-                          width:(NSUInteger)width
-                         height:(NSUInteger)height
-                       stride_x:(NSUInteger)_stride_x
-                       stride_y:(NSUInteger)_stride_y {
-  if ((self = [super initWithFunctionName:@"cnn_convolution_kern"
-                              libraryName:@"Convolution"
-                            librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
-                                  context:context
-                           constantValues:constantValues])) {
-    stride_x = _stride_x;
-    stride_y = _stride_y;
-
-    input_batch_size = input_channel_batching(constantValues->input_channels);
-    output_channels = constantValues->output_channels;
-
-    super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
-                                                                                       width:width
-                                                                                      height:height
-                                                                                   mipmapped:NO];
-  }
-  return self;
-}
-
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
-                     librarySource:(NSString*)librarySource {
-  FBMetalCNNConstantValues* convolutionConstantValues = (FBMetalCNNConstantValues *) constantValues;
-  std::string source = [librarySource UTF8String];
-
-  REPLACE_CONSTANT(source, convolutionConstantValues->output_width,     0);
-  REPLACE_CONSTANT(source, convolutionConstantValues->output_height,    1);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_width,      2);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_height,     3);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_stride_x,   4);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_stride_y,   5);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_pad_t,      6);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_pad_l,      7);
-  REPLACE_CONSTANT(source, convolutionConstantValues->filter_width,     8);
-  REPLACE_CONSTANT(source, convolutionConstantValues->filter_height,    9);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_channels,   10);
-  REPLACE_CONSTANT(source, convolutionConstantValues->output_channels,  11);
-  REPLACE_CONSTANT(source, convolutionConstantValues->transposed,       12);
-  REPLACE_CONSTANT(source, input_channel_batching(convolutionConstantValues->input_channels),  13);
-
-  return [NSString stringWithUTF8String:source.c_str()];
-}
-
-// Bind data between C and Metal
-
- (void) configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
-                               weightBufferOffset:(NSInteger)weightBufferOffset
-                               outputBufferOffset:(NSInteger)outputBufferOffset {
-  [commandEncoder setBuffer:_biasBuffer offset:0 atIndex:0];
-  [commandEncoder setBuffer:_weightBuffer offset:weightBufferOffset atIndex:1];
-  [commandEncoder setBuffer:_dataBuffer offset:0 atIndex:2];
-  [commandEncoder setBuffer:_outputBuffer offset:outputBufferOffset atIndex:3];
-}
-@end
-
-namespace {
-const char *metalCode = R"Metal(
-//  Copyright 2004-present Facebook. All Rights Reserved.
-
-#include <metal_stdlib>
-
-using namespace metal;
-
-// These function_constant expressions are replaced at compile time with actual values
-
-constant constexpr ushort output_size_x     [[ function_constant(0) ]];
-constant constexpr ushort output_size_y     [[ function_constant(1) ]];
-constant constexpr ushort input_size_x      [[ function_constant(2) ]];
-constant constexpr ushort input_size_y      [[ function_constant(3) ]];
-constant constexpr ushort input_stride_x    [[ function_constant(4) ]];
-constant constexpr ushort input_stride_y    [[ function_constant(5) ]];
-constant constexpr ushort input_pad_t       [[ function_constant(6) ]];
-constant constexpr ushort input_pad_l       [[ function_constant(7) ]];
-constant constexpr ushort filter_width      [[ function_constant(8) ]];
-constant constexpr ushort filter_height     [[ function_constant(9) ]];
-constant constexpr ushort input_channels    [[ function_constant(10) ]];
-constant constexpr ushort output_channels   [[ function_constant(11) ]];
-constant constexpr bool transposed          [[ function_constant(12) ]];
-constant constexpr ushort input_batch_size  [[ function_constant(13) ]];
-
-constant constexpr ushort2 input_padding = {
-  transposed ? (filter_width - 1 - input_pad_l) : input_pad_l,
-  transposed ? (filter_height - 1 - input_pad_t) : input_pad_t,
-};
-
-constant constexpr ushort DataSize      = output_channels <= 2 ? output_channels : 4;
-constant constexpr ushort DataLength    = (output_channels + DataSize - 1) / DataSize;
-
-typedef float data_buffer_type;
-
-typedef vec<half, DataSize> vec_t;
-
-typedef struct {
-  data_buffer_type data[input_channels][input_size_y][input_size_x];
-} input_data;
-
-typedef struct {
-  data_buffer_type data[output_channels][output_size_y][output_size_x];
-} output_data;
-
-typedef struct {
-  vec_t data[input_channels][filter_height][filter_width][DataLength];
-} filter_data;
-
-typedef struct {
-  half data[output_channels];
-} bias_data;
-
-typedef vec_t (thread_storage)[input_stride_y][input_stride_x][DataLength];
-
-// Filter data sampler - returns an aligned array of vectors for the filter interleaved output channel data
-
-template <typename T>
-class planar_filter_sampler {
-  const constant T (&channel_data)[filter_height][filter_width][DataLength];
-
- public:
-  planar_filter_sampler(constant T data[input_channels][filter_height][filter_width][DataLength], ushort channel)
-      : channel_data(data[channel]) {}
-
-  typedef constant T (&return_type)[DataLength];
-
-  return_type operator()(ushort2 off) {
-    return channel_data[off.y][off.x];
-  }
-};
-
-// Input data sampler - returns a scalar sample
-
-template <typename T>
-class planar_data_sampler {
-  const constant T (&channel_data)[input_size_y][input_size_x];
-  const ushort2     base;
-
- public:
-  planar_data_sampler(const constant T data[input_channels][input_size_y][input_size_x], ushort2 _base, ushort channel)
-      : channel_data(data[channel]), base(_base) {}
-
-  inline T operator()(ushort2 off) {
-    /*
-     *  Note: Even though the documentation states otherwise, Metal will execute both sides of ternary statements,
-     *        and, if the array index is out of bounds, it will crash.
-     */
-
-    const ushort2 idx = base + off;
-    bool in_bounds = all(idx >= input_padding) &&
-                     all(idx < (ushort2(input_size_x, input_size_y) * ushort2(input_stride_x, input_stride_y) + input_padding));
-    const ushort2 padded_idx = (idx - (in_bounds ? input_padding : 0)) / ushort2(input_stride_x, input_stride_y);
-    return in_bounds ? channel_data[padded_idx.y][padded_idx.x] : T(0);
-  }
-};
-
-/*
- * To speed up data access in the convolution inner loop we use vector data access for the filter.
- * Every input pixel is multiplied by all the kernel values to generate all the output channels at once.
- */
-
-template <ushort DataRemainder, typename T>
-void accumulate(const ushort DataLength, T dst[], const constant T w[], const half sample) {
-  for (ushort i = 0; i < DataLength; i++) {
-    dst[i] += sample * w[i];
-  }
-}
-
-template <>
-void accumulate<3, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
-  accumulate<0, half4>(DataLength - 1, dst, w, sample);
-  dst[DataLength - 1].xyz += sample * w[DataLength - 1].xyz;
-}
-
-template <>
-void accumulate<2, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
-  accumulate<0, half4>(DataLength - 1, dst, w, sample);
-  dst[DataLength - 1].xy += sample * w[DataLength - 1].xy;
-}
-
-template <>
-void accumulate<1, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
-  accumulate<0, half4>(DataLength - 1, dst, w, sample);
-  dst[DataLength - 1].x += sample * w[DataLength - 1].x;
-}
-
-/*
- * Convolution inner loop: works for direct convolution (stride == 1) and for the transposed (stride > 1)
- */
-
-template <typename T, typename S1, typename S2>
-void convolution(T acc[input_stride_y][input_stride_x][DataLength], S1 in, S2 weights) {
-  for (ushort y = 0; y < filter_height; y++) {
-    for (ushort x = 0; x < filter_width; x++) {
-      const ushort2 off(x, y);
-      const ushort2 p0     = (input_padding + ushort2(input_stride_x, input_stride_y) - off) % ushort2(input_stride_x, input_stride_y);
-      const half    sample = in(off + p0);
-      auto          w      = weights(off);
-
-      accumulate<output_channels % DataSize, T>(DataLength, acc[p0.y][p0.x], w, sample);
-    }
-  }
-}
-
-template <typename T>
-void array_zero(T data[], ushort size) {
-  for (ushort i = 0; i < size; i++) {
-    data[i] = T(0);
-  }
-}
-
-template <typename T1, typename T2>
-void array_copy(T1 dst[], const T2 src[], ushort size) {
-  for (ushort i = 0; i < size; i++) {
-    dst[i] = src[i];
-  }
-}
-
-template <typename T1, typename T2>
-void array_add(T1 dst[], const T2 src[], ushort size) {
-  for (ushort i = 0; i < size; i++) {
-    dst[i] += src[i];
-  }
-}
-
-template <ushort DataSize, typename T = vec<half, DataSize>>
-const half element(const T data[], ushort i) {
-  return data[i / DataSize][i % DataSize];
-}
-
-template <ushort DataSize, typename T = vec<half, DataSize>>
-const half element(const threadgroup T data[], ushort i) {
-  return data[i / DataSize][i % DataSize];
-}
-
-template <>
-const half element<1, half>(const half data[], ushort i) {
-  return data[i];
-}
-
-static void cnn_convolution_full(constant bias_data &bias,
-                                 constant filter_data &filters,
-                                 constant input_data &input,
-                                 device output_data &output,
-                                 ushort2 xgid)
-{
-  ushort2 gid = ushort2(input_stride_x, input_stride_y) * xgid;
-
-  if (all(gid < ushort2(output_size_x, output_size_y))) {
-    thread_storage storage;
-    array_zero((thread vec_t *) storage, sizeof(thread_storage) / sizeof(vec_t));
-
-    for (ushort channel = 0; channel < input_channels; channel++) {
-      planar_data_sampler<data_buffer_type> in(input.data, gid, channel);
-      planar_filter_sampler<vec_t> weights(filters.data, channel);
-
-      convolution(storage, in, weights);
-    }
-
-    for (ushort c = 0; c < output_channels; c++) {
-      for (ushort y = 0; y < input_stride_y; y++) {
-        for (ushort x = 0; x < input_stride_x; x++) {
-          ushort2 idx = gid + ushort2(x, y);
-          if (all(idx < ushort2(output_size_x, output_size_y)))
-            output.data[c][idx.y][idx.x] = element<DataSize, vec_t>(storage[y][x], c) + bias.data[c];
-        }
-      }
-    }
-  }
-}
-
-static void cnn_convolution_batch(constant bias_data &bias,
-                                  constant filter_data &filters,
-                                  constant input_data &input,
-                                  device output_data &output,
-                                  threadgroup thread_storage *batch_storage,
-                                  ushort2 xgid)
-{
-  ushort2 gid = ushort2(input_stride_x, input_stride_y) * ushort2(xgid.x / input_batch_size, xgid.y);
-  ushort batch = xgid.x % input_batch_size;
-
-  if (gid.x < output_size_x && gid.y < output_size_y) {
-    thread_storage storage;
-    const ushort data_size = sizeof(thread_storage) / sizeof(vec_t);
-    array_zero((thread vec_t *) storage, data_size);
-
-    ushort base_channel = batch * input_channels / input_batch_size;
-    for (ushort channel = 0; channel < input_channels / input_batch_size; channel++) {
-      planar_data_sampler<data_buffer_type> in(input.data, gid, base_channel + channel);
-      planar_filter_sampler<vec_t> weights(filters.data, base_channel + channel);
-
-      convolution(storage, in, weights);
-    }
-
-    for (ushort b = 0; b < input_batch_size; b++) {
-      threadgroup_barrier(mem_flags::mem_device);
-
-      if (b == batch) {
-        for (ushort y = 0; y < input_stride_y; y++) {
-          for (ushort x = 0; x < input_stride_x; x++) {
-            thread half *h_storage = (thread half *) storage[y][x];
-
-            for (ushort c = 0; c < output_channels; c++) {
-              ushort2 idx = gid + ushort2(x, y);
-              if (all(idx < ushort2(output_size_x, output_size_y))) {
-                device data_buffer_type *out_channel = &output.data[c][gid.y + y][gid.x + x];
-                *out_channel = h_storage[c] + (b == 0 ? bias.data[c] : *out_channel);
-              }
-            }
-          }
-        }
-      }
-    }
-  } else {
-    for (ushort b = 0; b < input_batch_size; b++) {
-      threadgroup_barrier(mem_flags::mem_device);
-    }
-  }
-}
-
-// clang-format off
-
-kernel void cnn_convolution_kern(constant bias_data &bias             [[ buffer(0) ]],
-                                 constant filter_data &filters        [[ buffer(1) ]],
-                                 constant input_data &input           [[ buffer(2) ]],
-                                 device output_data &output           [[ buffer(3) ]],
-                                 threadgroup thread_storage *storage  [[ threadgroup(0) ]],
-                                 ushort tid                           [[ thread_index_in_threadgroup ]],
-                                 ushort2 xgid                         [[ thread_position_in_grid ]])
-
-// clang-format on
-
-{
-  if (input_batch_size > 1) {
-    ushort batch_index = (input_batch_size-1) * (tid / (input_batch_size));
-    cnn_convolution_batch(bias, filters, input, output, &storage[batch_index], xgid);
-  } else {
-    cnn_convolution_full(bias, filters, input, output, xgid);
-  }
-}
-)Metal";
-}
--- a/caffe2/contrib/ios/metal/FBMetalCNNInstanceNorm.h
+++ b/caffe2/contrib/ios/metal/FBMetalCNNInstanceNorm.h
@ -1,61 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import "arm_neon_support.h"
-#import "MetalImageFilter.h"
-
-class FBMetalInstanceNormConstantValues : public FBMetalConstantValues {
-  public:
-  ushort input_width;
-  ushort input_height;
-  ushort input_channels;
-  ushort prelu_size;
-
-  FBMetalInstanceNormConstantValues(
-                                    ushort _input_width,
-                                    ushort _input_height,
-                                    ushort _input_channels,
-                                    ushort _prelu_size)
-  : input_width(_input_width),
-  input_height(_input_height),
-  input_channels(_input_channels),
-  prelu_size(_prelu_size) {}
-
-  std::string to_string();
-};
-
-@interface FBMetalCNNInstanceNormBase : MetalImageFilter
-
-@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
-@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
-@property (nonatomic, strong) id<MTLBuffer> scaleBuffer;
-@property (nonatomic, strong) id<MTLBuffer> biasBuffer;
-@property (nonatomic, strong) id<MTLBuffer> avgBuffer;
-@property (nonatomic, strong) id<MTLBuffer> stdevBuffer;
-@property (nonatomic, strong) id<MTLBuffer> preluBuffer;
-@property (nonatomic, strong) id<MTLBuffer> epsilonBuffer;
-
-@property (nonatomic, copy) NSString *functionName;
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                     functionName:(NSString*)functionName
-                   constantValues:(FBMetalInstanceNormConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                          channel:(NSUInteger)channel;
-@end
-
-
-@interface FBMetalCNNInstanceNorm: FBMetalCNNInstanceNormBase
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                   constantValues:(FBMetalInstanceNormConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                          channel:(NSUInteger)channel
-                        withPRelu:(BOOL)withPRelu;
-
- (void)loadEpsilon:(const float)epsilon;
-
-@end
--- a/caffe2/contrib/ios/metal/FBMetalCNNInstanceNorm.mm
+++ b/caffe2/contrib/ios/metal/FBMetalCNNInstanceNorm.mm
@ -1,407 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "FBMetalCNNInstanceNorm.h"
-#import "MetalShaderUtilities.h"
-
-
-std::string FBMetalInstanceNormConstantValues::to_string() {
-  std::ostringstream ss;
-  ss << ":" << input_width << ":" << input_height << ":" << input_channels << ":" << prelu_size;
-  return ss.str();
-}
-
-@interface FBMetalCNNInstanceNormBase () {
-  struct {
-    ushort channel;
-    ushort width;
-    ushort height;
-  } configuration;
-}
-@end
-
-namespace {
-  extern const char *metalCode;
-}
-
-@implementation FBMetalCNNInstanceNormBase
-
-static constexpr size_t kThreadGroupSize_x = 4;
-static constexpr size_t kThreadGroupSize_y = 8;
-
-static NSArray* kernelNames = @[
-                                @"cnn_instance_norm_avg_kern",
-                                @"cnn_instance_norm_stdev_kern",
-                                @"cnn_instance_norm_last_kern",
-                                @"cnn_instance_norm_prelu_kern"
-                                ];
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                     functionName:(NSString*)functionName
-                   constantValues:(FBMetalInstanceNormConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                          channel:(NSUInteger)channel {
-  return [[self alloc] initWithContext:context
-                          functionName:functionName
-                        constantValues:constantValues
-                                 width:width
-                                height:height
-                               channel:channel];
-}
-
- (instancetype)initWithContext:(MetalContext*)context
-                   functionName:(NSString*)functionName
-                 constantValues:(FBMetalInstanceNormConstantValues*)constantValues
-                          width:(NSUInteger)width
-                         height:(NSUInteger)height
-                        channel:(NSUInteger)channel {
-  if ((self = [super initWithFunctionName:functionName
-                              libraryName:@"InstanceNorm"
-                            librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
-                                  context:context
-                           constantValues:constantValues])) {
-    _functionName = functionName;
-
-    configuration.channel = channel;
-    configuration.width   = width;
-    configuration.height  = height;
-
-    super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
-                                                                                       width:width
-                                                                                      height:height
-                                                                                   mipmapped:NO];
-  }
-  return self;
-}
-
- (MTLSize)threadsPerThreadgroup {
-  if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
-    NSUInteger maxTotalThreadsPerThreadgroup = [self.pipeline maxTotalThreadsPerThreadgroup];
-    return MTLSizeMake(maxTotalThreadsPerThreadgroup, 1, 1);
-  } else {
-    return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
-  }
-}
-
- (MTLSize)threadgroupsPerGrid {
-  if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
-    return MTLSizeMake(configuration.channel, 1, 1);
-  } else {
-    MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
-    return MTLSizeMake(
-        (self.outputTextureDescriptor.width + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width,
-        (self.outputTextureDescriptor.height + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height,
-        1);
-  }
-}
-
- (NSString*)replaceConstantValues:(FBMetalConstantValues*)constantValues librarySource:(NSString*)librarySource {
-  FBMetalInstanceNormConstantValues* convolutionConstantValues = (FBMetalInstanceNormConstantValues*)constantValues;
-  std::string source                                           = [librarySource UTF8String];
-
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_width * convolutionConstantValues->input_height, 0);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 1);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 2);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 3);
-  REPLACE_CONSTANT(source, convolutionConstantValues->prelu_size, 4);
-
-  return [NSString stringWithUTF8String:source.c_str()];
-}
-
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
-                              weightBufferOffset:(NSInteger)weightBufferOffset
-                              outputBufferOffset:(NSInteger)outputBufferOffset {
-  [commandEncoder setBuffer:_scaleBuffer offset:0 atIndex:0];
-  [commandEncoder setBuffer:_biasBuffer offset:0 atIndex:1];
-  [commandEncoder setBuffer:_dataBuffer offset:0 atIndex:2];
-  [commandEncoder setBuffer:_outputBuffer offset:0 atIndex:3];
-  [commandEncoder setBuffer:_avgBuffer offset:0 atIndex:4];
-  [commandEncoder setBuffer:_stdevBuffer offset:0 atIndex:5];
-  [commandEncoder setBuffer:_epsilonBuffer offset:0 atIndex:6];
-  if (_preluBuffer != nil) {
-    [commandEncoder setBuffer:_preluBuffer offset:0 atIndex:7];
-  }
-
-  if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
-    MTLSize threads                   = [self threadsPerThreadgroup];
-    const int threadGroupMemoryLength = threads.width * sizeof(float);
-    [commandEncoder setThreadgroupMemoryLength:threadGroupMemoryLength atIndex:0];
-  }
-}
-@end
-
-@implementation FBMetalCNNInstanceNorm {
-  NSMutableArray<FBMetalCNNInstanceNormBase*>* instanceNorm;
-  NSMutableArray<NSString*>* functionNames;
-}
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                   constantValues:(FBMetalInstanceNormConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                          channel:(NSUInteger)channel
-                        withPRelu:(BOOL)withPRelu {
-  return [[self alloc] initWithContext:context
-                        constantValues:constantValues
-                                 width:width
-                                height:height
-                               channel:channel
-                             withPRelu:withPRelu];
-}
-
- (instancetype)initWithContext:(MetalContext*)context
-                 constantValues:(FBMetalInstanceNormConstantValues*)constantValues
-                          width:(NSUInteger)width
-                         height:(NSUInteger)height
-                        channel:(NSUInteger)channel
-                      withPRelu:(BOOL)withPRelu{
-  self.context = context;
-  if (withPRelu) {
-    functionNames = [[NSMutableArray alloc]  initWithObjects:kernelNames[0], kernelNames[1], kernelNames[3], nil];
-  } else {
-    functionNames = [[NSMutableArray alloc]  initWithObjects:kernelNames[0], kernelNames[1], kernelNames[2], nil];
-  }
-  instanceNorm = [NSMutableArray<FBMetalCNNInstanceNormBase*> arrayWithCapacity:functionNames.count];
-  for (int i = 0; i < functionNames.count; i++) {
-    FBMetalCNNInstanceNormBase* instanceNormChunk = [FBMetalCNNInstanceNormBase filterWithContext:context
-                                                                                     functionName:functionNames[i]
-                                                                                   constantValues:constantValues
-                                                                                            width:width
-                                                                                           height:height
-                                                                                          channel:channel];
-
-    instanceNorm[i] = instanceNormChunk;
-  }
-  return self;
-}
-
- (void)loadEpsilon:(const float)epsilon {
-  int length = 1;
-  if (self.epsilonBuffer == nil || [self.epsilonBuffer length] < sizeof(float) * length) {
-    self.epsilonBuffer =
-    [self.context.device newBufferWithLength:sizeof(float) * length options:MTLResourceOptionCPUCacheModeDefault];
-  }
-  if (self.epsilonBuffer) {
-    float* bias_data = (float*)[self.epsilonBuffer contents];
-    bias_data[0]     = epsilon;
-  }
-}
-
- (void)applyFilter:(void (^)(NSError*))completionHandler {
-  for (int i = 0; i < functionNames.count; i++) {
-    FBMetalCNNInstanceNormBase* instanceNormChunk = instanceNorm[i];
-    instanceNormChunk.avgBuffer                   = self.avgBuffer;
-    instanceNormChunk.stdevBuffer                 = self.stdevBuffer;
-    instanceNormChunk.dataBuffer                  = self.dataBuffer;
-    instanceNormChunk.outputBuffer                = self.outputBuffer;
-    instanceNormChunk.scaleBuffer                 = self.scaleBuffer;
-    instanceNormChunk.biasBuffer                  = self.biasBuffer;
-    instanceNormChunk.preluBuffer                 = self.preluBuffer;
-    instanceNormChunk.epsilonBuffer               = self.epsilonBuffer;
-
-    [instanceNorm[i] applyFilter:completionHandler];
-  }
-}
-
-@end
-
-namespace {
-const char *metalCode = R"Metal(
-//  Copyright 2004-present Facebook. All Rights Reserved.
-
-#include <metal_stdlib>
-
-using namespace metal;
-
-constant constexpr int input_size                     [[ function_constant(0) ]];
-constant constexpr ushort input_size_x                [[ function_constant(1) ]];
-constant constexpr ushort input_size_y                [[ function_constant(2) ]];
-constant constexpr ushort channels                    [[ function_constant(3) ]];
-constant constexpr ushort prelu_size                  [[ function_constant(4) ]];
-
-typedef float data_buffer_type;
-
-typedef struct {
-  data_buffer_type data[channels * input_size];
-} input_output_data;
-
-typedef struct {
-  half data[channels];
-} stats_data;
-
-typedef struct {
-  half data[channels];
-} channel_data;
-
-typedef struct {
-  float data[1];
-} epsilon_data;
-
-kernel void cnn_instance_norm_avg_kern(
-                                       constant input_output_data &inputBuffer    [[ buffer(2) ]],
-                                       device stats_data &avgBuffer               [[ buffer(4) ]],
-                                       threadgroup float *per_thread_sum          [[ threadgroup(0) ]],
-                                       ushort channel                             [[ threadgroup_position_in_grid ]],
-                                       ushort num_threads                         [[ threads_per_threadgroup ]],
-                                       ushort tid                                 [[ thread_index_in_threadgroup ]]
-                                       ) {
-  const int chunk_size = (input_size + num_threads - 1) / num_threads;
-
-  constant float* input  = &inputBuffer.data[channel * input_size + tid * chunk_size];
-
-  const int max_index = min(chunk_size, input_size - tid * chunk_size);
-
-  float4 sum = 0;
-  int i = 0;
-  for (; i < max_index-16; i+=16) {
-    sum += ((constant float4 *) input)[i/4];
-    sum += ((constant float4 *) input)[i/4+1];
-    sum += ((constant float4 *) input)[i/4+2];
-    sum += ((constant float4 *) input)[i/4+3];
-  }
-  for (; i < max_index-8; i+=8) {
-    sum += ((constant float4 *) input)[i/4];
-    sum += ((constant float4 *) input)[i/4+1];
-  }
-  for (; i < max_index-4; i+=4) {
-    sum += ((constant float4 *) input)[i/4];
-  }
-  for (; i < max_index; i++) {
-    sum[0] += input[i];
-  }
-  per_thread_sum[tid] = sum[0] + sum[1] + sum[2] + sum[3];
-
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-
-  const int last_bit_size = 32;
-  const int last_bit_section = num_threads/last_bit_size;
-
-  if (tid < last_bit_size) {
-    sum = 0;
-    for (int t = 0; t < last_bit_section/4; t++) {
-      sum += ((threadgroup float4 *) per_thread_sum)[tid * last_bit_section/4 + t];
-    }
-    per_thread_sum[tid] = sum[0] + sum[1] + sum[2] + sum[3];
-  }
-
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-
-  if (tid == 0) {
-    sum = 0;
-    for (int t = 0; t < last_bit_size/4; t++) {
-      sum += ((threadgroup float4 *) per_thread_sum)[t];
-    }
-    avgBuffer.data[channel] = (sum[0] + sum[1] + sum[2] + sum[3]) / input_size;
-  }
-}
-
-kernel void cnn_instance_norm_stdev_kern(
-                                         constant channel_data &scaleBuffer         [[ buffer(0) ]],
-                                         constant channel_data &biasBuffer          [[ buffer(1) ]],
-                                         constant input_output_data &inputBuffer    [[ buffer(2) ]],
-                                         device stats_data &avgBuffer               [[ buffer(4) ]],
-                                         device stats_data &stdevBuffer             [[ buffer(5) ]],
-                                         constant epsilon_data &epsilon             [[ buffer(6) ]],
-                                         threadgroup float *per_thread_sq_norm      [[ threadgroup(0) ]],
-                                         ushort channel                             [[ threadgroup_position_in_grid ]],
-                                         ushort num_threads                         [[ threads_per_threadgroup ]],
-                                         ushort tid                                 [[ thread_index_in_threadgroup ]]
-                                         ) {
-  const int chunk_size = (input_size + num_threads - 1) / num_threads;
-
-  constant float* input  = &inputBuffer.data[channel * input_size + tid * chunk_size];
-
-  const int max_index = min(chunk_size, input_size - tid * chunk_size);
-
-  float4 sum = 0;
-  float mean = avgBuffer.data[channel];
-
-  int i = 0;
-  for (; i < max_index-16; i+=16) {
-    float4 delta = ((constant float4 *) input)[i/4] - mean;
-    sum += delta * delta;
-    delta = ((constant float4 *) input)[i/4+1] - mean;
-    sum += delta * delta;
-    delta = ((constant float4 *) input)[i/4+2] - mean;
-    sum += delta * delta;
-    delta = ((constant float4 *) input)[i/4+3] - mean;
-    sum += delta * delta;
-  }
-  for (; i < max_index-8; i+=8) {
-    float4 delta = ((constant float4 *) input)[i/4] - mean;
-    sum += delta * delta;
-    delta = ((constant float4 *) input)[i/4+1] - mean;
-    sum += delta * delta;
-  }
-  for (; i < max_index-4; i+=4) {
-    float4 delta = ((constant float4 *) input)[i/4] - mean;
-    sum += delta * delta;
-  }
-  for (; i < max_index; i++) {
-    half delta = input[i] - mean;
-    sum[0] += delta * delta;
-  }
-  per_thread_sq_norm[tid] = sum[0] + sum[1] + sum[2] + sum[3];
-
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-
-  const int last_bit_size = 32;
-  const int last_bit_section = num_threads/last_bit_size;
-
-  if (tid < last_bit_size) {
-    sum = 0;
-    for (int t = 0; t < last_bit_section/4; t++) {
-      sum += ((threadgroup float4 *) per_thread_sq_norm)[tid * last_bit_section/4 + t];
-    }
-    per_thread_sq_norm[tid] = sum[0] + sum[1] + sum[2] + sum[3];
-  }
-
-  threadgroup_barrier(mem_flags::mem_threadgroup);
-
-  if (tid == 0) {
-    sum = 0;
-    for (int t = 0; t < last_bit_size/4; t++) {
-      sum += ((threadgroup float4 *) per_thread_sq_norm)[t];
-    }
-
-    float inv_stdev = 1.0h / sqrt((sum[0] + sum[1] + sum[2] + sum[3]) / input_size + epsilon.data[0]);
-    float scale = inv_stdev * scaleBuffer.data[channel];
-    float shift = biasBuffer.data[channel] - mean * scale;
-
-    avgBuffer.data[channel] = scale; //scale
-    stdevBuffer.data[channel] = shift; // shift
-  }
-}
-
-kernel void cnn_instance_norm_last_kern(
-                                        constant input_output_data &inputBuffer           [[ buffer(2) ]],
-                                        device input_output_data &outputBuffer            [[ buffer(3) ]],
-                                        constant stats_data &avgBuffer                    [[ buffer(4) ]], //scale
-                                        constant stats_data &stdevBuffer                  [[ buffer(5) ]], //shift
-                                        uint2 gid                                         [[ thread_position_in_grid ]]) {
-  if (gid.x < input_size_x && gid.y < input_size_y) {
-    for (int c = 0; c < channels; c++) {
-      int idx = c * input_size + gid.y * input_size_x + gid.x;
-      outputBuffer.data[idx] = half(inputBuffer.data[idx]) * avgBuffer.data[c] + stdevBuffer.data[c];
-    }
-  }
-}
-
-kernel void cnn_instance_norm_prelu_kern(
-                                         constant input_output_data &inputBuffer           [[ buffer(2) ]],
-                                         device input_output_data &outputBuffer            [[ buffer(3) ]],
-                                         constant stats_data &avgBuffer                    [[ buffer(4) ]], //scale
-                                         constant stats_data &stdevBuffer                  [[ buffer(5) ]], //shift
-                                         constant channel_data &preluBuffer                [[ buffer(7) ]], //prelu weights
-                                         uint2 gid                                         [[ thread_position_in_grid ]]) {
-  if (gid.x < input_size_x && gid.y < input_size_y) {
-    for (int c = 0; c < channels; c++) {
-      int idx = c * input_size + gid.y * input_size_x + gid.x;
-      half weight = preluBuffer.data[prelu_size > 1 ? c : 0];
-      half value = half(inputBuffer.data[idx]) * avgBuffer.data[c] + stdevBuffer.data[c];
-      outputBuffer.data[idx] = value > 0 ? value : value * weight;
-    }
-  }
-}
-)Metal";
-}
--- a/caffe2/contrib/ios/metal/FBMetalCNNNoOp.h
+++ b/caffe2/contrib/ios/metal/FBMetalCNNNoOp.h
@ -1,14 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import "arm_neon_support.h"
-#import "MetalImageFilter.h"
-
-@interface FBMetalCNNNoOp: MetalImageFilter
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height;
-
-@end
--- a/caffe2/contrib/ios/metal/FBMetalCNNNoOp.mm
+++ b/caffe2/contrib/ios/metal/FBMetalCNNNoOp.mm
@ -1,54 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "FBMetalCNNNoOp.h"
-#import "MetalShaderUtilities.h"
-
-namespace {
-  extern const char *metalCode;
-}
-
-@implementation FBMetalCNNNoOp
-
- (MTLSize)threadsPerThreadgroup {
-    return MTLSizeMake(1, 1, 1);
-}
-
- (MTLSize)threadgroupsPerGrid {
-    return MTLSizeMake(1, 1, 1);
-}
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height {
-  return [[self alloc] initWithContext:context width:width height:height];
-}
-
- (instancetype)initWithContext:(MetalContext*)context
-                          width:(NSUInteger)width
-                         height:(NSUInteger)height {
-  if ((self = [super initWithFunctionName:@"cnn_no_op_kern"
-                              libraryName:@"NoOp"
-                            librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
-                                  context:context
-                           constantValues:nil])) {
-    super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
-                                                                                       width:width
-                                                                                      height:height
-                                                                                   mipmapped:NO];
-  }
-  return self;
-}
-@end
-
-namespace {
-const char *metalCode = R"Metal(
-//  Copyright 2004-present Facebook. All Rights Reserved.
-
-#include <metal_stdlib>
-using namespace metal;
-
-kernel void cnn_no_op_kern() {
-  return;
-}
-)Metal";
-}
--- a/caffe2/contrib/ios/metal/FBMetalCNNPRelu.h
+++ b/caffe2/contrib/ios/metal/FBMetalCNNPRelu.h
@ -1,40 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import "arm_neon_support.h"
-#import "MetalImageFilter.h"
-
-class FBMetalPReluConstantValues : public FBMetalConstantValues {
-public:
-  ushort input_width;
-  ushort input_height;
-  ushort input_channels;
-  ushort weight_length;
-
-  FBMetalPReluConstantValues(
-                             ushort _input_width,
-                             ushort _input_height,
-                             ushort _input_channels,
-                             ushort _weight_length)
-  : input_width(_input_width),
-  input_height(_input_height),
-  input_channels(_input_channels),
-  weight_length(_weight_length) {}
-
-  std::string to_string();
-};
-
-@interface FBMetalCNNPRelu : MetalImageFilter
-
-@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
-@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
-@property (nonatomic, strong) id<MTLBuffer> weightBuffer;
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                   constantValues:(FBMetalPReluConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                          channel:(NSUInteger)channel;
-
-@end
--- a/caffe2/contrib/ios/metal/FBMetalCNNPRelu.mm
+++ b/caffe2/contrib/ios/metal/FBMetalCNNPRelu.mm
@ -1,131 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "FBMetalCNNPRelu.h"
-#import "MetalShaderUtilities.h"
-
-#include "caffe2/core/logging.h"
-
-std::string FBMetalPReluConstantValues::to_string() {
-  std::ostringstream ss;
-
-  ss  << ":" <<
-  input_width << ":" <<
-  input_height << ":" <<
-  weight_length;
-
-  return ss.str();
-}
-
-@interface FBMetalCNNPRelu() {
-  struct {
-    ushort channel;
-    ushort width;
-    ushort height;
-  } configuration;
-}
-@end
-
-namespace {
-  extern const char *metalCode;
-}
-
-@implementation FBMetalCNNPRelu
-
-static constexpr size_t kThreadGroupSize_x = 4;
-static constexpr size_t kThreadGroupSize_y = 8;
-
-+ (instancetype)filterWithContext:(MetalContext*)context
-                   constantValues:(FBMetalPReluConstantValues*)constantValues
-                            width:(NSUInteger)width
-                           height:(NSUInteger)height
-                          channel:(NSUInteger)channel {
-  return [[self alloc] initWithContext:context constantValues:constantValues width:width height:height channel:channel];
-}
-
- (instancetype)initWithContext:(MetalContext*)context
-                 constantValues:(FBMetalPReluConstantValues*)constantValues
-                          width:(NSUInteger)width
-                         height:(NSUInteger)height
-                        channel:(NSUInteger)channel {
-  if ((self = [super initWithFunctionName:@"cnn_prelu_kern"
-                              libraryName:@"PRelu"
-                            librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
-                                  context:context
-                           constantValues:constantValues])) {
-    configuration.channel = channel;
-    configuration.width = width;
-    configuration.height = height;
-
-    super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
-                                                                                       width:width
-                                                                                      height:height
-                                                                                   mipmapped:NO];
-  }
-  return self;
-}
-
- (MTLSize) threadsPerThreadgroup {
-  return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
-}
-
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
-                     librarySource:(NSString*)librarySource {
-  FBMetalPReluConstantValues* convolutionConstantValues = (FBMetalPReluConstantValues *) constantValues;
-  std::string source = [librarySource UTF8String];
-
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 0);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 1);
-  REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 2);
-  REPLACE_CONSTANT(source, convolutionConstantValues->weight_length,  3);
-
-  return [NSString stringWithUTF8String:source.c_str()];
-}
-
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
-                              weightBufferOffset:(NSInteger)weightBufferOffset
-                              outputBufferOffset:(NSInteger)outputBufferOffset {
-  [commandEncoder setBuffer:_weightBuffer offset:0 atIndex:0];
-  [commandEncoder setBuffer:_outputBuffer offset:0 atIndex:1];
-}
-
-@end
-
-namespace {
-const char *metalCode = R"Metal(
-//  Copyright 2004-present Facebook. All Rights Reserved.
-
-#include <metal_stdlib>
-
-using namespace metal;
-
-constant constexpr ushort input_size_x      [[ function_constant(0) ]];
-constant constexpr ushort input_size_y      [[ function_constant(1) ]];
-constant constexpr ushort input_channels    [[ function_constant(2) ]];
-constant constexpr ushort kernels           [[ function_constant(3) ]];
-
-constant constexpr int input_size = input_size_x * input_size_y;
-
-typedef float data_buffer_type;
-typedef struct {
-  data_buffer_type data[input_channels][input_size];
-} output_data;
-
-typedef struct {
-  half data[kernels];
-} filter_data;
-
-kernel void cnn_prelu_kern(
-                           constant filter_data &filter             [[ buffer(0) ]],
-                           device output_data &output               [[ buffer(1) ]],
-                           uint2 gid                                [[thread_position_in_grid]]) {
-  if (gid.x < input_size_x && gid.y < input_size_y) {
-    int idx = gid.y * input_size_x + gid.x;
-    for (int c = 0; c < input_channels; c++) {
-      half             weight = filter.data[kernels > 1 ? c : 0];
-      data_buffer_type value  = output.data[c][idx];
-      output.data[c][idx]     = value > 0 ? value : value * weight;
-    }
-  }
-}
-)Metal";
-}
--- a/caffe2/contrib/ios/metal/FBMetalConstantValues.h
+++ b/caffe2/contrib/ios/metal/FBMetalConstantValues.h
@ -1,10 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import <string>
-
-class FBMetalConstantValues {
-public:
-  virtual std::string to_string() = 0;
-};
--- a/caffe2/contrib/ios/metal/GivenTensorMetalFillOp.mm
+++ b/caffe2/contrib/ios/metal/GivenTensorMetalFillOp.mm
@ -1,38 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/operators/filler_op.h"
-#import "MetalCaffeContext.h"
-#import "data_conversion.h"
-
-namespace caffe2 {
-
-template <typename T1, typename T2>
-class GivenTensorMetalFillOp final : public FillerOp<MetalCaffeContext> {
- public:
-  GivenTensorMetalFillOp(const OperatorDef& operator_def, Workspace* ws)
-      : FillerOp<MetalCaffeContext>(operator_def, ws) {
-    auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
-    for (float f : source_values) {
-      values_.push_back(f);
-    }
-  }
-
-  bool Fill(TensorMetal* output) override {
-    DCHECK_EQ(output->size(), values_.size()) << "output size: " << output->size() << " given size: " << values_.size();
-
-    id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)output->template mutable_data<T1>());
-    T2* output_data            = (T2*)[weightBuffer contents];
-
-    CAFFE_ENFORCE(output_data != NULL);
-    memcpycvt(output_data, values_.data(), output->size());
-    return true;
-  }
-
- private:
-  vector<float> values_;
-};
-
-// uint16_t is used because caffe2 does not support float16_t
-REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenTensorFloat16MetalFill, METAL, GivenTensorMetalFillOp<uint16_t, float16_t>);
-OPERATOR_SCHEMA(GivenTensorFloat16MetalFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
-}
--- a/caffe2/contrib/ios/metal/GivenWeightTensorFillOp.mm
+++ b/caffe2/contrib/ios/metal/GivenWeightTensorFillOp.mm
@ -1,200 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/core/common.h"
-
-#ifndef CAFFE2_MOBILE
-#error "Caffe2 mobile state not defined"
-#endif
-
-#if CAFFE2_MOBILE
-
-#include "caffe2/operators/filler_op.h"
-#import "data_conversion.h"
-#import "FBMetalCNNConvolution.h"
-#import "MetalCaffeContext.h"
-
-namespace caffe2 {
-class GivenWeightTensorFillOp final : public FillerOp<MetalCaffeContext> {
- public:
-  GivenWeightTensorFillOp(const OperatorDef &operator_def, Workspace *ws)
-      : FillerOp<MetalCaffeContext>(operator_def, ws) {
-    auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
-    for (float f : source_values) {
-      values_.push_back(f);
-    }
-  }
-
-  bool Fill(Tensor<MetalCaffeContext> *output) override { return true; }
-
-  bool RunOnDevice() override {
-    auto *output = Operator<MetalCaffeContext>::Output(0);
-    CAFFE_ENFORCE_EQ(shape_.size(), 4);
-    auto shape = shape_;
-
-    int kernels = shape[0];
-    int kernel_channels = shape[1];
-    int kernel_height = shape[2];
-    int kernel_width = shape[3];
-
-    reformatKernelImage<weight_buffer_type>(
-        (const float *)values_.data(),
-        kernels,
-        kernel_channels,
-        kernel_width,
-        kernel_height,
-        false,
-        [&](int multiplier) -> weight_buffer_type * {
-          // buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
-          shape.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
-          output->Resize(shape);
-
-          id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
-          weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
-          return output_data;
-        });
-    return true;
-  }
-
- private:
-  vector<float> values_;
-};
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenWeightTensorFill, METAL, GivenWeightTensorFillOp);
-OPERATOR_SCHEMA(GivenWeightTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
-
-class GivenTransposeWeightTensorFillOp final : public FillerOp<MetalCaffeContext> {
- public:
-  GivenTransposeWeightTensorFillOp(const OperatorDef &operator_def, Workspace *ws)
-      : FillerOp<MetalCaffeContext>(operator_def, ws) {
-    auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
-    for (float f : source_values) {
-      values_.push_back(f);
-    }
-  }
-
-  bool Fill(Tensor<MetalCaffeContext> *output) override { return true; }
-
-  bool RunOnDevice() override {
-    auto *output = Operator<MetalCaffeContext>::Output(0);
-    CAFFE_ENFORCE_EQ(shape_.size(), 4);
-    auto shape = shape_;
-
-    int kernels = shape[0];
-    int kernel_channels = shape[1];
-    int kernel_height = shape[2];
-    int kernel_width = shape[3];
-
-    std::swap(kernels, kernel_channels);
-
-    reformatKernelImage<weight_buffer_type>(
-        (const float *)values_.data(),
-        kernels,
-        kernel_channels,
-        kernel_width,
-        kernel_height,
-        true,
-        [&](int multiplier) -> weight_buffer_type * {
-          // buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
-          shape_.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
-          output->Resize(shape_);
-
-          id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
-          weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
-          return output_data;
-        });
-    return true;
-  }
-
- private:
-  vector<float> values_;
-};
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenTransposeWeightTensorFill, METAL, GivenTransposeWeightTensorFillOp);
-OPERATOR_SCHEMA(GivenTransposeWeightTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
-
-class CopyWeightTensorToMetalGPUOp final : public Operator<MetalCaffeContext> {
- public:
-  CopyWeightTensorToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<MetalCaffeContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    const Blob *blob = Inputs()[0];
-    const TensorCPU &X = blob->Get<TensorCPU>();
-    auto *output = Operator<MetalCaffeContext>::Output(0);
-    auto shape = X.dims();
-    CAFFE_ENFORCE_EQ(shape.size(), 4);
-
-    int kernels = shape[0];
-    int kernel_channels = shape[1];
-    int kernel_height = shape[2];
-    int kernel_width = shape[3];
-
-    reformatKernelImage<weight_buffer_type>(
-        (const float *)X.template data<float>(),
-        kernels,
-        kernel_channels,
-        kernel_width,
-        kernel_height,
-        false,
-        [&](int multiplier) -> weight_buffer_type * {
-          // buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
-          shape.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
-          output->Resize(shape);
-
-          id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
-          weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
-          return output_data;
-        });
-    return true;
-  }
-};
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyWeightTensorToMetalGPU, METAL, CopyWeightTensorToMetalGPUOp);
-OPERATOR_SCHEMA(CopyWeightTensorToMetalGPU).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
-
-class CopyTransposeWeightTensorToMetalGPUOp final : public Operator<MetalCaffeContext> {
- public:
-  CopyTransposeWeightTensorToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws)
-      : Operator<MetalCaffeContext>(operator_def, ws) {}
-
-  bool RunOnDevice() override {
-    const Blob *blob = Inputs()[0];
-    const TensorCPU &X = blob->Get<TensorCPU>();
-    auto *output = Operator<MetalCaffeContext>::Output(0);
-    auto shape_ = X.dims();
-    CAFFE_ENFORCE_EQ(shape_.size(), 4);
-    auto shape = shape_;
-
-    int kernels = shape[0];
-    int kernel_channels = shape[1];
-    int kernel_height = shape[2];
-    int kernel_width = shape[3];
-
-    std::swap(kernels, kernel_channels);
-
-    reformatKernelImage<weight_buffer_type>(
-        (const float *)X.template data<float>(),
-        kernels,
-        kernel_channels,
-        kernel_width,
-        kernel_height,
-        true,
-        [&](int multiplier) -> weight_buffer_type * {
-          // buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
-          shape_.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
-          output->Resize(shape_);
-
-          id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
-          weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
-          return output_data;
-        });
-    return true;
-  }
-};
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyTransposeWeightTensorToMetalGPU, METAL, CopyTransposeWeightTensorToMetalGPUOp);
-OPERATOR_SCHEMA(CopyTransposeWeightTensorToMetalGPU).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
-
-} // namespace caffe2
-
-#endif // CAFFE2_MOBILE
--- a/caffe2/contrib/ios/metal/MetalCaffeContext.h
+++ b/caffe2/contrib/ios/metal/MetalCaffeContext.h
@ -1,87 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/operator.h"
-#include "caffe2/utils/math.h"
-
-#import <Metal/Metal.h>
-
-namespace caffe2 {
-
-struct MetalAllocator final : CPUAllocator {
-  id<MTLDevice> device;
-
-  MetalAllocator(id<MTLDevice> _device);
-
-  ~MetalAllocator();
-
-  void *New(size_t nbytes) override;
-
-  void Delete(void *data) override;
-
-  id<MTLBuffer> Buffer(void *data);
-};
-
-MetalAllocator *GetMetalAllocator();
-
-class MetalCaffeContext final {
- public:
-  MetalCaffeContext() : random_seed_(math::randomNumberSeed()) {}
-  explicit MetalCaffeContext(const DeviceOption &option)
-      : random_seed_(option.has_random_seed() ? option.random_seed() : math::randomNumberSeed()) {
-    CHECK_EQ(option.device_type(), CPU);
-  }
-
-  ~MetalCaffeContext() {}
-
-  inline void SwitchToDevice(int stream_id) {}
-  inline void SwitchToDevice() {
-    SwitchToDevice(0);
-  }
-
-  inline bool FinishDeviceComputation() { return true; }
-
-  inline std::mt19937 &RandGenerator() {
-    if (!random_generator_.get()) {
-      random_generator_.reset(new std::mt19937(random_seed_));
-    }
-    return *random_generator_.get();
-  }
-
-  static void *New(size_t nbytes);
-
-  static void Delete(void *data);
-
-  // Two copy functions that deals with cross-device copies.
-  template <class SrcContext, class DstContext>
-  inline void CopyBytes(size_t nbytes, const void *src, void *dst);
-
-  template <typename T, class SrcContext, class DstContext>
-  inline void Copy(size_t n, const T *src, T *dst) {
-    if (std::is_fundamental<T>::value) {
-      CopyBytes<SrcContext, DstContext>(n * sizeof(T), static_cast<const void *>(src), static_cast<void *>(dst));
-    } else {
-      for (int i = 0; i < n; ++i) {
-        dst[i] = src[i];
-      }
-    }
-  }
-
-  template <class SrcContext, class DstContext>
-  inline void CopyItems(const TypeMeta &meta, size_t n, const void *src, void *dst) {
-    if (meta.copy()) {
-      meta.copy()(src, dst, n);
-    } else {
-      CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
-    }
-  }
-
- protected:
-  int random_seed_{1701};
-  std::unique_ptr<std::mt19937> random_generator_;
-};
-
-typedef Tensor<MetalCaffeContext> TensorMetal;
-}
--- a/caffe2/contrib/ios/metal/MetalCaffeContext.mm
+++ b/caffe2/contrib/ios/metal/MetalCaffeContext.mm
@ -1,59 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "MetalCaffeContext.h"
-#import "MetalContext.h"
-
-namespace caffe2 {
-CAFFE_KNOWN_TYPE(TensorMetal);
-
-static NSMutableDictionary<NSNumber *, id<MTLBuffer>> *buffer_cache = nil;
-
-// class MetalAllocator
-MetalAllocator::MetalAllocator(id<MTLDevice> _device) : device(_device) {
-  buffer_cache = [NSMutableDictionary<NSNumber *, id<MTLBuffer>> dictionary];
-}
-
-MetalAllocator::~MetalAllocator() {
-  for (id key in buffer_cache) {
-    id<MTLBuffer> buffer = buffer_cache[key];
-    [buffer_cache removeObjectForKey:key];
-  }
-}
-
-void *MetalAllocator::New(size_t nbytes) {
-  id<MTLBuffer> buffer = [device newBufferWithLength:nbytes options:MTLResourceCPUCacheModeDefaultCache];
-  void *data = [buffer contents];
-  NSNumber *key = @((unsigned long long)data);
-  buffer_cache[key] = buffer;
-  return data;
-}
-
-void MetalAllocator::Delete(void *data) {
-  NSNumber *key = @((unsigned long long)data);
-  id<MTLBuffer> buffer = buffer_cache[key];
-  [buffer_cache removeObjectForKey:key];
-  buffer = nil;
-}
-
-id<MTLBuffer> MetalAllocator::Buffer(void *data) {
-  NSNumber *key = @((unsigned long long)data);
-  return buffer_cache[key];
-}
-
-// the Metal Allocator
-static MetalAllocator *MetalAllocatorInstance = NULL;
-
-// Get the Metal Allocator
-MetalAllocator *GetMetalAllocator() {
-  if (MetalAllocatorInstance == NULL) {
-    MetalAllocatorInstance = new MetalAllocator([MetalContext getContext].device);
-  }
-  CAFFE_ENFORCE(MetalAllocatorInstance != NULL);
-  return MetalAllocatorInstance;
-}
-
-// class MetalCaffeContext
-void *MetalCaffeContext::New(size_t nbytes) { return MetalAllocatorInstance->New(nbytes); }
-
-void MetalCaffeContext::Delete(void *data) { MetalAllocatorInstance->Delete(data); }
-}
--- a/caffe2/contrib/ios/metal/MetalContext.h
+++ b/caffe2/contrib/ios/metal/MetalContext.h
@ -1,17 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import <Foundation/Foundation.h>
-
-@protocol MTLDevice, MTLLibrary, MTLCommandQueue;
-
-@interface MetalContext : NSObject
-
-@property (atomic, strong) id<MTLDevice>       device;
-@property (atomic, strong) id<MTLLibrary>      library;
-@property (atomic, strong) id<MTLCommandQueue> commandQueue;
-
-+ (instancetype)getContext;
-
-@end
--- a/caffe2/contrib/ios/metal/MetalContext.m
+++ b/caffe2/contrib/ios/metal/MetalContext.m
@ -1,27 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "MetalContext.h"
-
-#import <Metal/Metal.h>
-
-static MetalContext* metalContext = NULL;
-
-@implementation MetalContext
-
-+ (instancetype)getContext {
-  if (metalContext == NULL) {
-    metalContext = [[self alloc] initWithDevice:nil];
-  }
-  return metalContext;
-}
-
- (instancetype)initWithDevice:(id<MTLDevice>)device {
-  if ((self = [super init])) {
-    _device       = device ?: MTLCreateSystemDefaultDevice();
-    _library      = [_device newDefaultLibrary];
-    _commandQueue = [_device newCommandQueue];
-  }
-  return self;
-}
-
-@end
--- a/caffe2/contrib/ios/metal/MetalConvMTLBufferOp.mm
+++ b/caffe2/contrib/ios/metal/MetalConvMTLBufferOp.mm
@ -1,163 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/core/common.h"
-
-#ifndef CAFFE2_MOBILE
-#error "Caffe2 mobile state not defined"
-#endif
-
-#if CAFFE2_MOBILE
-
-#include "caffe2/operators/conv_pool_op_base.h"
-#include "caffe2/operators/conv_transpose_unpool_op_base.h"
-#import "metal_convolution.h"
-#import "MetalCaffeContext.h"
-
-namespace caffe2 {
-
-class MetalConvMTLBufferOp final : public ConvPoolOpBase<MetalCaffeContext> {
- public:
-  MetalConvMTLBufferOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvPoolOpBase<MetalCaffeContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override;
-
-  const TensorCPU& InputFromTensorCPU(int idx) {
-    const Blob*                           blob = Inputs()[idx];
-    return blob->Get<TensorCPU>();
-  }
-
-  // Input: X, W, b
-  // Output: Y
-  INPUT_TAGS(INPUT, FILTER, BIAS);
-};
-
-bool MetalConvMTLBufferOp::RunOnDeviceWithOrderNCHW() {
-  const TensorMetal& X      = Input(INPUT);
-  auto&              filter = Input(FILTER);
-  auto&              bias   = InputFromTensorCPU(BIAS);
-  TensorMetal*       Y      = Output(0);
-
-  CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
-  const int N = X.dim32(0), C = X.dim32(1);
-  CAFFE_ENFORCE(filter.ndim(), 5);
-  const int M = filter.dim32(0);
-
-  CAFFE_ENFORCE(filter.dim32(1) == C, "");
-  CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
-  CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
-  CAFFE_ENFORCE(bias.ndim() == 1, "");
-  CAFFE_ENFORCE(bias.dim32(0) == M, "");
-
-  ConvPoolOpBase<MetalCaffeContext>::SetOutputSize(X, Y, filter.dim32(0));
-
-  id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)X.template data<float>());
-  id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Y->template mutable_data<float>());
-  id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)filter.template data<uint16_t>());
-  const float* biasData = bias.template data<float>();
-
-  metal_convolution(
-      inputDataBuffer,
-      X.dim32(1),
-      X.dim32(3),
-      X.dim32(2),
-      stride_h_,
-      stride_w_,
-      pad_t_,
-      pad_l_,
-      pad_b_,
-      pad_r_,
-      weightBuffer,
-      filter.dim32(0),
-      filter.dim32(1),
-      filter.dim32(3),
-      filter.dim32(2),
-      outputDataBuffer,
-      Y->dim32(1),
-      Y->dim32(3),
-      Y->dim32(2),
-      biasData,
-      bias.dim32(0),
-      false);
-
-  return true;
-}
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalConv, METAL, MetalConvMTLBufferOp);
-OPERATOR_SCHEMA(MetalConv).NumInputs(3).NumOutputs(1);
-
-class MetalConvTransposeMTLBufferOp final : public ConvTransposeUnpoolBase<MetalCaffeContext> {
- public:
-  MetalConvTransposeMTLBufferOp(const OperatorDef& operator_def, Workspace* ws)
-      : ConvTransposeUnpoolBase<MetalCaffeContext>(operator_def, ws) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
-  }
-
-  bool RunOnDeviceWithOrderNCHW() override;
-
- private:
-  // Input: X, W, b
-  // Output: Y
-  INPUT_TAGS(INPUT, FILTER, BIAS);
-
-  const TensorCPU& InputFromTensorCPU(int idx) {
-    const Blob*                           blob = Inputs()[idx];
-    return blob->Get<TensorCPU>();
-  }
-};
-
-bool MetalConvTransposeMTLBufferOp::RunOnDeviceWithOrderNCHW() {
-  const TensorMetal& X      = Input(INPUT);
-  auto&              filter = Input(FILTER);
-  auto&              bias   = InputFromTensorCPU(BIAS);
-  TensorMetal*       Y      = Output(0);
-
-  const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
-  CAFFE_ENFORCE(filter.ndim() == 5, "filter must be 4D tensor");
-  CAFFE_ENFORCE(filter.dim32(0) == M, "filter number must be equal to input channel number");
-  const int C = filter.dim32(1);
-  CAFFE_ENFORCE(filter.dim32(2) == kernel_h_, "filter height must be equal to kernel height");
-  CAFFE_ENFORCE(filter.dim32(3) == kernel_w_, "filter width must be equal to kernel width");
-  CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
-  CAFFE_ENFORCE(bias.dim32(0) == C, "bias dimension must be equal to output channel number");
-
-  ConvTransposeUnpoolBase<MetalCaffeContext>::SetOutputSize(X, Y, C);
-
-  id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)X.template data<float>());
-  id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Y->template mutable_data<float>());
-  id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)filter.template data<uint16_t>());
-  const float* biasData = bias.template data<float>();
-
-  metal_convolution(
-      inputDataBuffer,
-      X.dim32(1),
-      X.dim32(3),
-      X.dim32(2),
-      stride_h_,
-      stride_w_,
-      pad_t_,
-      pad_l_,
-      pad_b_,
-      pad_r_,
-      weightBuffer,
-      filter.dim32(0),
-      filter.dim32(1),
-      filter.dim32(3),
-      filter.dim32(2),
-      outputDataBuffer,
-      Y->dim32(1),
-      Y->dim32(3),
-      Y->dim32(2),
-      biasData,
-      bias.dim32(0),
-      true);
-  return true;
-}
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalConvTranspose, METAL, MetalConvTransposeMTLBufferOp);
-OPERATOR_SCHEMA(MetalConvTranspose).NumInputs(3).NumOutputs(1);
-} // namespace caffe2
-
-#endif // CAFFE2_MOBILE
--- a/caffe2/contrib/ios/metal/MetalImageFilter.h
+++ b/caffe2/contrib/ios/metal/MetalImageFilter.h
@ -1,39 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import "MetalContext.h"
-#import "FBMetalConstantValues.h"
-
-#import <Foundation/Foundation.h>
-#import <Metal/Metal.h>
-
-@protocol MTLBuffer, MTLComputeCommandEncoder, MTLComputePipelineState;
-
-@interface MetalImageFilter : NSObject
-
-@property (nonatomic, strong) MetalContext*               context;
-@property (nonatomic, strong) id<MTLComputePipelineState> pipeline;
-
-@property (atomic, strong) MTLTextureDescriptor* outputTextureDescriptor;
-
- (instancetype)initWithFunctionName:(NSString*)functionName
-                         libraryName:(NSString*)libraryName
-                       librarySource:(NSString*)librarySource
-                             context:(MetalContext*)context
-                      constantValues:(FBMetalConstantValues*)constantValues;
-
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
-                     librarySource:(NSString*)librarySource;
-
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
-                              weightBufferOffset:(NSInteger)weightBufferOffset
-                              outputBufferOffset:(NSInteger)outputBufferOffset;
-
- (void)applyFilter:(void (^)(NSError*))completionHandler;
-
- (void) applyFilter:(void(^)(NSError*))completionHandler
-  weightBufferOffset:(NSInteger)weightBufferOffset
-  outputBufferOffset:(NSInteger)outputBufferOffset;
-
-@end
--- a/caffe2/contrib/ios/metal/MetalImageFilter.mm
+++ b/caffe2/contrib/ios/metal/MetalImageFilter.mm
@ -1,171 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "MetalImageFilter.h"
-#import "MetalContext.h"
-
-#import <TargetConditionals.h>
-#import <Metal/Metal.h>
-
-#include "caffe2/core/logging.h"
-
-@interface MetalImageFilter ()
-@property (nonatomic, strong) id<MTLFunction> kernelFunction;
-@end
-
-@implementation MetalImageFilter {
-  NSString* _functionName;
-}
-static constexpr size_t kThreadGroupSize_x = 4;
-static constexpr size_t kThreadGroupSize_y = 8;
-static constexpr bool   kEnableGPUProfiling     = false; // GPU profiling is expensive, have it off by default
-
-#import "MetalShaderUtilities.h"
-
-@synthesize outputTextureDescriptor = _outputTextureDescriptor;
-
-static NSMutableDictionary<NSString*, id<MTLLibrary>>* functionLibraryCache = NULL;
-
- (id<MTLFunction>)newFunctionWithName:(NSString*)functionName
-                           libraryName:(NSString*)libraryName
-                         librarySource:(NSString*)librarySource
-                               context:(MetalContext*)context
-                        constantValues:(FBMetalConstantValues*)constantValues {
-  NSString* library_version =
-      constantValues ? [libraryName stringByAppendingString:[[NSString alloc]
-                                                             initWithUTF8String:constantValues->to_string().c_str()]]
-                     : libraryName;
-
-  if (functionLibraryCache == NULL) {
-    functionLibraryCache = [NSMutableDictionary<NSString*, id<MTLLibrary>> dictionary];
-  }
-
-  id<MTLLibrary> library = functionLibraryCache[library_version];
-
-  if (library == nil) {
-    librarySource = [self replaceConstantValues:constantValues librarySource:librarySource];
-
-    MTLCompileOptions* options = [MTLCompileOptions alloc];
-    options.fastMathEnabled    = TRUE;
-    options.languageVersion    = MTLLanguageVersion1_0;
-
-    NSError* error = NULL;
-    library        = [context.device newLibraryWithSource:librarySource options:options error:&error];
-
-    if (error != nil) {
-      NSString* description = [[NSString alloc] init];
-      LOG(ERROR) << "Problems with library for " << [library_version UTF8String] << " : "
-      << [[error localizedDescription] UTF8String];
-    }
-
-    functionLibraryCache[library_version] = library;
-  }
-
-  return library != nil ? [library newFunctionWithName:functionName] : nil;
-}
-
- (NSString*)replaceConstantValues:(FBMetalConstantValues*)constantValues librarySource:(NSString*)librarySource {
-  return librarySource;
-}
-
- (instancetype)initWithFunctionName:(NSString*)functionName
-                         libraryName:(NSString*)libraryName
-                       librarySource:(NSString*)librarySource
-                             context:(MetalContext*)context
-                      constantValues:(FBMetalConstantValues*)constantValues {
-  if ((self = [super init])) {
-    NSError* error = nil;
-
-    _context = context;
-    _kernelFunction =
-        [self newFunctionWithName:functionName libraryName:libraryName librarySource:librarySource context:context constantValues:constantValues];
-    _functionName = functionName;
-    _pipeline     = [context.device newComputePipelineStateWithFunction:_kernelFunction error:&error];
-
-    if (!_pipeline) {
-      LOG(ERROR) << "Error occurred when building compute pipeline for function " << [functionName UTF8String];
-      return nil;
-    }
-  }
-
-  return self;
-}
-
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
-                              weightBufferOffset:(NSInteger)weightBufferOffset
-                              outputBufferOffset:(NSInteger)outputBufferOffset {
-}
-
- (bool)checkExecution:(id<MTLCommandBuffer>)commandBuffer {
-  NSError* error = [commandBuffer error];
-
-  if (error != nil) {
-    LOG(ERROR) << "Problems with " << [self->_functionName UTF8String] << " : "
-    << [[error localizedDescription] UTF8String];
-    return false;
-  }
-
-  return true;
-}
-
- (MTLSize)threadsPerThreadgroup {
-  return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
-}
-
- (MTLSize)threadgroupsPerGrid {
-  MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
-
-  return MTLSizeMake(
-                     (self.outputTextureDescriptor.width + threadsPerThreadgroup.width - 1) /
-                     threadsPerThreadgroup.width,
-                     (self.outputTextureDescriptor.height + threadsPerThreadgroup.height - 1) /
-                     threadsPerThreadgroup.height,
-                     1);
-}
-
- (void)applyFilter:(void (^)(NSError*))completionHandler {
-  [self applyFilter:completionHandler weightBufferOffset:0 outputBufferOffset:0];
-}
-
- (void)applyFilter:(void (^)(NSError*))completionHandler
- weightBufferOffset:(NSInteger)weightBufferOffset
- outputBufferOffset:(NSInteger)outputBufferOffset {
-  MTLTextureDescriptor* textureDescriptor = self.outputTextureDescriptor;
-
-  if (kEnableGPUProfiling) {
-    [self.context.commandQueue insertDebugCaptureBoundary];
-  }
-
-  id<MTLCommandBuffer> commandBuffer = [self.context.commandQueue commandBuffer];
-
-  /*
-   * It is not obvious which grid strategy is best, maximizing the number of threadgroups
-   * seems to give better results, but more investigation is needed
-   */
-
-  id<MTLComputeCommandEncoder> commandEncoder = [commandBuffer computeCommandEncoder];
-  [commandEncoder setComputePipelineState:self.pipeline];
-
-  [self configureArgumentTableWithCommandEncoder:commandEncoder
-                              weightBufferOffset:weightBufferOffset
-                              outputBufferOffset:outputBufferOffset];
-
-  [commandEncoder dispatchThreadgroups:[self threadgroupsPerGrid] threadsPerThreadgroup:[self threadsPerThreadgroup]];
-
-  [commandEncoder endEncoding];
-
-  [commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> commandBuffer) {
-    if (completionHandler != NULL) {
-      completionHandler([commandBuffer error]);
-    } else {
-      [self checkExecution:commandBuffer];
-    }
-  }];
-
-  [commandBuffer commit];
-
-  if (kEnableGPUProfiling) {
-    [self.context.commandQueue insertDebugCaptureBoundary];
-  }
-}
-
-@end
--- a/caffe2/contrib/ios/metal/MetalInstanceNormOp.mm
+++ b/caffe2/contrib/ios/metal/MetalInstanceNormOp.mm
@ -1,134 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/core/context.h"
-#include "caffe2/core/logging.h"
-#include "caffe2/core/operator.h"
-#import "MetalCaffeContext.h"
-#import "metal_instance_norm.h"
-
-#ifndef CAFFE2_MOBILE
-#error "Caffe2 mobile state not defined"
-#endif
-
-#if CAFFE2_MOBILE
-
-namespace caffe2 {
-
-class MetalInstanceNormOp final : public Operator<MetalCaffeContext> {
- public:
-  MetalInstanceNormOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<MetalCaffeContext>(operator_def, ws),
-        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override {
-    return RunOnDeviceWithOrderNCHW();
-  }
-
-  bool RunOnDeviceWithOrderNCHW();
-
- protected:
-  float        epsilon_;
-  StorageOrder order_;
-  INPUT_TAGS(INPUT, SCALE, BIAS);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-bool MetalInstanceNormOp::RunOnDeviceWithOrderNCHW() {
-  const auto& X     = Input(INPUT);
-  const auto& scale = Input(SCALE);
-  const auto& bias  = Input(BIAS);
-  auto*       Y     = Output(OUTPUT);
-
-  const int C = X.dim32(1);
-  const int H = X.dim32(2);
-  const int W = X.dim32(3);
-  Y->ResizeLike(X);
-
-  const auto* Xdata = X.template data<float>();
-  const auto* Sdata = scale.template data<uint16_t>();
-  const auto* Bdata = bias.template data<uint16_t>();
-  auto*       Ydata = Y->template mutable_data<float>();
-
-  id<MTLBuffer> inputDataBuffer  = GetMetalAllocator()->Buffer((void*)Xdata);
-  id<MTLBuffer> scaleDataBuffer  = GetMetalAllocator()->Buffer((void*)Sdata);
-  id<MTLBuffer> biasDataBuffer   = GetMetalAllocator()->Buffer((void*)Bdata);
-  id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
-
-  metal_instance_norm(inputDataBuffer, C, H, W, scaleDataBuffer, biasDataBuffer, outputDataBuffer, nil, 0, epsilon_);
-
-  return true;
-}
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalInstanceNorm, METAL, MetalInstanceNormOp);
-OPERATOR_SCHEMA(MetalInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
-
-class MetalInstanceNormPReluOp final : public Operator<MetalCaffeContext> {
- public:
-  MetalInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<MetalCaffeContext>(operator_def, ws),
-        epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override {
-    return RunOnDeviceWithOrderNCHW();
-  }
-
-  bool RunOnDeviceWithOrderNCHW();
-
- protected:
-  float        epsilon_;
-  StorageOrder order_;
-  INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
-  OUTPUT_TAGS(OUTPUT);
-};
-
-bool MetalInstanceNormPReluOp::RunOnDeviceWithOrderNCHW() {
-  const auto& X     = Input(INPUT);
-  const auto& scale = Input(SCALE);
-  const auto& bias  = Input(BIAS);
-  const auto& prelu = Input(PRELU);
-  auto*       Y     = Output(OUTPUT);
-
-  const int C = X.dim32(1);
-  const int H = X.dim32(2);
-  const int W = X.dim32(3);
-  Y->ResizeLike(X);
-
-  const auto* Xdata = X.template data<float>();
-  const auto* Sdata = scale.template data<uint16_t>();
-  const auto* Bdata = bias.template data<uint16_t>();
-  const auto* Pdata = prelu.template data<uint16_t>();
-  auto*       Ydata = Y->template mutable_data<float>();
-
-  id<MTLBuffer> inputDataBuffer  = GetMetalAllocator()->Buffer((void*)Xdata);
-  id<MTLBuffer> scaleDataBuffer  = GetMetalAllocator()->Buffer((void*)Sdata);
-  id<MTLBuffer> biasDataBuffer   = GetMetalAllocator()->Buffer((void*)Bdata);
-  id<MTLBuffer> preluDataBuffer  = GetMetalAllocator()->Buffer((void*)Pdata);
-  id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
-
-  metal_instance_norm(
-      inputDataBuffer,
-      C,
-      H,
-      W,
-      scaleDataBuffer,
-      biasDataBuffer,
-      outputDataBuffer,
-      preluDataBuffer,
-      prelu.size(),
-      epsilon_);
-
-  return true;
-}
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalInstanceNormPRelu, METAL, MetalInstanceNormPReluOp);
-OPERATOR_SCHEMA(MetalInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
-} // namespace caffe2
-#endif // CAFFE2_MOBILE
--- a/caffe2/contrib/ios/metal/MetalPReluOp.mm
+++ b/caffe2/contrib/ios/metal/MetalPReluOp.mm
@ -1,52 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/core/operator.h"
-#import "MetalCaffeContext.h"
-#import "metal_prelu.h"
-
-#ifndef CAFFE2_MOBILE
-#error "Caffe2 mobile state not defined"
-#endif
-
-#if CAFFE2_MOBILE
-
-namespace caffe2 {
-
-class MetalPReluOp final : public Operator<MetalCaffeContext> {
- public:
-  MetalPReluOp(const OperatorDef& operator_def, Workspace* ws)
-      : Operator<MetalCaffeContext>(operator_def, ws),
-        order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
-    OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
-  }
-
-  bool RunOnDevice() override;
-
- protected:
-  StorageOrder order_;
-};
-
-bool MetalPReluOp::RunOnDevice() {
-  const auto& X = Input(0);
-  const auto& W = Input(1);
-  auto*       Y = Output(0);
-
-  Y->ResizeLike(X);
-
-  const auto* Xdata = X.template data<float>();
-  const auto* Wdata = W.template data<uint16_t>();
-  auto*       Ydata = Y->template mutable_data<float>();
-
-  id<MTLBuffer> inputDataBuffer  = GetMetalAllocator()->Buffer((void*)Xdata);
-  id<MTLBuffer> weightBuffer     = GetMetalAllocator()->Buffer((void*)Wdata);
-  id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
-
-  metal_prelu(inputDataBuffer, X.dim32(1), X.dim32(3), X.dim32(2), weightBuffer, W.size(), outputDataBuffer);
-
-  return true;
-}
-
-REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalPRelu, METAL, MetalPReluOp);
-OPERATOR_SCHEMA(MetalPRelu).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}}).IdenticalTypeAndShape();
-} // namespace caffe2
-#endif // CAFFE2_MOBILE
--- a/caffe2/contrib/ios/metal/MetalShaderUtilities.h
+++ b/caffe2/contrib/ios/metal/MetalShaderUtilities.h
@ -1,11 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import <string>
-#import <sstream>
-#import <functional>
-
-std::string replace_first(std::string input, std::function<void(std::stringstream &fs)> fmt, int index);
-
-#define REPLACE_CONSTANT(src, val, idx) { src = replace_first(src, [&](std::stringstream &fmt) { fmt << val; }, idx); }
--- a/caffe2/contrib/ios/metal/MetalShaderUtilities.mm
+++ b/caffe2/contrib/ios/metal/MetalShaderUtilities.mm
@ -1,20 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "MetalShaderUtilities.h"
-
-std::string replace_first(std::string input, std::function<void(std::stringstream &fs)> fmt, int index) {
-  std::stringstream value_format_stream;
-  value_format_stream << "= ";
-  fmt(value_format_stream);
-
-  std::stringstream token_format_stream;
-  token_format_stream << "[[ function_constant(" << index << ") ]]";
-
-  size_t position = input.find(token_format_stream.str());
-  if (position != std::string::npos) {
-    size_t length = token_format_stream.str().length();
-    return input.replace(position, length, value_format_stream.str());
-  } else {
-    return input;
-  }
-}
--- a/caffe2/contrib/ios/metal/arm_neon_support.h
+++ b/caffe2/contrib/ios/metal/arm_neon_support.h
@ -1,11 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#ifdef __ARM_NEON__
-#import <arm_neon.h>
-#else
-typedef unsigned short uint16_t;
-typedef uint16_t       float16_t;
-typedef float          float32_t;
-#endif
--- a/caffe2/contrib/ios/metal/data_conversion.h
+++ b/caffe2/contrib/ios/metal/data_conversion.h
@ -1,67 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import <stdlib.h>
-#import <functional>
-
-#import "arm_neon_support.h"
-#import <dispatch/dispatch.h>
-
-template <typename T1, typename T2, void process_data(T1*, const T2*, size_t), int P>
-void parallelize(T1* dst, const T2* src, size_t items, dispatch_queue_t queue) {
-  process_data(dst, src, items);
-  dispatch_apply(P, queue, ^(size_t it) {
-    const size_t chunk_size = it == P - 1 ? items - (P - 1) * (items / P) : items / P;
-    const size_t offset     = it * chunk_size;
-
-    process_data(dst + offset, src + offset, chunk_size);
-  });
-}
-
-template <typename filter_type>
-filter_type* reformatKernelImage(
-    const float*                        input_data,
-    int                                 kernels,
-    int                                 input_kernels,
-    int                                 kernel_offset,
-    int                                 kernel_stride,
-    int                                 channels,
-    int                                 width,
-    int                                 height,
-    bool                                transposed,
-    std::function<filter_type*(size_t)> allocator);
-
-template <typename filter_type>
-bool reformatKernelImage(
-    const float* input_data,
-    filter_type* output_data,
-    int          kernels,
-    int          input_kernels,
-    int          kernel_offset,
-    int          kernel_stride,
-    int          channels,
-    int          width,
-    int          height,
-    bool         transposed);
-
-template <typename filter_type>
-bool reformatKernelImage(
-    const float*                     input_data,
-    int                              kernels,
-    int                              kernel_channels,
-    int                              kernel_width,
-    int                              kernel_height,
-    bool                             transposed,
-    std::function<filter_type*(int)> allocator);
-
-template <typename out_buffer_type>
-out_buffer_type* reformatInputImage(
-    const float*                            data,
-    int                                     channels,
-    int                                     width,
-    int                                     height,
-    std::function<out_buffer_type*(size_t)> allocator);
-
-template <typename T1, typename T2>
-void memcpycvt(T1* dst, const T2* src, size_t n);
--- a/caffe2/contrib/ios/metal/data_conversion.mm
+++ b/caffe2/contrib/ios/metal/data_conversion.mm
@ -1,265 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "data_conversion.h"
-#import "metal_convolution.h"
-
-#import <stdio.h>
-#import <string.h>
-
-void memcvt_F32_F16(float32_t* dst, const float16_t* src, size_t n) {
-  int i = 0;
-#if defined(__ARM_NEON__)
-  for (; i < 4 * (n / 4); i += 4) {
-    *((float32x4_t*)&dst[i]) = vcvt_f32_f16(*((float16x4_t*)&src[i]));
-  }
-#endif
-  for (; i < n; i++) {
-    dst[i] = src[i];
-  }
-}
-
-void memcvt_F16_F32(float16_t* dst, const float32_t* src, size_t n) {
-  int i = 0;
-#if defined(__ARM_NEON__)
-  for (; i < 4 * (n / 4); i += 4) {
-    *((float16x4_t*)&dst[i]) = vcvt_f16_f32(*((float32x4_t*)&src[i]));
-  }
-#endif
-  for (; i < n; i++) {
-    dst[i] = src[i];
-  }
-}
-
-template <typename T1, typename T2>
-void memcpycvt(T1* dst, const T2* src, size_t n);
-
-template <>
-void memcpycvt(float* dst, const float* src, size_t n) {
-  memcpy(dst, src, n * sizeof(float));
-}
-
-template <>
-void memcpycvt(float16_t* dst, const float* src, size_t n) {
-  parallelize<float16_t, float, memcvt_F16_F32, 2>(
-      dst, src, n, dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0));
-}
-
-template <>
-void memcpycvt(float* dst, const float16_t* src, size_t n) {
-  parallelize<float, float16_t, memcvt_F32_F16, 2>(
-      dst, src, n, dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0));
-}
-
-template <typename filter_type>
-bool reformatKernelImage(
-    const float* input_data,
-    filter_type* output_data,
-    int          kernels,
-    int          input_kernels,
-    int          kernel_offset,
-    int          kernel_stride,
-    int          channels,
-    int          width,
-    int          height,
-    bool         transposed) {
-  const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
-
-  if (output_data) {
-    filter_type* buffer = output_data;
-
-    for (int ks = 0; ks < kernels / kernel_stride; ks++) {
-      for (int kb = 0; kb < kernel_stride; kb++) {
-        int k = kernel_offset + kernel_stride * ks + kb;
-
-        for (int c = 0; c < channels; c++) {
-          for (int y = 0; y < height; y++) {
-            for (int x = 0; x < width; x++) {
-              buffer[aligned_kernel_stride * (width * height * (ks * channels + c) + y * width + x) + kb] =
-                  transposed
-                      ? input_data
-                            [(c * input_kernels + k) * width * height + (height - 1 - y) * height + (width - 1 - x)]
-                      : input_data[(k * channels + c) * width * height + y * width + x];
-            }
-          }
-        }
-      }
-    }
-    return true;
-  }
-  return false;
-}
-
-template bool reformatKernelImage<float16_t>(
-    const float* input_data,
-    float16_t*   output_data,
-    int          kernels,
-    int          input_kernels,
-    int          kernel_offset,
-    int          kernel_stride,
-    int          channels,
-    int          width,
-    int          height,
-    bool         transposed);
-
-template bool reformatKernelImage<float>(
-    const float* input_data,
-    float*       output_data,
-    int          kernels,
-    int          input_kernels,
-    int          kernel_offset,
-    int          kernel_stride,
-    int          channels,
-    int          width,
-    int          height,
-    bool         transposed);
-
-template <typename filter_type>
-filter_type* reformatKernelImage(
-    const float*                        input_data,
-    int                                 kernels,
-    int                                 input_kernels,
-    int                                 kernel_offset,
-    int                                 kernel_stride,
-    int                                 channels,
-    int                                 width,
-    int                                 height,
-    bool                                transposed,
-    std::function<filter_type*(size_t)> allocator) {
-  const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
-
-  const int buffer_size = aligned_kernel_stride * (kernels / kernel_stride) * channels * width * height;
-
-  filter_type* output_data = allocator(sizeof(filter_type) * buffer_size);
-
-  reformatKernelImage(
-      input_data,
-      output_data,
-      kernels,
-      input_kernels,
-      kernel_offset,
-      kernel_stride,
-      channels,
-      width,
-      height,
-      transposed);
-
-  return output_data;
-}
-template float16_t* reformatKernelImage<float16_t>(
-    const float*                      input_data,
-    int                               kernels,
-    int                               input_kernels,
-    int                               kernel_offset,
-    int                               kernel_stride,
-    int                               channels,
-    int                               width,
-    int                               height,
-    bool                              transposed,
-    std::function<float16_t*(size_t)> allocator);
-
-template float* reformatKernelImage<float>(
-    const float*                  input_data,
-    int                           kernels,
-    int                           input_kernels,
-    int                           kernel_offset,
-    int                           kernel_stride,
-    int                           channels,
-    int                           width,
-    int                           height,
-    bool                          transposed,
-    std::function<float*(size_t)> allocator);
-
-template <typename filter_type>
-bool reformatKernelImage(
-    const float*                     input_data,
-    int                              kernels,
-    int                              kernel_channels,
-    int                              kernel_width,
-    int                              kernel_height,
-    bool                             transposed,
-    std::function<filter_type*(int)> allocator) {
-  int kernels_per_convolution = kernels;
-
-  if (!calculate_kernels_per_convolution(kernels_per_convolution)) {
-    return false;
-  }
-
-  const int convolutions = kernels / kernels_per_convolution;
-
-  const int kernel_stride = kernels_per_convolution;
-
-  const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
-
-  const int chunk_size = aligned_kernel_stride * (kernels_per_convolution / kernel_stride) * kernel_channels *
-                         kernel_width * kernel_height;
-
-  // This will allocate more memory than needed, but I couldn't find a better solution
-  filter_type* output_data = allocator((aligned_kernel_stride + kernel_stride - 1) / kernel_stride);
-
-  for (int c = 0; c < convolutions; c++) {
-    if (!reformatKernelImage(
-            input_data,
-            output_data + c * chunk_size,
-            kernels_per_convolution,
-            kernels,
-            c * kernels_per_convolution,
-            kernel_stride,
-            kernel_channels,
-            kernel_width,
-            kernel_height,
-            transposed))
-      return false;
-  }
-
-  return true;
-}
-
-template bool reformatKernelImage<float16_t>(
-    const float*                   input_data,
-    int                            kernels,
-    int                            kernel_channels,
-    int                            kernel_width,
-    int                            kernel_height,
-    bool                           transposed,
-    std::function<float16_t*(int)> allocator);
-
-template bool reformatKernelImage<float>(
-    const float*               input_data,
-    int                        kernels,
-    int                        kernel_channels,
-    int                        kernel_width,
-    int                        kernel_height,
-    bool                       transposed,
-    std::function<float*(int)> allocator);
-
-template <typename out_buffer_type>
-out_buffer_type* reformatInputImage(
-    const float*                            data,
-    int                                     channels,
-    int                                     width,
-    int                                     height,
-    std::function<out_buffer_type*(size_t)> allocator) {
-  const int buffer_size = channels * width * height;
-
-  out_buffer_type* output_data = allocator(sizeof(out_buffer_type) * buffer_size);
-
-  if (output_data) {
-    memcpycvt(output_data, data, buffer_size);
-  }
-
-  return output_data;
-}
-
-template float16_t* reformatInputImage<float16_t>(
-    const float*                      data,
-    int                               channels,
-    int                               width,
-    int                               height,
-    std::function<float16_t*(size_t)> allocator);
-
-template float* reformatInputImage<float>(
-    const float*                  data,
-    int                           channels,
-    int                           width,
-    int                           height,
-    std::function<float*(size_t)> allocator);
--- a/caffe2/contrib/ios/metal/metal_convolution.h
+++ b/caffe2/contrib/ios/metal/metal_convolution.h
@ -1,31 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import <Metal/Metal.h>
-
-bool calculate_kernels_per_convolution(int& kernels_per_convolution);
-
-bool metal_convolution(
-    id<MTLBuffer> inputBuffer,
-    int           input_channels,
-    int           input_width,
-    int           input_height,
-    int           input_stride_x,
-    int           input_stride_y,
-    int           input_pad_t,
-    int           input_pad_l,
-    int           input_pad_b,
-    int           input_pad_r,
-    id<MTLBuffer> weightBuffer,
-    int           output_channels,
-    int           kernel_channels,
-    int           kernel_width,
-    int           kernel_height,
-    id<MTLBuffer> outputBuffer,
-    int           output_number,
-    int           output_width,
-    int           output_height,
-    const float*  bias,
-    int           bias_length,
-    bool          transposed);
--- a/caffe2/contrib/ios/metal/metal_convolution.mm
+++ b/caffe2/contrib/ios/metal/metal_convolution.mm
@ -1,140 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "arm_neon_support.h"
-#import "FBMetalCNNConvolution.h"
-#import "metal_convolution.h"
-
-#include "caffe2/core/logging.h"
-
-@interface MetalConvolutionCacheEntry : NSObject
-@property (nonatomic, strong) NSMutableArray<FBMetalCNNConvolution*>* convolutions;
-@end
-@implementation MetalConvolutionCacheEntry
-@end
-
-static MetalContext* metalContext = NULL;
-static NSMutableDictionary<NSString*, MetalConvolutionCacheEntry*>* convolutionCache = NULL;
-
-static void init_metal_pipeline() {
-  if (metalContext == NULL) {
-    metalContext     = [MetalContext getContext];
-    convolutionCache = [NSMutableDictionary<NSString*, MetalConvolutionCacheEntry*> dictionary];
-  }
-}
-
-bool calculate_kernels_per_convolution(int& kernels_per_convolution) {
-  while (kernels_per_convolution > MAX_KERNELS_PER_CONVOLUTION) {
-    if (kernels_per_convolution % 3 == 0)
-    kernels_per_convolution /= 3;
-    else if (kernels_per_convolution % 2 == 0)
-    kernels_per_convolution /= 2;
-    else {
-      LOG(ERROR) << "The number of output channels must be a multiple of 2 or 3\n";
-      return false;
-    }
-  }
-  return true;
-}
-
-bool metal_convolution(
-    id<MTLBuffer> inputBuffer,
-    int           input_channels,
-    int           input_width,
-    int           input_height,
-    int           input_stride_x,
-    int           input_stride_y,
-    int           input_pad_t,
-    int           input_pad_l,
-    int           input_pad_b,
-    int           input_pad_r,
-    id<MTLBuffer> weightBuffer,
-    int           output_channels,
-    int           kernel_channels,
-    int           kernel_width,
-    int           kernel_height,
-    id<MTLBuffer> outputBuffer,
-    int           output_number,
-    int           output_width,
-    int           output_height,
-    const float*  bias,
-    int           bias_length,
-    bool          transposed) {
-  init_metal_pipeline();
-
-  if (transposed) {
-    int t           = output_channels;
-    output_channels         = kernel_channels;
-    kernel_channels = t;
-  }
-
-  int output_batch_size = output_channels;
-
-  if (!calculate_kernels_per_convolution(output_batch_size)) {
-    return false;
-  }
-
-  FBMetalCNNConstantValues constantValues = FBMetalCNNConstantValues(
-                                                                     input_width,
-                                                                     input_height,
-                                                                     input_channels,
-                                                                     input_stride_x,
-                                                                     input_stride_y,
-                                                                     input_pad_t,
-                                                                     input_pad_l,
-                                                                     input_pad_b,
-                                                                     input_pad_r,
-                                                                     kernel_width,
-                                                                     kernel_height,
-                                                                     output_width,
-                                                                     output_height,
-                                                                     output_batch_size,
-                                                                     transposed);
-
-  NSString* key = [NSString stringWithUTF8String:constantValues.to_string().c_str()];
-
-  const int batches = output_channels / output_batch_size;
-
-  MetalConvolutionCacheEntry* cc = convolutionCache[key];
-
-  if (cc == NULL) {
-//    printf("metal_convolution_mtlbuffer: %s\n", [key UTF8String]);
-    convolutionCache[key] = cc = [[MetalConvolutionCacheEntry alloc] init];
-    cc.convolutions            = [NSMutableArray<FBMetalCNNConvolution*> arrayWithCapacity:batches];
-
-
-    for (int c = 0; c < batches; c++) {
-      FBMetalCNNConvolution* convolution = [FBMetalCNNConvolution filterWithContext:metalContext
-                                                                           channels:input_channels
-                                                                        kernel_size:kernel_height
-                                                                     constantValues:&constantValues
-                                                                              width:output_width
-                                                                             height:output_height
-                                                                           stride_x:input_stride_x
-                                                                           stride_y:input_stride_y];
-      cc.convolutions[c] = convolution;
-    }
-  }
-
-  for (int c = 0; c < batches; c++) {
-    FBMetalCNNConvolution* convolution = cc.convolutions[c];
-
-    [convolution loadBiasData:&bias[output_batch_size * c] length:(NSUInteger)output_batch_size];
-
-    convolution.dataBuffer   = inputBuffer;
-    convolution.outputBuffer = outputBuffer;
-    convolution.weightBuffer = weightBuffer;
-
-    const int aligned_kernel_stride = output_batch_size <= 2 ? output_batch_size : 4 * ((output_batch_size + 3) / 4);
-    const int buffer_size = aligned_kernel_stride * (output_batch_size / output_batch_size) * kernel_channels *
-                            kernel_width * kernel_height;
-
-    int weightBuffer_offset = c * sizeof(weight_buffer_type) * buffer_size;
-    int outputBuffer_offset = c * sizeof(data_buffer_type) * output_batch_size * output_width * output_height;
-
-    [convolution applyFilter:(void (^)(NSError* error)) nullptr
-          weightBufferOffset:weightBuffer_offset
-          outputBufferOffset:outputBuffer_offset];
-  }
-
-  return true;
-}
--- a/caffe2/contrib/ios/metal/metal_instance_norm.h
+++ b/caffe2/contrib/ios/metal/metal_instance_norm.h
@ -1,17 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-#import <Metal/Metal.h>
-
-bool metal_instance_norm(
-                         id<MTLBuffer> inputBuffer,
-                         int           input_channels,
-                         int           input_width,
-                         int           input_height,
-                         id<MTLBuffer> scaleDataBuffer,
-                         id<MTLBuffer> biasDataBuffer,
-                         id<MTLBuffer> outputBuffer,
-                         id<MTLBuffer> preluBuffer,
-                         int           prelu_length,
-                         float         epsilon_);
--- a/caffe2/contrib/ios/metal/metal_instance_norm.mm
+++ b/caffe2/contrib/ios/metal/metal_instance_norm.mm
@ -1,77 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "arm_neon_support.h"
-#import "MetalContext.h"
-
-#import "FBMetalCNNInstanceNorm.h"
-#import <Metal/Metal.h>
-
-@interface InstanceNormCacheEntry : NSObject
-@property (nonatomic, strong) FBMetalCNNInstanceNorm* instanceNorm;
-@end
-@implementation InstanceNormCacheEntry
-@end
-
-static MetalContext* metalContext = NULL;
-static NSMutableDictionary<NSString*, InstanceNormCacheEntry*>* instanceNormCache = NULL;
-
-static void init_metal_pipeline() {
-  if (metalContext == NULL) {
-    metalContext      = [MetalContext getContext];
-    instanceNormCache = [NSMutableDictionary<NSString*, InstanceNormCacheEntry*> dictionary];
-  }
-}
-
-bool metal_instance_norm(
-                         id<MTLBuffer> inputBuffer,
-                         int           input_channels,
-                         int           input_width,
-                         int           input_height,
-                         id<MTLBuffer> scaleDataBuffer,
-                         id<MTLBuffer> biasDataBuffer,
-                         id<MTLBuffer> outputBuffer,
-                         id<MTLBuffer> preluBuffer,
-                         int           prelu_size,
-                         float         epsilon_) {
-  init_metal_pipeline();
-
-  NSString* key = [NSString stringWithFormat:@"X:%d:%d:%d-P:%d", input_channels, input_width, input_height, prelu_size];
-
-  InstanceNormCacheEntry* cc = instanceNormCache[key];
-
-  if (cc == NULL) {
-    cc                     = [[InstanceNormCacheEntry alloc] init];
-    instanceNormCache[key] = cc;
-
-    FBMetalInstanceNormConstantValues constantValues =
-    FBMetalInstanceNormConstantValues(input_width, input_height, input_channels, prelu_size);
-
-    id<MTLBuffer> avgBuffer =
-    [metalContext.device newBufferWithLength:sizeof(float) * input_channels options:MTLStorageModeShared];
-    id<MTLBuffer> stdevBuffer =
-    [metalContext.device newBufferWithLength:sizeof(float) * input_channels options:MTLStorageModeShared];
-
-    FBMetalCNNInstanceNorm* instanceNorm = [FBMetalCNNInstanceNorm filterWithContext:metalContext
-                                                                      constantValues:&constantValues
-                                                                               width:input_width
-                                                                              height:input_height
-                                                                             channel:input_channels
-                                                                           withPRelu:preluBuffer != nil];
-
-    instanceNorm.avgBuffer   = avgBuffer;
-    instanceNorm.stdevBuffer = stdevBuffer;
-    [instanceNorm loadEpsilon:epsilon_];
-    cc.instanceNorm = instanceNorm;
-  }
-
-  FBMetalCNNInstanceNorm* instanceNorm = cc.instanceNorm;
-  instanceNorm.dataBuffer              = inputBuffer;
-  instanceNorm.outputBuffer            = outputBuffer;
-  instanceNorm.scaleBuffer             = scaleDataBuffer;
-  instanceNorm.biasBuffer              = biasDataBuffer;
-  instanceNorm.preluBuffer             = preluBuffer;
-
-  [instanceNorm applyFilter:(void (^)(NSError* error)) nullptr];
-
-  return true;
-}
--- a/caffe2/contrib/ios/metal/metal_prelu.h
+++ b/caffe2/contrib/ios/metal/metal_prelu.h
@ -1,12 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-bool metal_prelu(
-    id<MTLBuffer> inputBuffer,
-    int           input_channels,
-    int           input_width,
-    int           input_height,
-    id<MTLBuffer> weightBuffer,
-    int           weight_length,
-    id<MTLBuffer> outputBuffer);
--- a/caffe2/contrib/ios/metal/metal_prelu.mm
+++ b/caffe2/contrib/ios/metal/metal_prelu.mm
@ -1,63 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "arm_neon_support.h"
-#import "MetalContext.h"
-#import "FBMetalCNNPRelu.h"
-#import "metal_prelu.h"
-
-#include "caffe2/core/logging.h"
-
-@interface PReluCacheEntry : NSObject
-@property (nonatomic, strong) FBMetalCNNPRelu* prelu;
-@end
-@implementation PReluCacheEntry
-@end
-
-static MetalContext* metalContext = NULL;
-static NSMutableDictionary<NSString*, PReluCacheEntry*>* preluCache = NULL;
-
-static void init_metal_pipeline() {
-  if (metalContext == NULL) {
-    metalContext = [MetalContext getContext];
-    preluCache   = [NSMutableDictionary<NSString*, PReluCacheEntry*> dictionary];
-  }
-}
-
-bool metal_prelu(
-    id<MTLBuffer> inputBuffer,
-    int           input_channels,
-    int           input_width,
-    int           input_height,
-    id<MTLBuffer> weightBuffer,
-    int           weight_length,
-    id<MTLBuffer> outputBuffer) {
-  init_metal_pipeline();
-
-  NSString* key =
-      [NSString stringWithFormat:@"X:%d:%d:%d-F:%d", input_channels, input_width, input_height, weight_length];
-
-  PReluCacheEntry* cc = preluCache[key];
-
-  if (cc == NULL) {
-    preluCache[key] = cc = [[PReluCacheEntry alloc] init];
-
-    FBMetalPReluConstantValues constantValues =
-        FBMetalPReluConstantValues(input_width, input_height, input_channels, weight_length);
-
-    cc.prelu = [FBMetalCNNPRelu filterWithContext:metalContext
-                                   constantValues:&constantValues
-                                            width:input_width
-                                           height:input_height
-                                          channel:input_channels];
-  }
-
-  FBMetalCNNPRelu* prelu = cc.prelu;
-
-  prelu.dataBuffer   = inputBuffer;
-  prelu.outputBuffer = outputBuffer;
-  prelu.weightBuffer = weightBuffer;
-
-  [prelu applyFilter:(void (^)(NSError* error)) nullptr];
-
-  return true;
-}
--- a/caffe2/contrib/ios/metal/metal_sync_op.h
+++ b/caffe2/contrib/ios/metal/metal_sync_op.h
@ -1,5 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-
-bool metal_sync_op();
--- a/caffe2/contrib/ios/metal/metal_sync_op.mm
+++ b/caffe2/contrib/ios/metal/metal_sync_op.mm
@ -1,36 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#import "arm_neon_support.h"
-#import "MetalContext.h"
-#import "FBMetalCNNNoOp.h"
-#import "metal_sync_op.h"
-
-static MetalContext* metalContext = NULL;
-static FBMetalCNNNoOp* noOpCache = NULL;
-
-static void init_metal_pipeline() {
-  if (metalContext == NULL) {
-    metalContext = [MetalContext getContext];
-  }
-}
-
-bool metal_sync_op() {
-  init_metal_pipeline();
-
-  if (noOpCache == NULL) {
-    noOpCache = [FBMetalCNNNoOp filterWithContext:metalContext
-                                            width:1
-                                           height:1];
-  }
-
-  static dispatch_semaphore_t gpu_execution_done = NULL;
-  if (gpu_execution_done == NULL)
-    gpu_execution_done = dispatch_semaphore_create(0);
-
-  [noOpCache applyFilter:^(NSError* error) {
-    dispatch_semaphore_signal(gpu_execution_done);
-  }];
-  dispatch_semaphore_wait(gpu_execution_done, DISPATCH_TIME_FOREVER);
-
-  return true;
-}
--- a/caffe2/contrib/ios/metal/metal_test.cc
+++ b/caffe2/contrib/ios/metal/metal_test.cc
@ -1,883 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/core/logging.h"
-#include "caffe2/core/workspace.h"
-#include "metal_test.h"
-#include "rewrite_net.h"
-
-#define DEBUGGING false
-
-namespace caffe2 {
-void testMetalCopyOps(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "MetalCopyFrom/To Test";
-  Workspace ws;
-  {
-    auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPU");
-    op.add_input("X_cpu");
-    op.add_output("X_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyFromMetalGPU");
-    op.add_input("X_mtl");
-    op.add_output("Y_cpu");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto &t1 = ws.GetBlob("X_cpu")->Get<TensorCPU>();
-  const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-
-//  for (auto i = 0; i < t1.size(); ++i) {
-//    LOG(INFO) << "i: " << i << ", CPU: " << t1.data<float>()[i] << ", MTL: " << t2.data<float>()[i];
-//  }
-
-  for (auto i = 0; i < t1.size(); ++i) {
-    CHECK_NEAR(t1.data<float>()[i], t2.data<float>()[i], error);
-  }
-}
-
-void testMetalInstanceNorm(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "MetalInstanceNorm Test";
-  Workspace ws;
-  {
-    auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 3, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = i;
-    }
-  }
-  {
-    auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = 8 - 2 * i;
-    }
-  }
-
-  NetDef netdef;
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPU");
-    op.add_input("X_cpu");
-    op.add_output("X_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPUFloat16");
-    op.add_input("W");
-    op.add_output("W_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPUFloat16");
-    op.add_input("b");
-    op.add_output("b_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("MetalInstanceNorm");
-    op.add_input("X_mtl");
-    op.add_input("W_mtl");
-    op.add_input("b_mtl");
-    op.add_output("Y_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyFromMetalGPU");
-    op.add_input("Y_mtl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("InstanceNorm");
-    op.add_input("X_cpu");
-    op.add_input("W");
-    op.add_input("b");
-    auto &arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
-
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-  for (auto i = 0; i < t1.size(); ++i) {
-    const float t1_i = t1.data<float>()[i];
-    const float t2_i = t2.data<float>()[i];
-    CHECK_NEAR(t1_i, t2_i, error);
-  }
-}
-
-void testMetalPRelu(int N, int C, int H, int W, int K, float error) {
-  LOG(INFO) << "MetalPRelu Test";
-  Workspace ws;
-  {
-    auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
-    t->Resize(K);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  // The MetalPRelu is an in-place operator
-  NetDef netdef;
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPU");
-    op.add_input("X_cpu");
-    op.add_output("X_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPUFloat16");
-    op.add_input("b");
-    op.add_output("b_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("MetalPRelu");
-    op.add_input("X_mtl");
-    op.add_input("b_mtl");
-    op.add_output("X_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyFromMetalGPU");
-    op.add_input("X_mtl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("PRelu");
-    op.add_input("X_cpu");
-    op.add_input("b");
-    auto &arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
-
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-
-  for (auto i = 0; i < t1.size(); ++i) {
-    const float t1_i = t1.data<float>()[i];
-    const float t2_i = t2.data<float>()[i];
-    CHECK_NEAR(t1_i, t2_i, error);
-  }
-}
-
-void testMetalInstanceNormPRelu(int N, int C, int H, int W, float error) {
-  LOG(INFO) << "MetalInstanceNormPRelu Test";
-  Workspace ws;
-  {
-    auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    // Too noisy.
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 3, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = i;
-    }
-  }
-  {
-    auto *t = ws.CreateBlob("b1")->GetMutable<TensorCPU>();
-    t->Resize(C);
-    CPUContext ctx;
-    for (auto i = 0; i < t->size(); ++i) {
-      t->mutable_data<float>()[i] = 8 - 2 * i;
-    }
-  }
-  // bias for PRelu
-  {
-    auto *t = ws.CreateBlob("b2")->GetMutable<TensorCPU>();
-    t->Resize(C);
-    CPUContext ctx;
-
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPU");
-    op.add_input("X_cpu");
-    op.add_output("X_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPUFloat16");
-    op.add_input("W");
-    op.add_output("W_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPUFloat16");
-    op.add_input("b1");
-    op.add_output("b1_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPUFloat16");
-    op.add_input("b2");
-    op.add_output("b2_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("MetalInstanceNormPRelu");
-    op.add_input("X_mtl");
-    op.add_input("W_mtl");
-    op.add_input("b1_mtl");
-    op.add_input("b2_mtl");
-    op.add_output("Y_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyFromMetalGPU");
-    op.add_input("Y_mtl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("InstanceNorm");
-    op.add_input("X_cpu");
-    op.add_input("W");
-    op.add_input("b1");
-    auto &arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_mid");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("PRelu");
-    op.add_input("Y_mid");
-    op.add_input("b2");
-    auto &arg = *(op.add_arg());
-    arg.set_name("order");
-    arg.set_s("NCHW");
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto &t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  const auto &t2 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
-
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-
-  for (auto i = 0; i < t1.size(); ++i) {
-    const float t1_i = t1.data<float>()[i];
-    const float t2_i = t2.data<float>()[i];
-    CHECK_NEAR(t1_i, t2_i, error);
-  }
-}
-
-void testMetalConv(int N, int C, int H, int W, int K, int kernel_h, int kernel_w, int pad, float error) {
-  LOG(INFO) << "MetalConv Test";
-  Workspace ws;
-  {
-    auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
-    t->Resize(K, C, kernel_h, kernel_w);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
-    t->Resize(K);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPU");
-    op.add_input("X_cpu");
-    op.add_output("X_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyWeightTensorToMetalGPU");
-    op.add_input("W");
-    op.add_output("W_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("MetalConv");
-    op.add_input("X_mtl");
-    op.add_input("W_mtl");
-    op.add_input("b");
-    op.set_engine("METAL");
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_h);
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(pad);
-    }
-    op.add_output("Y_mtl");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyFromMetalGPU");
-    op.add_input("Y_mtl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("Conv");
-    op.add_input("X_cpu");
-    op.add_input("W");
-    op.add_input("b");
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_h);
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(pad);
-    }
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
-
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-
-//  for (auto i = 0; i < t1.size(); ++i) {
-//    const float t1_i = t1.data<float>()[i];
-//    const float t2_i = t2.data<float>()[i];
-//    if (std::abs(t1_i - t2_i) > 0.5) {
-//      LOG(INFO) << "i: " << i << ", CPU: " << t1_i << ", MTL: " << t2_i << ", error: " << std::abs(t1_i - t2_i) / t1_i;
-//    }
-//  }
-
-  for (auto i = 0; i < t1.size(); ++i) {
-    // FP16 <-> FP32 round trip, accumulation, etc.
-    const float t1_i = t1.data<float>()[i];
-    const float t2_i = t2.data<float>()[i];
-    CHECK_NEAR(t1_i, t2_i, error);
-  }
-}
-
-void testMetalConvTranspose(
-    int N, int C, int H, int W, int K, int kernel_h, int kernel_w, int pad, int stride, float error) {
-  LOG(INFO) << "MetalConvTranspose Test";
-  Workspace ws;
-  {
-    auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
-    t->Resize(N, C, H, W);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
-    t->Resize(K, C, kernel_h, kernel_w);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  {
-    auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
-    t->Resize(C);
-    CPUContext ctx;
-    math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
-  }
-
-  NetDef netdef;
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyToMetalGPU");
-    op.add_input("X_cpu");
-    op.add_output("X_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyTransposeWeightTensorToMetalGPU");
-    op.add_input("W");
-    op.add_output("W_mtl");
-    op.set_engine("METAL");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("MetalConvTranspose");
-    op.add_input("X_mtl");
-    op.add_input("W_mtl");
-    op.add_input("b");
-    op.set_engine("METAL");
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_h);
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(pad);
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("stride");
-      arg.set_i(stride);
-    }
-    op.add_output("Y_mtl");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("CopyFromMetalGPU");
-    op.add_input("Y_mtl");
-    op.add_output("Y_cpu");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("ConvTranspose");
-    op.add_input("X_cpu");
-    op.add_input("W");
-    op.add_input("b");
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("order");
-      arg.set_s("NCHW");
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("kernel");
-      arg.set_i(kernel_h);
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("pad");
-      arg.set_i(pad);
-    }
-    {
-      auto &arg = *(op.add_arg());
-      arg.set_name("stride");
-      arg.set_i(stride);
-    }
-    op.add_output("Y_ref");
-  }
-
-  ws.RunNetOnce(netdef);
-  const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
-  const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
-  CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
-  LOG(INFO) << "N: " << t1.dim(0) << " C: " << t1.dim(1) << " H: " << t1.dim(2) << " W: " << t1.dim(3);
-
-#if DEBUGGING
-  for (auto i = 0; i < t1.size(); ++i) {
-    const float t1_i = t1.data<float>()[i];
-    const float t2_i = t2.data<float>()[i];
-    if (std::abs(t1_i - t2_i) > error) {
-      LOG(INFO) << "i: " << i << ", CPU: " << t1_i << ", MTL: " << t2_i
-                << ", relative error: " << 100 * std::abs(t1_i - t2_i) / t1_i << "%";
-    }
-  }
-
-  printf("CPU:");
-  for (int i = 0; i < t1.size(); i++) {
-    const float t1_i = t1.data<float>()[i];
-    if (i % t1.dim(2) == 0)
-      printf("\n");
-    printf("%f\t", t1_i);
-  }
-
-  printf("\nMETAL:");
-  for (int i = 0; i < t1.size(); i++) {
-    const float t2_i = t2.data<float>()[i];
-    if (i % t2.dim(2) == 0)
-      printf("\n");
-    printf("%f\t", t2_i);
-  }
-  printf("\n");
-#endif
-
-  for (auto i = 0; i < t1.size(); ++i) {
-    // FP16 <-> FP32 round trip, accumulation, etc.
-    const float t1_i = t1.data<float>()[i];
-    const float t2_i = t2.data<float>()[i];
-    CHECK_NEAR(t1_i, t2_i, error);
-  }
-}
-void testMetalRewriteWithFusion() {
-  for (const auto &computeOp : std::vector<std::string>{"InstanceNorm"}) {
-    LOG(INFO) << "RewriteForMetal Fusion/Copy Test";
-    NetDef netdef;
-    netdef.add_external_input("X");
-    netdef.add_external_output("Y");
-    // These two ops can be fused.
-    {
-      auto &op = *(netdef.add_op());
-      op.set_type(computeOp);
-      op.add_input("X");
-      op.add_input("W");
-      op.add_input("b1");
-      op.add_output("Y");
-    }
-    {
-      auto &op = *(netdef.add_op());
-      op.set_type("PRelu");
-      op.add_input("Y");
-      op.add_input("b2");
-      op.add_output("Y");
-    }
-    // Can't fuse these as not in-place (can fix by using SSA).
-    {
-      auto &op = *(netdef.add_op());
-      op.set_type(computeOp);
-      op.add_input("X2");
-      op.add_input("W");
-      op.add_input("b1");
-      op.add_output("Y2");
-    }
-    {
-      auto &op = *(netdef.add_op());
-      op.set_type("PRelu");
-      op.add_input("Y2");
-      op.add_input("b2");
-      op.add_output("Y");
-    }
-
-    netdef = rewritePredictNetForMetal(netdef, "METAL");
-    // dumpDef(netdef);
-
-    auto ty = [&](size_t i) { return netdef.op(i).type(); };
-    auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
-    auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
-    CHECK_EQ(netdef.op_size(), 5);
-    CHECK_EQ(ty(0), "CopyToMetalGPU");
-    CHECK_EQ(ty(1), std::string("Metal") + computeOp + std::string("PRelu"));
-    CHECK_EQ(ty(2), std::string("Metal") + computeOp);
-    CHECK_EQ(ty(3), "MetalPRelu");
-    CHECK_EQ(ty(4), "CopyFromMetalGPU");
-    CHECK_EQ(i0(0), "X");
-    CHECK_EQ(i0(1), o0(0));
-    CHECK_EQ(o0(2), "Y2_M");
-    CHECK_EQ(i0(3), o0(2));
-    CHECK_EQ(i0(4), o0(3));
-    CHECK_NE(o0(4), i0(4));
-    CHECK_EQ(netdef.external_input(0), "X");
-    CHECK_EQ(netdef.external_output(0), "Y");
-  }
-}
-
-void testMetalRewriteWithMultiInputCPUOps() {
-
-  LOG(INFO) << "RewriteForMetal Test";
-  NetDef netdef;
-  netdef.add_external_input("X");
-  netdef.add_external_output("Y");
-  // These two ops can be fused.
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("Conv");
-    op.add_input("X");
-    op.add_input("W");
-    op.add_input("b");
-    op.add_output("Y1");
-  }
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("Conv");
-    op.add_input("Y1");
-    op.add_input("W");
-    op.add_input("b");
-    op.add_output("Y2");
-  }
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("Add");
-    op.add_input("Y1");
-    op.add_input("Y2");
-    op.add_output("Y");
-  }
-
-  netdef = rewritePredictNetForMetal(netdef, "METAL");
-  // dumpDef(netdef);
-
-  auto ty = [&](size_t i) { return netdef.op(i).type(); };
-  auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
-  auto i1 = [&](size_t i) { return netdef.op(i).input(1); };
-  auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
-  CHECK_EQ(netdef.op_size(), 6);
-  CHECK_EQ(ty(0), "CopyToMetalGPU");
-  CHECK_EQ(ty(1), "MetalConv");
-  CHECK_EQ(ty(2), "MetalConv");
-  CHECK_EQ(ty(3), "CopyFromMetalGPU");
-  CHECK_EQ(ty(4), "CopyFromMetalGPU");
-  CHECK_EQ(ty(5), "Add");
-  CHECK_EQ(i0(0), "X");
-  CHECK_EQ(i0(1), o0(0));
-  CHECK_EQ(o0(2), "Y2_M");
-  CHECK_EQ(i0(3), o0(1));
-  CHECK_EQ(i0(4), o0(2));
-  CHECK_EQ(i0(5), o0(3));
-  CHECK_EQ(i1(5), o0(4));
-  CHECK_EQ(o0(5), "Y");
-  CHECK_EQ(netdef.external_input(0), "X");
-  CHECK_EQ(netdef.external_output(0), "Y");
-}
-
-void testMetalRewriteFailure() {
-  LOG(INFO) << "RewriteForMetal Failure Test";
-  NetDef netdef;
-  netdef.add_external_input("X");
-  netdef.add_external_output("Y");
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("Conv");
-    op.add_input("X");
-    op.add_input("W");
-    op.add_input("b");
-    op.add_output("Y1");
-  }
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("Conv");
-    op.add_input("X");
-    op.add_input("W");
-    op.add_input("b");
-    op.add_output("Y2");
-  }
-
-  {
-    auto &op = *(netdef.add_op());
-    op.set_type("Concat");
-    op.add_input("Y1");
-    op.add_input("Y2");
-    op.add_output("Y");
-  }
-  try {;
-    netdef = rewritePredictNetForMetal(netdef, "METAL");
-    // dumpDef(netdef);
-    CHECK(false) << "Shouldn't reach here, due to multiple usages of X";
-  } catch (const std::exception &e) {
-    LOG(INFO) << "RewriteForMetal failed";
-  }
-}
-
-void testMetal() {
-  testMetalCopyOps(1, 3, 2, 1, 1e-2);
-  testMetalCopyOps(1, 4, 1, 1, 1e-2);
-  testMetalCopyOps(1, 4, 8, 3, 1e-2);
-  testMetalCopyOps(1, 6, 8, 3, 1e-2);
-  testMetalCopyOps(1, 4, 1, 2, 1e-2);
-  testMetalCopyOps(1, 8, 6, 1, 1e-2);
-  testMetalCopyOps(1, 12, 13, 18, 1e-2);
-
-  testMetalInstanceNorm(1, 3, 120, 140, 0.05);
-  testMetalInstanceNorm(1, 12, 120, 140, 0.05);
-
-  testMetalPRelu(1, 3, 8, 13, 3, 0.1);
-  testMetalPRelu(1, 3, 8, 13, 1, 0.1);
-
-  testMetalInstanceNormPRelu(1, 12, 120, 140, 0.2);
-
-  testMetalConv(1, 12, 57, 72, 8, 3, 3, 1, 1.5);
-  testMetalConv(1, 12, 57, 72, 8, 3, 3, 2, 1.5);
-  testMetalConv(1, 12, 57, 72, 8, 3, 3, 0, 1.5);
-  testMetalConv(1, 12, 57, 72, 8, 2, 2, 0, 1.5);
-
-#if DEBUGGING
-  testMetalConvTranspose(1, 1, 6, 6, 1, 1, 1, 0, 2, 0.1);
-  testMetalConvTranspose(1, 1, 6, 6, 1, 2, 2, 0, 2, 0.1);
-  testMetalConvTranspose(1, 1, 6, 6, 1, 3, 3, 0, 2, 0.1);
-  testMetalConvTranspose(1, 1, 6, 6, 1, 4, 4, 0, 2, 0.1);
-  testMetalConvTranspose(1, 1, 6, 6, 1, 5, 5, 0, 2, 0.1);
-  testMetalConvTranspose(1, 1, 6, 6, 1, 6, 6, 0, 2, 0.1);
-#endif
-  
-  testMetalConvTranspose(1, 16, 320, 180, 16, 2, 2, 0, 2, 1.5);
-  testMetalConvTranspose(1, 4, 320, 180, 4, 4, 4, 1, 2, 1.5);
-  testMetalConvTranspose(1, 4, 320, 180, 4, 4, 4, 0, 4, 1.5);
-
-  testMetalRewriteWithFusion();
-  testMetalRewriteWithMultiInputCPUOps();
-  testMetalRewriteFailure();
-  
-}
-
-NetDef truncateAfter(NetDef def, size_t idx) {
-  // idx = 0, net = 10 -> remove 9
-  // idx = 0, net = 1 -> remove 0
-  const auto toRemove = def.op_size() - idx - 1;
-  for (auto i = 0; i < toRemove; ++i) {
-    def.mutable_op()->RemoveLast();
-  }
-  CHECK_EQ(def.op_size(), idx + 1);
-  return def;
-}
-
-void compareModels(const NetDef &initNet, NetDef predictNet) {
-  auto *arg = predictNet.mutable_op(0)->mutable_arg(0);
-  CHECK_EQ(arg->name(), "noise_std");
-  arg->set_f(0.000001);
-
-  for (auto i = 0; i < predictNet.op_size(); ++i) {
-    auto truncatedPredictNet = truncateAfter(predictNet, i);
-
-    // The copyFromMetalGPUop is added in the rewriting process
-    NetDef truncatedMetalPredictNet = rewritePredictNetForMetal(truncatedMetalPredictNet, "METAL");
-    NetDef metalInitNet = rewriteInitNetForMetal(metalInitNet, truncatedMetalPredictNet, "METAL");
-
-    // dumpDef(truncatedPredictNet);
-    // dumpDef(truncatedMetalPredictNet);
-
-    Workspace cws;
-    cws.RunNetOnce(initNet);
-    {
-      auto *t = cws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
-      t->Resize(1, 224, 224, 4);
-      for (auto i = 0; i < t->size(); ++i) {
-        t->mutable_data<uint8_t>()[i] = i % 225;
-      }
-    }
-    cws.RunNetOnce(truncatedPredictNet);
-
-    Workspace mws;
-    mws.RunNetOnce(metalInitNet);
-    {
-      auto *t = mws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
-      t->Resize(1, 224, 224, 4);
-      for (auto i = 0; i < t->size(); ++i) {
-        t->mutable_data<uint8_t>()[i] = i % 225;
-      }
-    }
-    mws.RunNetOnce(truncatedMetalPredictNet);
-
-    const auto name = truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
-
-    LOG(INFO) << "Checking correspondence for name: " << name << ", idx: " << i;
-    {
-      const auto &mt = mws.GetBlob(name)->Get<TensorCPU>();
-      const auto &ct = cws.GetBlob(name)->Get<TensorCPU>();
-      CHECK_EQ(mt.dims(), ct.dims());
-      for (auto j = 0; j < mt.size(); ++j) {
-        if (mt.IsType<float>()) {
-          if (j < 10) {
-            LOG(INFO) << "i: " << i << ", j: " << j << ", CPU: " << ct.data<float>()[j]
-                      << ", MTL: " << mt.data<float>()[j];
-          }
-          CHECK_NEAR(mt.data<float>()[j], ct.data<float>()[j], 5);
-        } else {
-          CHECK(mt.IsType<uint8_t>());
-          if (j < 10) {
-            LOG(INFO) << "i: " << i << ", j: " << j << ", CPU: " << ct.data<uint8_t>()[j]
-                      << ", MTL: " << mt.data<uint8_t>()[j];
-          }
-          CHECK_NEAR(mt.data<uint8_t>()[j], ct.data<uint8_t>()[j], 5);
-        }
-      }
-    }
-  }
-}
-} // namespace caffe2
--- a/caffe2/contrib/ios/metal/metal_test.h
+++ b/caffe2/contrib/ios/metal/metal_test.h
@ -1,8 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "caffe2/proto/caffe2.pb.h"
-
-namespace caffe2 {
-  void testMetal();
-  void compareModels(const NetDef& initNet, NetDef predictNet);
-}
--- a/caffe2/contrib/ios/metal/rewrite_net.cc
+++ b/caffe2/contrib/ios/metal/rewrite_net.cc
@ -1,304 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#include "rewrite_net.h"
-#include "caffe2/utils/proto_utils.h"
-
-namespace caffe2 {
-
-struct Analysis {
-  struct SSA {
-    using BlobVersions = std::unordered_map<std::string, size_t>;
-    BlobVersions inVersions;
-    BlobVersions outVersions;
-  };
-  std::vector<SSA> ssa;
-  std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
-};
-
-Analysis analyzeNet(const NetDef &net) {
-  Analysis::SSA::BlobVersions frontier;
-  Analysis analysis;
-
-  auto play = [&](size_t i, const OperatorDef &op) {
-    Analysis::SSA::BlobVersions inVersions;
-    for (const auto &s : op.input()) {
-      inVersions[s] = frontier[s];
-      analysis.inUsages[s][frontier[s]].push_back(i);
-    }
-    Analysis::SSA::BlobVersions outVersions;
-    for (const auto &s : op.output()) {
-      if (frontier.find(s) != frontier.end()) {
-        frontier[s] += 1;
-      }
-      outVersions[s] = frontier[s];
-    }
-    analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
-  };
-
-  for (auto i = 0; i < net.op_size(); ++i) {
-    play(i, net.op(i));
-  }
-  return analysis;
-}
-  
-void insertCopyToMetalGPUOp(NetDef &predictNet, const std::string &cpu_blob) {
-  auto *op = predictNet.add_op();
-  op->set_name("CopyFromCPUToGPU");
-  op->set_type("CopyToMetalGPU");
-  op->set_engine("METAL");
-  op->add_input(cpu_blob);
-  op->add_output(cpu_blob + "_M");
-}
-
-void insertCopyFromMetalGPUOp(NetDef &predictNet, const std::string &cpu_blob) {
-  auto *op = predictNet.add_op();
-  op->set_name("CopyFromGPUToCPU");
-  op->set_type("CopyFromMetalGPU");
-  op->add_input(cpu_blob + "_M");
-  op->add_output(cpu_blob);
-}
-
-NetDef insertInputOutputCopyOps(const NetDef &def, std::string engine) {
-  // Do some validation of the outputs. For this version, we require:
-  // - a single input (first element of external_input()) is consumed by the NetDef
-  // - a single output (first element of external_output()) is produced by the NetDef.
-  // - the input is consumed by def.op(0), and this is the only consumer.
-  // - the output is produced by def.op(-1).
-  CAFFE_ENFORCE_GE(def.external_input_size(), 1);
-  CAFFE_ENFORCE_GE(def.external_output_size(), 1);
-  auto analysis = analyzeNet(def);
-  // enforce a single use of the input blob.
-  CAFFE_ENFORCE_GE(def.op_size(), 1);
-
-  const auto &inputBlob = def.external_input(0);
-  // Enforce that the input blob has a single usage - in the first operator.
-  CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
-  // Enforce that the external_output(0) blob is produced by the last operator in this sequence.
-  const auto &outputBlob = def.external_output(0);
-  CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) != analysis.ssa.back().outVersions.end());
-  const auto &outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
-  // This should hold true by definition of the SSA analysis.
-  CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) == analysis.inUsages[outputBlob].end());
-
-  NetDef mdef;
-  mdef.CopyFrom(def);
-  mdef.clear_op();
-
-  std::unordered_map<std::string, std::set<size_t>> cpu_blobs, metal_blobs;
-  cpu_blobs[def.external_input(0)].insert(0);
-
-  for (auto i = 0; i < def.op_size(); i++) {
-    const auto &currentOp = def.op(i);
-    if (currentOp.engine() == engine) {
-      // Metal Op
-      // insert copyToMetalOp
-      for (auto j = 0; j < currentOp.input_size(); j++) {
-        auto &input = currentOp.input(j);
-        auto version = analysis.ssa[i].inVersions[input];
-        if (cpu_blobs[input].count(version) > 0) {
-          insertCopyToMetalGPUOp(mdef, input);
-          metal_blobs[input].insert(version);
-          cpu_blobs[input].erase(version);
-        }
-      }
-
-      auto *op = mdef.add_op();
-      op->CopyFrom(currentOp);
-
-      // swap input blob
-      for (auto j = 0; j < currentOp.input_size(); j++) {
-        auto &input = currentOp.input(j);
-        auto version = analysis.ssa[i].inVersions[input];
-        if (metal_blobs[input].count(version) > 0) {
-          op->set_input(j, input + "_M");
-        }
-      }
-
-      // swap output blob
-      for (auto j = 0; j < currentOp.output_size(); j++) {
-        auto &output = currentOp.output(j);
-        auto version = analysis.ssa[i].outVersions[output];
-        op->set_output(j, output + "_M");
-        metal_blobs[output].insert(version);
-      }
-      // insert copyFromMetalOp after the last op if the last op is a metal op
-      if (i == def.op_size() - 1) {
-        insertCopyFromMetalGPUOp(mdef, currentOp.output(0));
-      }
-    } else {
-      // CPU Op
-      // insert copyFromMetalOp
-      for (auto j = 0; j < currentOp.input_size(); j++) {
-        auto &input = currentOp.input(j);
-        auto version = analysis.ssa[i].inVersions[input];
-        if (metal_blobs[input].count(version) > 0) {
-          insertCopyFromMetalGPUOp(mdef, input);
-        }
-      }
-      auto *op = mdef.add_op();
-      op->CopyFrom(currentOp);
-      for (auto j = 0; j < currentOp.output_size(); j++) {
-        auto &output = currentOp.output(j);
-        auto version = analysis.ssa[i].outVersions[output];
-        cpu_blobs[output].insert(version);
-      }
-    }
-  }
-  return mdef;
-}
-
-bool tryFuseAdjacentOps(const OperatorDef &currentOp, const OperatorDef &nextOp, OperatorDef *fusedOp) {
-  // Check for possible invalid opportunities.
-  // Must be identical outputs, with in-place usage for nextOp.
-  if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
-    return false;
-  }
-  if (currentOp.output(0) != nextOp.input(0) || nextOp.input(0) != nextOp.output(0)) {
-    return false;
-  }
-
-  static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {{
-      {{"MetalInstanceNorm", "MetalPRelu"}, "MetalInstanceNormPRelu"},
-  }};
-  auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
-  if (it == fusionOpportunities.end()) {
-    return false;
-  }
-  LOG(INFO) << "Found a fusion between adjacent ops: (" << currentOp.type() << ", " << nextOp.type() << ") -> "
-            << it->second;
-  fusedOp->CopyFrom(currentOp);
-  fusedOp->set_type(it->second);
-  for (auto i = 1; i < nextOp.input_size(); i++) {
-    fusedOp->add_input(nextOp.input(i));
-  }
-  return true;
-}
-
-NetDef runMetalFusion(const NetDef &def) {
-  CHECK_GE(def.op_size(), 1);
-  NetDef mdef;
-  mdef.CopyFrom(def);
-  mdef.clear_op();
-  auto i = 0;
-
-  while (i < def.op_size()) {
-    if (i == def.op_size() - 1) {
-      VLOG(2) << "Last operator, skipping";
-      auto *op = mdef.add_op();
-      op->CopyFrom(def.op(i));
-      i += 1;
-      continue;
-    }
-
-    const auto &currentOp = def.op(i);
-    const auto &nextOp = def.op(i + 1);
-    OperatorDef fusedOp;
-    if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp)) {
-      VLOG(2) << "Found an adjacent fusion at: " << i;
-      // We can fuse.
-      auto *op = mdef.add_op();
-      op->CopyFrom(fusedOp);
-      i += 2;
-      continue;
-    }
-    VLOG(2) << "No fusion available";
-    // Just emit the current type.
-    auto *op = mdef.add_op();
-    op->CopyFrom(currentOp);
-    i += 1;
-  }
-  return mdef;
-}
-
-void dumpDef(NetDef &net) {
-  for (const auto &op : net.op()) {
-    printf("***Operator: %s\n", op.type().c_str());
-    for (auto input : op.input()) {
-      printf("\tInput: %s\n", input.c_str());
-    }
-
-    for (auto output : op.output()) {
-      printf("\tOutput: %s\n", output.c_str());
-    }
-  }
-}
-
-NetDef rewritePredictNetForMetal(const NetDef &predictNet, const std::string engine) {
-  CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
-  NetDef net;
-  net.CopyFrom(predictNet);
-
-  std::unordered_map<std::string, std::string> replacements({
-      {"Conv", "MetalConv"},
-      {"InstanceNorm", "MetalInstanceNorm"},
-      {"PRelu", "MetalPRelu"},
-      {"ConvTranspose", "MetalConvTranspose"},
-  });
-
-  for (OperatorDef &op : *net.mutable_op()) {
-    if (op.has_type() && replacements.count(op.type()) > 0) {
-      op.set_type(replacements[op.type()]);
-      op.set_engine("METAL");
-    }
-  }
-
-  net = insertInputOutputCopyOps(net, engine);
-  net = runMetalFusion(net);
-  return net;
-}
-
-NetDef rewriteInitNetForMetal(const NetDef &initNet, const NetDef &predictNet, const std::string engine) {
-  // Find the GivenTensorFill operators for weight tensors and change to Metal GivenTensorFill ops
-  NetDef net;
-  net.CopyFrom(initNet);
-  std::set<std::string> conv_weights, conv_tranpose_weights, weights_and_biases;
-  for (auto &op : predictNet.op()) {
-    if (op.engine() == engine) {
-      if (op.type() == "MetalConv") {
-        conv_weights.insert(op.input(1));
-      } else if (op.type() == "MetalConvTranspose") {
-        conv_tranpose_weights.insert(op.input(1));
-      } else {
-        // Need to add support for operators with > 1 input tensors such as Add
-        for (int i = 1; i < op.input_size(); i++) {
-          weights_and_biases.insert(op.input(i));
-        }
-      }
-    }
-  }
-
-  for (auto &op : *net.mutable_op()) {
-    if (op.type() == "GivenTensorFill" && op.output_size() == 1) {
-      if (conv_weights.count(op.output(0)) > 0) {
-        op.set_type("GivenWeightTensorFill");
-        op.set_engine(engine);
-      } else if (conv_tranpose_weights.count(op.output(0)) > 0) {
-        op.set_type("GivenTransposeWeightTensorFill");
-        op.set_engine(engine);
-      } else if (weights_and_biases.count(op.output(0)) > 0) {
-        op.set_type("GivenTensorFloat16MetalFill");
-        op.set_engine(engine);
-      }
-    }
-  }
-  return net;
-}
-
-bool tryConvertToMetal(const NetDef &initNet, const NetDef &predictNet, NetDef *metalInitNet, NetDef *metalPredictNet) {
-  try {
-    // Throws if unsupported operators are found.
-    *metalPredictNet = rewritePredictNetForMetal(predictNet, "METAL");
-    *metalInitNet = rewriteInitNetForMetal(initNet, *metalPredictNet, "METAL");
-
-    // Throws if unsupported parameters are found.
-    Workspace ws;
-    ws.RunNetOnce(*metalInitNet);
-    ws.CreateNet(*metalPredictNet);
-    LOG(INFO) << "Metal is successfully enabled";
-    return true;
-  } catch (const std::exception &e) {
-    LOG(ERROR) << "Caught exception trying to convert NetDef to Metal: " << e.what();
-    return false;
-  }
-}
-} // namespace caffe2
--- a/caffe2/contrib/ios/metal/rewrite_net.h
+++ b/caffe2/contrib/ios/metal/rewrite_net.h
@ -1,13 +0,0 @@
-// Copyright 2004-present Facebook. All Rights Reserved.
-
-#pragma once
-#include "caffe2/core/predictor.h"
-
-namespace caffe2 {
-bool tryConvertToMetal(const NetDef &initNet, const NetDef &predictNet, NetDef *metalInitNet, NetDef *metalPredictNet);
-
-// Exposed for testing
-NetDef rewritePredictNetForMetal(const NetDef &predictNet, const std::string engine);
-NetDef rewriteInitNetForMetal(const NetDef &initNet, const NetDef &predictNet, const std::string engine);
-void dumpDef(NetDef &net);
-}
--- a/caffe2/contrib/ios/mpscnn/CMakeLists.txt
+++ b/caffe2/contrib/ios/mpscnn/CMakeLists.txt
@ -0,0 +1,7 @@
+if(IOS)
+  file(GLOB_RECURSE tmp *.mm *.cc)
+  # exclude test files
+  file(GLOB_RECURSE test_files *_test.cc)
+  exclude(tmp "${tmp}" ${test_files})
+  set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
+endif()
--- a/cmake/MiscCheck.cmake
+++ b/cmake/MiscCheck.cmake
@ -85,3 +85,8 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
      /wd4996 # (3): Use of a deprecated member
  )
 endif()
+
+# ---[ If we are building on ios, we will enable -mfpu=neon-fp16 for iOS Metal build.
+if (IOS)
+  add_definitions("-mfpu=neon-fp16")
+endif()