mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Fix metal build after sync
Summary: While I was trying to make a quick oss cmakefile, I found that some of the ios source files are out of sync with the most code changes. This diff should fix the issues. I manually ran cmake on the oss side with scripts/build_ios.sh to make sure things pass. Reviewed By: ajtulloch Differential Revision: D5582265 fbshipit-source-id: 2636d353d32fcd8fb7087385b9bbed8476e33e74
This commit is contained in:
committed by
Facebook Github Bot
parent
9fcf676cfa
commit
679a586d53
@ -1,4 +1,5 @@
|
||||
add_subdirectory(gloo)
|
||||
add_subdirectory(ios)
|
||||
add_subdirectory(nccl)
|
||||
add_subdirectory(nnpack)
|
||||
add_subdirectory(observers)
|
||||
|
15
caffe2/contrib/ios/CMakeLists.txt
Normal file
15
caffe2/contrib/ios/CMakeLists.txt
Normal file
@ -0,0 +1,15 @@
|
||||
# TODO: figure out conflict between contrib/nnpack/nnpack_ops.cc and mobile_nnpack.cc
|
||||
if(IOS)
|
||||
# Basic ios srcs.
|
||||
set(Caffe2_CONTRIB_IOS_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe_predictor.cc"
|
||||
# "${CMAKE_CURRENT_SOURCE_DIR}/mobile_nnpack.cc"
|
||||
)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_IOS_SRC})
|
||||
|
||||
# mpscnn files
|
||||
add_subdirectory(mpscnn)
|
||||
endif()
|
||||
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
@ -1,99 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "Caffe2 mobile state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#import "data_conversion.h"
|
||||
#import "MetalCaffeContext.h"
|
||||
#import "MetalImageFilter.h"
|
||||
#import "metal_sync_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
class CopyToMetalGPUOp final : public Operator<MetalCaffeContext> {
|
||||
public:
|
||||
CopyToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws) : Operator<MetalCaffeContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const Blob *blob = Inputs()[0];
|
||||
const TensorCPU &X = blob->Get<TensorCPU>();
|
||||
|
||||
CAFFE_ENFORCE(X.dim32(0) == 1);
|
||||
int input_channels = X.dim32(1);
|
||||
int input_width = X.dim32(3);
|
||||
int input_height = X.dim32(2);
|
||||
|
||||
auto *Y = Output(0);
|
||||
Y->Resize(X.dim32(0), input_channels, input_height, input_width);
|
||||
|
||||
const float *input = X.template data<float>();
|
||||
float *output_data = (float *)[GetMetalAllocator()->Buffer((void *)Y->template mutable_data<float>()) contents];
|
||||
|
||||
memcpycvt(output_data, input, Y->size());
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyToMetalGPU, METAL, CopyToMetalGPUOp);
|
||||
OPERATOR_SCHEMA(CopyToMetalGPU).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
class CopyToMetalGPUFloat16Op final : public Operator<MetalCaffeContext> {
|
||||
public:
|
||||
CopyToMetalGPUFloat16Op(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<MetalCaffeContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const Blob *blob = Inputs()[0];
|
||||
const TensorCPU &X = blob->Get<TensorCPU>();
|
||||
|
||||
auto *Y = Output(0);
|
||||
Y->ResizeLike(X);
|
||||
|
||||
const float *input = X.template data<float>();
|
||||
float16_t *output_data =
|
||||
(float16_t *)[GetMetalAllocator()->Buffer((void *)Y->template mutable_data<uint16_t>()) contents];
|
||||
|
||||
memcpycvt(output_data, input, Y->size());
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyToMetalGPUFloat16, METAL, CopyToMetalGPUFloat16Op);
|
||||
OPERATOR_SCHEMA(CopyToMetalGPUFloat16).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
class CopyFromMetalGPUOp final : public Operator<CPUContext> {
|
||||
public:
|
||||
CopyFromMetalGPUOp(const OperatorDef &operator_def, Workspace *ws) : Operator<CPUContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const Blob *blob = Inputs()[0];
|
||||
const TensorMetal &X = blob->Get<TensorMetal>();
|
||||
|
||||
CAFFE_ENFORCE(X.dim32(0) == 1);
|
||||
int input_channels = X.dim32(1);
|
||||
int input_width = X.dim32(3);
|
||||
int input_height = X.dim32(2);
|
||||
|
||||
auto *Y = Output(0);
|
||||
Y->Resize(X.dim32(0), input_channels, input_height, input_width);
|
||||
|
||||
const float *input = (float *)[GetMetalAllocator()->Buffer((void *)X.template data<float>()) contents];
|
||||
float *output_data = Y->template mutable_data<float>();
|
||||
|
||||
metal_sync_op();
|
||||
|
||||
memcpycvt(output_data, input, Y->size());
|
||||
|
||||
return true;
|
||||
}
|
||||
};
|
||||
REGISTER_CPU_OPERATOR(CopyFromMetalGPU, CopyFromMetalGPUOp);
|
||||
OPERATOR_SCHEMA(CopyFromMetalGPU).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_MOBILE
|
@ -1,119 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "arm_neon_support.h"
|
||||
|
||||
#import "MetalImageFilter.h"
|
||||
#import "FBMetalConstantValues.h"
|
||||
|
||||
class FBMetalCNNConstantValues : public FBMetalConstantValues {
|
||||
public:
|
||||
ushort input_width;
|
||||
ushort input_height;
|
||||
ushort input_channels;
|
||||
ushort input_stride_x;
|
||||
ushort input_stride_y;
|
||||
ushort input_pad_t;
|
||||
ushort input_pad_l;
|
||||
ushort input_pad_b;
|
||||
ushort input_pad_r;
|
||||
ushort filter_width;
|
||||
ushort filter_height;
|
||||
ushort output_width;
|
||||
ushort output_height;
|
||||
ushort output_channels;
|
||||
bool transposed;
|
||||
|
||||
FBMetalCNNConstantValues(
|
||||
ushort _input_width,
|
||||
ushort _input_height,
|
||||
ushort _input_channels,
|
||||
ushort _input_stride_x,
|
||||
ushort _input_stride_y,
|
||||
ushort _input_pad_t,
|
||||
ushort _input_pad_l,
|
||||
ushort _input_pad_b,
|
||||
ushort _input_pad_r,
|
||||
ushort _filter_width,
|
||||
ushort _filter_height,
|
||||
ushort _output_width,
|
||||
ushort _output_height,
|
||||
ushort _output_channels,
|
||||
bool _transposed) :
|
||||
input_width(_input_width),
|
||||
input_height(_input_height),
|
||||
input_channels(_input_channels),
|
||||
input_stride_x(_input_stride_x),
|
||||
input_stride_y(_input_stride_y),
|
||||
input_pad_t(_input_pad_t),
|
||||
input_pad_l(_input_pad_l),
|
||||
input_pad_b(_input_pad_b),
|
||||
input_pad_r(_input_pad_r),
|
||||
filter_width(_filter_width),
|
||||
filter_height(_filter_height),
|
||||
output_width(_output_width),
|
||||
output_height(_output_height),
|
||||
output_channels(_output_channels),
|
||||
transposed(_transposed) { }
|
||||
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
constexpr int MAX_KERNELS_PER_CONVOLUTION = 32;
|
||||
|
||||
typedef float32_t data_buffer_type;
|
||||
|
||||
typedef float16_t weight_buffer_type;
|
||||
|
||||
@interface FBMetalCNNConvolution : MetalImageFilter
|
||||
|
||||
@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
|
||||
|
||||
@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
|
||||
|
||||
@property (nonatomic, strong) id<MTLBuffer> weightBuffer;
|
||||
|
||||
@property (nonatomic, strong) id<MTLBuffer> biasBuffer;
|
||||
|
||||
+ (id<MTLBuffer>)loadFilterWithImage:(const float*)weight_data
|
||||
weightBuffer:(id<MTLBuffer>)weightBuffer
|
||||
kernels:(NSUInteger)kernels
|
||||
input_kernels:(NSUInteger)input_kernels
|
||||
kernel_offset:(NSUInteger)kernel_offset
|
||||
kernel_stride:(NSUInteger)kernel_stride
|
||||
channels:(NSUInteger)channels
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
transposed:(bool)transposed
|
||||
context:(MetalContext*)context;
|
||||
|
||||
- (void)loadFilterWithImage:(const float*)weight_data
|
||||
kernels:(NSUInteger)kernels
|
||||
input_kernels:(NSUInteger)input_kernels
|
||||
kernel_offset:(NSUInteger)kernel_offset
|
||||
kernel_stride:(NSUInteger)kernel_stride
|
||||
channels:(NSUInteger)channels
|
||||
width:(NSUInteger)image_width
|
||||
height:(NSUInteger)image_height
|
||||
transposed:(bool)transposed;
|
||||
|
||||
+ (id<MTLBuffer>)loadDataWithImage:(const float*)imageData
|
||||
imageDataBuffer:(id<MTLBuffer>)imageDataBuffer
|
||||
channels:(NSUInteger)channels
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
context:(MetalContext*)context;
|
||||
|
||||
- (void)loadBiasData:(const float *)bias
|
||||
length:(NSUInteger)length;
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
channels:(NSUInteger)channels
|
||||
kernel_size:(NSUInteger)kernel_size
|
||||
constantValues:(FBMetalCNNConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
stride_x:(NSUInteger)stride_x
|
||||
stride_y:(NSUInteger)stride_y;
|
||||
@end
|
@ -1,538 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "FBMetalConstantValues.h"
|
||||
#import "FBMetalCNNConvolution.h"
|
||||
#import "data_conversion.h"
|
||||
#import "MetalShaderUtilities.h"
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
static constexpr size_t kThreadGroupSize_x = 4;
|
||||
static constexpr size_t kThreadGroupSize_y = 8;
|
||||
|
||||
std::string FBMetalCNNConstantValues::to_string() {
|
||||
std::ostringstream ss;
|
||||
|
||||
ss << "X:" <<
|
||||
input_channels << "x" <<
|
||||
input_width << "x" <<
|
||||
input_height << "[" <<
|
||||
input_stride_x << ":" <<
|
||||
input_stride_y << ":" <<
|
||||
input_pad_t << ":" <<
|
||||
input_pad_l << ":" <<
|
||||
input_pad_b << ":" <<
|
||||
input_pad_r << "]-Y:" <<
|
||||
output_channels << "x" <<
|
||||
output_width << "x" <<
|
||||
output_height << "-W:" <<
|
||||
filter_width << "x" <<
|
||||
filter_height << ":" <<
|
||||
(transposed ? "T" : "D");
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
namespace {
|
||||
extern const char *metalCode;
|
||||
}
|
||||
|
||||
@interface FBMetalCNNConvolution () {
|
||||
int stride_x;
|
||||
int stride_y;
|
||||
uint input_batch_size;
|
||||
uint output_channels;
|
||||
}
|
||||
@end
|
||||
|
||||
@implementation FBMetalCNNConvolution
|
||||
|
||||
+ (id<MTLBuffer>)loadFilterWithImage:(const float*)weight_data
|
||||
weightBuffer:(id<MTLBuffer>)weightBuffer
|
||||
kernels:(NSUInteger)kernels
|
||||
input_kernels:(NSUInteger)input_kernels
|
||||
kernel_offset:(NSUInteger)kernel_offset
|
||||
kernel_stride:(NSUInteger)kernel_stride
|
||||
channels:(NSUInteger)channels
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
transposed:(bool)transposed
|
||||
context:(MetalContext*)context
|
||||
|
||||
{
|
||||
reformatKernelImage<weight_buffer_type>(
|
||||
weight_data,
|
||||
kernels,
|
||||
input_kernels,
|
||||
kernel_offset,
|
||||
kernel_stride,
|
||||
channels,
|
||||
width,
|
||||
height,
|
||||
transposed,
|
||||
[&](size_t buffer_size) -> weight_buffer_type* {
|
||||
if (weightBuffer == nil || [weightBuffer length] != sizeof(weight_buffer_type) * buffer_size) {
|
||||
weightBuffer = [context.device newBufferWithLength:sizeof(weight_buffer_type) * buffer_size
|
||||
options:MTLResourceOptionCPUCacheModeDefault];
|
||||
if (weightBuffer == nil) {
|
||||
LOG(ERROR) << "couldn't create weight buffer of size: " << buffer_size;
|
||||
}
|
||||
}
|
||||
|
||||
return (weight_buffer_type*)(weightBuffer ? [weightBuffer contents] : NULL);
|
||||
});
|
||||
|
||||
return weightBuffer;
|
||||
}
|
||||
|
||||
- (void)loadFilterWithImage:(const float*)weight_data
|
||||
kernels:(NSUInteger)kernels
|
||||
input_kernels:(NSUInteger)input_kernels
|
||||
kernel_offset:(NSUInteger)kernel_offset
|
||||
kernel_stride:(NSUInteger)kernel_stride
|
||||
channels:(NSUInteger)channels
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
transposed:(bool)transposed {
|
||||
_weightBuffer = [FBMetalCNNConvolution loadFilterWithImage:weight_data
|
||||
weightBuffer:_weightBuffer
|
||||
kernels:kernels
|
||||
input_kernels:input_kernels
|
||||
kernel_offset:kernel_offset
|
||||
kernel_stride:kernel_stride
|
||||
channels:channels
|
||||
width:width
|
||||
height:height
|
||||
transposed:transposed
|
||||
context:self.context];
|
||||
}
|
||||
|
||||
- (void)loadBiasData:(const float *)bias
|
||||
length:(NSUInteger)length {
|
||||
if (_biasBuffer == nil || [_biasBuffer length] != sizeof(weight_buffer_type) * length) {
|
||||
_biasBuffer = [self.context.device newBufferWithLength:sizeof(weight_buffer_type) * length
|
||||
options:MTLResourceOptionCPUCacheModeDefault];
|
||||
}
|
||||
if (_biasBuffer) {
|
||||
weight_buffer_type *bias_data = (weight_buffer_type *) [_biasBuffer contents];
|
||||
for (int i = 0; i < length; i++) {
|
||||
bias_data[i] = bias[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+ (id<MTLBuffer>)loadDataWithImage:(const float*)imageData
|
||||
imageDataBuffer:(id<MTLBuffer>)imageDataBuffer
|
||||
channels:(NSUInteger)channels
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
context:(MetalContext*)context {
|
||||
id<MTLBuffer> newDataBuffer = imageDataBuffer;
|
||||
|
||||
reformatInputImage<data_buffer_type>(
|
||||
imageData, channels, width, height, [&](size_t buffer_size) -> data_buffer_type* {
|
||||
if (newDataBuffer == nil || [newDataBuffer length] != sizeof(data_buffer_type) * buffer_size) {
|
||||
newDataBuffer = [context.device newBufferWithLength:sizeof(data_buffer_type) * buffer_size
|
||||
options:MTLResourceOptionCPUCacheModeDefault];
|
||||
if (newDataBuffer == nil) {
|
||||
VLOG(0) << "couldn't create data buffer of size: " << buffer_size;
|
||||
}
|
||||
}
|
||||
|
||||
return newDataBuffer ? (data_buffer_type*)[newDataBuffer contents] : NULL;
|
||||
});
|
||||
|
||||
return newDataBuffer;
|
||||
}
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
channels:(NSUInteger)channels
|
||||
kernel_size:(NSUInteger)kernel_size
|
||||
constantValues:(FBMetalCNNConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
stride_x:(NSUInteger)stride_x
|
||||
stride_y:(NSUInteger)stride_y {
|
||||
return [[self alloc] initWithContext:context
|
||||
channels:(NSUInteger)channels
|
||||
kernel_size:kernel_size
|
||||
constantValues:constantValues
|
||||
width:width
|
||||
height:height
|
||||
stride_x:stride_x
|
||||
stride_y:stride_y];
|
||||
}
|
||||
|
||||
- (MTLSize)threadsPerThreadgroup {
|
||||
if (input_batch_size > 1)
|
||||
return MTLSizeMake(input_batch_size * kThreadGroupSize_x, kThreadGroupSize_y / input_batch_size, 1);
|
||||
else
|
||||
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
|
||||
}
|
||||
|
||||
- (MTLSize)threadgroupsPerGrid {
|
||||
MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
|
||||
|
||||
return MTLSizeMake(
|
||||
((input_batch_size * self.outputTextureDescriptor.width + stride_x - 1) / stride_x + threadsPerThreadgroup.width - 1) /
|
||||
threadsPerThreadgroup.width,
|
||||
((self.outputTextureDescriptor.height + stride_y - 1) / stride_y + threadsPerThreadgroup.height - 1) /
|
||||
threadsPerThreadgroup.height,
|
||||
1);
|
||||
}
|
||||
|
||||
// TODO: this code is temporary, we need to find the optimal strategy for large channel numbers
|
||||
|
||||
static uint input_channel_batching(uint input_channels) {
|
||||
return input_channels <= 32 ? 1 :
|
||||
input_channels % 8 == 0 ? 8 :
|
||||
input_channels % 4 == 0 ? 4 :
|
||||
input_channels % 3 == 0 ? 3 :
|
||||
input_channels % 2 == 0 ? 2 : 1;
|
||||
}
|
||||
|
||||
- (instancetype)initWithContext:(MetalContext*)context
|
||||
channels:(NSUInteger)channels
|
||||
kernel_size:(NSUInteger)kernel_size
|
||||
constantValues:(FBMetalCNNConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
stride_x:(NSUInteger)_stride_x
|
||||
stride_y:(NSUInteger)_stride_y {
|
||||
if ((self = [super initWithFunctionName:@"cnn_convolution_kern"
|
||||
libraryName:@"Convolution"
|
||||
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
|
||||
context:context
|
||||
constantValues:constantValues])) {
|
||||
stride_x = _stride_x;
|
||||
stride_y = _stride_y;
|
||||
|
||||
input_batch_size = input_channel_batching(constantValues->input_channels);
|
||||
output_channels = constantValues->output_channels;
|
||||
|
||||
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
|
||||
width:width
|
||||
height:height
|
||||
mipmapped:NO];
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
|
||||
librarySource:(NSString*)librarySource {
|
||||
FBMetalCNNConstantValues* convolutionConstantValues = (FBMetalCNNConstantValues *) constantValues;
|
||||
std::string source = [librarySource UTF8String];
|
||||
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->output_width, 0);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->output_height, 1);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 2);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 3);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_stride_x, 4);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_stride_y, 5);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_pad_t, 6);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_pad_l, 7);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->filter_width, 8);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->filter_height, 9);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 10);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->output_channels, 11);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->transposed, 12);
|
||||
REPLACE_CONSTANT(source, input_channel_batching(convolutionConstantValues->input_channels), 13);
|
||||
|
||||
return [NSString stringWithUTF8String:source.c_str()];
|
||||
}
|
||||
|
||||
// Bind data between C and Metal
|
||||
|
||||
- (void) configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
|
||||
weightBufferOffset:(NSInteger)weightBufferOffset
|
||||
outputBufferOffset:(NSInteger)outputBufferOffset {
|
||||
[commandEncoder setBuffer:_biasBuffer offset:0 atIndex:0];
|
||||
[commandEncoder setBuffer:_weightBuffer offset:weightBufferOffset atIndex:1];
|
||||
[commandEncoder setBuffer:_dataBuffer offset:0 atIndex:2];
|
||||
[commandEncoder setBuffer:_outputBuffer offset:outputBufferOffset atIndex:3];
|
||||
}
|
||||
@end
|
||||
|
||||
namespace {
|
||||
const char *metalCode = R"Metal(
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include <metal_stdlib>
|
||||
|
||||
using namespace metal;
|
||||
|
||||
// These function_constant expressions are replaced at compile time with actual values
|
||||
|
||||
constant constexpr ushort output_size_x [[ function_constant(0) ]];
|
||||
constant constexpr ushort output_size_y [[ function_constant(1) ]];
|
||||
constant constexpr ushort input_size_x [[ function_constant(2) ]];
|
||||
constant constexpr ushort input_size_y [[ function_constant(3) ]];
|
||||
constant constexpr ushort input_stride_x [[ function_constant(4) ]];
|
||||
constant constexpr ushort input_stride_y [[ function_constant(5) ]];
|
||||
constant constexpr ushort input_pad_t [[ function_constant(6) ]];
|
||||
constant constexpr ushort input_pad_l [[ function_constant(7) ]];
|
||||
constant constexpr ushort filter_width [[ function_constant(8) ]];
|
||||
constant constexpr ushort filter_height [[ function_constant(9) ]];
|
||||
constant constexpr ushort input_channels [[ function_constant(10) ]];
|
||||
constant constexpr ushort output_channels [[ function_constant(11) ]];
|
||||
constant constexpr bool transposed [[ function_constant(12) ]];
|
||||
constant constexpr ushort input_batch_size [[ function_constant(13) ]];
|
||||
|
||||
constant constexpr ushort2 input_padding = {
|
||||
transposed ? (filter_width - 1 - input_pad_l) : input_pad_l,
|
||||
transposed ? (filter_height - 1 - input_pad_t) : input_pad_t,
|
||||
};
|
||||
|
||||
constant constexpr ushort DataSize = output_channels <= 2 ? output_channels : 4;
|
||||
constant constexpr ushort DataLength = (output_channels + DataSize - 1) / DataSize;
|
||||
|
||||
typedef float data_buffer_type;
|
||||
|
||||
typedef vec<half, DataSize> vec_t;
|
||||
|
||||
typedef struct {
|
||||
data_buffer_type data[input_channels][input_size_y][input_size_x];
|
||||
} input_data;
|
||||
|
||||
typedef struct {
|
||||
data_buffer_type data[output_channels][output_size_y][output_size_x];
|
||||
} output_data;
|
||||
|
||||
typedef struct {
|
||||
vec_t data[input_channels][filter_height][filter_width][DataLength];
|
||||
} filter_data;
|
||||
|
||||
typedef struct {
|
||||
half data[output_channels];
|
||||
} bias_data;
|
||||
|
||||
typedef vec_t (thread_storage)[input_stride_y][input_stride_x][DataLength];
|
||||
|
||||
// Filter data sampler - returns an aligned array of vectors for the filter interleaved output channel data
|
||||
|
||||
template <typename T>
|
||||
class planar_filter_sampler {
|
||||
const constant T (&channel_data)[filter_height][filter_width][DataLength];
|
||||
|
||||
public:
|
||||
planar_filter_sampler(constant T data[input_channels][filter_height][filter_width][DataLength], ushort channel)
|
||||
: channel_data(data[channel]) {}
|
||||
|
||||
typedef constant T (&return_type)[DataLength];
|
||||
|
||||
return_type operator()(ushort2 off) {
|
||||
return channel_data[off.y][off.x];
|
||||
}
|
||||
};
|
||||
|
||||
// Input data sampler - returns a scalar sample
|
||||
|
||||
template <typename T>
|
||||
class planar_data_sampler {
|
||||
const constant T (&channel_data)[input_size_y][input_size_x];
|
||||
const ushort2 base;
|
||||
|
||||
public:
|
||||
planar_data_sampler(const constant T data[input_channels][input_size_y][input_size_x], ushort2 _base, ushort channel)
|
||||
: channel_data(data[channel]), base(_base) {}
|
||||
|
||||
inline T operator()(ushort2 off) {
|
||||
/*
|
||||
* Note: Even though the documentation states otherwise, Metal will execute both sides of ternary statements,
|
||||
* and, if the array index is out of bounds, it will crash.
|
||||
*/
|
||||
|
||||
const ushort2 idx = base + off;
|
||||
bool in_bounds = all(idx >= input_padding) &&
|
||||
all(idx < (ushort2(input_size_x, input_size_y) * ushort2(input_stride_x, input_stride_y) + input_padding));
|
||||
const ushort2 padded_idx = (idx - (in_bounds ? input_padding : 0)) / ushort2(input_stride_x, input_stride_y);
|
||||
return in_bounds ? channel_data[padded_idx.y][padded_idx.x] : T(0);
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* To speed up data access in the convolution inner loop we use vector data access for the filter.
|
||||
* Every input pixel is multiplied by all the kernel values to generate all the output channels at once.
|
||||
*/
|
||||
|
||||
template <ushort DataRemainder, typename T>
|
||||
void accumulate(const ushort DataLength, T dst[], const constant T w[], const half sample) {
|
||||
for (ushort i = 0; i < DataLength; i++) {
|
||||
dst[i] += sample * w[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <>
|
||||
void accumulate<3, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
|
||||
accumulate<0, half4>(DataLength - 1, dst, w, sample);
|
||||
dst[DataLength - 1].xyz += sample * w[DataLength - 1].xyz;
|
||||
}
|
||||
|
||||
template <>
|
||||
void accumulate<2, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
|
||||
accumulate<0, half4>(DataLength - 1, dst, w, sample);
|
||||
dst[DataLength - 1].xy += sample * w[DataLength - 1].xy;
|
||||
}
|
||||
|
||||
template <>
|
||||
void accumulate<1, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
|
||||
accumulate<0, half4>(DataLength - 1, dst, w, sample);
|
||||
dst[DataLength - 1].x += sample * w[DataLength - 1].x;
|
||||
}
|
||||
|
||||
/*
|
||||
* Convolution inner loop: works for direct convolution (stride == 1) and for the transposed (stride > 1)
|
||||
*/
|
||||
|
||||
template <typename T, typename S1, typename S2>
|
||||
void convolution(T acc[input_stride_y][input_stride_x][DataLength], S1 in, S2 weights) {
|
||||
for (ushort y = 0; y < filter_height; y++) {
|
||||
for (ushort x = 0; x < filter_width; x++) {
|
||||
const ushort2 off(x, y);
|
||||
const ushort2 p0 = (input_padding + ushort2(input_stride_x, input_stride_y) - off) % ushort2(input_stride_x, input_stride_y);
|
||||
const half sample = in(off + p0);
|
||||
auto w = weights(off);
|
||||
|
||||
accumulate<output_channels % DataSize, T>(DataLength, acc[p0.y][p0.x], w, sample);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void array_zero(T data[], ushort size) {
|
||||
for (ushort i = 0; i < size; i++) {
|
||||
data[i] = T(0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void array_copy(T1 dst[], const T2 src[], ushort size) {
|
||||
for (ushort i = 0; i < size; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void array_add(T1 dst[], const T2 src[], ushort size) {
|
||||
for (ushort i = 0; i < size; i++) {
|
||||
dst[i] += src[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <ushort DataSize, typename T = vec<half, DataSize>>
|
||||
const half element(const T data[], ushort i) {
|
||||
return data[i / DataSize][i % DataSize];
|
||||
}
|
||||
|
||||
template <ushort DataSize, typename T = vec<half, DataSize>>
|
||||
const half element(const threadgroup T data[], ushort i) {
|
||||
return data[i / DataSize][i % DataSize];
|
||||
}
|
||||
|
||||
template <>
|
||||
const half element<1, half>(const half data[], ushort i) {
|
||||
return data[i];
|
||||
}
|
||||
|
||||
static void cnn_convolution_full(constant bias_data &bias,
|
||||
constant filter_data &filters,
|
||||
constant input_data &input,
|
||||
device output_data &output,
|
||||
ushort2 xgid)
|
||||
{
|
||||
ushort2 gid = ushort2(input_stride_x, input_stride_y) * xgid;
|
||||
|
||||
if (all(gid < ushort2(output_size_x, output_size_y))) {
|
||||
thread_storage storage;
|
||||
array_zero((thread vec_t *) storage, sizeof(thread_storage) / sizeof(vec_t));
|
||||
|
||||
for (ushort channel = 0; channel < input_channels; channel++) {
|
||||
planar_data_sampler<data_buffer_type> in(input.data, gid, channel);
|
||||
planar_filter_sampler<vec_t> weights(filters.data, channel);
|
||||
|
||||
convolution(storage, in, weights);
|
||||
}
|
||||
|
||||
for (ushort c = 0; c < output_channels; c++) {
|
||||
for (ushort y = 0; y < input_stride_y; y++) {
|
||||
for (ushort x = 0; x < input_stride_x; x++) {
|
||||
ushort2 idx = gid + ushort2(x, y);
|
||||
if (all(idx < ushort2(output_size_x, output_size_y)))
|
||||
output.data[c][idx.y][idx.x] = element<DataSize, vec_t>(storage[y][x], c) + bias.data[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void cnn_convolution_batch(constant bias_data &bias,
|
||||
constant filter_data &filters,
|
||||
constant input_data &input,
|
||||
device output_data &output,
|
||||
threadgroup thread_storage *batch_storage,
|
||||
ushort2 xgid)
|
||||
{
|
||||
ushort2 gid = ushort2(input_stride_x, input_stride_y) * ushort2(xgid.x / input_batch_size, xgid.y);
|
||||
ushort batch = xgid.x % input_batch_size;
|
||||
|
||||
if (gid.x < output_size_x && gid.y < output_size_y) {
|
||||
thread_storage storage;
|
||||
const ushort data_size = sizeof(thread_storage) / sizeof(vec_t);
|
||||
array_zero((thread vec_t *) storage, data_size);
|
||||
|
||||
ushort base_channel = batch * input_channels / input_batch_size;
|
||||
for (ushort channel = 0; channel < input_channels / input_batch_size; channel++) {
|
||||
planar_data_sampler<data_buffer_type> in(input.data, gid, base_channel + channel);
|
||||
planar_filter_sampler<vec_t> weights(filters.data, base_channel + channel);
|
||||
|
||||
convolution(storage, in, weights);
|
||||
}
|
||||
|
||||
for (ushort b = 0; b < input_batch_size; b++) {
|
||||
threadgroup_barrier(mem_flags::mem_device);
|
||||
|
||||
if (b == batch) {
|
||||
for (ushort y = 0; y < input_stride_y; y++) {
|
||||
for (ushort x = 0; x < input_stride_x; x++) {
|
||||
thread half *h_storage = (thread half *) storage[y][x];
|
||||
|
||||
for (ushort c = 0; c < output_channels; c++) {
|
||||
ushort2 idx = gid + ushort2(x, y);
|
||||
if (all(idx < ushort2(output_size_x, output_size_y))) {
|
||||
device data_buffer_type *out_channel = &output.data[c][gid.y + y][gid.x + x];
|
||||
*out_channel = h_storage[c] + (b == 0 ? bias.data[c] : *out_channel);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (ushort b = 0; b < input_batch_size; b++) {
|
||||
threadgroup_barrier(mem_flags::mem_device);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// clang-format off
|
||||
|
||||
kernel void cnn_convolution_kern(constant bias_data &bias [[ buffer(0) ]],
|
||||
constant filter_data &filters [[ buffer(1) ]],
|
||||
constant input_data &input [[ buffer(2) ]],
|
||||
device output_data &output [[ buffer(3) ]],
|
||||
threadgroup thread_storage *storage [[ threadgroup(0) ]],
|
||||
ushort tid [[ thread_index_in_threadgroup ]],
|
||||
ushort2 xgid [[ thread_position_in_grid ]])
|
||||
|
||||
// clang-format on
|
||||
|
||||
{
|
||||
if (input_batch_size > 1) {
|
||||
ushort batch_index = (input_batch_size-1) * (tid / (input_batch_size));
|
||||
cnn_convolution_batch(bias, filters, input, output, &storage[batch_index], xgid);
|
||||
} else {
|
||||
cnn_convolution_full(bias, filters, input, output, xgid);
|
||||
}
|
||||
}
|
||||
)Metal";
|
||||
}
|
@ -1,61 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import "MetalImageFilter.h"
|
||||
|
||||
class FBMetalInstanceNormConstantValues : public FBMetalConstantValues {
|
||||
public:
|
||||
ushort input_width;
|
||||
ushort input_height;
|
||||
ushort input_channels;
|
||||
ushort prelu_size;
|
||||
|
||||
FBMetalInstanceNormConstantValues(
|
||||
ushort _input_width,
|
||||
ushort _input_height,
|
||||
ushort _input_channels,
|
||||
ushort _prelu_size)
|
||||
: input_width(_input_width),
|
||||
input_height(_input_height),
|
||||
input_channels(_input_channels),
|
||||
prelu_size(_prelu_size) {}
|
||||
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
@interface FBMetalCNNInstanceNormBase : MetalImageFilter
|
||||
|
||||
@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> scaleBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> biasBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> avgBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> stdevBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> preluBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> epsilonBuffer;
|
||||
|
||||
@property (nonatomic, copy) NSString *functionName;
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
functionName:(NSString*)functionName
|
||||
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel;
|
||||
@end
|
||||
|
||||
|
||||
@interface FBMetalCNNInstanceNorm: FBMetalCNNInstanceNormBase
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel
|
||||
withPRelu:(BOOL)withPRelu;
|
||||
|
||||
- (void)loadEpsilon:(const float)epsilon;
|
||||
|
||||
@end
|
@ -1,407 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "FBMetalCNNInstanceNorm.h"
|
||||
#import "MetalShaderUtilities.h"
|
||||
|
||||
|
||||
std::string FBMetalInstanceNormConstantValues::to_string() {
|
||||
std::ostringstream ss;
|
||||
ss << ":" << input_width << ":" << input_height << ":" << input_channels << ":" << prelu_size;
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
@interface FBMetalCNNInstanceNormBase () {
|
||||
struct {
|
||||
ushort channel;
|
||||
ushort width;
|
||||
ushort height;
|
||||
} configuration;
|
||||
}
|
||||
@end
|
||||
|
||||
namespace {
|
||||
extern const char *metalCode;
|
||||
}
|
||||
|
||||
@implementation FBMetalCNNInstanceNormBase
|
||||
|
||||
static constexpr size_t kThreadGroupSize_x = 4;
|
||||
static constexpr size_t kThreadGroupSize_y = 8;
|
||||
|
||||
static NSArray* kernelNames = @[
|
||||
@"cnn_instance_norm_avg_kern",
|
||||
@"cnn_instance_norm_stdev_kern",
|
||||
@"cnn_instance_norm_last_kern",
|
||||
@"cnn_instance_norm_prelu_kern"
|
||||
];
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
functionName:(NSString*)functionName
|
||||
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel {
|
||||
return [[self alloc] initWithContext:context
|
||||
functionName:functionName
|
||||
constantValues:constantValues
|
||||
width:width
|
||||
height:height
|
||||
channel:channel];
|
||||
}
|
||||
|
||||
- (instancetype)initWithContext:(MetalContext*)context
|
||||
functionName:(NSString*)functionName
|
||||
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel {
|
||||
if ((self = [super initWithFunctionName:functionName
|
||||
libraryName:@"InstanceNorm"
|
||||
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
|
||||
context:context
|
||||
constantValues:constantValues])) {
|
||||
_functionName = functionName;
|
||||
|
||||
configuration.channel = channel;
|
||||
configuration.width = width;
|
||||
configuration.height = height;
|
||||
|
||||
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
|
||||
width:width
|
||||
height:height
|
||||
mipmapped:NO];
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (MTLSize)threadsPerThreadgroup {
|
||||
if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
|
||||
NSUInteger maxTotalThreadsPerThreadgroup = [self.pipeline maxTotalThreadsPerThreadgroup];
|
||||
return MTLSizeMake(maxTotalThreadsPerThreadgroup, 1, 1);
|
||||
} else {
|
||||
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
|
||||
}
|
||||
}
|
||||
|
||||
- (MTLSize)threadgroupsPerGrid {
|
||||
if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
|
||||
return MTLSizeMake(configuration.channel, 1, 1);
|
||||
} else {
|
||||
MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
|
||||
return MTLSizeMake(
|
||||
(self.outputTextureDescriptor.width + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width,
|
||||
(self.outputTextureDescriptor.height + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height,
|
||||
1);
|
||||
}
|
||||
}
|
||||
|
||||
- (NSString*)replaceConstantValues:(FBMetalConstantValues*)constantValues librarySource:(NSString*)librarySource {
|
||||
FBMetalInstanceNormConstantValues* convolutionConstantValues = (FBMetalInstanceNormConstantValues*)constantValues;
|
||||
std::string source = [librarySource UTF8String];
|
||||
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_width * convolutionConstantValues->input_height, 0);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 1);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 2);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 3);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->prelu_size, 4);
|
||||
|
||||
return [NSString stringWithUTF8String:source.c_str()];
|
||||
}
|
||||
|
||||
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
|
||||
weightBufferOffset:(NSInteger)weightBufferOffset
|
||||
outputBufferOffset:(NSInteger)outputBufferOffset {
|
||||
[commandEncoder setBuffer:_scaleBuffer offset:0 atIndex:0];
|
||||
[commandEncoder setBuffer:_biasBuffer offset:0 atIndex:1];
|
||||
[commandEncoder setBuffer:_dataBuffer offset:0 atIndex:2];
|
||||
[commandEncoder setBuffer:_outputBuffer offset:0 atIndex:3];
|
||||
[commandEncoder setBuffer:_avgBuffer offset:0 atIndex:4];
|
||||
[commandEncoder setBuffer:_stdevBuffer offset:0 atIndex:5];
|
||||
[commandEncoder setBuffer:_epsilonBuffer offset:0 atIndex:6];
|
||||
if (_preluBuffer != nil) {
|
||||
[commandEncoder setBuffer:_preluBuffer offset:0 atIndex:7];
|
||||
}
|
||||
|
||||
if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
|
||||
MTLSize threads = [self threadsPerThreadgroup];
|
||||
const int threadGroupMemoryLength = threads.width * sizeof(float);
|
||||
[commandEncoder setThreadgroupMemoryLength:threadGroupMemoryLength atIndex:0];
|
||||
}
|
||||
}
|
||||
@end
|
||||
|
||||
@implementation FBMetalCNNInstanceNorm {
|
||||
NSMutableArray<FBMetalCNNInstanceNormBase*>* instanceNorm;
|
||||
NSMutableArray<NSString*>* functionNames;
|
||||
}
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel
|
||||
withPRelu:(BOOL)withPRelu {
|
||||
return [[self alloc] initWithContext:context
|
||||
constantValues:constantValues
|
||||
width:width
|
||||
height:height
|
||||
channel:channel
|
||||
withPRelu:withPRelu];
|
||||
}
|
||||
|
||||
- (instancetype)initWithContext:(MetalContext*)context
|
||||
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel
|
||||
withPRelu:(BOOL)withPRelu{
|
||||
self.context = context;
|
||||
if (withPRelu) {
|
||||
functionNames = [[NSMutableArray alloc] initWithObjects:kernelNames[0], kernelNames[1], kernelNames[3], nil];
|
||||
} else {
|
||||
functionNames = [[NSMutableArray alloc] initWithObjects:kernelNames[0], kernelNames[1], kernelNames[2], nil];
|
||||
}
|
||||
instanceNorm = [NSMutableArray<FBMetalCNNInstanceNormBase*> arrayWithCapacity:functionNames.count];
|
||||
for (int i = 0; i < functionNames.count; i++) {
|
||||
FBMetalCNNInstanceNormBase* instanceNormChunk = [FBMetalCNNInstanceNormBase filterWithContext:context
|
||||
functionName:functionNames[i]
|
||||
constantValues:constantValues
|
||||
width:width
|
||||
height:height
|
||||
channel:channel];
|
||||
|
||||
instanceNorm[i] = instanceNormChunk;
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (void)loadEpsilon:(const float)epsilon {
|
||||
int length = 1;
|
||||
if (self.epsilonBuffer == nil || [self.epsilonBuffer length] < sizeof(float) * length) {
|
||||
self.epsilonBuffer =
|
||||
[self.context.device newBufferWithLength:sizeof(float) * length options:MTLResourceOptionCPUCacheModeDefault];
|
||||
}
|
||||
if (self.epsilonBuffer) {
|
||||
float* bias_data = (float*)[self.epsilonBuffer contents];
|
||||
bias_data[0] = epsilon;
|
||||
}
|
||||
}
|
||||
|
||||
- (void)applyFilter:(void (^)(NSError*))completionHandler {
|
||||
for (int i = 0; i < functionNames.count; i++) {
|
||||
FBMetalCNNInstanceNormBase* instanceNormChunk = instanceNorm[i];
|
||||
instanceNormChunk.avgBuffer = self.avgBuffer;
|
||||
instanceNormChunk.stdevBuffer = self.stdevBuffer;
|
||||
instanceNormChunk.dataBuffer = self.dataBuffer;
|
||||
instanceNormChunk.outputBuffer = self.outputBuffer;
|
||||
instanceNormChunk.scaleBuffer = self.scaleBuffer;
|
||||
instanceNormChunk.biasBuffer = self.biasBuffer;
|
||||
instanceNormChunk.preluBuffer = self.preluBuffer;
|
||||
instanceNormChunk.epsilonBuffer = self.epsilonBuffer;
|
||||
|
||||
[instanceNorm[i] applyFilter:completionHandler];
|
||||
}
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
namespace {
|
||||
const char *metalCode = R"Metal(
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include <metal_stdlib>
|
||||
|
||||
using namespace metal;
|
||||
|
||||
constant constexpr int input_size [[ function_constant(0) ]];
|
||||
constant constexpr ushort input_size_x [[ function_constant(1) ]];
|
||||
constant constexpr ushort input_size_y [[ function_constant(2) ]];
|
||||
constant constexpr ushort channels [[ function_constant(3) ]];
|
||||
constant constexpr ushort prelu_size [[ function_constant(4) ]];
|
||||
|
||||
typedef float data_buffer_type;
|
||||
|
||||
typedef struct {
|
||||
data_buffer_type data[channels * input_size];
|
||||
} input_output_data;
|
||||
|
||||
typedef struct {
|
||||
half data[channels];
|
||||
} stats_data;
|
||||
|
||||
typedef struct {
|
||||
half data[channels];
|
||||
} channel_data;
|
||||
|
||||
typedef struct {
|
||||
float data[1];
|
||||
} epsilon_data;
|
||||
|
||||
kernel void cnn_instance_norm_avg_kern(
|
||||
constant input_output_data &inputBuffer [[ buffer(2) ]],
|
||||
device stats_data &avgBuffer [[ buffer(4) ]],
|
||||
threadgroup float *per_thread_sum [[ threadgroup(0) ]],
|
||||
ushort channel [[ threadgroup_position_in_grid ]],
|
||||
ushort num_threads [[ threads_per_threadgroup ]],
|
||||
ushort tid [[ thread_index_in_threadgroup ]]
|
||||
) {
|
||||
const int chunk_size = (input_size + num_threads - 1) / num_threads;
|
||||
|
||||
constant float* input = &inputBuffer.data[channel * input_size + tid * chunk_size];
|
||||
|
||||
const int max_index = min(chunk_size, input_size - tid * chunk_size);
|
||||
|
||||
float4 sum = 0;
|
||||
int i = 0;
|
||||
for (; i < max_index-16; i+=16) {
|
||||
sum += ((constant float4 *) input)[i/4];
|
||||
sum += ((constant float4 *) input)[i/4+1];
|
||||
sum += ((constant float4 *) input)[i/4+2];
|
||||
sum += ((constant float4 *) input)[i/4+3];
|
||||
}
|
||||
for (; i < max_index-8; i+=8) {
|
||||
sum += ((constant float4 *) input)[i/4];
|
||||
sum += ((constant float4 *) input)[i/4+1];
|
||||
}
|
||||
for (; i < max_index-4; i+=4) {
|
||||
sum += ((constant float4 *) input)[i/4];
|
||||
}
|
||||
for (; i < max_index; i++) {
|
||||
sum[0] += input[i];
|
||||
}
|
||||
per_thread_sum[tid] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const int last_bit_size = 32;
|
||||
const int last_bit_section = num_threads/last_bit_size;
|
||||
|
||||
if (tid < last_bit_size) {
|
||||
sum = 0;
|
||||
for (int t = 0; t < last_bit_section/4; t++) {
|
||||
sum += ((threadgroup float4 *) per_thread_sum)[tid * last_bit_section/4 + t];
|
||||
}
|
||||
per_thread_sum[tid] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tid == 0) {
|
||||
sum = 0;
|
||||
for (int t = 0; t < last_bit_size/4; t++) {
|
||||
sum += ((threadgroup float4 *) per_thread_sum)[t];
|
||||
}
|
||||
avgBuffer.data[channel] = (sum[0] + sum[1] + sum[2] + sum[3]) / input_size;
|
||||
}
|
||||
}
|
||||
|
||||
kernel void cnn_instance_norm_stdev_kern(
|
||||
constant channel_data &scaleBuffer [[ buffer(0) ]],
|
||||
constant channel_data &biasBuffer [[ buffer(1) ]],
|
||||
constant input_output_data &inputBuffer [[ buffer(2) ]],
|
||||
device stats_data &avgBuffer [[ buffer(4) ]],
|
||||
device stats_data &stdevBuffer [[ buffer(5) ]],
|
||||
constant epsilon_data &epsilon [[ buffer(6) ]],
|
||||
threadgroup float *per_thread_sq_norm [[ threadgroup(0) ]],
|
||||
ushort channel [[ threadgroup_position_in_grid ]],
|
||||
ushort num_threads [[ threads_per_threadgroup ]],
|
||||
ushort tid [[ thread_index_in_threadgroup ]]
|
||||
) {
|
||||
const int chunk_size = (input_size + num_threads - 1) / num_threads;
|
||||
|
||||
constant float* input = &inputBuffer.data[channel * input_size + tid * chunk_size];
|
||||
|
||||
const int max_index = min(chunk_size, input_size - tid * chunk_size);
|
||||
|
||||
float4 sum = 0;
|
||||
float mean = avgBuffer.data[channel];
|
||||
|
||||
int i = 0;
|
||||
for (; i < max_index-16; i+=16) {
|
||||
float4 delta = ((constant float4 *) input)[i/4] - mean;
|
||||
sum += delta * delta;
|
||||
delta = ((constant float4 *) input)[i/4+1] - mean;
|
||||
sum += delta * delta;
|
||||
delta = ((constant float4 *) input)[i/4+2] - mean;
|
||||
sum += delta * delta;
|
||||
delta = ((constant float4 *) input)[i/4+3] - mean;
|
||||
sum += delta * delta;
|
||||
}
|
||||
for (; i < max_index-8; i+=8) {
|
||||
float4 delta = ((constant float4 *) input)[i/4] - mean;
|
||||
sum += delta * delta;
|
||||
delta = ((constant float4 *) input)[i/4+1] - mean;
|
||||
sum += delta * delta;
|
||||
}
|
||||
for (; i < max_index-4; i+=4) {
|
||||
float4 delta = ((constant float4 *) input)[i/4] - mean;
|
||||
sum += delta * delta;
|
||||
}
|
||||
for (; i < max_index; i++) {
|
||||
half delta = input[i] - mean;
|
||||
sum[0] += delta * delta;
|
||||
}
|
||||
per_thread_sq_norm[tid] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
const int last_bit_size = 32;
|
||||
const int last_bit_section = num_threads/last_bit_size;
|
||||
|
||||
if (tid < last_bit_size) {
|
||||
sum = 0;
|
||||
for (int t = 0; t < last_bit_section/4; t++) {
|
||||
sum += ((threadgroup float4 *) per_thread_sq_norm)[tid * last_bit_section/4 + t];
|
||||
}
|
||||
per_thread_sq_norm[tid] = sum[0] + sum[1] + sum[2] + sum[3];
|
||||
}
|
||||
|
||||
threadgroup_barrier(mem_flags::mem_threadgroup);
|
||||
|
||||
if (tid == 0) {
|
||||
sum = 0;
|
||||
for (int t = 0; t < last_bit_size/4; t++) {
|
||||
sum += ((threadgroup float4 *) per_thread_sq_norm)[t];
|
||||
}
|
||||
|
||||
float inv_stdev = 1.0h / sqrt((sum[0] + sum[1] + sum[2] + sum[3]) / input_size + epsilon.data[0]);
|
||||
float scale = inv_stdev * scaleBuffer.data[channel];
|
||||
float shift = biasBuffer.data[channel] - mean * scale;
|
||||
|
||||
avgBuffer.data[channel] = scale; //scale
|
||||
stdevBuffer.data[channel] = shift; // shift
|
||||
}
|
||||
}
|
||||
|
||||
kernel void cnn_instance_norm_last_kern(
|
||||
constant input_output_data &inputBuffer [[ buffer(2) ]],
|
||||
device input_output_data &outputBuffer [[ buffer(3) ]],
|
||||
constant stats_data &avgBuffer [[ buffer(4) ]], //scale
|
||||
constant stats_data &stdevBuffer [[ buffer(5) ]], //shift
|
||||
uint2 gid [[ thread_position_in_grid ]]) {
|
||||
if (gid.x < input_size_x && gid.y < input_size_y) {
|
||||
for (int c = 0; c < channels; c++) {
|
||||
int idx = c * input_size + gid.y * input_size_x + gid.x;
|
||||
outputBuffer.data[idx] = half(inputBuffer.data[idx]) * avgBuffer.data[c] + stdevBuffer.data[c];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kernel void cnn_instance_norm_prelu_kern(
|
||||
constant input_output_data &inputBuffer [[ buffer(2) ]],
|
||||
device input_output_data &outputBuffer [[ buffer(3) ]],
|
||||
constant stats_data &avgBuffer [[ buffer(4) ]], //scale
|
||||
constant stats_data &stdevBuffer [[ buffer(5) ]], //shift
|
||||
constant channel_data &preluBuffer [[ buffer(7) ]], //prelu weights
|
||||
uint2 gid [[ thread_position_in_grid ]]) {
|
||||
if (gid.x < input_size_x && gid.y < input_size_y) {
|
||||
for (int c = 0; c < channels; c++) {
|
||||
int idx = c * input_size + gid.y * input_size_x + gid.x;
|
||||
half weight = preluBuffer.data[prelu_size > 1 ? c : 0];
|
||||
half value = half(inputBuffer.data[idx]) * avgBuffer.data[c] + stdevBuffer.data[c];
|
||||
outputBuffer.data[idx] = value > 0 ? value : value * weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
)Metal";
|
||||
}
|
@ -1,14 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import "MetalImageFilter.h"
|
||||
|
||||
@interface FBMetalCNNNoOp: MetalImageFilter
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height;
|
||||
|
||||
@end
|
@ -1,54 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "FBMetalCNNNoOp.h"
|
||||
#import "MetalShaderUtilities.h"
|
||||
|
||||
namespace {
|
||||
extern const char *metalCode;
|
||||
}
|
||||
|
||||
@implementation FBMetalCNNNoOp
|
||||
|
||||
- (MTLSize)threadsPerThreadgroup {
|
||||
return MTLSizeMake(1, 1, 1);
|
||||
}
|
||||
|
||||
- (MTLSize)threadgroupsPerGrid {
|
||||
return MTLSizeMake(1, 1, 1);
|
||||
}
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height {
|
||||
return [[self alloc] initWithContext:context width:width height:height];
|
||||
}
|
||||
|
||||
- (instancetype)initWithContext:(MetalContext*)context
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height {
|
||||
if ((self = [super initWithFunctionName:@"cnn_no_op_kern"
|
||||
libraryName:@"NoOp"
|
||||
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
|
||||
context:context
|
||||
constantValues:nil])) {
|
||||
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
|
||||
width:width
|
||||
height:height
|
||||
mipmapped:NO];
|
||||
}
|
||||
return self;
|
||||
}
|
||||
@end
|
||||
|
||||
namespace {
|
||||
const char *metalCode = R"Metal(
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include <metal_stdlib>
|
||||
using namespace metal;
|
||||
|
||||
kernel void cnn_no_op_kern() {
|
||||
return;
|
||||
}
|
||||
)Metal";
|
||||
}
|
@ -1,40 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import "MetalImageFilter.h"
|
||||
|
||||
class FBMetalPReluConstantValues : public FBMetalConstantValues {
|
||||
public:
|
||||
ushort input_width;
|
||||
ushort input_height;
|
||||
ushort input_channels;
|
||||
ushort weight_length;
|
||||
|
||||
FBMetalPReluConstantValues(
|
||||
ushort _input_width,
|
||||
ushort _input_height,
|
||||
ushort _input_channels,
|
||||
ushort _weight_length)
|
||||
: input_width(_input_width),
|
||||
input_height(_input_height),
|
||||
input_channels(_input_channels),
|
||||
weight_length(_weight_length) {}
|
||||
|
||||
std::string to_string();
|
||||
};
|
||||
|
||||
@interface FBMetalCNNPRelu : MetalImageFilter
|
||||
|
||||
@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
|
||||
@property (nonatomic, strong) id<MTLBuffer> weightBuffer;
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
constantValues:(FBMetalPReluConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel;
|
||||
|
||||
@end
|
@ -1,131 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "FBMetalCNNPRelu.h"
|
||||
#import "MetalShaderUtilities.h"
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
std::string FBMetalPReluConstantValues::to_string() {
|
||||
std::ostringstream ss;
|
||||
|
||||
ss << ":" <<
|
||||
input_width << ":" <<
|
||||
input_height << ":" <<
|
||||
weight_length;
|
||||
|
||||
return ss.str();
|
||||
}
|
||||
|
||||
@interface FBMetalCNNPRelu() {
|
||||
struct {
|
||||
ushort channel;
|
||||
ushort width;
|
||||
ushort height;
|
||||
} configuration;
|
||||
}
|
||||
@end
|
||||
|
||||
namespace {
|
||||
extern const char *metalCode;
|
||||
}
|
||||
|
||||
@implementation FBMetalCNNPRelu
|
||||
|
||||
static constexpr size_t kThreadGroupSize_x = 4;
|
||||
static constexpr size_t kThreadGroupSize_y = 8;
|
||||
|
||||
+ (instancetype)filterWithContext:(MetalContext*)context
|
||||
constantValues:(FBMetalPReluConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel {
|
||||
return [[self alloc] initWithContext:context constantValues:constantValues width:width height:height channel:channel];
|
||||
}
|
||||
|
||||
- (instancetype)initWithContext:(MetalContext*)context
|
||||
constantValues:(FBMetalPReluConstantValues*)constantValues
|
||||
width:(NSUInteger)width
|
||||
height:(NSUInteger)height
|
||||
channel:(NSUInteger)channel {
|
||||
if ((self = [super initWithFunctionName:@"cnn_prelu_kern"
|
||||
libraryName:@"PRelu"
|
||||
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
|
||||
context:context
|
||||
constantValues:constantValues])) {
|
||||
configuration.channel = channel;
|
||||
configuration.width = width;
|
||||
configuration.height = height;
|
||||
|
||||
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
|
||||
width:width
|
||||
height:height
|
||||
mipmapped:NO];
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
- (MTLSize) threadsPerThreadgroup {
|
||||
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
|
||||
}
|
||||
|
||||
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
|
||||
librarySource:(NSString*)librarySource {
|
||||
FBMetalPReluConstantValues* convolutionConstantValues = (FBMetalPReluConstantValues *) constantValues;
|
||||
std::string source = [librarySource UTF8String];
|
||||
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 0);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 1);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 2);
|
||||
REPLACE_CONSTANT(source, convolutionConstantValues->weight_length, 3);
|
||||
|
||||
return [NSString stringWithUTF8String:source.c_str()];
|
||||
}
|
||||
|
||||
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
|
||||
weightBufferOffset:(NSInteger)weightBufferOffset
|
||||
outputBufferOffset:(NSInteger)outputBufferOffset {
|
||||
[commandEncoder setBuffer:_weightBuffer offset:0 atIndex:0];
|
||||
[commandEncoder setBuffer:_outputBuffer offset:0 atIndex:1];
|
||||
}
|
||||
|
||||
@end
|
||||
|
||||
namespace {
|
||||
const char *metalCode = R"Metal(
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include <metal_stdlib>
|
||||
|
||||
using namespace metal;
|
||||
|
||||
constant constexpr ushort input_size_x [[ function_constant(0) ]];
|
||||
constant constexpr ushort input_size_y [[ function_constant(1) ]];
|
||||
constant constexpr ushort input_channels [[ function_constant(2) ]];
|
||||
constant constexpr ushort kernels [[ function_constant(3) ]];
|
||||
|
||||
constant constexpr int input_size = input_size_x * input_size_y;
|
||||
|
||||
typedef float data_buffer_type;
|
||||
typedef struct {
|
||||
data_buffer_type data[input_channels][input_size];
|
||||
} output_data;
|
||||
|
||||
typedef struct {
|
||||
half data[kernels];
|
||||
} filter_data;
|
||||
|
||||
kernel void cnn_prelu_kern(
|
||||
constant filter_data &filter [[ buffer(0) ]],
|
||||
device output_data &output [[ buffer(1) ]],
|
||||
uint2 gid [[thread_position_in_grid]]) {
|
||||
if (gid.x < input_size_x && gid.y < input_size_y) {
|
||||
int idx = gid.y * input_size_x + gid.x;
|
||||
for (int c = 0; c < input_channels; c++) {
|
||||
half weight = filter.data[kernels > 1 ? c : 0];
|
||||
data_buffer_type value = output.data[c][idx];
|
||||
output.data[c][idx] = value > 0 ? value : value * weight;
|
||||
}
|
||||
}
|
||||
}
|
||||
)Metal";
|
||||
}
|
@ -1,10 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import <string>
|
||||
|
||||
class FBMetalConstantValues {
|
||||
public:
|
||||
virtual std::string to_string() = 0;
|
||||
};
|
@ -1,38 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/operators/filler_op.h"
|
||||
#import "MetalCaffeContext.h"
|
||||
#import "data_conversion.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T1, typename T2>
|
||||
class GivenTensorMetalFillOp final : public FillerOp<MetalCaffeContext> {
|
||||
public:
|
||||
GivenTensorMetalFillOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: FillerOp<MetalCaffeContext>(operator_def, ws) {
|
||||
auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
|
||||
for (float f : source_values) {
|
||||
values_.push_back(f);
|
||||
}
|
||||
}
|
||||
|
||||
bool Fill(TensorMetal* output) override {
|
||||
DCHECK_EQ(output->size(), values_.size()) << "output size: " << output->size() << " given size: " << values_.size();
|
||||
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)output->template mutable_data<T1>());
|
||||
T2* output_data = (T2*)[weightBuffer contents];
|
||||
|
||||
CAFFE_ENFORCE(output_data != NULL);
|
||||
memcpycvt(output_data, values_.data(), output->size());
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<float> values_;
|
||||
};
|
||||
|
||||
// uint16_t is used because caffe2 does not support float16_t
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenTensorFloat16MetalFill, METAL, GivenTensorMetalFillOp<uint16_t, float16_t>);
|
||||
OPERATOR_SCHEMA(GivenTensorFloat16MetalFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
}
|
@ -1,200 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "Caffe2 mobile state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
|
||||
#include "caffe2/operators/filler_op.h"
|
||||
#import "data_conversion.h"
|
||||
#import "FBMetalCNNConvolution.h"
|
||||
#import "MetalCaffeContext.h"
|
||||
|
||||
namespace caffe2 {
|
||||
class GivenWeightTensorFillOp final : public FillerOp<MetalCaffeContext> {
|
||||
public:
|
||||
GivenWeightTensorFillOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: FillerOp<MetalCaffeContext>(operator_def, ws) {
|
||||
auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
|
||||
for (float f : source_values) {
|
||||
values_.push_back(f);
|
||||
}
|
||||
}
|
||||
|
||||
bool Fill(Tensor<MetalCaffeContext> *output) override { return true; }
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto *output = Operator<MetalCaffeContext>::Output(0);
|
||||
CAFFE_ENFORCE_EQ(shape_.size(), 4);
|
||||
auto shape = shape_;
|
||||
|
||||
int kernels = shape[0];
|
||||
int kernel_channels = shape[1];
|
||||
int kernel_height = shape[2];
|
||||
int kernel_width = shape[3];
|
||||
|
||||
reformatKernelImage<weight_buffer_type>(
|
||||
(const float *)values_.data(),
|
||||
kernels,
|
||||
kernel_channels,
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
false,
|
||||
[&](int multiplier) -> weight_buffer_type * {
|
||||
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
|
||||
shape.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
|
||||
output->Resize(shape);
|
||||
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
|
||||
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
|
||||
return output_data;
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<float> values_;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenWeightTensorFill, METAL, GivenWeightTensorFillOp);
|
||||
OPERATOR_SCHEMA(GivenWeightTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
class GivenTransposeWeightTensorFillOp final : public FillerOp<MetalCaffeContext> {
|
||||
public:
|
||||
GivenTransposeWeightTensorFillOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: FillerOp<MetalCaffeContext>(operator_def, ws) {
|
||||
auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
|
||||
for (float f : source_values) {
|
||||
values_.push_back(f);
|
||||
}
|
||||
}
|
||||
|
||||
bool Fill(Tensor<MetalCaffeContext> *output) override { return true; }
|
||||
|
||||
bool RunOnDevice() override {
|
||||
auto *output = Operator<MetalCaffeContext>::Output(0);
|
||||
CAFFE_ENFORCE_EQ(shape_.size(), 4);
|
||||
auto shape = shape_;
|
||||
|
||||
int kernels = shape[0];
|
||||
int kernel_channels = shape[1];
|
||||
int kernel_height = shape[2];
|
||||
int kernel_width = shape[3];
|
||||
|
||||
std::swap(kernels, kernel_channels);
|
||||
|
||||
reformatKernelImage<weight_buffer_type>(
|
||||
(const float *)values_.data(),
|
||||
kernels,
|
||||
kernel_channels,
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
true,
|
||||
[&](int multiplier) -> weight_buffer_type * {
|
||||
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
|
||||
shape_.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
|
||||
output->Resize(shape_);
|
||||
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
|
||||
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
|
||||
return output_data;
|
||||
});
|
||||
return true;
|
||||
}
|
||||
|
||||
private:
|
||||
vector<float> values_;
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenTransposeWeightTensorFill, METAL, GivenTransposeWeightTensorFillOp);
|
||||
OPERATOR_SCHEMA(GivenTransposeWeightTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
class CopyWeightTensorToMetalGPUOp final : public Operator<MetalCaffeContext> {
|
||||
public:
|
||||
CopyWeightTensorToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<MetalCaffeContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const Blob *blob = Inputs()[0];
|
||||
const TensorCPU &X = blob->Get<TensorCPU>();
|
||||
auto *output = Operator<MetalCaffeContext>::Output(0);
|
||||
auto shape = X.dims();
|
||||
CAFFE_ENFORCE_EQ(shape.size(), 4);
|
||||
|
||||
int kernels = shape[0];
|
||||
int kernel_channels = shape[1];
|
||||
int kernel_height = shape[2];
|
||||
int kernel_width = shape[3];
|
||||
|
||||
reformatKernelImage<weight_buffer_type>(
|
||||
(const float *)X.template data<float>(),
|
||||
kernels,
|
||||
kernel_channels,
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
false,
|
||||
[&](int multiplier) -> weight_buffer_type * {
|
||||
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
|
||||
shape.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
|
||||
output->Resize(shape);
|
||||
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
|
||||
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
|
||||
return output_data;
|
||||
});
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyWeightTensorToMetalGPU, METAL, CopyWeightTensorToMetalGPUOp);
|
||||
OPERATOR_SCHEMA(CopyWeightTensorToMetalGPU).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
class CopyTransposeWeightTensorToMetalGPUOp final : public Operator<MetalCaffeContext> {
|
||||
public:
|
||||
CopyTransposeWeightTensorToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<MetalCaffeContext>(operator_def, ws) {}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
const Blob *blob = Inputs()[0];
|
||||
const TensorCPU &X = blob->Get<TensorCPU>();
|
||||
auto *output = Operator<MetalCaffeContext>::Output(0);
|
||||
auto shape_ = X.dims();
|
||||
CAFFE_ENFORCE_EQ(shape_.size(), 4);
|
||||
auto shape = shape_;
|
||||
|
||||
int kernels = shape[0];
|
||||
int kernel_channels = shape[1];
|
||||
int kernel_height = shape[2];
|
||||
int kernel_width = shape[3];
|
||||
|
||||
std::swap(kernels, kernel_channels);
|
||||
|
||||
reformatKernelImage<weight_buffer_type>(
|
||||
(const float *)X.template data<float>(),
|
||||
kernels,
|
||||
kernel_channels,
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
true,
|
||||
[&](int multiplier) -> weight_buffer_type * {
|
||||
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
|
||||
shape_.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
|
||||
output->Resize(shape_);
|
||||
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
|
||||
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
|
||||
return output_data;
|
||||
});
|
||||
return true;
|
||||
}
|
||||
};
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyTransposeWeightTensorToMetalGPU, METAL, CopyTransposeWeightTensorToMetalGPUOp);
|
||||
OPERATOR_SCHEMA(CopyTransposeWeightTensorToMetalGPU).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_MOBILE
|
@ -1,87 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/math.h"
|
||||
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
struct MetalAllocator final : CPUAllocator {
|
||||
id<MTLDevice> device;
|
||||
|
||||
MetalAllocator(id<MTLDevice> _device);
|
||||
|
||||
~MetalAllocator();
|
||||
|
||||
void *New(size_t nbytes) override;
|
||||
|
||||
void Delete(void *data) override;
|
||||
|
||||
id<MTLBuffer> Buffer(void *data);
|
||||
};
|
||||
|
||||
MetalAllocator *GetMetalAllocator();
|
||||
|
||||
class MetalCaffeContext final {
|
||||
public:
|
||||
MetalCaffeContext() : random_seed_(math::randomNumberSeed()) {}
|
||||
explicit MetalCaffeContext(const DeviceOption &option)
|
||||
: random_seed_(option.has_random_seed() ? option.random_seed() : math::randomNumberSeed()) {
|
||||
CHECK_EQ(option.device_type(), CPU);
|
||||
}
|
||||
|
||||
~MetalCaffeContext() {}
|
||||
|
||||
inline void SwitchToDevice(int stream_id) {}
|
||||
inline void SwitchToDevice() {
|
||||
SwitchToDevice(0);
|
||||
}
|
||||
|
||||
inline bool FinishDeviceComputation() { return true; }
|
||||
|
||||
inline std::mt19937 &RandGenerator() {
|
||||
if (!random_generator_.get()) {
|
||||
random_generator_.reset(new std::mt19937(random_seed_));
|
||||
}
|
||||
return *random_generator_.get();
|
||||
}
|
||||
|
||||
static void *New(size_t nbytes);
|
||||
|
||||
static void Delete(void *data);
|
||||
|
||||
// Two copy functions that deals with cross-device copies.
|
||||
template <class SrcContext, class DstContext>
|
||||
inline void CopyBytes(size_t nbytes, const void *src, void *dst);
|
||||
|
||||
template <typename T, class SrcContext, class DstContext>
|
||||
inline void Copy(size_t n, const T *src, T *dst) {
|
||||
if (std::is_fundamental<T>::value) {
|
||||
CopyBytes<SrcContext, DstContext>(n * sizeof(T), static_cast<const void *>(src), static_cast<void *>(dst));
|
||||
} else {
|
||||
for (int i = 0; i < n; ++i) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <class SrcContext, class DstContext>
|
||||
inline void CopyItems(const TypeMeta &meta, size_t n, const void *src, void *dst) {
|
||||
if (meta.copy()) {
|
||||
meta.copy()(src, dst, n);
|
||||
} else {
|
||||
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
|
||||
}
|
||||
}
|
||||
|
||||
protected:
|
||||
int random_seed_{1701};
|
||||
std::unique_ptr<std::mt19937> random_generator_;
|
||||
};
|
||||
|
||||
typedef Tensor<MetalCaffeContext> TensorMetal;
|
||||
}
|
@ -1,59 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "MetalCaffeContext.h"
|
||||
#import "MetalContext.h"
|
||||
|
||||
namespace caffe2 {
|
||||
CAFFE_KNOWN_TYPE(TensorMetal);
|
||||
|
||||
static NSMutableDictionary<NSNumber *, id<MTLBuffer>> *buffer_cache = nil;
|
||||
|
||||
// class MetalAllocator
|
||||
MetalAllocator::MetalAllocator(id<MTLDevice> _device) : device(_device) {
|
||||
buffer_cache = [NSMutableDictionary<NSNumber *, id<MTLBuffer>> dictionary];
|
||||
}
|
||||
|
||||
MetalAllocator::~MetalAllocator() {
|
||||
for (id key in buffer_cache) {
|
||||
id<MTLBuffer> buffer = buffer_cache[key];
|
||||
[buffer_cache removeObjectForKey:key];
|
||||
}
|
||||
}
|
||||
|
||||
void *MetalAllocator::New(size_t nbytes) {
|
||||
id<MTLBuffer> buffer = [device newBufferWithLength:nbytes options:MTLResourceCPUCacheModeDefaultCache];
|
||||
void *data = [buffer contents];
|
||||
NSNumber *key = @((unsigned long long)data);
|
||||
buffer_cache[key] = buffer;
|
||||
return data;
|
||||
}
|
||||
|
||||
void MetalAllocator::Delete(void *data) {
|
||||
NSNumber *key = @((unsigned long long)data);
|
||||
id<MTLBuffer> buffer = buffer_cache[key];
|
||||
[buffer_cache removeObjectForKey:key];
|
||||
buffer = nil;
|
||||
}
|
||||
|
||||
id<MTLBuffer> MetalAllocator::Buffer(void *data) {
|
||||
NSNumber *key = @((unsigned long long)data);
|
||||
return buffer_cache[key];
|
||||
}
|
||||
|
||||
// the Metal Allocator
|
||||
static MetalAllocator *MetalAllocatorInstance = NULL;
|
||||
|
||||
// Get the Metal Allocator
|
||||
MetalAllocator *GetMetalAllocator() {
|
||||
if (MetalAllocatorInstance == NULL) {
|
||||
MetalAllocatorInstance = new MetalAllocator([MetalContext getContext].device);
|
||||
}
|
||||
CAFFE_ENFORCE(MetalAllocatorInstance != NULL);
|
||||
return MetalAllocatorInstance;
|
||||
}
|
||||
|
||||
// class MetalCaffeContext
|
||||
void *MetalCaffeContext::New(size_t nbytes) { return MetalAllocatorInstance->New(nbytes); }
|
||||
|
||||
void MetalCaffeContext::Delete(void *data) { MetalAllocatorInstance->Delete(data); }
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
|
||||
@protocol MTLDevice, MTLLibrary, MTLCommandQueue;
|
||||
|
||||
@interface MetalContext : NSObject
|
||||
|
||||
@property (atomic, strong) id<MTLDevice> device;
|
||||
@property (atomic, strong) id<MTLLibrary> library;
|
||||
@property (atomic, strong) id<MTLCommandQueue> commandQueue;
|
||||
|
||||
+ (instancetype)getContext;
|
||||
|
||||
@end
|
@ -1,27 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "MetalContext.h"
|
||||
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
static MetalContext* metalContext = NULL;
|
||||
|
||||
@implementation MetalContext
|
||||
|
||||
+ (instancetype)getContext {
|
||||
if (metalContext == NULL) {
|
||||
metalContext = [[self alloc] initWithDevice:nil];
|
||||
}
|
||||
return metalContext;
|
||||
}
|
||||
|
||||
- (instancetype)initWithDevice:(id<MTLDevice>)device {
|
||||
if ((self = [super init])) {
|
||||
_device = device ?: MTLCreateSystemDefaultDevice();
|
||||
_library = [_device newDefaultLibrary];
|
||||
_commandQueue = [_device newCommandQueue];
|
||||
}
|
||||
return self;
|
||||
}
|
||||
|
||||
@end
|
@ -1,163 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "Caffe2 mobile state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
|
||||
#include "caffe2/operators/conv_pool_op_base.h"
|
||||
#include "caffe2/operators/conv_transpose_unpool_op_base.h"
|
||||
#import "metal_convolution.h"
|
||||
#import "MetalCaffeContext.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class MetalConvMTLBufferOp final : public ConvPoolOpBase<MetalCaffeContext> {
|
||||
public:
|
||||
MetalConvMTLBufferOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<MetalCaffeContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
|
||||
const TensorCPU& InputFromTensorCPU(int idx) {
|
||||
const Blob* blob = Inputs()[idx];
|
||||
return blob->Get<TensorCPU>();
|
||||
}
|
||||
|
||||
// Input: X, W, b
|
||||
// Output: Y
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
};
|
||||
|
||||
bool MetalConvMTLBufferOp::RunOnDeviceWithOrderNCHW() {
|
||||
const TensorMetal& X = Input(INPUT);
|
||||
auto& filter = Input(FILTER);
|
||||
auto& bias = InputFromTensorCPU(BIAS);
|
||||
TensorMetal* Y = Output(0);
|
||||
|
||||
CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
|
||||
const int N = X.dim32(0), C = X.dim32(1);
|
||||
CAFFE_ENFORCE(filter.ndim(), 5);
|
||||
const int M = filter.dim32(0);
|
||||
|
||||
CAFFE_ENFORCE(filter.dim32(1) == C, "");
|
||||
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
|
||||
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
|
||||
CAFFE_ENFORCE(bias.ndim() == 1, "");
|
||||
CAFFE_ENFORCE(bias.dim32(0) == M, "");
|
||||
|
||||
ConvPoolOpBase<MetalCaffeContext>::SetOutputSize(X, Y, filter.dim32(0));
|
||||
|
||||
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)X.template data<float>());
|
||||
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Y->template mutable_data<float>());
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)filter.template data<uint16_t>());
|
||||
const float* biasData = bias.template data<float>();
|
||||
|
||||
metal_convolution(
|
||||
inputDataBuffer,
|
||||
X.dim32(1),
|
||||
X.dim32(3),
|
||||
X.dim32(2),
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
weightBuffer,
|
||||
filter.dim32(0),
|
||||
filter.dim32(1),
|
||||
filter.dim32(3),
|
||||
filter.dim32(2),
|
||||
outputDataBuffer,
|
||||
Y->dim32(1),
|
||||
Y->dim32(3),
|
||||
Y->dim32(2),
|
||||
biasData,
|
||||
bias.dim32(0),
|
||||
false);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalConv, METAL, MetalConvMTLBufferOp);
|
||||
OPERATOR_SCHEMA(MetalConv).NumInputs(3).NumOutputs(1);
|
||||
|
||||
class MetalConvTransposeMTLBufferOp final : public ConvTransposeUnpoolBase<MetalCaffeContext> {
|
||||
public:
|
||||
MetalConvTransposeMTLBufferOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvTransposeUnpoolBase<MetalCaffeContext>(operator_def, ws) {
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
|
||||
private:
|
||||
// Input: X, W, b
|
||||
// Output: Y
|
||||
INPUT_TAGS(INPUT, FILTER, BIAS);
|
||||
|
||||
const TensorCPU& InputFromTensorCPU(int idx) {
|
||||
const Blob* blob = Inputs()[idx];
|
||||
return blob->Get<TensorCPU>();
|
||||
}
|
||||
};
|
||||
|
||||
bool MetalConvTransposeMTLBufferOp::RunOnDeviceWithOrderNCHW() {
|
||||
const TensorMetal& X = Input(INPUT);
|
||||
auto& filter = Input(FILTER);
|
||||
auto& bias = InputFromTensorCPU(BIAS);
|
||||
TensorMetal* Y = Output(0);
|
||||
|
||||
const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
|
||||
CAFFE_ENFORCE(filter.ndim() == 5, "filter must be 4D tensor");
|
||||
CAFFE_ENFORCE(filter.dim32(0) == M, "filter number must be equal to input channel number");
|
||||
const int C = filter.dim32(1);
|
||||
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_, "filter height must be equal to kernel height");
|
||||
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_, "filter width must be equal to kernel width");
|
||||
CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
|
||||
CAFFE_ENFORCE(bias.dim32(0) == C, "bias dimension must be equal to output channel number");
|
||||
|
||||
ConvTransposeUnpoolBase<MetalCaffeContext>::SetOutputSize(X, Y, C);
|
||||
|
||||
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)X.template data<float>());
|
||||
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Y->template mutable_data<float>());
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)filter.template data<uint16_t>());
|
||||
const float* biasData = bias.template data<float>();
|
||||
|
||||
metal_convolution(
|
||||
inputDataBuffer,
|
||||
X.dim32(1),
|
||||
X.dim32(3),
|
||||
X.dim32(2),
|
||||
stride_h_,
|
||||
stride_w_,
|
||||
pad_t_,
|
||||
pad_l_,
|
||||
pad_b_,
|
||||
pad_r_,
|
||||
weightBuffer,
|
||||
filter.dim32(0),
|
||||
filter.dim32(1),
|
||||
filter.dim32(3),
|
||||
filter.dim32(2),
|
||||
outputDataBuffer,
|
||||
Y->dim32(1),
|
||||
Y->dim32(3),
|
||||
Y->dim32(2),
|
||||
biasData,
|
||||
bias.dim32(0),
|
||||
true);
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalConvTranspose, METAL, MetalConvTransposeMTLBufferOp);
|
||||
OPERATOR_SCHEMA(MetalConvTranspose).NumInputs(3).NumOutputs(1);
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_MOBILE
|
@ -1,39 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import "MetalContext.h"
|
||||
#import "FBMetalConstantValues.h"
|
||||
|
||||
#import <Foundation/Foundation.h>
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
@protocol MTLBuffer, MTLComputeCommandEncoder, MTLComputePipelineState;
|
||||
|
||||
@interface MetalImageFilter : NSObject
|
||||
|
||||
@property (nonatomic, strong) MetalContext* context;
|
||||
@property (nonatomic, strong) id<MTLComputePipelineState> pipeline;
|
||||
|
||||
@property (atomic, strong) MTLTextureDescriptor* outputTextureDescriptor;
|
||||
|
||||
- (instancetype)initWithFunctionName:(NSString*)functionName
|
||||
libraryName:(NSString*)libraryName
|
||||
librarySource:(NSString*)librarySource
|
||||
context:(MetalContext*)context
|
||||
constantValues:(FBMetalConstantValues*)constantValues;
|
||||
|
||||
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
|
||||
librarySource:(NSString*)librarySource;
|
||||
|
||||
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
|
||||
weightBufferOffset:(NSInteger)weightBufferOffset
|
||||
outputBufferOffset:(NSInteger)outputBufferOffset;
|
||||
|
||||
- (void)applyFilter:(void (^)(NSError*))completionHandler;
|
||||
|
||||
- (void) applyFilter:(void(^)(NSError*))completionHandler
|
||||
weightBufferOffset:(NSInteger)weightBufferOffset
|
||||
outputBufferOffset:(NSInteger)outputBufferOffset;
|
||||
|
||||
@end
|
@ -1,171 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "MetalImageFilter.h"
|
||||
#import "MetalContext.h"
|
||||
|
||||
#import <TargetConditionals.h>
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
@interface MetalImageFilter ()
|
||||
@property (nonatomic, strong) id<MTLFunction> kernelFunction;
|
||||
@end
|
||||
|
||||
@implementation MetalImageFilter {
|
||||
NSString* _functionName;
|
||||
}
|
||||
static constexpr size_t kThreadGroupSize_x = 4;
|
||||
static constexpr size_t kThreadGroupSize_y = 8;
|
||||
static constexpr bool kEnableGPUProfiling = false; // GPU profiling is expensive, have it off by default
|
||||
|
||||
#import "MetalShaderUtilities.h"
|
||||
|
||||
@synthesize outputTextureDescriptor = _outputTextureDescriptor;
|
||||
|
||||
static NSMutableDictionary<NSString*, id<MTLLibrary>>* functionLibraryCache = NULL;
|
||||
|
||||
- (id<MTLFunction>)newFunctionWithName:(NSString*)functionName
|
||||
libraryName:(NSString*)libraryName
|
||||
librarySource:(NSString*)librarySource
|
||||
context:(MetalContext*)context
|
||||
constantValues:(FBMetalConstantValues*)constantValues {
|
||||
NSString* library_version =
|
||||
constantValues ? [libraryName stringByAppendingString:[[NSString alloc]
|
||||
initWithUTF8String:constantValues->to_string().c_str()]]
|
||||
: libraryName;
|
||||
|
||||
if (functionLibraryCache == NULL) {
|
||||
functionLibraryCache = [NSMutableDictionary<NSString*, id<MTLLibrary>> dictionary];
|
||||
}
|
||||
|
||||
id<MTLLibrary> library = functionLibraryCache[library_version];
|
||||
|
||||
if (library == nil) {
|
||||
librarySource = [self replaceConstantValues:constantValues librarySource:librarySource];
|
||||
|
||||
MTLCompileOptions* options = [MTLCompileOptions alloc];
|
||||
options.fastMathEnabled = TRUE;
|
||||
options.languageVersion = MTLLanguageVersion1_0;
|
||||
|
||||
NSError* error = NULL;
|
||||
library = [context.device newLibraryWithSource:librarySource options:options error:&error];
|
||||
|
||||
if (error != nil) {
|
||||
NSString* description = [[NSString alloc] init];
|
||||
LOG(ERROR) << "Problems with library for " << [library_version UTF8String] << " : "
|
||||
<< [[error localizedDescription] UTF8String];
|
||||
}
|
||||
|
||||
functionLibraryCache[library_version] = library;
|
||||
}
|
||||
|
||||
return library != nil ? [library newFunctionWithName:functionName] : nil;
|
||||
}
|
||||
|
||||
- (NSString*)replaceConstantValues:(FBMetalConstantValues*)constantValues librarySource:(NSString*)librarySource {
|
||||
return librarySource;
|
||||
}
|
||||
|
||||
- (instancetype)initWithFunctionName:(NSString*)functionName
|
||||
libraryName:(NSString*)libraryName
|
||||
librarySource:(NSString*)librarySource
|
||||
context:(MetalContext*)context
|
||||
constantValues:(FBMetalConstantValues*)constantValues {
|
||||
if ((self = [super init])) {
|
||||
NSError* error = nil;
|
||||
|
||||
_context = context;
|
||||
_kernelFunction =
|
||||
[self newFunctionWithName:functionName libraryName:libraryName librarySource:librarySource context:context constantValues:constantValues];
|
||||
_functionName = functionName;
|
||||
_pipeline = [context.device newComputePipelineStateWithFunction:_kernelFunction error:&error];
|
||||
|
||||
if (!_pipeline) {
|
||||
LOG(ERROR) << "Error occurred when building compute pipeline for function " << [functionName UTF8String];
|
||||
return nil;
|
||||
}
|
||||
}
|
||||
|
||||
return self;
|
||||
}
|
||||
|
||||
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
|
||||
weightBufferOffset:(NSInteger)weightBufferOffset
|
||||
outputBufferOffset:(NSInteger)outputBufferOffset {
|
||||
}
|
||||
|
||||
- (bool)checkExecution:(id<MTLCommandBuffer>)commandBuffer {
|
||||
NSError* error = [commandBuffer error];
|
||||
|
||||
if (error != nil) {
|
||||
LOG(ERROR) << "Problems with " << [self->_functionName UTF8String] << " : "
|
||||
<< [[error localizedDescription] UTF8String];
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
- (MTLSize)threadsPerThreadgroup {
|
||||
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
|
||||
}
|
||||
|
||||
- (MTLSize)threadgroupsPerGrid {
|
||||
MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
|
||||
|
||||
return MTLSizeMake(
|
||||
(self.outputTextureDescriptor.width + threadsPerThreadgroup.width - 1) /
|
||||
threadsPerThreadgroup.width,
|
||||
(self.outputTextureDescriptor.height + threadsPerThreadgroup.height - 1) /
|
||||
threadsPerThreadgroup.height,
|
||||
1);
|
||||
}
|
||||
|
||||
- (void)applyFilter:(void (^)(NSError*))completionHandler {
|
||||
[self applyFilter:completionHandler weightBufferOffset:0 outputBufferOffset:0];
|
||||
}
|
||||
|
||||
- (void)applyFilter:(void (^)(NSError*))completionHandler
|
||||
weightBufferOffset:(NSInteger)weightBufferOffset
|
||||
outputBufferOffset:(NSInteger)outputBufferOffset {
|
||||
MTLTextureDescriptor* textureDescriptor = self.outputTextureDescriptor;
|
||||
|
||||
if (kEnableGPUProfiling) {
|
||||
[self.context.commandQueue insertDebugCaptureBoundary];
|
||||
}
|
||||
|
||||
id<MTLCommandBuffer> commandBuffer = [self.context.commandQueue commandBuffer];
|
||||
|
||||
/*
|
||||
* It is not obvious which grid strategy is best, maximizing the number of threadgroups
|
||||
* seems to give better results, but more investigation is needed
|
||||
*/
|
||||
|
||||
id<MTLComputeCommandEncoder> commandEncoder = [commandBuffer computeCommandEncoder];
|
||||
[commandEncoder setComputePipelineState:self.pipeline];
|
||||
|
||||
[self configureArgumentTableWithCommandEncoder:commandEncoder
|
||||
weightBufferOffset:weightBufferOffset
|
||||
outputBufferOffset:outputBufferOffset];
|
||||
|
||||
[commandEncoder dispatchThreadgroups:[self threadgroupsPerGrid] threadsPerThreadgroup:[self threadsPerThreadgroup]];
|
||||
|
||||
[commandEncoder endEncoding];
|
||||
|
||||
[commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> commandBuffer) {
|
||||
if (completionHandler != NULL) {
|
||||
completionHandler([commandBuffer error]);
|
||||
} else {
|
||||
[self checkExecution:commandBuffer];
|
||||
}
|
||||
}];
|
||||
|
||||
[commandBuffer commit];
|
||||
|
||||
if (kEnableGPUProfiling) {
|
||||
[self.context.commandQueue insertDebugCaptureBoundary];
|
||||
}
|
||||
}
|
||||
|
||||
@end
|
@ -1,134 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#import "MetalCaffeContext.h"
|
||||
#import "metal_instance_norm.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "Caffe2 mobile state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class MetalInstanceNormOp final : public Operator<MetalCaffeContext> {
|
||||
public:
|
||||
MetalInstanceNormOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<MetalCaffeContext>(operator_def, ws),
|
||||
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
return RunOnDeviceWithOrderNCHW();
|
||||
}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW();
|
||||
|
||||
protected:
|
||||
float epsilon_;
|
||||
StorageOrder order_;
|
||||
INPUT_TAGS(INPUT, SCALE, BIAS);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
bool MetalInstanceNormOp::RunOnDeviceWithOrderNCHW() {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& scale = Input(SCALE);
|
||||
const auto& bias = Input(BIAS);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
const int C = X.dim32(1);
|
||||
const int H = X.dim32(2);
|
||||
const int W = X.dim32(3);
|
||||
Y->ResizeLike(X);
|
||||
|
||||
const auto* Xdata = X.template data<float>();
|
||||
const auto* Sdata = scale.template data<uint16_t>();
|
||||
const auto* Bdata = bias.template data<uint16_t>();
|
||||
auto* Ydata = Y->template mutable_data<float>();
|
||||
|
||||
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)Xdata);
|
||||
id<MTLBuffer> scaleDataBuffer = GetMetalAllocator()->Buffer((void*)Sdata);
|
||||
id<MTLBuffer> biasDataBuffer = GetMetalAllocator()->Buffer((void*)Bdata);
|
||||
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
|
||||
|
||||
metal_instance_norm(inputDataBuffer, C, H, W, scaleDataBuffer, biasDataBuffer, outputDataBuffer, nil, 0, epsilon_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalInstanceNorm, METAL, MetalInstanceNormOp);
|
||||
OPERATOR_SCHEMA(MetalInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
|
||||
|
||||
class MetalInstanceNormPReluOp final : public Operator<MetalCaffeContext> {
|
||||
public:
|
||||
MetalInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<MetalCaffeContext>(operator_def, ws),
|
||||
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override {
|
||||
return RunOnDeviceWithOrderNCHW();
|
||||
}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW();
|
||||
|
||||
protected:
|
||||
float epsilon_;
|
||||
StorageOrder order_;
|
||||
INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
|
||||
OUTPUT_TAGS(OUTPUT);
|
||||
};
|
||||
|
||||
bool MetalInstanceNormPReluOp::RunOnDeviceWithOrderNCHW() {
|
||||
const auto& X = Input(INPUT);
|
||||
const auto& scale = Input(SCALE);
|
||||
const auto& bias = Input(BIAS);
|
||||
const auto& prelu = Input(PRELU);
|
||||
auto* Y = Output(OUTPUT);
|
||||
|
||||
const int C = X.dim32(1);
|
||||
const int H = X.dim32(2);
|
||||
const int W = X.dim32(3);
|
||||
Y->ResizeLike(X);
|
||||
|
||||
const auto* Xdata = X.template data<float>();
|
||||
const auto* Sdata = scale.template data<uint16_t>();
|
||||
const auto* Bdata = bias.template data<uint16_t>();
|
||||
const auto* Pdata = prelu.template data<uint16_t>();
|
||||
auto* Ydata = Y->template mutable_data<float>();
|
||||
|
||||
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)Xdata);
|
||||
id<MTLBuffer> scaleDataBuffer = GetMetalAllocator()->Buffer((void*)Sdata);
|
||||
id<MTLBuffer> biasDataBuffer = GetMetalAllocator()->Buffer((void*)Bdata);
|
||||
id<MTLBuffer> preluDataBuffer = GetMetalAllocator()->Buffer((void*)Pdata);
|
||||
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
|
||||
|
||||
metal_instance_norm(
|
||||
inputDataBuffer,
|
||||
C,
|
||||
H,
|
||||
W,
|
||||
scaleDataBuffer,
|
||||
biasDataBuffer,
|
||||
outputDataBuffer,
|
||||
preluDataBuffer,
|
||||
prelu.size(),
|
||||
epsilon_);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalInstanceNormPRelu, METAL, MetalInstanceNormPReluOp);
|
||||
OPERATOR_SCHEMA(MetalInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
|
||||
} // namespace caffe2
|
||||
#endif // CAFFE2_MOBILE
|
@ -1,52 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#import "MetalCaffeContext.h"
|
||||
#import "metal_prelu.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "Caffe2 mobile state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
class MetalPReluOp final : public Operator<MetalCaffeContext> {
|
||||
public:
|
||||
MetalPReluOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: Operator<MetalCaffeContext>(operator_def, ws),
|
||||
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
|
||||
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
|
||||
}
|
||||
|
||||
bool RunOnDevice() override;
|
||||
|
||||
protected:
|
||||
StorageOrder order_;
|
||||
};
|
||||
|
||||
bool MetalPReluOp::RunOnDevice() {
|
||||
const auto& X = Input(0);
|
||||
const auto& W = Input(1);
|
||||
auto* Y = Output(0);
|
||||
|
||||
Y->ResizeLike(X);
|
||||
|
||||
const auto* Xdata = X.template data<float>();
|
||||
const auto* Wdata = W.template data<uint16_t>();
|
||||
auto* Ydata = Y->template mutable_data<float>();
|
||||
|
||||
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)Xdata);
|
||||
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)Wdata);
|
||||
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
|
||||
|
||||
metal_prelu(inputDataBuffer, X.dim32(1), X.dim32(3), X.dim32(2), weightBuffer, W.size(), outputDataBuffer);
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalPRelu, METAL, MetalPReluOp);
|
||||
OPERATOR_SCHEMA(MetalPRelu).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}}).IdenticalTypeAndShape();
|
||||
} // namespace caffe2
|
||||
#endif // CAFFE2_MOBILE
|
@ -1,11 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import <string>
|
||||
#import <sstream>
|
||||
#import <functional>
|
||||
|
||||
std::string replace_first(std::string input, std::function<void(std::stringstream &fs)> fmt, int index);
|
||||
|
||||
#define REPLACE_CONSTANT(src, val, idx) { src = replace_first(src, [&](std::stringstream &fmt) { fmt << val; }, idx); }
|
@ -1,20 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "MetalShaderUtilities.h"
|
||||
|
||||
std::string replace_first(std::string input, std::function<void(std::stringstream &fs)> fmt, int index) {
|
||||
std::stringstream value_format_stream;
|
||||
value_format_stream << "= ";
|
||||
fmt(value_format_stream);
|
||||
|
||||
std::stringstream token_format_stream;
|
||||
token_format_stream << "[[ function_constant(" << index << ") ]]";
|
||||
|
||||
size_t position = input.find(token_format_stream.str());
|
||||
if (position != std::string::npos) {
|
||||
size_t length = token_format_stream.str().length();
|
||||
return input.replace(position, length, value_format_stream.str());
|
||||
} else {
|
||||
return input;
|
||||
}
|
||||
}
|
@ -1,11 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#ifdef __ARM_NEON__
|
||||
#import <arm_neon.h>
|
||||
#else
|
||||
typedef unsigned short uint16_t;
|
||||
typedef uint16_t float16_t;
|
||||
typedef float float32_t;
|
||||
#endif
|
@ -1,67 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import <stdlib.h>
|
||||
#import <functional>
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import <dispatch/dispatch.h>
|
||||
|
||||
template <typename T1, typename T2, void process_data(T1*, const T2*, size_t), int P>
|
||||
void parallelize(T1* dst, const T2* src, size_t items, dispatch_queue_t queue) {
|
||||
process_data(dst, src, items);
|
||||
dispatch_apply(P, queue, ^(size_t it) {
|
||||
const size_t chunk_size = it == P - 1 ? items - (P - 1) * (items / P) : items / P;
|
||||
const size_t offset = it * chunk_size;
|
||||
|
||||
process_data(dst + offset, src + offset, chunk_size);
|
||||
});
|
||||
}
|
||||
|
||||
template <typename filter_type>
|
||||
filter_type* reformatKernelImage(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed,
|
||||
std::function<filter_type*(size_t)> allocator);
|
||||
|
||||
template <typename filter_type>
|
||||
bool reformatKernelImage(
|
||||
const float* input_data,
|
||||
filter_type* output_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed);
|
||||
|
||||
template <typename filter_type>
|
||||
bool reformatKernelImage(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int kernel_channels,
|
||||
int kernel_width,
|
||||
int kernel_height,
|
||||
bool transposed,
|
||||
std::function<filter_type*(int)> allocator);
|
||||
|
||||
template <typename out_buffer_type>
|
||||
out_buffer_type* reformatInputImage(
|
||||
const float* data,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
std::function<out_buffer_type*(size_t)> allocator);
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void memcpycvt(T1* dst, const T2* src, size_t n);
|
@ -1,265 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "data_conversion.h"
|
||||
#import "metal_convolution.h"
|
||||
|
||||
#import <stdio.h>
|
||||
#import <string.h>
|
||||
|
||||
void memcvt_F32_F16(float32_t* dst, const float16_t* src, size_t n) {
|
||||
int i = 0;
|
||||
#if defined(__ARM_NEON__)
|
||||
for (; i < 4 * (n / 4); i += 4) {
|
||||
*((float32x4_t*)&dst[i]) = vcvt_f32_f16(*((float16x4_t*)&src[i]));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
void memcvt_F16_F32(float16_t* dst, const float32_t* src, size_t n) {
|
||||
int i = 0;
|
||||
#if defined(__ARM_NEON__)
|
||||
for (; i < 4 * (n / 4); i += 4) {
|
||||
*((float16x4_t*)&dst[i]) = vcvt_f16_f32(*((float32x4_t*)&src[i]));
|
||||
}
|
||||
#endif
|
||||
for (; i < n; i++) {
|
||||
dst[i] = src[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T1, typename T2>
|
||||
void memcpycvt(T1* dst, const T2* src, size_t n);
|
||||
|
||||
template <>
|
||||
void memcpycvt(float* dst, const float* src, size_t n) {
|
||||
memcpy(dst, src, n * sizeof(float));
|
||||
}
|
||||
|
||||
template <>
|
||||
void memcpycvt(float16_t* dst, const float* src, size_t n) {
|
||||
parallelize<float16_t, float, memcvt_F16_F32, 2>(
|
||||
dst, src, n, dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0));
|
||||
}
|
||||
|
||||
template <>
|
||||
void memcpycvt(float* dst, const float16_t* src, size_t n) {
|
||||
parallelize<float, float16_t, memcvt_F32_F16, 2>(
|
||||
dst, src, n, dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0));
|
||||
}
|
||||
|
||||
template <typename filter_type>
|
||||
bool reformatKernelImage(
|
||||
const float* input_data,
|
||||
filter_type* output_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed) {
|
||||
const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
|
||||
|
||||
if (output_data) {
|
||||
filter_type* buffer = output_data;
|
||||
|
||||
for (int ks = 0; ks < kernels / kernel_stride; ks++) {
|
||||
for (int kb = 0; kb < kernel_stride; kb++) {
|
||||
int k = kernel_offset + kernel_stride * ks + kb;
|
||||
|
||||
for (int c = 0; c < channels; c++) {
|
||||
for (int y = 0; y < height; y++) {
|
||||
for (int x = 0; x < width; x++) {
|
||||
buffer[aligned_kernel_stride * (width * height * (ks * channels + c) + y * width + x) + kb] =
|
||||
transposed
|
||||
? input_data
|
||||
[(c * input_kernels + k) * width * height + (height - 1 - y) * height + (width - 1 - x)]
|
||||
: input_data[(k * channels + c) * width * height + y * width + x];
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template bool reformatKernelImage<float16_t>(
|
||||
const float* input_data,
|
||||
float16_t* output_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed);
|
||||
|
||||
template bool reformatKernelImage<float>(
|
||||
const float* input_data,
|
||||
float* output_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed);
|
||||
|
||||
template <typename filter_type>
|
||||
filter_type* reformatKernelImage(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed,
|
||||
std::function<filter_type*(size_t)> allocator) {
|
||||
const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
|
||||
|
||||
const int buffer_size = aligned_kernel_stride * (kernels / kernel_stride) * channels * width * height;
|
||||
|
||||
filter_type* output_data = allocator(sizeof(filter_type) * buffer_size);
|
||||
|
||||
reformatKernelImage(
|
||||
input_data,
|
||||
output_data,
|
||||
kernels,
|
||||
input_kernels,
|
||||
kernel_offset,
|
||||
kernel_stride,
|
||||
channels,
|
||||
width,
|
||||
height,
|
||||
transposed);
|
||||
|
||||
return output_data;
|
||||
}
|
||||
template float16_t* reformatKernelImage<float16_t>(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed,
|
||||
std::function<float16_t*(size_t)> allocator);
|
||||
|
||||
template float* reformatKernelImage<float>(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int input_kernels,
|
||||
int kernel_offset,
|
||||
int kernel_stride,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
bool transposed,
|
||||
std::function<float*(size_t)> allocator);
|
||||
|
||||
template <typename filter_type>
|
||||
bool reformatKernelImage(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int kernel_channels,
|
||||
int kernel_width,
|
||||
int kernel_height,
|
||||
bool transposed,
|
||||
std::function<filter_type*(int)> allocator) {
|
||||
int kernels_per_convolution = kernels;
|
||||
|
||||
if (!calculate_kernels_per_convolution(kernels_per_convolution)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
const int convolutions = kernels / kernels_per_convolution;
|
||||
|
||||
const int kernel_stride = kernels_per_convolution;
|
||||
|
||||
const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
|
||||
|
||||
const int chunk_size = aligned_kernel_stride * (kernels_per_convolution / kernel_stride) * kernel_channels *
|
||||
kernel_width * kernel_height;
|
||||
|
||||
// This will allocate more memory than needed, but I couldn't find a better solution
|
||||
filter_type* output_data = allocator((aligned_kernel_stride + kernel_stride - 1) / kernel_stride);
|
||||
|
||||
for (int c = 0; c < convolutions; c++) {
|
||||
if (!reformatKernelImage(
|
||||
input_data,
|
||||
output_data + c * chunk_size,
|
||||
kernels_per_convolution,
|
||||
kernels,
|
||||
c * kernels_per_convolution,
|
||||
kernel_stride,
|
||||
kernel_channels,
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
transposed))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template bool reformatKernelImage<float16_t>(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int kernel_channels,
|
||||
int kernel_width,
|
||||
int kernel_height,
|
||||
bool transposed,
|
||||
std::function<float16_t*(int)> allocator);
|
||||
|
||||
template bool reformatKernelImage<float>(
|
||||
const float* input_data,
|
||||
int kernels,
|
||||
int kernel_channels,
|
||||
int kernel_width,
|
||||
int kernel_height,
|
||||
bool transposed,
|
||||
std::function<float*(int)> allocator);
|
||||
|
||||
template <typename out_buffer_type>
|
||||
out_buffer_type* reformatInputImage(
|
||||
const float* data,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
std::function<out_buffer_type*(size_t)> allocator) {
|
||||
const int buffer_size = channels * width * height;
|
||||
|
||||
out_buffer_type* output_data = allocator(sizeof(out_buffer_type) * buffer_size);
|
||||
|
||||
if (output_data) {
|
||||
memcpycvt(output_data, data, buffer_size);
|
||||
}
|
||||
|
||||
return output_data;
|
||||
}
|
||||
|
||||
template float16_t* reformatInputImage<float16_t>(
|
||||
const float* data,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
std::function<float16_t*(size_t)> allocator);
|
||||
|
||||
template float* reformatInputImage<float>(
|
||||
const float* data,
|
||||
int channels,
|
||||
int width,
|
||||
int height,
|
||||
std::function<float*(size_t)> allocator);
|
@ -1,31 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
bool calculate_kernels_per_convolution(int& kernels_per_convolution);
|
||||
|
||||
bool metal_convolution(
|
||||
id<MTLBuffer> inputBuffer,
|
||||
int input_channels,
|
||||
int input_width,
|
||||
int input_height,
|
||||
int input_stride_x,
|
||||
int input_stride_y,
|
||||
int input_pad_t,
|
||||
int input_pad_l,
|
||||
int input_pad_b,
|
||||
int input_pad_r,
|
||||
id<MTLBuffer> weightBuffer,
|
||||
int output_channels,
|
||||
int kernel_channels,
|
||||
int kernel_width,
|
||||
int kernel_height,
|
||||
id<MTLBuffer> outputBuffer,
|
||||
int output_number,
|
||||
int output_width,
|
||||
int output_height,
|
||||
const float* bias,
|
||||
int bias_length,
|
||||
bool transposed);
|
@ -1,140 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import "FBMetalCNNConvolution.h"
|
||||
#import "metal_convolution.h"
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
@interface MetalConvolutionCacheEntry : NSObject
|
||||
@property (nonatomic, strong) NSMutableArray<FBMetalCNNConvolution*>* convolutions;
|
||||
@end
|
||||
@implementation MetalConvolutionCacheEntry
|
||||
@end
|
||||
|
||||
static MetalContext* metalContext = NULL;
|
||||
static NSMutableDictionary<NSString*, MetalConvolutionCacheEntry*>* convolutionCache = NULL;
|
||||
|
||||
static void init_metal_pipeline() {
|
||||
if (metalContext == NULL) {
|
||||
metalContext = [MetalContext getContext];
|
||||
convolutionCache = [NSMutableDictionary<NSString*, MetalConvolutionCacheEntry*> dictionary];
|
||||
}
|
||||
}
|
||||
|
||||
bool calculate_kernels_per_convolution(int& kernels_per_convolution) {
|
||||
while (kernels_per_convolution > MAX_KERNELS_PER_CONVOLUTION) {
|
||||
if (kernels_per_convolution % 3 == 0)
|
||||
kernels_per_convolution /= 3;
|
||||
else if (kernels_per_convolution % 2 == 0)
|
||||
kernels_per_convolution /= 2;
|
||||
else {
|
||||
LOG(ERROR) << "The number of output channels must be a multiple of 2 or 3\n";
|
||||
return false;
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
bool metal_convolution(
|
||||
id<MTLBuffer> inputBuffer,
|
||||
int input_channels,
|
||||
int input_width,
|
||||
int input_height,
|
||||
int input_stride_x,
|
||||
int input_stride_y,
|
||||
int input_pad_t,
|
||||
int input_pad_l,
|
||||
int input_pad_b,
|
||||
int input_pad_r,
|
||||
id<MTLBuffer> weightBuffer,
|
||||
int output_channels,
|
||||
int kernel_channels,
|
||||
int kernel_width,
|
||||
int kernel_height,
|
||||
id<MTLBuffer> outputBuffer,
|
||||
int output_number,
|
||||
int output_width,
|
||||
int output_height,
|
||||
const float* bias,
|
||||
int bias_length,
|
||||
bool transposed) {
|
||||
init_metal_pipeline();
|
||||
|
||||
if (transposed) {
|
||||
int t = output_channels;
|
||||
output_channels = kernel_channels;
|
||||
kernel_channels = t;
|
||||
}
|
||||
|
||||
int output_batch_size = output_channels;
|
||||
|
||||
if (!calculate_kernels_per_convolution(output_batch_size)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
FBMetalCNNConstantValues constantValues = FBMetalCNNConstantValues(
|
||||
input_width,
|
||||
input_height,
|
||||
input_channels,
|
||||
input_stride_x,
|
||||
input_stride_y,
|
||||
input_pad_t,
|
||||
input_pad_l,
|
||||
input_pad_b,
|
||||
input_pad_r,
|
||||
kernel_width,
|
||||
kernel_height,
|
||||
output_width,
|
||||
output_height,
|
||||
output_batch_size,
|
||||
transposed);
|
||||
|
||||
NSString* key = [NSString stringWithUTF8String:constantValues.to_string().c_str()];
|
||||
|
||||
const int batches = output_channels / output_batch_size;
|
||||
|
||||
MetalConvolutionCacheEntry* cc = convolutionCache[key];
|
||||
|
||||
if (cc == NULL) {
|
||||
// printf("metal_convolution_mtlbuffer: %s\n", [key UTF8String]);
|
||||
convolutionCache[key] = cc = [[MetalConvolutionCacheEntry alloc] init];
|
||||
cc.convolutions = [NSMutableArray<FBMetalCNNConvolution*> arrayWithCapacity:batches];
|
||||
|
||||
|
||||
for (int c = 0; c < batches; c++) {
|
||||
FBMetalCNNConvolution* convolution = [FBMetalCNNConvolution filterWithContext:metalContext
|
||||
channels:input_channels
|
||||
kernel_size:kernel_height
|
||||
constantValues:&constantValues
|
||||
width:output_width
|
||||
height:output_height
|
||||
stride_x:input_stride_x
|
||||
stride_y:input_stride_y];
|
||||
cc.convolutions[c] = convolution;
|
||||
}
|
||||
}
|
||||
|
||||
for (int c = 0; c < batches; c++) {
|
||||
FBMetalCNNConvolution* convolution = cc.convolutions[c];
|
||||
|
||||
[convolution loadBiasData:&bias[output_batch_size * c] length:(NSUInteger)output_batch_size];
|
||||
|
||||
convolution.dataBuffer = inputBuffer;
|
||||
convolution.outputBuffer = outputBuffer;
|
||||
convolution.weightBuffer = weightBuffer;
|
||||
|
||||
const int aligned_kernel_stride = output_batch_size <= 2 ? output_batch_size : 4 * ((output_batch_size + 3) / 4);
|
||||
const int buffer_size = aligned_kernel_stride * (output_batch_size / output_batch_size) * kernel_channels *
|
||||
kernel_width * kernel_height;
|
||||
|
||||
int weightBuffer_offset = c * sizeof(weight_buffer_type) * buffer_size;
|
||||
int outputBuffer_offset = c * sizeof(data_buffer_type) * output_batch_size * output_width * output_height;
|
||||
|
||||
[convolution applyFilter:(void (^)(NSError* error)) nullptr
|
||||
weightBufferOffset:weightBuffer_offset
|
||||
outputBufferOffset:outputBuffer_offset];
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
@ -1,17 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
bool metal_instance_norm(
|
||||
id<MTLBuffer> inputBuffer,
|
||||
int input_channels,
|
||||
int input_width,
|
||||
int input_height,
|
||||
id<MTLBuffer> scaleDataBuffer,
|
||||
id<MTLBuffer> biasDataBuffer,
|
||||
id<MTLBuffer> outputBuffer,
|
||||
id<MTLBuffer> preluBuffer,
|
||||
int prelu_length,
|
||||
float epsilon_);
|
@ -1,77 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import "MetalContext.h"
|
||||
|
||||
#import "FBMetalCNNInstanceNorm.h"
|
||||
#import <Metal/Metal.h>
|
||||
|
||||
@interface InstanceNormCacheEntry : NSObject
|
||||
@property (nonatomic, strong) FBMetalCNNInstanceNorm* instanceNorm;
|
||||
@end
|
||||
@implementation InstanceNormCacheEntry
|
||||
@end
|
||||
|
||||
static MetalContext* metalContext = NULL;
|
||||
static NSMutableDictionary<NSString*, InstanceNormCacheEntry*>* instanceNormCache = NULL;
|
||||
|
||||
static void init_metal_pipeline() {
|
||||
if (metalContext == NULL) {
|
||||
metalContext = [MetalContext getContext];
|
||||
instanceNormCache = [NSMutableDictionary<NSString*, InstanceNormCacheEntry*> dictionary];
|
||||
}
|
||||
}
|
||||
|
||||
bool metal_instance_norm(
|
||||
id<MTLBuffer> inputBuffer,
|
||||
int input_channels,
|
||||
int input_width,
|
||||
int input_height,
|
||||
id<MTLBuffer> scaleDataBuffer,
|
||||
id<MTLBuffer> biasDataBuffer,
|
||||
id<MTLBuffer> outputBuffer,
|
||||
id<MTLBuffer> preluBuffer,
|
||||
int prelu_size,
|
||||
float epsilon_) {
|
||||
init_metal_pipeline();
|
||||
|
||||
NSString* key = [NSString stringWithFormat:@"X:%d:%d:%d-P:%d", input_channels, input_width, input_height, prelu_size];
|
||||
|
||||
InstanceNormCacheEntry* cc = instanceNormCache[key];
|
||||
|
||||
if (cc == NULL) {
|
||||
cc = [[InstanceNormCacheEntry alloc] init];
|
||||
instanceNormCache[key] = cc;
|
||||
|
||||
FBMetalInstanceNormConstantValues constantValues =
|
||||
FBMetalInstanceNormConstantValues(input_width, input_height, input_channels, prelu_size);
|
||||
|
||||
id<MTLBuffer> avgBuffer =
|
||||
[metalContext.device newBufferWithLength:sizeof(float) * input_channels options:MTLStorageModeShared];
|
||||
id<MTLBuffer> stdevBuffer =
|
||||
[metalContext.device newBufferWithLength:sizeof(float) * input_channels options:MTLStorageModeShared];
|
||||
|
||||
FBMetalCNNInstanceNorm* instanceNorm = [FBMetalCNNInstanceNorm filterWithContext:metalContext
|
||||
constantValues:&constantValues
|
||||
width:input_width
|
||||
height:input_height
|
||||
channel:input_channels
|
||||
withPRelu:preluBuffer != nil];
|
||||
|
||||
instanceNorm.avgBuffer = avgBuffer;
|
||||
instanceNorm.stdevBuffer = stdevBuffer;
|
||||
[instanceNorm loadEpsilon:epsilon_];
|
||||
cc.instanceNorm = instanceNorm;
|
||||
}
|
||||
|
||||
FBMetalCNNInstanceNorm* instanceNorm = cc.instanceNorm;
|
||||
instanceNorm.dataBuffer = inputBuffer;
|
||||
instanceNorm.outputBuffer = outputBuffer;
|
||||
instanceNorm.scaleBuffer = scaleDataBuffer;
|
||||
instanceNorm.biasBuffer = biasDataBuffer;
|
||||
instanceNorm.preluBuffer = preluBuffer;
|
||||
|
||||
[instanceNorm applyFilter:(void (^)(NSError* error)) nullptr];
|
||||
|
||||
return true;
|
||||
}
|
@ -1,12 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
bool metal_prelu(
|
||||
id<MTLBuffer> inputBuffer,
|
||||
int input_channels,
|
||||
int input_width,
|
||||
int input_height,
|
||||
id<MTLBuffer> weightBuffer,
|
||||
int weight_length,
|
||||
id<MTLBuffer> outputBuffer);
|
@ -1,63 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import "MetalContext.h"
|
||||
#import "FBMetalCNNPRelu.h"
|
||||
#import "metal_prelu.h"
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
|
||||
@interface PReluCacheEntry : NSObject
|
||||
@property (nonatomic, strong) FBMetalCNNPRelu* prelu;
|
||||
@end
|
||||
@implementation PReluCacheEntry
|
||||
@end
|
||||
|
||||
static MetalContext* metalContext = NULL;
|
||||
static NSMutableDictionary<NSString*, PReluCacheEntry*>* preluCache = NULL;
|
||||
|
||||
static void init_metal_pipeline() {
|
||||
if (metalContext == NULL) {
|
||||
metalContext = [MetalContext getContext];
|
||||
preluCache = [NSMutableDictionary<NSString*, PReluCacheEntry*> dictionary];
|
||||
}
|
||||
}
|
||||
|
||||
bool metal_prelu(
|
||||
id<MTLBuffer> inputBuffer,
|
||||
int input_channels,
|
||||
int input_width,
|
||||
int input_height,
|
||||
id<MTLBuffer> weightBuffer,
|
||||
int weight_length,
|
||||
id<MTLBuffer> outputBuffer) {
|
||||
init_metal_pipeline();
|
||||
|
||||
NSString* key =
|
||||
[NSString stringWithFormat:@"X:%d:%d:%d-F:%d", input_channels, input_width, input_height, weight_length];
|
||||
|
||||
PReluCacheEntry* cc = preluCache[key];
|
||||
|
||||
if (cc == NULL) {
|
||||
preluCache[key] = cc = [[PReluCacheEntry alloc] init];
|
||||
|
||||
FBMetalPReluConstantValues constantValues =
|
||||
FBMetalPReluConstantValues(input_width, input_height, input_channels, weight_length);
|
||||
|
||||
cc.prelu = [FBMetalCNNPRelu filterWithContext:metalContext
|
||||
constantValues:&constantValues
|
||||
width:input_width
|
||||
height:input_height
|
||||
channel:input_channels];
|
||||
}
|
||||
|
||||
FBMetalCNNPRelu* prelu = cc.prelu;
|
||||
|
||||
prelu.dataBuffer = inputBuffer;
|
||||
prelu.outputBuffer = outputBuffer;
|
||||
prelu.weightBuffer = weightBuffer;
|
||||
|
||||
[prelu applyFilter:(void (^)(NSError* error)) nullptr];
|
||||
|
||||
return true;
|
||||
}
|
@ -1,5 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
|
||||
bool metal_sync_op();
|
@ -1,36 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#import "arm_neon_support.h"
|
||||
#import "MetalContext.h"
|
||||
#import "FBMetalCNNNoOp.h"
|
||||
#import "metal_sync_op.h"
|
||||
|
||||
static MetalContext* metalContext = NULL;
|
||||
static FBMetalCNNNoOp* noOpCache = NULL;
|
||||
|
||||
static void init_metal_pipeline() {
|
||||
if (metalContext == NULL) {
|
||||
metalContext = [MetalContext getContext];
|
||||
}
|
||||
}
|
||||
|
||||
bool metal_sync_op() {
|
||||
init_metal_pipeline();
|
||||
|
||||
if (noOpCache == NULL) {
|
||||
noOpCache = [FBMetalCNNNoOp filterWithContext:metalContext
|
||||
width:1
|
||||
height:1];
|
||||
}
|
||||
|
||||
static dispatch_semaphore_t gpu_execution_done = NULL;
|
||||
if (gpu_execution_done == NULL)
|
||||
gpu_execution_done = dispatch_semaphore_create(0);
|
||||
|
||||
[noOpCache applyFilter:^(NSError* error) {
|
||||
dispatch_semaphore_signal(gpu_execution_done);
|
||||
}];
|
||||
dispatch_semaphore_wait(gpu_execution_done, DISPATCH_TIME_FOREVER);
|
||||
|
||||
return true;
|
||||
}
|
@ -1,883 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "metal_test.h"
|
||||
#include "rewrite_net.h"
|
||||
|
||||
#define DEBUGGING false
|
||||
|
||||
namespace caffe2 {
|
||||
void testMetalCopyOps(int N, int C, int H, int W, float error) {
|
||||
LOG(INFO) << "MetalCopyFrom/To Test";
|
||||
Workspace ws;
|
||||
{
|
||||
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(N, C, H, W);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPU");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyFromMetalGPU");
|
||||
op.add_input("X_mtl");
|
||||
op.add_output("Y_cpu");
|
||||
}
|
||||
|
||||
ws.RunNetOnce(netdef);
|
||||
const auto &t1 = ws.GetBlob("X_cpu")->Get<TensorCPU>();
|
||||
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
|
||||
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
|
||||
|
||||
// for (auto i = 0; i < t1.size(); ++i) {
|
||||
// LOG(INFO) << "i: " << i << ", CPU: " << t1.data<float>()[i] << ", MTL: " << t2.data<float>()[i];
|
||||
// }
|
||||
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
CHECK_NEAR(t1.data<float>()[i], t2.data<float>()[i], error);
|
||||
}
|
||||
}
|
||||
|
||||
void testMetalInstanceNorm(int N, int C, int H, int W, float error) {
|
||||
LOG(INFO) << "MetalInstanceNorm Test";
|
||||
Workspace ws;
|
||||
{
|
||||
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(N, C, H, W);
|
||||
CPUContext ctx;
|
||||
// Too noisy.
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 3, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
|
||||
t->Resize(C);
|
||||
CPUContext ctx;
|
||||
for (auto i = 0; i < t->size(); ++i) {
|
||||
t->mutable_data<float>()[i] = i;
|
||||
}
|
||||
}
|
||||
{
|
||||
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
|
||||
t->Resize(C);
|
||||
CPUContext ctx;
|
||||
for (auto i = 0; i < t->size(); ++i) {
|
||||
t->mutable_data<float>()[i] = 8 - 2 * i;
|
||||
}
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPU");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPUFloat16");
|
||||
op.add_input("W");
|
||||
op.add_output("W_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPUFloat16");
|
||||
op.add_input("b");
|
||||
op.add_output("b_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("MetalInstanceNorm");
|
||||
op.add_input("X_mtl");
|
||||
op.add_input("W_mtl");
|
||||
op.add_input("b_mtl");
|
||||
op.add_output("Y_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyFromMetalGPU");
|
||||
op.add_input("Y_mtl");
|
||||
op.add_output("Y_cpu");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("InstanceNorm");
|
||||
op.add_input("X_cpu");
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
op.add_output("Y_ref");
|
||||
}
|
||||
|
||||
ws.RunNetOnce(netdef);
|
||||
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
|
||||
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
CHECK_NEAR(t1_i, t2_i, error);
|
||||
}
|
||||
}
|
||||
|
||||
void testMetalPRelu(int N, int C, int H, int W, int K, float error) {
|
||||
LOG(INFO) << "MetalPRelu Test";
|
||||
Workspace ws;
|
||||
{
|
||||
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(N, C, H, W);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
|
||||
t->Resize(K);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
// The MetalPRelu is an in-place operator
|
||||
NetDef netdef;
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPU");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPUFloat16");
|
||||
op.add_input("b");
|
||||
op.add_output("b_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("MetalPRelu");
|
||||
op.add_input("X_mtl");
|
||||
op.add_input("b_mtl");
|
||||
op.add_output("X_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyFromMetalGPU");
|
||||
op.add_input("X_mtl");
|
||||
op.add_output("Y_cpu");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("PRelu");
|
||||
op.add_input("X_cpu");
|
||||
op.add_input("b");
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
op.add_output("Y_ref");
|
||||
}
|
||||
|
||||
ws.RunNetOnce(netdef);
|
||||
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
|
||||
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
|
||||
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
CHECK_NEAR(t1_i, t2_i, error);
|
||||
}
|
||||
}
|
||||
|
||||
void testMetalInstanceNormPRelu(int N, int C, int H, int W, float error) {
|
||||
LOG(INFO) << "MetalInstanceNormPRelu Test";
|
||||
Workspace ws;
|
||||
{
|
||||
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(N, C, H, W);
|
||||
CPUContext ctx;
|
||||
// Too noisy.
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 3, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
|
||||
t->Resize(C);
|
||||
CPUContext ctx;
|
||||
for (auto i = 0; i < t->size(); ++i) {
|
||||
t->mutable_data<float>()[i] = i;
|
||||
}
|
||||
}
|
||||
{
|
||||
auto *t = ws.CreateBlob("b1")->GetMutable<TensorCPU>();
|
||||
t->Resize(C);
|
||||
CPUContext ctx;
|
||||
for (auto i = 0; i < t->size(); ++i) {
|
||||
t->mutable_data<float>()[i] = 8 - 2 * i;
|
||||
}
|
||||
}
|
||||
// bias for PRelu
|
||||
{
|
||||
auto *t = ws.CreateBlob("b2")->GetMutable<TensorCPU>();
|
||||
t->Resize(C);
|
||||
CPUContext ctx;
|
||||
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPU");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPUFloat16");
|
||||
op.add_input("W");
|
||||
op.add_output("W_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPUFloat16");
|
||||
op.add_input("b1");
|
||||
op.add_output("b1_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPUFloat16");
|
||||
op.add_input("b2");
|
||||
op.add_output("b2_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("MetalInstanceNormPRelu");
|
||||
op.add_input("X_mtl");
|
||||
op.add_input("W_mtl");
|
||||
op.add_input("b1_mtl");
|
||||
op.add_input("b2_mtl");
|
||||
op.add_output("Y_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyFromMetalGPU");
|
||||
op.add_input("Y_mtl");
|
||||
op.add_output("Y_cpu");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("InstanceNorm");
|
||||
op.add_input("X_cpu");
|
||||
op.add_input("W");
|
||||
op.add_input("b1");
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
op.add_output("Y_mid");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("PRelu");
|
||||
op.add_input("Y_mid");
|
||||
op.add_input("b2");
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
op.add_output("Y_ref");
|
||||
}
|
||||
|
||||
ws.RunNetOnce(netdef);
|
||||
const auto &t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
|
||||
const auto &t2 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
|
||||
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
CHECK_NEAR(t1_i, t2_i, error);
|
||||
}
|
||||
}
|
||||
|
||||
void testMetalConv(int N, int C, int H, int W, int K, int kernel_h, int kernel_w, int pad, float error) {
|
||||
LOG(INFO) << "MetalConv Test";
|
||||
Workspace ws;
|
||||
{
|
||||
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(N, C, H, W);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
|
||||
t->Resize(K, C, kernel_h, kernel_w);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
|
||||
t->Resize(K);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPU");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyWeightTensorToMetalGPU");
|
||||
op.add_input("W");
|
||||
op.add_output("W_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("MetalConv");
|
||||
op.add_input("X_mtl");
|
||||
op.add_input("W_mtl");
|
||||
op.add_input("b");
|
||||
op.set_engine("METAL");
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("kernel");
|
||||
arg.set_i(kernel_h);
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("pad");
|
||||
arg.set_i(pad);
|
||||
}
|
||||
op.add_output("Y_mtl");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyFromMetalGPU");
|
||||
op.add_input("Y_mtl");
|
||||
op.add_output("Y_cpu");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("Conv");
|
||||
op.add_input("X_cpu");
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("kernel");
|
||||
arg.set_i(kernel_h);
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("pad");
|
||||
arg.set_i(pad);
|
||||
}
|
||||
op.add_output("Y_ref");
|
||||
}
|
||||
|
||||
ws.RunNetOnce(netdef);
|
||||
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
|
||||
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
|
||||
|
||||
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
|
||||
|
||||
// for (auto i = 0; i < t1.size(); ++i) {
|
||||
// const float t1_i = t1.data<float>()[i];
|
||||
// const float t2_i = t2.data<float>()[i];
|
||||
// if (std::abs(t1_i - t2_i) > 0.5) {
|
||||
// LOG(INFO) << "i: " << i << ", CPU: " << t1_i << ", MTL: " << t2_i << ", error: " << std::abs(t1_i - t2_i) / t1_i;
|
||||
// }
|
||||
// }
|
||||
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
// FP16 <-> FP32 round trip, accumulation, etc.
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
CHECK_NEAR(t1_i, t2_i, error);
|
||||
}
|
||||
}
|
||||
|
||||
void testMetalConvTranspose(
|
||||
int N, int C, int H, int W, int K, int kernel_h, int kernel_w, int pad, int stride, float error) {
|
||||
LOG(INFO) << "MetalConvTranspose Test";
|
||||
Workspace ws;
|
||||
{
|
||||
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
|
||||
t->Resize(N, C, H, W);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
|
||||
t->Resize(K, C, kernel_h, kernel_w);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
{
|
||||
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
|
||||
t->Resize(C);
|
||||
CPUContext ctx;
|
||||
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
|
||||
}
|
||||
|
||||
NetDef netdef;
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyToMetalGPU");
|
||||
op.add_input("X_cpu");
|
||||
op.add_output("X_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyTransposeWeightTensorToMetalGPU");
|
||||
op.add_input("W");
|
||||
op.add_output("W_mtl");
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("MetalConvTranspose");
|
||||
op.add_input("X_mtl");
|
||||
op.add_input("W_mtl");
|
||||
op.add_input("b");
|
||||
op.set_engine("METAL");
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("kernel");
|
||||
arg.set_i(kernel_h);
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("pad");
|
||||
arg.set_i(pad);
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("stride");
|
||||
arg.set_i(stride);
|
||||
}
|
||||
op.add_output("Y_mtl");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("CopyFromMetalGPU");
|
||||
op.add_input("Y_mtl");
|
||||
op.add_output("Y_cpu");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("ConvTranspose");
|
||||
op.add_input("X_cpu");
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("order");
|
||||
arg.set_s("NCHW");
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("kernel");
|
||||
arg.set_i(kernel_h);
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("pad");
|
||||
arg.set_i(pad);
|
||||
}
|
||||
{
|
||||
auto &arg = *(op.add_arg());
|
||||
arg.set_name("stride");
|
||||
arg.set_i(stride);
|
||||
}
|
||||
op.add_output("Y_ref");
|
||||
}
|
||||
|
||||
ws.RunNetOnce(netdef);
|
||||
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
|
||||
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
|
||||
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
|
||||
LOG(INFO) << "N: " << t1.dim(0) << " C: " << t1.dim(1) << " H: " << t1.dim(2) << " W: " << t1.dim(3);
|
||||
|
||||
#if DEBUGGING
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
if (std::abs(t1_i - t2_i) > error) {
|
||||
LOG(INFO) << "i: " << i << ", CPU: " << t1_i << ", MTL: " << t2_i
|
||||
<< ", relative error: " << 100 * std::abs(t1_i - t2_i) / t1_i << "%";
|
||||
}
|
||||
}
|
||||
|
||||
printf("CPU:");
|
||||
for (int i = 0; i < t1.size(); i++) {
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
if (i % t1.dim(2) == 0)
|
||||
printf("\n");
|
||||
printf("%f\t", t1_i);
|
||||
}
|
||||
|
||||
printf("\nMETAL:");
|
||||
for (int i = 0; i < t1.size(); i++) {
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
if (i % t2.dim(2) == 0)
|
||||
printf("\n");
|
||||
printf("%f\t", t2_i);
|
||||
}
|
||||
printf("\n");
|
||||
#endif
|
||||
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
// FP16 <-> FP32 round trip, accumulation, etc.
|
||||
const float t1_i = t1.data<float>()[i];
|
||||
const float t2_i = t2.data<float>()[i];
|
||||
CHECK_NEAR(t1_i, t2_i, error);
|
||||
}
|
||||
}
|
||||
void testMetalRewriteWithFusion() {
|
||||
for (const auto &computeOp : std::vector<std::string>{"InstanceNorm"}) {
|
||||
LOG(INFO) << "RewriteForMetal Fusion/Copy Test";
|
||||
NetDef netdef;
|
||||
netdef.add_external_input("X");
|
||||
netdef.add_external_output("Y");
|
||||
// These two ops can be fused.
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type(computeOp);
|
||||
op.add_input("X");
|
||||
op.add_input("W");
|
||||
op.add_input("b1");
|
||||
op.add_output("Y");
|
||||
}
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("PRelu");
|
||||
op.add_input("Y");
|
||||
op.add_input("b2");
|
||||
op.add_output("Y");
|
||||
}
|
||||
// Can't fuse these as not in-place (can fix by using SSA).
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type(computeOp);
|
||||
op.add_input("X2");
|
||||
op.add_input("W");
|
||||
op.add_input("b1");
|
||||
op.add_output("Y2");
|
||||
}
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("PRelu");
|
||||
op.add_input("Y2");
|
||||
op.add_input("b2");
|
||||
op.add_output("Y");
|
||||
}
|
||||
|
||||
netdef = rewritePredictNetForMetal(netdef, "METAL");
|
||||
// dumpDef(netdef);
|
||||
|
||||
auto ty = [&](size_t i) { return netdef.op(i).type(); };
|
||||
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
|
||||
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
|
||||
CHECK_EQ(netdef.op_size(), 5);
|
||||
CHECK_EQ(ty(0), "CopyToMetalGPU");
|
||||
CHECK_EQ(ty(1), std::string("Metal") + computeOp + std::string("PRelu"));
|
||||
CHECK_EQ(ty(2), std::string("Metal") + computeOp);
|
||||
CHECK_EQ(ty(3), "MetalPRelu");
|
||||
CHECK_EQ(ty(4), "CopyFromMetalGPU");
|
||||
CHECK_EQ(i0(0), "X");
|
||||
CHECK_EQ(i0(1), o0(0));
|
||||
CHECK_EQ(o0(2), "Y2_M");
|
||||
CHECK_EQ(i0(3), o0(2));
|
||||
CHECK_EQ(i0(4), o0(3));
|
||||
CHECK_NE(o0(4), i0(4));
|
||||
CHECK_EQ(netdef.external_input(0), "X");
|
||||
CHECK_EQ(netdef.external_output(0), "Y");
|
||||
}
|
||||
}
|
||||
|
||||
void testMetalRewriteWithMultiInputCPUOps() {
|
||||
|
||||
LOG(INFO) << "RewriteForMetal Test";
|
||||
NetDef netdef;
|
||||
netdef.add_external_input("X");
|
||||
netdef.add_external_output("Y");
|
||||
// These two ops can be fused.
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("Conv");
|
||||
op.add_input("X");
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
op.add_output("Y1");
|
||||
}
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("Conv");
|
||||
op.add_input("Y1");
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
op.add_output("Y2");
|
||||
}
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("Add");
|
||||
op.add_input("Y1");
|
||||
op.add_input("Y2");
|
||||
op.add_output("Y");
|
||||
}
|
||||
|
||||
netdef = rewritePredictNetForMetal(netdef, "METAL");
|
||||
// dumpDef(netdef);
|
||||
|
||||
auto ty = [&](size_t i) { return netdef.op(i).type(); };
|
||||
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
|
||||
auto i1 = [&](size_t i) { return netdef.op(i).input(1); };
|
||||
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
|
||||
CHECK_EQ(netdef.op_size(), 6);
|
||||
CHECK_EQ(ty(0), "CopyToMetalGPU");
|
||||
CHECK_EQ(ty(1), "MetalConv");
|
||||
CHECK_EQ(ty(2), "MetalConv");
|
||||
CHECK_EQ(ty(3), "CopyFromMetalGPU");
|
||||
CHECK_EQ(ty(4), "CopyFromMetalGPU");
|
||||
CHECK_EQ(ty(5), "Add");
|
||||
CHECK_EQ(i0(0), "X");
|
||||
CHECK_EQ(i0(1), o0(0));
|
||||
CHECK_EQ(o0(2), "Y2_M");
|
||||
CHECK_EQ(i0(3), o0(1));
|
||||
CHECK_EQ(i0(4), o0(2));
|
||||
CHECK_EQ(i0(5), o0(3));
|
||||
CHECK_EQ(i1(5), o0(4));
|
||||
CHECK_EQ(o0(5), "Y");
|
||||
CHECK_EQ(netdef.external_input(0), "X");
|
||||
CHECK_EQ(netdef.external_output(0), "Y");
|
||||
}
|
||||
|
||||
void testMetalRewriteFailure() {
|
||||
LOG(INFO) << "RewriteForMetal Failure Test";
|
||||
NetDef netdef;
|
||||
netdef.add_external_input("X");
|
||||
netdef.add_external_output("Y");
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("Conv");
|
||||
op.add_input("X");
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
op.add_output("Y1");
|
||||
}
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("Conv");
|
||||
op.add_input("X");
|
||||
op.add_input("W");
|
||||
op.add_input("b");
|
||||
op.add_output("Y2");
|
||||
}
|
||||
|
||||
{
|
||||
auto &op = *(netdef.add_op());
|
||||
op.set_type("Concat");
|
||||
op.add_input("Y1");
|
||||
op.add_input("Y2");
|
||||
op.add_output("Y");
|
||||
}
|
||||
try {;
|
||||
netdef = rewritePredictNetForMetal(netdef, "METAL");
|
||||
// dumpDef(netdef);
|
||||
CHECK(false) << "Shouldn't reach here, due to multiple usages of X";
|
||||
} catch (const std::exception &e) {
|
||||
LOG(INFO) << "RewriteForMetal failed";
|
||||
}
|
||||
}
|
||||
|
||||
void testMetal() {
|
||||
testMetalCopyOps(1, 3, 2, 1, 1e-2);
|
||||
testMetalCopyOps(1, 4, 1, 1, 1e-2);
|
||||
testMetalCopyOps(1, 4, 8, 3, 1e-2);
|
||||
testMetalCopyOps(1, 6, 8, 3, 1e-2);
|
||||
testMetalCopyOps(1, 4, 1, 2, 1e-2);
|
||||
testMetalCopyOps(1, 8, 6, 1, 1e-2);
|
||||
testMetalCopyOps(1, 12, 13, 18, 1e-2);
|
||||
|
||||
testMetalInstanceNorm(1, 3, 120, 140, 0.05);
|
||||
testMetalInstanceNorm(1, 12, 120, 140, 0.05);
|
||||
|
||||
testMetalPRelu(1, 3, 8, 13, 3, 0.1);
|
||||
testMetalPRelu(1, 3, 8, 13, 1, 0.1);
|
||||
|
||||
testMetalInstanceNormPRelu(1, 12, 120, 140, 0.2);
|
||||
|
||||
testMetalConv(1, 12, 57, 72, 8, 3, 3, 1, 1.5);
|
||||
testMetalConv(1, 12, 57, 72, 8, 3, 3, 2, 1.5);
|
||||
testMetalConv(1, 12, 57, 72, 8, 3, 3, 0, 1.5);
|
||||
testMetalConv(1, 12, 57, 72, 8, 2, 2, 0, 1.5);
|
||||
|
||||
#if DEBUGGING
|
||||
testMetalConvTranspose(1, 1, 6, 6, 1, 1, 1, 0, 2, 0.1);
|
||||
testMetalConvTranspose(1, 1, 6, 6, 1, 2, 2, 0, 2, 0.1);
|
||||
testMetalConvTranspose(1, 1, 6, 6, 1, 3, 3, 0, 2, 0.1);
|
||||
testMetalConvTranspose(1, 1, 6, 6, 1, 4, 4, 0, 2, 0.1);
|
||||
testMetalConvTranspose(1, 1, 6, 6, 1, 5, 5, 0, 2, 0.1);
|
||||
testMetalConvTranspose(1, 1, 6, 6, 1, 6, 6, 0, 2, 0.1);
|
||||
#endif
|
||||
|
||||
testMetalConvTranspose(1, 16, 320, 180, 16, 2, 2, 0, 2, 1.5);
|
||||
testMetalConvTranspose(1, 4, 320, 180, 4, 4, 4, 1, 2, 1.5);
|
||||
testMetalConvTranspose(1, 4, 320, 180, 4, 4, 4, 0, 4, 1.5);
|
||||
|
||||
testMetalRewriteWithFusion();
|
||||
testMetalRewriteWithMultiInputCPUOps();
|
||||
testMetalRewriteFailure();
|
||||
|
||||
}
|
||||
|
||||
NetDef truncateAfter(NetDef def, size_t idx) {
|
||||
// idx = 0, net = 10 -> remove 9
|
||||
// idx = 0, net = 1 -> remove 0
|
||||
const auto toRemove = def.op_size() - idx - 1;
|
||||
for (auto i = 0; i < toRemove; ++i) {
|
||||
def.mutable_op()->RemoveLast();
|
||||
}
|
||||
CHECK_EQ(def.op_size(), idx + 1);
|
||||
return def;
|
||||
}
|
||||
|
||||
void compareModels(const NetDef &initNet, NetDef predictNet) {
|
||||
auto *arg = predictNet.mutable_op(0)->mutable_arg(0);
|
||||
CHECK_EQ(arg->name(), "noise_std");
|
||||
arg->set_f(0.000001);
|
||||
|
||||
for (auto i = 0; i < predictNet.op_size(); ++i) {
|
||||
auto truncatedPredictNet = truncateAfter(predictNet, i);
|
||||
|
||||
// The copyFromMetalGPUop is added in the rewriting process
|
||||
NetDef truncatedMetalPredictNet = rewritePredictNetForMetal(truncatedMetalPredictNet, "METAL");
|
||||
NetDef metalInitNet = rewriteInitNetForMetal(metalInitNet, truncatedMetalPredictNet, "METAL");
|
||||
|
||||
// dumpDef(truncatedPredictNet);
|
||||
// dumpDef(truncatedMetalPredictNet);
|
||||
|
||||
Workspace cws;
|
||||
cws.RunNetOnce(initNet);
|
||||
{
|
||||
auto *t = cws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
|
||||
t->Resize(1, 224, 224, 4);
|
||||
for (auto i = 0; i < t->size(); ++i) {
|
||||
t->mutable_data<uint8_t>()[i] = i % 225;
|
||||
}
|
||||
}
|
||||
cws.RunNetOnce(truncatedPredictNet);
|
||||
|
||||
Workspace mws;
|
||||
mws.RunNetOnce(metalInitNet);
|
||||
{
|
||||
auto *t = mws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
|
||||
t->Resize(1, 224, 224, 4);
|
||||
for (auto i = 0; i < t->size(); ++i) {
|
||||
t->mutable_data<uint8_t>()[i] = i % 225;
|
||||
}
|
||||
}
|
||||
mws.RunNetOnce(truncatedMetalPredictNet);
|
||||
|
||||
const auto name = truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
|
||||
|
||||
LOG(INFO) << "Checking correspondence for name: " << name << ", idx: " << i;
|
||||
{
|
||||
const auto &mt = mws.GetBlob(name)->Get<TensorCPU>();
|
||||
const auto &ct = cws.GetBlob(name)->Get<TensorCPU>();
|
||||
CHECK_EQ(mt.dims(), ct.dims());
|
||||
for (auto j = 0; j < mt.size(); ++j) {
|
||||
if (mt.IsType<float>()) {
|
||||
if (j < 10) {
|
||||
LOG(INFO) << "i: " << i << ", j: " << j << ", CPU: " << ct.data<float>()[j]
|
||||
<< ", MTL: " << mt.data<float>()[j];
|
||||
}
|
||||
CHECK_NEAR(mt.data<float>()[j], ct.data<float>()[j], 5);
|
||||
} else {
|
||||
CHECK(mt.IsType<uint8_t>());
|
||||
if (j < 10) {
|
||||
LOG(INFO) << "i: " << i << ", j: " << j << ", CPU: " << ct.data<uint8_t>()[j]
|
||||
<< ", MTL: " << mt.data<uint8_t>()[j];
|
||||
}
|
||||
CHECK_NEAR(mt.data<uint8_t>()[j], ct.data<uint8_t>()[j], 5);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace caffe2
|
@ -1,8 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
void testMetal();
|
||||
void compareModels(const NetDef& initNet, NetDef predictNet);
|
||||
}
|
@ -1,304 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#include "rewrite_net.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
struct Analysis {
|
||||
struct SSA {
|
||||
using BlobVersions = std::unordered_map<std::string, size_t>;
|
||||
BlobVersions inVersions;
|
||||
BlobVersions outVersions;
|
||||
};
|
||||
std::vector<SSA> ssa;
|
||||
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
|
||||
};
|
||||
|
||||
Analysis analyzeNet(const NetDef &net) {
|
||||
Analysis::SSA::BlobVersions frontier;
|
||||
Analysis analysis;
|
||||
|
||||
auto play = [&](size_t i, const OperatorDef &op) {
|
||||
Analysis::SSA::BlobVersions inVersions;
|
||||
for (const auto &s : op.input()) {
|
||||
inVersions[s] = frontier[s];
|
||||
analysis.inUsages[s][frontier[s]].push_back(i);
|
||||
}
|
||||
Analysis::SSA::BlobVersions outVersions;
|
||||
for (const auto &s : op.output()) {
|
||||
if (frontier.find(s) != frontier.end()) {
|
||||
frontier[s] += 1;
|
||||
}
|
||||
outVersions[s] = frontier[s];
|
||||
}
|
||||
analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
|
||||
};
|
||||
|
||||
for (auto i = 0; i < net.op_size(); ++i) {
|
||||
play(i, net.op(i));
|
||||
}
|
||||
return analysis;
|
||||
}
|
||||
|
||||
void insertCopyToMetalGPUOp(NetDef &predictNet, const std::string &cpu_blob) {
|
||||
auto *op = predictNet.add_op();
|
||||
op->set_name("CopyFromCPUToGPU");
|
||||
op->set_type("CopyToMetalGPU");
|
||||
op->set_engine("METAL");
|
||||
op->add_input(cpu_blob);
|
||||
op->add_output(cpu_blob + "_M");
|
||||
}
|
||||
|
||||
void insertCopyFromMetalGPUOp(NetDef &predictNet, const std::string &cpu_blob) {
|
||||
auto *op = predictNet.add_op();
|
||||
op->set_name("CopyFromGPUToCPU");
|
||||
op->set_type("CopyFromMetalGPU");
|
||||
op->add_input(cpu_blob + "_M");
|
||||
op->add_output(cpu_blob);
|
||||
}
|
||||
|
||||
NetDef insertInputOutputCopyOps(const NetDef &def, std::string engine) {
|
||||
// Do some validation of the outputs. For this version, we require:
|
||||
// - a single input (first element of external_input()) is consumed by the NetDef
|
||||
// - a single output (first element of external_output()) is produced by the NetDef.
|
||||
// - the input is consumed by def.op(0), and this is the only consumer.
|
||||
// - the output is produced by def.op(-1).
|
||||
CAFFE_ENFORCE_GE(def.external_input_size(), 1);
|
||||
CAFFE_ENFORCE_GE(def.external_output_size(), 1);
|
||||
auto analysis = analyzeNet(def);
|
||||
// enforce a single use of the input blob.
|
||||
CAFFE_ENFORCE_GE(def.op_size(), 1);
|
||||
|
||||
const auto &inputBlob = def.external_input(0);
|
||||
// Enforce that the input blob has a single usage - in the first operator.
|
||||
CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
|
||||
// Enforce that the external_output(0) blob is produced by the last operator in this sequence.
|
||||
const auto &outputBlob = def.external_output(0);
|
||||
CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) != analysis.ssa.back().outVersions.end());
|
||||
const auto &outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
|
||||
// This should hold true by definition of the SSA analysis.
|
||||
CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) == analysis.inUsages[outputBlob].end());
|
||||
|
||||
NetDef mdef;
|
||||
mdef.CopyFrom(def);
|
||||
mdef.clear_op();
|
||||
|
||||
std::unordered_map<std::string, std::set<size_t>> cpu_blobs, metal_blobs;
|
||||
cpu_blobs[def.external_input(0)].insert(0);
|
||||
|
||||
for (auto i = 0; i < def.op_size(); i++) {
|
||||
const auto ¤tOp = def.op(i);
|
||||
if (currentOp.engine() == engine) {
|
||||
// Metal Op
|
||||
// insert copyToMetalOp
|
||||
for (auto j = 0; j < currentOp.input_size(); j++) {
|
||||
auto &input = currentOp.input(j);
|
||||
auto version = analysis.ssa[i].inVersions[input];
|
||||
if (cpu_blobs[input].count(version) > 0) {
|
||||
insertCopyToMetalGPUOp(mdef, input);
|
||||
metal_blobs[input].insert(version);
|
||||
cpu_blobs[input].erase(version);
|
||||
}
|
||||
}
|
||||
|
||||
auto *op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
|
||||
// swap input blob
|
||||
for (auto j = 0; j < currentOp.input_size(); j++) {
|
||||
auto &input = currentOp.input(j);
|
||||
auto version = analysis.ssa[i].inVersions[input];
|
||||
if (metal_blobs[input].count(version) > 0) {
|
||||
op->set_input(j, input + "_M");
|
||||
}
|
||||
}
|
||||
|
||||
// swap output blob
|
||||
for (auto j = 0; j < currentOp.output_size(); j++) {
|
||||
auto &output = currentOp.output(j);
|
||||
auto version = analysis.ssa[i].outVersions[output];
|
||||
op->set_output(j, output + "_M");
|
||||
metal_blobs[output].insert(version);
|
||||
}
|
||||
// insert copyFromMetalOp after the last op if the last op is a metal op
|
||||
if (i == def.op_size() - 1) {
|
||||
insertCopyFromMetalGPUOp(mdef, currentOp.output(0));
|
||||
}
|
||||
} else {
|
||||
// CPU Op
|
||||
// insert copyFromMetalOp
|
||||
for (auto j = 0; j < currentOp.input_size(); j++) {
|
||||
auto &input = currentOp.input(j);
|
||||
auto version = analysis.ssa[i].inVersions[input];
|
||||
if (metal_blobs[input].count(version) > 0) {
|
||||
insertCopyFromMetalGPUOp(mdef, input);
|
||||
}
|
||||
}
|
||||
auto *op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
for (auto j = 0; j < currentOp.output_size(); j++) {
|
||||
auto &output = currentOp.output(j);
|
||||
auto version = analysis.ssa[i].outVersions[output];
|
||||
cpu_blobs[output].insert(version);
|
||||
}
|
||||
}
|
||||
}
|
||||
return mdef;
|
||||
}
|
||||
|
||||
bool tryFuseAdjacentOps(const OperatorDef ¤tOp, const OperatorDef &nextOp, OperatorDef *fusedOp) {
|
||||
// Check for possible invalid opportunities.
|
||||
// Must be identical outputs, with in-place usage for nextOp.
|
||||
if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
|
||||
return false;
|
||||
}
|
||||
if (currentOp.output(0) != nextOp.input(0) || nextOp.input(0) != nextOp.output(0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {{
|
||||
{{"MetalInstanceNorm", "MetalPRelu"}, "MetalInstanceNormPRelu"},
|
||||
}};
|
||||
auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
|
||||
if (it == fusionOpportunities.end()) {
|
||||
return false;
|
||||
}
|
||||
LOG(INFO) << "Found a fusion between adjacent ops: (" << currentOp.type() << ", " << nextOp.type() << ") -> "
|
||||
<< it->second;
|
||||
fusedOp->CopyFrom(currentOp);
|
||||
fusedOp->set_type(it->second);
|
||||
for (auto i = 1; i < nextOp.input_size(); i++) {
|
||||
fusedOp->add_input(nextOp.input(i));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
NetDef runMetalFusion(const NetDef &def) {
|
||||
CHECK_GE(def.op_size(), 1);
|
||||
NetDef mdef;
|
||||
mdef.CopyFrom(def);
|
||||
mdef.clear_op();
|
||||
auto i = 0;
|
||||
|
||||
while (i < def.op_size()) {
|
||||
if (i == def.op_size() - 1) {
|
||||
VLOG(2) << "Last operator, skipping";
|
||||
auto *op = mdef.add_op();
|
||||
op->CopyFrom(def.op(i));
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto ¤tOp = def.op(i);
|
||||
const auto &nextOp = def.op(i + 1);
|
||||
OperatorDef fusedOp;
|
||||
if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp)) {
|
||||
VLOG(2) << "Found an adjacent fusion at: " << i;
|
||||
// We can fuse.
|
||||
auto *op = mdef.add_op();
|
||||
op->CopyFrom(fusedOp);
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
VLOG(2) << "No fusion available";
|
||||
// Just emit the current type.
|
||||
auto *op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
i += 1;
|
||||
}
|
||||
return mdef;
|
||||
}
|
||||
|
||||
void dumpDef(NetDef &net) {
|
||||
for (const auto &op : net.op()) {
|
||||
printf("***Operator: %s\n", op.type().c_str());
|
||||
for (auto input : op.input()) {
|
||||
printf("\tInput: %s\n", input.c_str());
|
||||
}
|
||||
|
||||
for (auto output : op.output()) {
|
||||
printf("\tOutput: %s\n", output.c_str());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
NetDef rewritePredictNetForMetal(const NetDef &predictNet, const std::string engine) {
|
||||
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
|
||||
NetDef net;
|
||||
net.CopyFrom(predictNet);
|
||||
|
||||
std::unordered_map<std::string, std::string> replacements({
|
||||
{"Conv", "MetalConv"},
|
||||
{"InstanceNorm", "MetalInstanceNorm"},
|
||||
{"PRelu", "MetalPRelu"},
|
||||
{"ConvTranspose", "MetalConvTranspose"},
|
||||
});
|
||||
|
||||
for (OperatorDef &op : *net.mutable_op()) {
|
||||
if (op.has_type() && replacements.count(op.type()) > 0) {
|
||||
op.set_type(replacements[op.type()]);
|
||||
op.set_engine("METAL");
|
||||
}
|
||||
}
|
||||
|
||||
net = insertInputOutputCopyOps(net, engine);
|
||||
net = runMetalFusion(net);
|
||||
return net;
|
||||
}
|
||||
|
||||
NetDef rewriteInitNetForMetal(const NetDef &initNet, const NetDef &predictNet, const std::string engine) {
|
||||
// Find the GivenTensorFill operators for weight tensors and change to Metal GivenTensorFill ops
|
||||
NetDef net;
|
||||
net.CopyFrom(initNet);
|
||||
std::set<std::string> conv_weights, conv_tranpose_weights, weights_and_biases;
|
||||
for (auto &op : predictNet.op()) {
|
||||
if (op.engine() == engine) {
|
||||
if (op.type() == "MetalConv") {
|
||||
conv_weights.insert(op.input(1));
|
||||
} else if (op.type() == "MetalConvTranspose") {
|
||||
conv_tranpose_weights.insert(op.input(1));
|
||||
} else {
|
||||
// Need to add support for operators with > 1 input tensors such as Add
|
||||
for (int i = 1; i < op.input_size(); i++) {
|
||||
weights_and_biases.insert(op.input(i));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for (auto &op : *net.mutable_op()) {
|
||||
if (op.type() == "GivenTensorFill" && op.output_size() == 1) {
|
||||
if (conv_weights.count(op.output(0)) > 0) {
|
||||
op.set_type("GivenWeightTensorFill");
|
||||
op.set_engine(engine);
|
||||
} else if (conv_tranpose_weights.count(op.output(0)) > 0) {
|
||||
op.set_type("GivenTransposeWeightTensorFill");
|
||||
op.set_engine(engine);
|
||||
} else if (weights_and_biases.count(op.output(0)) > 0) {
|
||||
op.set_type("GivenTensorFloat16MetalFill");
|
||||
op.set_engine(engine);
|
||||
}
|
||||
}
|
||||
}
|
||||
return net;
|
||||
}
|
||||
|
||||
bool tryConvertToMetal(const NetDef &initNet, const NetDef &predictNet, NetDef *metalInitNet, NetDef *metalPredictNet) {
|
||||
try {
|
||||
// Throws if unsupported operators are found.
|
||||
*metalPredictNet = rewritePredictNetForMetal(predictNet, "METAL");
|
||||
*metalInitNet = rewriteInitNetForMetal(initNet, *metalPredictNet, "METAL");
|
||||
|
||||
// Throws if unsupported parameters are found.
|
||||
Workspace ws;
|
||||
ws.RunNetOnce(*metalInitNet);
|
||||
ws.CreateNet(*metalPredictNet);
|
||||
LOG(INFO) << "Metal is successfully enabled";
|
||||
return true;
|
||||
} catch (const std::exception &e) {
|
||||
LOG(ERROR) << "Caught exception trying to convert NetDef to Metal: " << e.what();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} // namespace caffe2
|
@ -1,13 +0,0 @@
|
||||
// Copyright 2004-present Facebook. All Rights Reserved.
|
||||
|
||||
#pragma once
|
||||
#include "caffe2/core/predictor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
bool tryConvertToMetal(const NetDef &initNet, const NetDef &predictNet, NetDef *metalInitNet, NetDef *metalPredictNet);
|
||||
|
||||
// Exposed for testing
|
||||
NetDef rewritePredictNetForMetal(const NetDef &predictNet, const std::string engine);
|
||||
NetDef rewriteInitNetForMetal(const NetDef &initNet, const NetDef &predictNet, const std::string engine);
|
||||
void dumpDef(NetDef &net);
|
||||
}
|
7
caffe2/contrib/ios/mpscnn/CMakeLists.txt
Normal file
7
caffe2/contrib/ios/mpscnn/CMakeLists.txt
Normal file
@ -0,0 +1,7 @@
|
||||
if(IOS)
|
||||
file(GLOB_RECURSE tmp *.mm *.cc)
|
||||
# exclude test files
|
||||
file(GLOB_RECURSE test_files *_test.cc)
|
||||
exclude(tmp "${tmp}" ${test_files})
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
||||
endif()
|
@ -85,3 +85,8 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
|
||||
/wd4996 # (3): Use of a deprecated member
|
||||
)
|
||||
endif()
|
||||
|
||||
# ---[ If we are building on ios, we will enable -mfpu=neon-fp16 for iOS Metal build.
|
||||
if (IOS)
|
||||
add_definitions("-mfpu=neon-fp16")
|
||||
endif()
|
||||
|
Reference in New Issue
Block a user