Fix metal build after sync

Summary:
While I was trying to make a quick oss cmakefile, I found that some of the
ios source files are out of sync with the most code changes. This diff should
fix the issues.

I manually ran cmake on the oss side with scripts/build_ios.sh to make sure
things pass.

Reviewed By: ajtulloch

Differential Revision: D5582265

fbshipit-source-id: 2636d353d32fcd8fb7087385b9bbed8476e33e74
This commit is contained in:
Yangqing Jia
2017-08-08 10:10:18 -07:00
committed by Facebook Github Bot
parent 9fcf676cfa
commit 679a586d53
42 changed files with 28 additions and 4423 deletions

View File

@ -1,4 +1,5 @@
add_subdirectory(gloo)
add_subdirectory(ios)
add_subdirectory(nccl)
add_subdirectory(nnpack)
add_subdirectory(observers)

View File

@ -0,0 +1,15 @@
# TODO: figure out conflict between contrib/nnpack/nnpack_ops.cc and mobile_nnpack.cc
if(IOS)
# Basic ios srcs.
set(Caffe2_CONTRIB_IOS_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe.cc"
"${CMAKE_CURRENT_SOURCE_DIR}/ios_caffe_predictor.cc"
# "${CMAKE_CURRENT_SOURCE_DIR}/mobile_nnpack.cc"
)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_IOS_SRC})
# mpscnn files
add_subdirectory(mpscnn)
endif()
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)

View File

@ -1,99 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/core/common.h"
#ifndef CAFFE2_MOBILE
#error "Caffe2 mobile state not defined"
#endif
#if CAFFE2_MOBILE
#include "caffe2/core/operator.h"
#import "data_conversion.h"
#import "MetalCaffeContext.h"
#import "MetalImageFilter.h"
#import "metal_sync_op.h"
namespace caffe2 {
class CopyToMetalGPUOp final : public Operator<MetalCaffeContext> {
public:
CopyToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws) : Operator<MetalCaffeContext>(operator_def, ws) {}
bool RunOnDevice() override {
const Blob *blob = Inputs()[0];
const TensorCPU &X = blob->Get<TensorCPU>();
CAFFE_ENFORCE(X.dim32(0) == 1);
int input_channels = X.dim32(1);
int input_width = X.dim32(3);
int input_height = X.dim32(2);
auto *Y = Output(0);
Y->Resize(X.dim32(0), input_channels, input_height, input_width);
const float *input = X.template data<float>();
float *output_data = (float *)[GetMetalAllocator()->Buffer((void *)Y->template mutable_data<float>()) contents];
memcpycvt(output_data, input, Y->size());
return true;
}
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyToMetalGPU, METAL, CopyToMetalGPUOp);
OPERATOR_SCHEMA(CopyToMetalGPU).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
class CopyToMetalGPUFloat16Op final : public Operator<MetalCaffeContext> {
public:
CopyToMetalGPUFloat16Op(const OperatorDef &operator_def, Workspace *ws)
: Operator<MetalCaffeContext>(operator_def, ws) {}
bool RunOnDevice() override {
const Blob *blob = Inputs()[0];
const TensorCPU &X = blob->Get<TensorCPU>();
auto *Y = Output(0);
Y->ResizeLike(X);
const float *input = X.template data<float>();
float16_t *output_data =
(float16_t *)[GetMetalAllocator()->Buffer((void *)Y->template mutable_data<uint16_t>()) contents];
memcpycvt(output_data, input, Y->size());
return true;
}
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyToMetalGPUFloat16, METAL, CopyToMetalGPUFloat16Op);
OPERATOR_SCHEMA(CopyToMetalGPUFloat16).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
class CopyFromMetalGPUOp final : public Operator<CPUContext> {
public:
CopyFromMetalGPUOp(const OperatorDef &operator_def, Workspace *ws) : Operator<CPUContext>(operator_def, ws) {}
bool RunOnDevice() override {
const Blob *blob = Inputs()[0];
const TensorMetal &X = blob->Get<TensorMetal>();
CAFFE_ENFORCE(X.dim32(0) == 1);
int input_channels = X.dim32(1);
int input_width = X.dim32(3);
int input_height = X.dim32(2);
auto *Y = Output(0);
Y->Resize(X.dim32(0), input_channels, input_height, input_width);
const float *input = (float *)[GetMetalAllocator()->Buffer((void *)X.template data<float>()) contents];
float *output_data = Y->template mutable_data<float>();
metal_sync_op();
memcpycvt(output_data, input, Y->size());
return true;
}
};
REGISTER_CPU_OPERATOR(CopyFromMetalGPU, CopyFromMetalGPUOp);
OPERATOR_SCHEMA(CopyFromMetalGPU).NumInputs(1).NumOutputs(1).AllowInplace({{0, 0}});
} // namespace caffe2
#endif // CAFFE2_MOBILE

View File

@ -1,119 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "arm_neon_support.h"
#import "MetalImageFilter.h"
#import "FBMetalConstantValues.h"
class FBMetalCNNConstantValues : public FBMetalConstantValues {
public:
ushort input_width;
ushort input_height;
ushort input_channels;
ushort input_stride_x;
ushort input_stride_y;
ushort input_pad_t;
ushort input_pad_l;
ushort input_pad_b;
ushort input_pad_r;
ushort filter_width;
ushort filter_height;
ushort output_width;
ushort output_height;
ushort output_channels;
bool transposed;
FBMetalCNNConstantValues(
ushort _input_width,
ushort _input_height,
ushort _input_channels,
ushort _input_stride_x,
ushort _input_stride_y,
ushort _input_pad_t,
ushort _input_pad_l,
ushort _input_pad_b,
ushort _input_pad_r,
ushort _filter_width,
ushort _filter_height,
ushort _output_width,
ushort _output_height,
ushort _output_channels,
bool _transposed) :
input_width(_input_width),
input_height(_input_height),
input_channels(_input_channels),
input_stride_x(_input_stride_x),
input_stride_y(_input_stride_y),
input_pad_t(_input_pad_t),
input_pad_l(_input_pad_l),
input_pad_b(_input_pad_b),
input_pad_r(_input_pad_r),
filter_width(_filter_width),
filter_height(_filter_height),
output_width(_output_width),
output_height(_output_height),
output_channels(_output_channels),
transposed(_transposed) { }
std::string to_string();
};
constexpr int MAX_KERNELS_PER_CONVOLUTION = 32;
typedef float32_t data_buffer_type;
typedef float16_t weight_buffer_type;
@interface FBMetalCNNConvolution : MetalImageFilter
@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
@property (nonatomic, strong) id<MTLBuffer> weightBuffer;
@property (nonatomic, strong) id<MTLBuffer> biasBuffer;
+ (id<MTLBuffer>)loadFilterWithImage:(const float*)weight_data
weightBuffer:(id<MTLBuffer>)weightBuffer
kernels:(NSUInteger)kernels
input_kernels:(NSUInteger)input_kernels
kernel_offset:(NSUInteger)kernel_offset
kernel_stride:(NSUInteger)kernel_stride
channels:(NSUInteger)channels
width:(NSUInteger)width
height:(NSUInteger)height
transposed:(bool)transposed
context:(MetalContext*)context;
- (void)loadFilterWithImage:(const float*)weight_data
kernels:(NSUInteger)kernels
input_kernels:(NSUInteger)input_kernels
kernel_offset:(NSUInteger)kernel_offset
kernel_stride:(NSUInteger)kernel_stride
channels:(NSUInteger)channels
width:(NSUInteger)image_width
height:(NSUInteger)image_height
transposed:(bool)transposed;
+ (id<MTLBuffer>)loadDataWithImage:(const float*)imageData
imageDataBuffer:(id<MTLBuffer>)imageDataBuffer
channels:(NSUInteger)channels
width:(NSUInteger)width
height:(NSUInteger)height
context:(MetalContext*)context;
- (void)loadBiasData:(const float *)bias
length:(NSUInteger)length;
+ (instancetype)filterWithContext:(MetalContext*)context
channels:(NSUInteger)channels
kernel_size:(NSUInteger)kernel_size
constantValues:(FBMetalCNNConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
stride_x:(NSUInteger)stride_x
stride_y:(NSUInteger)stride_y;
@end

View File

@ -1,538 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "FBMetalConstantValues.h"
#import "FBMetalCNNConvolution.h"
#import "data_conversion.h"
#import "MetalShaderUtilities.h"
#include "caffe2/core/logging.h"
static constexpr size_t kThreadGroupSize_x = 4;
static constexpr size_t kThreadGroupSize_y = 8;
std::string FBMetalCNNConstantValues::to_string() {
std::ostringstream ss;
ss << "X:" <<
input_channels << "x" <<
input_width << "x" <<
input_height << "[" <<
input_stride_x << ":" <<
input_stride_y << ":" <<
input_pad_t << ":" <<
input_pad_l << ":" <<
input_pad_b << ":" <<
input_pad_r << "]-Y:" <<
output_channels << "x" <<
output_width << "x" <<
output_height << "-W:" <<
filter_width << "x" <<
filter_height << ":" <<
(transposed ? "T" : "D");
return ss.str();
}
namespace {
extern const char *metalCode;
}
@interface FBMetalCNNConvolution () {
int stride_x;
int stride_y;
uint input_batch_size;
uint output_channels;
}
@end
@implementation FBMetalCNNConvolution
+ (id<MTLBuffer>)loadFilterWithImage:(const float*)weight_data
weightBuffer:(id<MTLBuffer>)weightBuffer
kernels:(NSUInteger)kernels
input_kernels:(NSUInteger)input_kernels
kernel_offset:(NSUInteger)kernel_offset
kernel_stride:(NSUInteger)kernel_stride
channels:(NSUInteger)channels
width:(NSUInteger)width
height:(NSUInteger)height
transposed:(bool)transposed
context:(MetalContext*)context
{
reformatKernelImage<weight_buffer_type>(
weight_data,
kernels,
input_kernels,
kernel_offset,
kernel_stride,
channels,
width,
height,
transposed,
[&](size_t buffer_size) -> weight_buffer_type* {
if (weightBuffer == nil || [weightBuffer length] != sizeof(weight_buffer_type) * buffer_size) {
weightBuffer = [context.device newBufferWithLength:sizeof(weight_buffer_type) * buffer_size
options:MTLResourceOptionCPUCacheModeDefault];
if (weightBuffer == nil) {
LOG(ERROR) << "couldn't create weight buffer of size: " << buffer_size;
}
}
return (weight_buffer_type*)(weightBuffer ? [weightBuffer contents] : NULL);
});
return weightBuffer;
}
- (void)loadFilterWithImage:(const float*)weight_data
kernels:(NSUInteger)kernels
input_kernels:(NSUInteger)input_kernels
kernel_offset:(NSUInteger)kernel_offset
kernel_stride:(NSUInteger)kernel_stride
channels:(NSUInteger)channels
width:(NSUInteger)width
height:(NSUInteger)height
transposed:(bool)transposed {
_weightBuffer = [FBMetalCNNConvolution loadFilterWithImage:weight_data
weightBuffer:_weightBuffer
kernels:kernels
input_kernels:input_kernels
kernel_offset:kernel_offset
kernel_stride:kernel_stride
channels:channels
width:width
height:height
transposed:transposed
context:self.context];
}
- (void)loadBiasData:(const float *)bias
length:(NSUInteger)length {
if (_biasBuffer == nil || [_biasBuffer length] != sizeof(weight_buffer_type) * length) {
_biasBuffer = [self.context.device newBufferWithLength:sizeof(weight_buffer_type) * length
options:MTLResourceOptionCPUCacheModeDefault];
}
if (_biasBuffer) {
weight_buffer_type *bias_data = (weight_buffer_type *) [_biasBuffer contents];
for (int i = 0; i < length; i++) {
bias_data[i] = bias[i];
}
}
}
+ (id<MTLBuffer>)loadDataWithImage:(const float*)imageData
imageDataBuffer:(id<MTLBuffer>)imageDataBuffer
channels:(NSUInteger)channels
width:(NSUInteger)width
height:(NSUInteger)height
context:(MetalContext*)context {
id<MTLBuffer> newDataBuffer = imageDataBuffer;
reformatInputImage<data_buffer_type>(
imageData, channels, width, height, [&](size_t buffer_size) -> data_buffer_type* {
if (newDataBuffer == nil || [newDataBuffer length] != sizeof(data_buffer_type) * buffer_size) {
newDataBuffer = [context.device newBufferWithLength:sizeof(data_buffer_type) * buffer_size
options:MTLResourceOptionCPUCacheModeDefault];
if (newDataBuffer == nil) {
VLOG(0) << "couldn't create data buffer of size: " << buffer_size;
}
}
return newDataBuffer ? (data_buffer_type*)[newDataBuffer contents] : NULL;
});
return newDataBuffer;
}
+ (instancetype)filterWithContext:(MetalContext*)context
channels:(NSUInteger)channels
kernel_size:(NSUInteger)kernel_size
constantValues:(FBMetalCNNConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
stride_x:(NSUInteger)stride_x
stride_y:(NSUInteger)stride_y {
return [[self alloc] initWithContext:context
channels:(NSUInteger)channels
kernel_size:kernel_size
constantValues:constantValues
width:width
height:height
stride_x:stride_x
stride_y:stride_y];
}
- (MTLSize)threadsPerThreadgroup {
if (input_batch_size > 1)
return MTLSizeMake(input_batch_size * kThreadGroupSize_x, kThreadGroupSize_y / input_batch_size, 1);
else
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
}
- (MTLSize)threadgroupsPerGrid {
MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
return MTLSizeMake(
((input_batch_size * self.outputTextureDescriptor.width + stride_x - 1) / stride_x + threadsPerThreadgroup.width - 1) /
threadsPerThreadgroup.width,
((self.outputTextureDescriptor.height + stride_y - 1) / stride_y + threadsPerThreadgroup.height - 1) /
threadsPerThreadgroup.height,
1);
}
// TODO: this code is temporary, we need to find the optimal strategy for large channel numbers
static uint input_channel_batching(uint input_channels) {
return input_channels <= 32 ? 1 :
input_channels % 8 == 0 ? 8 :
input_channels % 4 == 0 ? 4 :
input_channels % 3 == 0 ? 3 :
input_channels % 2 == 0 ? 2 : 1;
}
- (instancetype)initWithContext:(MetalContext*)context
channels:(NSUInteger)channels
kernel_size:(NSUInteger)kernel_size
constantValues:(FBMetalCNNConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
stride_x:(NSUInteger)_stride_x
stride_y:(NSUInteger)_stride_y {
if ((self = [super initWithFunctionName:@"cnn_convolution_kern"
libraryName:@"Convolution"
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
context:context
constantValues:constantValues])) {
stride_x = _stride_x;
stride_y = _stride_y;
input_batch_size = input_channel_batching(constantValues->input_channels);
output_channels = constantValues->output_channels;
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
width:width
height:height
mipmapped:NO];
}
return self;
}
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
librarySource:(NSString*)librarySource {
FBMetalCNNConstantValues* convolutionConstantValues = (FBMetalCNNConstantValues *) constantValues;
std::string source = [librarySource UTF8String];
REPLACE_CONSTANT(source, convolutionConstantValues->output_width, 0);
REPLACE_CONSTANT(source, convolutionConstantValues->output_height, 1);
REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 2);
REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 3);
REPLACE_CONSTANT(source, convolutionConstantValues->input_stride_x, 4);
REPLACE_CONSTANT(source, convolutionConstantValues->input_stride_y, 5);
REPLACE_CONSTANT(source, convolutionConstantValues->input_pad_t, 6);
REPLACE_CONSTANT(source, convolutionConstantValues->input_pad_l, 7);
REPLACE_CONSTANT(source, convolutionConstantValues->filter_width, 8);
REPLACE_CONSTANT(source, convolutionConstantValues->filter_height, 9);
REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 10);
REPLACE_CONSTANT(source, convolutionConstantValues->output_channels, 11);
REPLACE_CONSTANT(source, convolutionConstantValues->transposed, 12);
REPLACE_CONSTANT(source, input_channel_batching(convolutionConstantValues->input_channels), 13);
return [NSString stringWithUTF8String:source.c_str()];
}
// Bind data between C and Metal
- (void) configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
weightBufferOffset:(NSInteger)weightBufferOffset
outputBufferOffset:(NSInteger)outputBufferOffset {
[commandEncoder setBuffer:_biasBuffer offset:0 atIndex:0];
[commandEncoder setBuffer:_weightBuffer offset:weightBufferOffset atIndex:1];
[commandEncoder setBuffer:_dataBuffer offset:0 atIndex:2];
[commandEncoder setBuffer:_outputBuffer offset:outputBufferOffset atIndex:3];
}
@end
namespace {
const char *metalCode = R"Metal(
// Copyright 2004-present Facebook. All Rights Reserved.
#include <metal_stdlib>
using namespace metal;
// These function_constant expressions are replaced at compile time with actual values
constant constexpr ushort output_size_x [[ function_constant(0) ]];
constant constexpr ushort output_size_y [[ function_constant(1) ]];
constant constexpr ushort input_size_x [[ function_constant(2) ]];
constant constexpr ushort input_size_y [[ function_constant(3) ]];
constant constexpr ushort input_stride_x [[ function_constant(4) ]];
constant constexpr ushort input_stride_y [[ function_constant(5) ]];
constant constexpr ushort input_pad_t [[ function_constant(6) ]];
constant constexpr ushort input_pad_l [[ function_constant(7) ]];
constant constexpr ushort filter_width [[ function_constant(8) ]];
constant constexpr ushort filter_height [[ function_constant(9) ]];
constant constexpr ushort input_channels [[ function_constant(10) ]];
constant constexpr ushort output_channels [[ function_constant(11) ]];
constant constexpr bool transposed [[ function_constant(12) ]];
constant constexpr ushort input_batch_size [[ function_constant(13) ]];
constant constexpr ushort2 input_padding = {
transposed ? (filter_width - 1 - input_pad_l) : input_pad_l,
transposed ? (filter_height - 1 - input_pad_t) : input_pad_t,
};
constant constexpr ushort DataSize = output_channels <= 2 ? output_channels : 4;
constant constexpr ushort DataLength = (output_channels + DataSize - 1) / DataSize;
typedef float data_buffer_type;
typedef vec<half, DataSize> vec_t;
typedef struct {
data_buffer_type data[input_channels][input_size_y][input_size_x];
} input_data;
typedef struct {
data_buffer_type data[output_channels][output_size_y][output_size_x];
} output_data;
typedef struct {
vec_t data[input_channels][filter_height][filter_width][DataLength];
} filter_data;
typedef struct {
half data[output_channels];
} bias_data;
typedef vec_t (thread_storage)[input_stride_y][input_stride_x][DataLength];
// Filter data sampler - returns an aligned array of vectors for the filter interleaved output channel data
template <typename T>
class planar_filter_sampler {
const constant T (&channel_data)[filter_height][filter_width][DataLength];
public:
planar_filter_sampler(constant T data[input_channels][filter_height][filter_width][DataLength], ushort channel)
: channel_data(data[channel]) {}
typedef constant T (&return_type)[DataLength];
return_type operator()(ushort2 off) {
return channel_data[off.y][off.x];
}
};
// Input data sampler - returns a scalar sample
template <typename T>
class planar_data_sampler {
const constant T (&channel_data)[input_size_y][input_size_x];
const ushort2 base;
public:
planar_data_sampler(const constant T data[input_channels][input_size_y][input_size_x], ushort2 _base, ushort channel)
: channel_data(data[channel]), base(_base) {}
inline T operator()(ushort2 off) {
/*
* Note: Even though the documentation states otherwise, Metal will execute both sides of ternary statements,
* and, if the array index is out of bounds, it will crash.
*/
const ushort2 idx = base + off;
bool in_bounds = all(idx >= input_padding) &&
all(idx < (ushort2(input_size_x, input_size_y) * ushort2(input_stride_x, input_stride_y) + input_padding));
const ushort2 padded_idx = (idx - (in_bounds ? input_padding : 0)) / ushort2(input_stride_x, input_stride_y);
return in_bounds ? channel_data[padded_idx.y][padded_idx.x] : T(0);
}
};
/*
* To speed up data access in the convolution inner loop we use vector data access for the filter.
* Every input pixel is multiplied by all the kernel values to generate all the output channels at once.
*/
template <ushort DataRemainder, typename T>
void accumulate(const ushort DataLength, T dst[], const constant T w[], const half sample) {
for (ushort i = 0; i < DataLength; i++) {
dst[i] += sample * w[i];
}
}
template <>
void accumulate<3, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
accumulate<0, half4>(DataLength - 1, dst, w, sample);
dst[DataLength - 1].xyz += sample * w[DataLength - 1].xyz;
}
template <>
void accumulate<2, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
accumulate<0, half4>(DataLength - 1, dst, w, sample);
dst[DataLength - 1].xy += sample * w[DataLength - 1].xy;
}
template <>
void accumulate<1, half4>(const ushort DataLength, half4 dst[], const constant half4 w[], const half sample) {
accumulate<0, half4>(DataLength - 1, dst, w, sample);
dst[DataLength - 1].x += sample * w[DataLength - 1].x;
}
/*
* Convolution inner loop: works for direct convolution (stride == 1) and for the transposed (stride > 1)
*/
template <typename T, typename S1, typename S2>
void convolution(T acc[input_stride_y][input_stride_x][DataLength], S1 in, S2 weights) {
for (ushort y = 0; y < filter_height; y++) {
for (ushort x = 0; x < filter_width; x++) {
const ushort2 off(x, y);
const ushort2 p0 = (input_padding + ushort2(input_stride_x, input_stride_y) - off) % ushort2(input_stride_x, input_stride_y);
const half sample = in(off + p0);
auto w = weights(off);
accumulate<output_channels % DataSize, T>(DataLength, acc[p0.y][p0.x], w, sample);
}
}
}
template <typename T>
void array_zero(T data[], ushort size) {
for (ushort i = 0; i < size; i++) {
data[i] = T(0);
}
}
template <typename T1, typename T2>
void array_copy(T1 dst[], const T2 src[], ushort size) {
for (ushort i = 0; i < size; i++) {
dst[i] = src[i];
}
}
template <typename T1, typename T2>
void array_add(T1 dst[], const T2 src[], ushort size) {
for (ushort i = 0; i < size; i++) {
dst[i] += src[i];
}
}
template <ushort DataSize, typename T = vec<half, DataSize>>
const half element(const T data[], ushort i) {
return data[i / DataSize][i % DataSize];
}
template <ushort DataSize, typename T = vec<half, DataSize>>
const half element(const threadgroup T data[], ushort i) {
return data[i / DataSize][i % DataSize];
}
template <>
const half element<1, half>(const half data[], ushort i) {
return data[i];
}
static void cnn_convolution_full(constant bias_data &bias,
constant filter_data &filters,
constant input_data &input,
device output_data &output,
ushort2 xgid)
{
ushort2 gid = ushort2(input_stride_x, input_stride_y) * xgid;
if (all(gid < ushort2(output_size_x, output_size_y))) {
thread_storage storage;
array_zero((thread vec_t *) storage, sizeof(thread_storage) / sizeof(vec_t));
for (ushort channel = 0; channel < input_channels; channel++) {
planar_data_sampler<data_buffer_type> in(input.data, gid, channel);
planar_filter_sampler<vec_t> weights(filters.data, channel);
convolution(storage, in, weights);
}
for (ushort c = 0; c < output_channels; c++) {
for (ushort y = 0; y < input_stride_y; y++) {
for (ushort x = 0; x < input_stride_x; x++) {
ushort2 idx = gid + ushort2(x, y);
if (all(idx < ushort2(output_size_x, output_size_y)))
output.data[c][idx.y][idx.x] = element<DataSize, vec_t>(storage[y][x], c) + bias.data[c];
}
}
}
}
}
static void cnn_convolution_batch(constant bias_data &bias,
constant filter_data &filters,
constant input_data &input,
device output_data &output,
threadgroup thread_storage *batch_storage,
ushort2 xgid)
{
ushort2 gid = ushort2(input_stride_x, input_stride_y) * ushort2(xgid.x / input_batch_size, xgid.y);
ushort batch = xgid.x % input_batch_size;
if (gid.x < output_size_x && gid.y < output_size_y) {
thread_storage storage;
const ushort data_size = sizeof(thread_storage) / sizeof(vec_t);
array_zero((thread vec_t *) storage, data_size);
ushort base_channel = batch * input_channels / input_batch_size;
for (ushort channel = 0; channel < input_channels / input_batch_size; channel++) {
planar_data_sampler<data_buffer_type> in(input.data, gid, base_channel + channel);
planar_filter_sampler<vec_t> weights(filters.data, base_channel + channel);
convolution(storage, in, weights);
}
for (ushort b = 0; b < input_batch_size; b++) {
threadgroup_barrier(mem_flags::mem_device);
if (b == batch) {
for (ushort y = 0; y < input_stride_y; y++) {
for (ushort x = 0; x < input_stride_x; x++) {
thread half *h_storage = (thread half *) storage[y][x];
for (ushort c = 0; c < output_channels; c++) {
ushort2 idx = gid + ushort2(x, y);
if (all(idx < ushort2(output_size_x, output_size_y))) {
device data_buffer_type *out_channel = &output.data[c][gid.y + y][gid.x + x];
*out_channel = h_storage[c] + (b == 0 ? bias.data[c] : *out_channel);
}
}
}
}
}
}
} else {
for (ushort b = 0; b < input_batch_size; b++) {
threadgroup_barrier(mem_flags::mem_device);
}
}
}
// clang-format off
kernel void cnn_convolution_kern(constant bias_data &bias [[ buffer(0) ]],
constant filter_data &filters [[ buffer(1) ]],
constant input_data &input [[ buffer(2) ]],
device output_data &output [[ buffer(3) ]],
threadgroup thread_storage *storage [[ threadgroup(0) ]],
ushort tid [[ thread_index_in_threadgroup ]],
ushort2 xgid [[ thread_position_in_grid ]])
// clang-format on
{
if (input_batch_size > 1) {
ushort batch_index = (input_batch_size-1) * (tid / (input_batch_size));
cnn_convolution_batch(bias, filters, input, output, &storage[batch_index], xgid);
} else {
cnn_convolution_full(bias, filters, input, output, xgid);
}
}
)Metal";
}

View File

@ -1,61 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import "arm_neon_support.h"
#import "MetalImageFilter.h"
class FBMetalInstanceNormConstantValues : public FBMetalConstantValues {
public:
ushort input_width;
ushort input_height;
ushort input_channels;
ushort prelu_size;
FBMetalInstanceNormConstantValues(
ushort _input_width,
ushort _input_height,
ushort _input_channels,
ushort _prelu_size)
: input_width(_input_width),
input_height(_input_height),
input_channels(_input_channels),
prelu_size(_prelu_size) {}
std::string to_string();
};
@interface FBMetalCNNInstanceNormBase : MetalImageFilter
@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
@property (nonatomic, strong) id<MTLBuffer> scaleBuffer;
@property (nonatomic, strong) id<MTLBuffer> biasBuffer;
@property (nonatomic, strong) id<MTLBuffer> avgBuffer;
@property (nonatomic, strong) id<MTLBuffer> stdevBuffer;
@property (nonatomic, strong) id<MTLBuffer> preluBuffer;
@property (nonatomic, strong) id<MTLBuffer> epsilonBuffer;
@property (nonatomic, copy) NSString *functionName;
+ (instancetype)filterWithContext:(MetalContext*)context
functionName:(NSString*)functionName
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel;
@end
@interface FBMetalCNNInstanceNorm: FBMetalCNNInstanceNormBase
+ (instancetype)filterWithContext:(MetalContext*)context
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel
withPRelu:(BOOL)withPRelu;
- (void)loadEpsilon:(const float)epsilon;
@end

View File

@ -1,407 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "FBMetalCNNInstanceNorm.h"
#import "MetalShaderUtilities.h"
std::string FBMetalInstanceNormConstantValues::to_string() {
std::ostringstream ss;
ss << ":" << input_width << ":" << input_height << ":" << input_channels << ":" << prelu_size;
return ss.str();
}
@interface FBMetalCNNInstanceNormBase () {
struct {
ushort channel;
ushort width;
ushort height;
} configuration;
}
@end
namespace {
extern const char *metalCode;
}
@implementation FBMetalCNNInstanceNormBase
static constexpr size_t kThreadGroupSize_x = 4;
static constexpr size_t kThreadGroupSize_y = 8;
static NSArray* kernelNames = @[
@"cnn_instance_norm_avg_kern",
@"cnn_instance_norm_stdev_kern",
@"cnn_instance_norm_last_kern",
@"cnn_instance_norm_prelu_kern"
];
+ (instancetype)filterWithContext:(MetalContext*)context
functionName:(NSString*)functionName
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel {
return [[self alloc] initWithContext:context
functionName:functionName
constantValues:constantValues
width:width
height:height
channel:channel];
}
- (instancetype)initWithContext:(MetalContext*)context
functionName:(NSString*)functionName
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel {
if ((self = [super initWithFunctionName:functionName
libraryName:@"InstanceNorm"
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
context:context
constantValues:constantValues])) {
_functionName = functionName;
configuration.channel = channel;
configuration.width = width;
configuration.height = height;
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
width:width
height:height
mipmapped:NO];
}
return self;
}
- (MTLSize)threadsPerThreadgroup {
if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
NSUInteger maxTotalThreadsPerThreadgroup = [self.pipeline maxTotalThreadsPerThreadgroup];
return MTLSizeMake(maxTotalThreadsPerThreadgroup, 1, 1);
} else {
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
}
}
- (MTLSize)threadgroupsPerGrid {
if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
return MTLSizeMake(configuration.channel, 1, 1);
} else {
MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
return MTLSizeMake(
(self.outputTextureDescriptor.width + threadsPerThreadgroup.width - 1) / threadsPerThreadgroup.width,
(self.outputTextureDescriptor.height + threadsPerThreadgroup.height - 1) / threadsPerThreadgroup.height,
1);
}
}
- (NSString*)replaceConstantValues:(FBMetalConstantValues*)constantValues librarySource:(NSString*)librarySource {
FBMetalInstanceNormConstantValues* convolutionConstantValues = (FBMetalInstanceNormConstantValues*)constantValues;
std::string source = [librarySource UTF8String];
REPLACE_CONSTANT(source, convolutionConstantValues->input_width * convolutionConstantValues->input_height, 0);
REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 1);
REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 2);
REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 3);
REPLACE_CONSTANT(source, convolutionConstantValues->prelu_size, 4);
return [NSString stringWithUTF8String:source.c_str()];
}
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
weightBufferOffset:(NSInteger)weightBufferOffset
outputBufferOffset:(NSInteger)outputBufferOffset {
[commandEncoder setBuffer:_scaleBuffer offset:0 atIndex:0];
[commandEncoder setBuffer:_biasBuffer offset:0 atIndex:1];
[commandEncoder setBuffer:_dataBuffer offset:0 atIndex:2];
[commandEncoder setBuffer:_outputBuffer offset:0 atIndex:3];
[commandEncoder setBuffer:_avgBuffer offset:0 atIndex:4];
[commandEncoder setBuffer:_stdevBuffer offset:0 atIndex:5];
[commandEncoder setBuffer:_epsilonBuffer offset:0 atIndex:6];
if (_preluBuffer != nil) {
[commandEncoder setBuffer:_preluBuffer offset:0 atIndex:7];
}
if (_functionName == kernelNames[0] || _functionName == kernelNames[1]) {
MTLSize threads = [self threadsPerThreadgroup];
const int threadGroupMemoryLength = threads.width * sizeof(float);
[commandEncoder setThreadgroupMemoryLength:threadGroupMemoryLength atIndex:0];
}
}
@end
@implementation FBMetalCNNInstanceNorm {
NSMutableArray<FBMetalCNNInstanceNormBase*>* instanceNorm;
NSMutableArray<NSString*>* functionNames;
}
+ (instancetype)filterWithContext:(MetalContext*)context
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel
withPRelu:(BOOL)withPRelu {
return [[self alloc] initWithContext:context
constantValues:constantValues
width:width
height:height
channel:channel
withPRelu:withPRelu];
}
- (instancetype)initWithContext:(MetalContext*)context
constantValues:(FBMetalInstanceNormConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel
withPRelu:(BOOL)withPRelu{
self.context = context;
if (withPRelu) {
functionNames = [[NSMutableArray alloc] initWithObjects:kernelNames[0], kernelNames[1], kernelNames[3], nil];
} else {
functionNames = [[NSMutableArray alloc] initWithObjects:kernelNames[0], kernelNames[1], kernelNames[2], nil];
}
instanceNorm = [NSMutableArray<FBMetalCNNInstanceNormBase*> arrayWithCapacity:functionNames.count];
for (int i = 0; i < functionNames.count; i++) {
FBMetalCNNInstanceNormBase* instanceNormChunk = [FBMetalCNNInstanceNormBase filterWithContext:context
functionName:functionNames[i]
constantValues:constantValues
width:width
height:height
channel:channel];
instanceNorm[i] = instanceNormChunk;
}
return self;
}
- (void)loadEpsilon:(const float)epsilon {
int length = 1;
if (self.epsilonBuffer == nil || [self.epsilonBuffer length] < sizeof(float) * length) {
self.epsilonBuffer =
[self.context.device newBufferWithLength:sizeof(float) * length options:MTLResourceOptionCPUCacheModeDefault];
}
if (self.epsilonBuffer) {
float* bias_data = (float*)[self.epsilonBuffer contents];
bias_data[0] = epsilon;
}
}
- (void)applyFilter:(void (^)(NSError*))completionHandler {
for (int i = 0; i < functionNames.count; i++) {
FBMetalCNNInstanceNormBase* instanceNormChunk = instanceNorm[i];
instanceNormChunk.avgBuffer = self.avgBuffer;
instanceNormChunk.stdevBuffer = self.stdevBuffer;
instanceNormChunk.dataBuffer = self.dataBuffer;
instanceNormChunk.outputBuffer = self.outputBuffer;
instanceNormChunk.scaleBuffer = self.scaleBuffer;
instanceNormChunk.biasBuffer = self.biasBuffer;
instanceNormChunk.preluBuffer = self.preluBuffer;
instanceNormChunk.epsilonBuffer = self.epsilonBuffer;
[instanceNorm[i] applyFilter:completionHandler];
}
}
@end
namespace {
const char *metalCode = R"Metal(
// Copyright 2004-present Facebook. All Rights Reserved.
#include <metal_stdlib>
using namespace metal;
constant constexpr int input_size [[ function_constant(0) ]];
constant constexpr ushort input_size_x [[ function_constant(1) ]];
constant constexpr ushort input_size_y [[ function_constant(2) ]];
constant constexpr ushort channels [[ function_constant(3) ]];
constant constexpr ushort prelu_size [[ function_constant(4) ]];
typedef float data_buffer_type;
typedef struct {
data_buffer_type data[channels * input_size];
} input_output_data;
typedef struct {
half data[channels];
} stats_data;
typedef struct {
half data[channels];
} channel_data;
typedef struct {
float data[1];
} epsilon_data;
kernel void cnn_instance_norm_avg_kern(
constant input_output_data &inputBuffer [[ buffer(2) ]],
device stats_data &avgBuffer [[ buffer(4) ]],
threadgroup float *per_thread_sum [[ threadgroup(0) ]],
ushort channel [[ threadgroup_position_in_grid ]],
ushort num_threads [[ threads_per_threadgroup ]],
ushort tid [[ thread_index_in_threadgroup ]]
) {
const int chunk_size = (input_size + num_threads - 1) / num_threads;
constant float* input = &inputBuffer.data[channel * input_size + tid * chunk_size];
const int max_index = min(chunk_size, input_size - tid * chunk_size);
float4 sum = 0;
int i = 0;
for (; i < max_index-16; i+=16) {
sum += ((constant float4 *) input)[i/4];
sum += ((constant float4 *) input)[i/4+1];
sum += ((constant float4 *) input)[i/4+2];
sum += ((constant float4 *) input)[i/4+3];
}
for (; i < max_index-8; i+=8) {
sum += ((constant float4 *) input)[i/4];
sum += ((constant float4 *) input)[i/4+1];
}
for (; i < max_index-4; i+=4) {
sum += ((constant float4 *) input)[i/4];
}
for (; i < max_index; i++) {
sum[0] += input[i];
}
per_thread_sum[tid] = sum[0] + sum[1] + sum[2] + sum[3];
threadgroup_barrier(mem_flags::mem_threadgroup);
const int last_bit_size = 32;
const int last_bit_section = num_threads/last_bit_size;
if (tid < last_bit_size) {
sum = 0;
for (int t = 0; t < last_bit_section/4; t++) {
sum += ((threadgroup float4 *) per_thread_sum)[tid * last_bit_section/4 + t];
}
per_thread_sum[tid] = sum[0] + sum[1] + sum[2] + sum[3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (tid == 0) {
sum = 0;
for (int t = 0; t < last_bit_size/4; t++) {
sum += ((threadgroup float4 *) per_thread_sum)[t];
}
avgBuffer.data[channel] = (sum[0] + sum[1] + sum[2] + sum[3]) / input_size;
}
}
kernel void cnn_instance_norm_stdev_kern(
constant channel_data &scaleBuffer [[ buffer(0) ]],
constant channel_data &biasBuffer [[ buffer(1) ]],
constant input_output_data &inputBuffer [[ buffer(2) ]],
device stats_data &avgBuffer [[ buffer(4) ]],
device stats_data &stdevBuffer [[ buffer(5) ]],
constant epsilon_data &epsilon [[ buffer(6) ]],
threadgroup float *per_thread_sq_norm [[ threadgroup(0) ]],
ushort channel [[ threadgroup_position_in_grid ]],
ushort num_threads [[ threads_per_threadgroup ]],
ushort tid [[ thread_index_in_threadgroup ]]
) {
const int chunk_size = (input_size + num_threads - 1) / num_threads;
constant float* input = &inputBuffer.data[channel * input_size + tid * chunk_size];
const int max_index = min(chunk_size, input_size - tid * chunk_size);
float4 sum = 0;
float mean = avgBuffer.data[channel];
int i = 0;
for (; i < max_index-16; i+=16) {
float4 delta = ((constant float4 *) input)[i/4] - mean;
sum += delta * delta;
delta = ((constant float4 *) input)[i/4+1] - mean;
sum += delta * delta;
delta = ((constant float4 *) input)[i/4+2] - mean;
sum += delta * delta;
delta = ((constant float4 *) input)[i/4+3] - mean;
sum += delta * delta;
}
for (; i < max_index-8; i+=8) {
float4 delta = ((constant float4 *) input)[i/4] - mean;
sum += delta * delta;
delta = ((constant float4 *) input)[i/4+1] - mean;
sum += delta * delta;
}
for (; i < max_index-4; i+=4) {
float4 delta = ((constant float4 *) input)[i/4] - mean;
sum += delta * delta;
}
for (; i < max_index; i++) {
half delta = input[i] - mean;
sum[0] += delta * delta;
}
per_thread_sq_norm[tid] = sum[0] + sum[1] + sum[2] + sum[3];
threadgroup_barrier(mem_flags::mem_threadgroup);
const int last_bit_size = 32;
const int last_bit_section = num_threads/last_bit_size;
if (tid < last_bit_size) {
sum = 0;
for (int t = 0; t < last_bit_section/4; t++) {
sum += ((threadgroup float4 *) per_thread_sq_norm)[tid * last_bit_section/4 + t];
}
per_thread_sq_norm[tid] = sum[0] + sum[1] + sum[2] + sum[3];
}
threadgroup_barrier(mem_flags::mem_threadgroup);
if (tid == 0) {
sum = 0;
for (int t = 0; t < last_bit_size/4; t++) {
sum += ((threadgroup float4 *) per_thread_sq_norm)[t];
}
float inv_stdev = 1.0h / sqrt((sum[0] + sum[1] + sum[2] + sum[3]) / input_size + epsilon.data[0]);
float scale = inv_stdev * scaleBuffer.data[channel];
float shift = biasBuffer.data[channel] - mean * scale;
avgBuffer.data[channel] = scale; //scale
stdevBuffer.data[channel] = shift; // shift
}
}
kernel void cnn_instance_norm_last_kern(
constant input_output_data &inputBuffer [[ buffer(2) ]],
device input_output_data &outputBuffer [[ buffer(3) ]],
constant stats_data &avgBuffer [[ buffer(4) ]], //scale
constant stats_data &stdevBuffer [[ buffer(5) ]], //shift
uint2 gid [[ thread_position_in_grid ]]) {
if (gid.x < input_size_x && gid.y < input_size_y) {
for (int c = 0; c < channels; c++) {
int idx = c * input_size + gid.y * input_size_x + gid.x;
outputBuffer.data[idx] = half(inputBuffer.data[idx]) * avgBuffer.data[c] + stdevBuffer.data[c];
}
}
}
kernel void cnn_instance_norm_prelu_kern(
constant input_output_data &inputBuffer [[ buffer(2) ]],
device input_output_data &outputBuffer [[ buffer(3) ]],
constant stats_data &avgBuffer [[ buffer(4) ]], //scale
constant stats_data &stdevBuffer [[ buffer(5) ]], //shift
constant channel_data &preluBuffer [[ buffer(7) ]], //prelu weights
uint2 gid [[ thread_position_in_grid ]]) {
if (gid.x < input_size_x && gid.y < input_size_y) {
for (int c = 0; c < channels; c++) {
int idx = c * input_size + gid.y * input_size_x + gid.x;
half weight = preluBuffer.data[prelu_size > 1 ? c : 0];
half value = half(inputBuffer.data[idx]) * avgBuffer.data[c] + stdevBuffer.data[c];
outputBuffer.data[idx] = value > 0 ? value : value * weight;
}
}
}
)Metal";
}

View File

@ -1,14 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import "arm_neon_support.h"
#import "MetalImageFilter.h"
@interface FBMetalCNNNoOp: MetalImageFilter
+ (instancetype)filterWithContext:(MetalContext*)context
width:(NSUInteger)width
height:(NSUInteger)height;
@end

View File

@ -1,54 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "FBMetalCNNNoOp.h"
#import "MetalShaderUtilities.h"
namespace {
extern const char *metalCode;
}
@implementation FBMetalCNNNoOp
- (MTLSize)threadsPerThreadgroup {
return MTLSizeMake(1, 1, 1);
}
- (MTLSize)threadgroupsPerGrid {
return MTLSizeMake(1, 1, 1);
}
+ (instancetype)filterWithContext:(MetalContext*)context
width:(NSUInteger)width
height:(NSUInteger)height {
return [[self alloc] initWithContext:context width:width height:height];
}
- (instancetype)initWithContext:(MetalContext*)context
width:(NSUInteger)width
height:(NSUInteger)height {
if ((self = [super initWithFunctionName:@"cnn_no_op_kern"
libraryName:@"NoOp"
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
context:context
constantValues:nil])) {
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
width:width
height:height
mipmapped:NO];
}
return self;
}
@end
namespace {
const char *metalCode = R"Metal(
// Copyright 2004-present Facebook. All Rights Reserved.
#include <metal_stdlib>
using namespace metal;
kernel void cnn_no_op_kern() {
return;
}
)Metal";
}

View File

@ -1,40 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import "arm_neon_support.h"
#import "MetalImageFilter.h"
class FBMetalPReluConstantValues : public FBMetalConstantValues {
public:
ushort input_width;
ushort input_height;
ushort input_channels;
ushort weight_length;
FBMetalPReluConstantValues(
ushort _input_width,
ushort _input_height,
ushort _input_channels,
ushort _weight_length)
: input_width(_input_width),
input_height(_input_height),
input_channels(_input_channels),
weight_length(_weight_length) {}
std::string to_string();
};
@interface FBMetalCNNPRelu : MetalImageFilter
@property (nonatomic, strong) id<MTLBuffer> dataBuffer;
@property (nonatomic, strong) id<MTLBuffer> outputBuffer;
@property (nonatomic, strong) id<MTLBuffer> weightBuffer;
+ (instancetype)filterWithContext:(MetalContext*)context
constantValues:(FBMetalPReluConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel;
@end

View File

@ -1,131 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "FBMetalCNNPRelu.h"
#import "MetalShaderUtilities.h"
#include "caffe2/core/logging.h"
std::string FBMetalPReluConstantValues::to_string() {
std::ostringstream ss;
ss << ":" <<
input_width << ":" <<
input_height << ":" <<
weight_length;
return ss.str();
}
@interface FBMetalCNNPRelu() {
struct {
ushort channel;
ushort width;
ushort height;
} configuration;
}
@end
namespace {
extern const char *metalCode;
}
@implementation FBMetalCNNPRelu
static constexpr size_t kThreadGroupSize_x = 4;
static constexpr size_t kThreadGroupSize_y = 8;
+ (instancetype)filterWithContext:(MetalContext*)context
constantValues:(FBMetalPReluConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel {
return [[self alloc] initWithContext:context constantValues:constantValues width:width height:height channel:channel];
}
- (instancetype)initWithContext:(MetalContext*)context
constantValues:(FBMetalPReluConstantValues*)constantValues
width:(NSUInteger)width
height:(NSUInteger)height
channel:(NSUInteger)channel {
if ((self = [super initWithFunctionName:@"cnn_prelu_kern"
libraryName:@"PRelu"
librarySource:[NSString stringWithCString:metalCode encoding:NSUTF8StringEncoding]
context:context
constantValues:constantValues])) {
configuration.channel = channel;
configuration.width = width;
configuration.height = height;
super.outputTextureDescriptor = [MTLTextureDescriptor texture2DDescriptorWithPixelFormat:MTLPixelFormatRGBA16Float
width:width
height:height
mipmapped:NO];
}
return self;
}
- (MTLSize) threadsPerThreadgroup {
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
}
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
librarySource:(NSString*)librarySource {
FBMetalPReluConstantValues* convolutionConstantValues = (FBMetalPReluConstantValues *) constantValues;
std::string source = [librarySource UTF8String];
REPLACE_CONSTANT(source, convolutionConstantValues->input_width, 0);
REPLACE_CONSTANT(source, convolutionConstantValues->input_height, 1);
REPLACE_CONSTANT(source, convolutionConstantValues->input_channels, 2);
REPLACE_CONSTANT(source, convolutionConstantValues->weight_length, 3);
return [NSString stringWithUTF8String:source.c_str()];
}
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
weightBufferOffset:(NSInteger)weightBufferOffset
outputBufferOffset:(NSInteger)outputBufferOffset {
[commandEncoder setBuffer:_weightBuffer offset:0 atIndex:0];
[commandEncoder setBuffer:_outputBuffer offset:0 atIndex:1];
}
@end
namespace {
const char *metalCode = R"Metal(
// Copyright 2004-present Facebook. All Rights Reserved.
#include <metal_stdlib>
using namespace metal;
constant constexpr ushort input_size_x [[ function_constant(0) ]];
constant constexpr ushort input_size_y [[ function_constant(1) ]];
constant constexpr ushort input_channels [[ function_constant(2) ]];
constant constexpr ushort kernels [[ function_constant(3) ]];
constant constexpr int input_size = input_size_x * input_size_y;
typedef float data_buffer_type;
typedef struct {
data_buffer_type data[input_channels][input_size];
} output_data;
typedef struct {
half data[kernels];
} filter_data;
kernel void cnn_prelu_kern(
constant filter_data &filter [[ buffer(0) ]],
device output_data &output [[ buffer(1) ]],
uint2 gid [[thread_position_in_grid]]) {
if (gid.x < input_size_x && gid.y < input_size_y) {
int idx = gid.y * input_size_x + gid.x;
for (int c = 0; c < input_channels; c++) {
half weight = filter.data[kernels > 1 ? c : 0];
data_buffer_type value = output.data[c][idx];
output.data[c][idx] = value > 0 ? value : value * weight;
}
}
}
)Metal";
}

View File

@ -1,10 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import <string>
class FBMetalConstantValues {
public:
virtual std::string to_string() = 0;
};

View File

@ -1,38 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/operators/filler_op.h"
#import "MetalCaffeContext.h"
#import "data_conversion.h"
namespace caffe2 {
template <typename T1, typename T2>
class GivenTensorMetalFillOp final : public FillerOp<MetalCaffeContext> {
public:
GivenTensorMetalFillOp(const OperatorDef& operator_def, Workspace* ws)
: FillerOp<MetalCaffeContext>(operator_def, ws) {
auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
for (float f : source_values) {
values_.push_back(f);
}
}
bool Fill(TensorMetal* output) override {
DCHECK_EQ(output->size(), values_.size()) << "output size: " << output->size() << " given size: " << values_.size();
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)output->template mutable_data<T1>());
T2* output_data = (T2*)[weightBuffer contents];
CAFFE_ENFORCE(output_data != NULL);
memcpycvt(output_data, values_.data(), output->size());
return true;
}
private:
vector<float> values_;
};
// uint16_t is used because caffe2 does not support float16_t
REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenTensorFloat16MetalFill, METAL, GivenTensorMetalFillOp<uint16_t, float16_t>);
OPERATOR_SCHEMA(GivenTensorFloat16MetalFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
}

View File

@ -1,200 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/core/common.h"
#ifndef CAFFE2_MOBILE
#error "Caffe2 mobile state not defined"
#endif
#if CAFFE2_MOBILE
#include "caffe2/operators/filler_op.h"
#import "data_conversion.h"
#import "FBMetalCNNConvolution.h"
#import "MetalCaffeContext.h"
namespace caffe2 {
class GivenWeightTensorFillOp final : public FillerOp<MetalCaffeContext> {
public:
GivenWeightTensorFillOp(const OperatorDef &operator_def, Workspace *ws)
: FillerOp<MetalCaffeContext>(operator_def, ws) {
auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
for (float f : source_values) {
values_.push_back(f);
}
}
bool Fill(Tensor<MetalCaffeContext> *output) override { return true; }
bool RunOnDevice() override {
auto *output = Operator<MetalCaffeContext>::Output(0);
CAFFE_ENFORCE_EQ(shape_.size(), 4);
auto shape = shape_;
int kernels = shape[0];
int kernel_channels = shape[1];
int kernel_height = shape[2];
int kernel_width = shape[3];
reformatKernelImage<weight_buffer_type>(
(const float *)values_.data(),
kernels,
kernel_channels,
kernel_width,
kernel_height,
false,
[&](int multiplier) -> weight_buffer_type * {
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
shape.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
output->Resize(shape);
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
return output_data;
});
return true;
}
private:
vector<float> values_;
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenWeightTensorFill, METAL, GivenWeightTensorFillOp);
OPERATOR_SCHEMA(GivenWeightTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
class GivenTransposeWeightTensorFillOp final : public FillerOp<MetalCaffeContext> {
public:
GivenTransposeWeightTensorFillOp(const OperatorDef &operator_def, Workspace *ws)
: FillerOp<MetalCaffeContext>(operator_def, ws) {
auto source_values = OperatorBase::template GetRepeatedArgument<float>("values");
for (float f : source_values) {
values_.push_back(f);
}
}
bool Fill(Tensor<MetalCaffeContext> *output) override { return true; }
bool RunOnDevice() override {
auto *output = Operator<MetalCaffeContext>::Output(0);
CAFFE_ENFORCE_EQ(shape_.size(), 4);
auto shape = shape_;
int kernels = shape[0];
int kernel_channels = shape[1];
int kernel_height = shape[2];
int kernel_width = shape[3];
std::swap(kernels, kernel_channels);
reformatKernelImage<weight_buffer_type>(
(const float *)values_.data(),
kernels,
kernel_channels,
kernel_width,
kernel_height,
true,
[&](int multiplier) -> weight_buffer_type * {
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
shape_.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
output->Resize(shape_);
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
return output_data;
});
return true;
}
private:
vector<float> values_;
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(GivenTransposeWeightTensorFill, METAL, GivenTransposeWeightTensorFillOp);
OPERATOR_SCHEMA(GivenTransposeWeightTensorFill).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
class CopyWeightTensorToMetalGPUOp final : public Operator<MetalCaffeContext> {
public:
CopyWeightTensorToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<MetalCaffeContext>(operator_def, ws) {}
bool RunOnDevice() override {
const Blob *blob = Inputs()[0];
const TensorCPU &X = blob->Get<TensorCPU>();
auto *output = Operator<MetalCaffeContext>::Output(0);
auto shape = X.dims();
CAFFE_ENFORCE_EQ(shape.size(), 4);
int kernels = shape[0];
int kernel_channels = shape[1];
int kernel_height = shape[2];
int kernel_width = shape[3];
reformatKernelImage<weight_buffer_type>(
(const float *)X.template data<float>(),
kernels,
kernel_channels,
kernel_width,
kernel_height,
false,
[&](int multiplier) -> weight_buffer_type * {
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
shape.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
output->Resize(shape);
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
return output_data;
});
return true;
}
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyWeightTensorToMetalGPU, METAL, CopyWeightTensorToMetalGPUOp);
OPERATOR_SCHEMA(CopyWeightTensorToMetalGPU).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
class CopyTransposeWeightTensorToMetalGPUOp final : public Operator<MetalCaffeContext> {
public:
CopyTransposeWeightTensorToMetalGPUOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<MetalCaffeContext>(operator_def, ws) {}
bool RunOnDevice() override {
const Blob *blob = Inputs()[0];
const TensorCPU &X = blob->Get<TensorCPU>();
auto *output = Operator<MetalCaffeContext>::Output(0);
auto shape_ = X.dims();
CAFFE_ENFORCE_EQ(shape_.size(), 4);
auto shape = shape_;
int kernels = shape[0];
int kernel_channels = shape[1];
int kernel_height = shape[2];
int kernel_width = shape[3];
std::swap(kernels, kernel_channels);
reformatKernelImage<weight_buffer_type>(
(const float *)X.template data<float>(),
kernels,
kernel_channels,
kernel_width,
kernel_height,
true,
[&](int multiplier) -> weight_buffer_type * {
// buffer_size = kernels * kernel_channels * kernel_width * kernel_height * multiplier;
shape_.push_back(multiplier); // multiplier = ceil(aligned_kernel_stride / kernel_stride)
output->Resize(shape_);
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void *)output->template mutable_data<uint16_t>());
weight_buffer_type *output_data = (weight_buffer_type *)[weightBuffer contents];
return output_data;
});
return true;
}
};
REGISTER_CPU_OPERATOR_WITH_ENGINE(CopyTransposeWeightTensorToMetalGPU, METAL, CopyTransposeWeightTensorToMetalGPUOp);
OPERATOR_SCHEMA(CopyTransposeWeightTensorToMetalGPU).NumInputs(0, 1).NumOutputs(1).AllowInplace({{0, 0}});
} // namespace caffe2
#endif // CAFFE2_MOBILE

View File

@ -1,87 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/math.h"
#import <Metal/Metal.h>
namespace caffe2 {
struct MetalAllocator final : CPUAllocator {
id<MTLDevice> device;
MetalAllocator(id<MTLDevice> _device);
~MetalAllocator();
void *New(size_t nbytes) override;
void Delete(void *data) override;
id<MTLBuffer> Buffer(void *data);
};
MetalAllocator *GetMetalAllocator();
class MetalCaffeContext final {
public:
MetalCaffeContext() : random_seed_(math::randomNumberSeed()) {}
explicit MetalCaffeContext(const DeviceOption &option)
: random_seed_(option.has_random_seed() ? option.random_seed() : math::randomNumberSeed()) {
CHECK_EQ(option.device_type(), CPU);
}
~MetalCaffeContext() {}
inline void SwitchToDevice(int stream_id) {}
inline void SwitchToDevice() {
SwitchToDevice(0);
}
inline bool FinishDeviceComputation() { return true; }
inline std::mt19937 &RandGenerator() {
if (!random_generator_.get()) {
random_generator_.reset(new std::mt19937(random_seed_));
}
return *random_generator_.get();
}
static void *New(size_t nbytes);
static void Delete(void *data);
// Two copy functions that deals with cross-device copies.
template <class SrcContext, class DstContext>
inline void CopyBytes(size_t nbytes, const void *src, void *dst);
template <typename T, class SrcContext, class DstContext>
inline void Copy(size_t n, const T *src, T *dst) {
if (std::is_fundamental<T>::value) {
CopyBytes<SrcContext, DstContext>(n * sizeof(T), static_cast<const void *>(src), static_cast<void *>(dst));
} else {
for (int i = 0; i < n; ++i) {
dst[i] = src[i];
}
}
}
template <class SrcContext, class DstContext>
inline void CopyItems(const TypeMeta &meta, size_t n, const void *src, void *dst) {
if (meta.copy()) {
meta.copy()(src, dst, n);
} else {
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
}
}
protected:
int random_seed_{1701};
std::unique_ptr<std::mt19937> random_generator_;
};
typedef Tensor<MetalCaffeContext> TensorMetal;
}

View File

@ -1,59 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "MetalCaffeContext.h"
#import "MetalContext.h"
namespace caffe2 {
CAFFE_KNOWN_TYPE(TensorMetal);
static NSMutableDictionary<NSNumber *, id<MTLBuffer>> *buffer_cache = nil;
// class MetalAllocator
MetalAllocator::MetalAllocator(id<MTLDevice> _device) : device(_device) {
buffer_cache = [NSMutableDictionary<NSNumber *, id<MTLBuffer>> dictionary];
}
MetalAllocator::~MetalAllocator() {
for (id key in buffer_cache) {
id<MTLBuffer> buffer = buffer_cache[key];
[buffer_cache removeObjectForKey:key];
}
}
void *MetalAllocator::New(size_t nbytes) {
id<MTLBuffer> buffer = [device newBufferWithLength:nbytes options:MTLResourceCPUCacheModeDefaultCache];
void *data = [buffer contents];
NSNumber *key = @((unsigned long long)data);
buffer_cache[key] = buffer;
return data;
}
void MetalAllocator::Delete(void *data) {
NSNumber *key = @((unsigned long long)data);
id<MTLBuffer> buffer = buffer_cache[key];
[buffer_cache removeObjectForKey:key];
buffer = nil;
}
id<MTLBuffer> MetalAllocator::Buffer(void *data) {
NSNumber *key = @((unsigned long long)data);
return buffer_cache[key];
}
// the Metal Allocator
static MetalAllocator *MetalAllocatorInstance = NULL;
// Get the Metal Allocator
MetalAllocator *GetMetalAllocator() {
if (MetalAllocatorInstance == NULL) {
MetalAllocatorInstance = new MetalAllocator([MetalContext getContext].device);
}
CAFFE_ENFORCE(MetalAllocatorInstance != NULL);
return MetalAllocatorInstance;
}
// class MetalCaffeContext
void *MetalCaffeContext::New(size_t nbytes) { return MetalAllocatorInstance->New(nbytes); }
void MetalCaffeContext::Delete(void *data) { MetalAllocatorInstance->Delete(data); }
}

View File

@ -1,17 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import <Foundation/Foundation.h>
@protocol MTLDevice, MTLLibrary, MTLCommandQueue;
@interface MetalContext : NSObject
@property (atomic, strong) id<MTLDevice> device;
@property (atomic, strong) id<MTLLibrary> library;
@property (atomic, strong) id<MTLCommandQueue> commandQueue;
+ (instancetype)getContext;
@end

View File

@ -1,27 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "MetalContext.h"
#import <Metal/Metal.h>
static MetalContext* metalContext = NULL;
@implementation MetalContext
+ (instancetype)getContext {
if (metalContext == NULL) {
metalContext = [[self alloc] initWithDevice:nil];
}
return metalContext;
}
- (instancetype)initWithDevice:(id<MTLDevice>)device {
if ((self = [super init])) {
_device = device ?: MTLCreateSystemDefaultDevice();
_library = [_device newDefaultLibrary];
_commandQueue = [_device newCommandQueue];
}
return self;
}
@end

View File

@ -1,163 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/core/common.h"
#ifndef CAFFE2_MOBILE
#error "Caffe2 mobile state not defined"
#endif
#if CAFFE2_MOBILE
#include "caffe2/operators/conv_pool_op_base.h"
#include "caffe2/operators/conv_transpose_unpool_op_base.h"
#import "metal_convolution.h"
#import "MetalCaffeContext.h"
namespace caffe2 {
class MetalConvMTLBufferOp final : public ConvPoolOpBase<MetalCaffeContext> {
public:
MetalConvMTLBufferOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<MetalCaffeContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
}
bool RunOnDeviceWithOrderNCHW() override;
const TensorCPU& InputFromTensorCPU(int idx) {
const Blob* blob = Inputs()[idx];
return blob->Get<TensorCPU>();
}
// Input: X, W, b
// Output: Y
INPUT_TAGS(INPUT, FILTER, BIAS);
};
bool MetalConvMTLBufferOp::RunOnDeviceWithOrderNCHW() {
const TensorMetal& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = InputFromTensorCPU(BIAS);
TensorMetal* Y = Output(0);
CAFFE_ENFORCE(X.ndim() == 4, "Input dim should be 4");
const int N = X.dim32(0), C = X.dim32(1);
CAFFE_ENFORCE(filter.ndim(), 5);
const int M = filter.dim32(0);
CAFFE_ENFORCE(filter.dim32(1) == C, "");
CAFFE_ENFORCE(filter.dim32(2) == this->kernel_h_, "");
CAFFE_ENFORCE(filter.dim32(3) == this->kernel_w_, "");
CAFFE_ENFORCE(bias.ndim() == 1, "");
CAFFE_ENFORCE(bias.dim32(0) == M, "");
ConvPoolOpBase<MetalCaffeContext>::SetOutputSize(X, Y, filter.dim32(0));
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)X.template data<float>());
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Y->template mutable_data<float>());
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)filter.template data<uint16_t>());
const float* biasData = bias.template data<float>();
metal_convolution(
inputDataBuffer,
X.dim32(1),
X.dim32(3),
X.dim32(2),
stride_h_,
stride_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
weightBuffer,
filter.dim32(0),
filter.dim32(1),
filter.dim32(3),
filter.dim32(2),
outputDataBuffer,
Y->dim32(1),
Y->dim32(3),
Y->dim32(2),
biasData,
bias.dim32(0),
false);
return true;
}
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalConv, METAL, MetalConvMTLBufferOp);
OPERATOR_SCHEMA(MetalConv).NumInputs(3).NumOutputs(1);
class MetalConvTransposeMTLBufferOp final : public ConvTransposeUnpoolBase<MetalCaffeContext> {
public:
MetalConvTransposeMTLBufferOp(const OperatorDef& operator_def, Workspace* ws)
: ConvTransposeUnpoolBase<MetalCaffeContext>(operator_def, ws) {
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
}
bool RunOnDeviceWithOrderNCHW() override;
private:
// Input: X, W, b
// Output: Y
INPUT_TAGS(INPUT, FILTER, BIAS);
const TensorCPU& InputFromTensorCPU(int idx) {
const Blob* blob = Inputs()[idx];
return blob->Get<TensorCPU>();
}
};
bool MetalConvTransposeMTLBufferOp::RunOnDeviceWithOrderNCHW() {
const TensorMetal& X = Input(INPUT);
auto& filter = Input(FILTER);
auto& bias = InputFromTensorCPU(BIAS);
TensorMetal* Y = Output(0);
const int N = X.dim32(0), M = X.dim32(1), H = X.dim32(2), W = X.dim32(3);
CAFFE_ENFORCE(filter.ndim() == 5, "filter must be 4D tensor");
CAFFE_ENFORCE(filter.dim32(0) == M, "filter number must be equal to input channel number");
const int C = filter.dim32(1);
CAFFE_ENFORCE(filter.dim32(2) == kernel_h_, "filter height must be equal to kernel height");
CAFFE_ENFORCE(filter.dim32(3) == kernel_w_, "filter width must be equal to kernel width");
CAFFE_ENFORCE(bias.ndim() == 1, "bias must be 1D tensor");
CAFFE_ENFORCE(bias.dim32(0) == C, "bias dimension must be equal to output channel number");
ConvTransposeUnpoolBase<MetalCaffeContext>::SetOutputSize(X, Y, C);
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)X.template data<float>());
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Y->template mutable_data<float>());
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)filter.template data<uint16_t>());
const float* biasData = bias.template data<float>();
metal_convolution(
inputDataBuffer,
X.dim32(1),
X.dim32(3),
X.dim32(2),
stride_h_,
stride_w_,
pad_t_,
pad_l_,
pad_b_,
pad_r_,
weightBuffer,
filter.dim32(0),
filter.dim32(1),
filter.dim32(3),
filter.dim32(2),
outputDataBuffer,
Y->dim32(1),
Y->dim32(3),
Y->dim32(2),
biasData,
bias.dim32(0),
true);
return true;
}
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalConvTranspose, METAL, MetalConvTransposeMTLBufferOp);
OPERATOR_SCHEMA(MetalConvTranspose).NumInputs(3).NumOutputs(1);
} // namespace caffe2
#endif // CAFFE2_MOBILE

View File

@ -1,39 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import "MetalContext.h"
#import "FBMetalConstantValues.h"
#import <Foundation/Foundation.h>
#import <Metal/Metal.h>
@protocol MTLBuffer, MTLComputeCommandEncoder, MTLComputePipelineState;
@interface MetalImageFilter : NSObject
@property (nonatomic, strong) MetalContext* context;
@property (nonatomic, strong) id<MTLComputePipelineState> pipeline;
@property (atomic, strong) MTLTextureDescriptor* outputTextureDescriptor;
- (instancetype)initWithFunctionName:(NSString*)functionName
libraryName:(NSString*)libraryName
librarySource:(NSString*)librarySource
context:(MetalContext*)context
constantValues:(FBMetalConstantValues*)constantValues;
- (NSString*)replaceConstantValues:(FBMetalConstantValues *)constantValues
librarySource:(NSString*)librarySource;
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
weightBufferOffset:(NSInteger)weightBufferOffset
outputBufferOffset:(NSInteger)outputBufferOffset;
- (void)applyFilter:(void (^)(NSError*))completionHandler;
- (void) applyFilter:(void(^)(NSError*))completionHandler
weightBufferOffset:(NSInteger)weightBufferOffset
outputBufferOffset:(NSInteger)outputBufferOffset;
@end

View File

@ -1,171 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "MetalImageFilter.h"
#import "MetalContext.h"
#import <TargetConditionals.h>
#import <Metal/Metal.h>
#include "caffe2/core/logging.h"
@interface MetalImageFilter ()
@property (nonatomic, strong) id<MTLFunction> kernelFunction;
@end
@implementation MetalImageFilter {
NSString* _functionName;
}
static constexpr size_t kThreadGroupSize_x = 4;
static constexpr size_t kThreadGroupSize_y = 8;
static constexpr bool kEnableGPUProfiling = false; // GPU profiling is expensive, have it off by default
#import "MetalShaderUtilities.h"
@synthesize outputTextureDescriptor = _outputTextureDescriptor;
static NSMutableDictionary<NSString*, id<MTLLibrary>>* functionLibraryCache = NULL;
- (id<MTLFunction>)newFunctionWithName:(NSString*)functionName
libraryName:(NSString*)libraryName
librarySource:(NSString*)librarySource
context:(MetalContext*)context
constantValues:(FBMetalConstantValues*)constantValues {
NSString* library_version =
constantValues ? [libraryName stringByAppendingString:[[NSString alloc]
initWithUTF8String:constantValues->to_string().c_str()]]
: libraryName;
if (functionLibraryCache == NULL) {
functionLibraryCache = [NSMutableDictionary<NSString*, id<MTLLibrary>> dictionary];
}
id<MTLLibrary> library = functionLibraryCache[library_version];
if (library == nil) {
librarySource = [self replaceConstantValues:constantValues librarySource:librarySource];
MTLCompileOptions* options = [MTLCompileOptions alloc];
options.fastMathEnabled = TRUE;
options.languageVersion = MTLLanguageVersion1_0;
NSError* error = NULL;
library = [context.device newLibraryWithSource:librarySource options:options error:&error];
if (error != nil) {
NSString* description = [[NSString alloc] init];
LOG(ERROR) << "Problems with library for " << [library_version UTF8String] << " : "
<< [[error localizedDescription] UTF8String];
}
functionLibraryCache[library_version] = library;
}
return library != nil ? [library newFunctionWithName:functionName] : nil;
}
- (NSString*)replaceConstantValues:(FBMetalConstantValues*)constantValues librarySource:(NSString*)librarySource {
return librarySource;
}
- (instancetype)initWithFunctionName:(NSString*)functionName
libraryName:(NSString*)libraryName
librarySource:(NSString*)librarySource
context:(MetalContext*)context
constantValues:(FBMetalConstantValues*)constantValues {
if ((self = [super init])) {
NSError* error = nil;
_context = context;
_kernelFunction =
[self newFunctionWithName:functionName libraryName:libraryName librarySource:librarySource context:context constantValues:constantValues];
_functionName = functionName;
_pipeline = [context.device newComputePipelineStateWithFunction:_kernelFunction error:&error];
if (!_pipeline) {
LOG(ERROR) << "Error occurred when building compute pipeline for function " << [functionName UTF8String];
return nil;
}
}
return self;
}
- (void)configureArgumentTableWithCommandEncoder:(id<MTLComputeCommandEncoder>)commandEncoder
weightBufferOffset:(NSInteger)weightBufferOffset
outputBufferOffset:(NSInteger)outputBufferOffset {
}
- (bool)checkExecution:(id<MTLCommandBuffer>)commandBuffer {
NSError* error = [commandBuffer error];
if (error != nil) {
LOG(ERROR) << "Problems with " << [self->_functionName UTF8String] << " : "
<< [[error localizedDescription] UTF8String];
return false;
}
return true;
}
- (MTLSize)threadsPerThreadgroup {
return MTLSizeMake(kThreadGroupSize_x, kThreadGroupSize_y, 1);
}
- (MTLSize)threadgroupsPerGrid {
MTLSize threadsPerThreadgroup = [self threadsPerThreadgroup];
return MTLSizeMake(
(self.outputTextureDescriptor.width + threadsPerThreadgroup.width - 1) /
threadsPerThreadgroup.width,
(self.outputTextureDescriptor.height + threadsPerThreadgroup.height - 1) /
threadsPerThreadgroup.height,
1);
}
- (void)applyFilter:(void (^)(NSError*))completionHandler {
[self applyFilter:completionHandler weightBufferOffset:0 outputBufferOffset:0];
}
- (void)applyFilter:(void (^)(NSError*))completionHandler
weightBufferOffset:(NSInteger)weightBufferOffset
outputBufferOffset:(NSInteger)outputBufferOffset {
MTLTextureDescriptor* textureDescriptor = self.outputTextureDescriptor;
if (kEnableGPUProfiling) {
[self.context.commandQueue insertDebugCaptureBoundary];
}
id<MTLCommandBuffer> commandBuffer = [self.context.commandQueue commandBuffer];
/*
* It is not obvious which grid strategy is best, maximizing the number of threadgroups
* seems to give better results, but more investigation is needed
*/
id<MTLComputeCommandEncoder> commandEncoder = [commandBuffer computeCommandEncoder];
[commandEncoder setComputePipelineState:self.pipeline];
[self configureArgumentTableWithCommandEncoder:commandEncoder
weightBufferOffset:weightBufferOffset
outputBufferOffset:outputBufferOffset];
[commandEncoder dispatchThreadgroups:[self threadgroupsPerGrid] threadsPerThreadgroup:[self threadsPerThreadgroup]];
[commandEncoder endEncoding];
[commandBuffer addCompletedHandler:^(id<MTLCommandBuffer> commandBuffer) {
if (completionHandler != NULL) {
completionHandler([commandBuffer error]);
} else {
[self checkExecution:commandBuffer];
}
}];
[commandBuffer commit];
if (kEnableGPUProfiling) {
[self.context.commandQueue insertDebugCaptureBoundary];
}
}
@end

View File

@ -1,134 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/operator.h"
#import "MetalCaffeContext.h"
#import "metal_instance_norm.h"
#ifndef CAFFE2_MOBILE
#error "Caffe2 mobile state not defined"
#endif
#if CAFFE2_MOBILE
namespace caffe2 {
class MetalInstanceNormOp final : public Operator<MetalCaffeContext> {
public:
MetalInstanceNormOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<MetalCaffeContext>(operator_def, ws),
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
}
bool RunOnDevice() override {
return RunOnDeviceWithOrderNCHW();
}
bool RunOnDeviceWithOrderNCHW();
protected:
float epsilon_;
StorageOrder order_;
INPUT_TAGS(INPUT, SCALE, BIAS);
OUTPUT_TAGS(OUTPUT);
};
bool MetalInstanceNormOp::RunOnDeviceWithOrderNCHW() {
const auto& X = Input(INPUT);
const auto& scale = Input(SCALE);
const auto& bias = Input(BIAS);
auto* Y = Output(OUTPUT);
const int C = X.dim32(1);
const int H = X.dim32(2);
const int W = X.dim32(3);
Y->ResizeLike(X);
const auto* Xdata = X.template data<float>();
const auto* Sdata = scale.template data<uint16_t>();
const auto* Bdata = bias.template data<uint16_t>();
auto* Ydata = Y->template mutable_data<float>();
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)Xdata);
id<MTLBuffer> scaleDataBuffer = GetMetalAllocator()->Buffer((void*)Sdata);
id<MTLBuffer> biasDataBuffer = GetMetalAllocator()->Buffer((void*)Bdata);
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
metal_instance_norm(inputDataBuffer, C, H, W, scaleDataBuffer, biasDataBuffer, outputDataBuffer, nil, 0, epsilon_);
return true;
}
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalInstanceNorm, METAL, MetalInstanceNormOp);
OPERATOR_SCHEMA(MetalInstanceNorm).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
class MetalInstanceNormPReluOp final : public Operator<MetalCaffeContext> {
public:
MetalInstanceNormPReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<MetalCaffeContext>(operator_def, ws),
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
CAFFE_ENFORCE(epsilon_ >= 0, "Must pass a nonnegative epsilon.");
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
}
bool RunOnDevice() override {
return RunOnDeviceWithOrderNCHW();
}
bool RunOnDeviceWithOrderNCHW();
protected:
float epsilon_;
StorageOrder order_;
INPUT_TAGS(INPUT, SCALE, BIAS, PRELU);
OUTPUT_TAGS(OUTPUT);
};
bool MetalInstanceNormPReluOp::RunOnDeviceWithOrderNCHW() {
const auto& X = Input(INPUT);
const auto& scale = Input(SCALE);
const auto& bias = Input(BIAS);
const auto& prelu = Input(PRELU);
auto* Y = Output(OUTPUT);
const int C = X.dim32(1);
const int H = X.dim32(2);
const int W = X.dim32(3);
Y->ResizeLike(X);
const auto* Xdata = X.template data<float>();
const auto* Sdata = scale.template data<uint16_t>();
const auto* Bdata = bias.template data<uint16_t>();
const auto* Pdata = prelu.template data<uint16_t>();
auto* Ydata = Y->template mutable_data<float>();
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)Xdata);
id<MTLBuffer> scaleDataBuffer = GetMetalAllocator()->Buffer((void*)Sdata);
id<MTLBuffer> biasDataBuffer = GetMetalAllocator()->Buffer((void*)Bdata);
id<MTLBuffer> preluDataBuffer = GetMetalAllocator()->Buffer((void*)Pdata);
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
metal_instance_norm(
inputDataBuffer,
C,
H,
W,
scaleDataBuffer,
biasDataBuffer,
outputDataBuffer,
preluDataBuffer,
prelu.size(),
epsilon_);
return true;
}
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalInstanceNormPRelu, METAL, MetalInstanceNormPReluOp);
OPERATOR_SCHEMA(MetalInstanceNormPRelu).NumInputs(3, 4).NumOutputs(1, 3).AllowInplace({{0, 0}});
} // namespace caffe2
#endif // CAFFE2_MOBILE

View File

@ -1,52 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/core/operator.h"
#import "MetalCaffeContext.h"
#import "metal_prelu.h"
#ifndef CAFFE2_MOBILE
#error "Caffe2 mobile state not defined"
#endif
#if CAFFE2_MOBILE
namespace caffe2 {
class MetalPReluOp final : public Operator<MetalCaffeContext> {
public:
MetalPReluOp(const OperatorDef& operator_def, Workspace* ws)
: Operator<MetalCaffeContext>(operator_def, ws),
order_(StringToStorageOrder(OperatorBase::GetSingleArgument<string>("order", "NCHW"))) {
OPERATOR_NEEDS_FEATURE(this->order_ == StorageOrder::NCHW, "Metal only supports NCHW order.");
}
bool RunOnDevice() override;
protected:
StorageOrder order_;
};
bool MetalPReluOp::RunOnDevice() {
const auto& X = Input(0);
const auto& W = Input(1);
auto* Y = Output(0);
Y->ResizeLike(X);
const auto* Xdata = X.template data<float>();
const auto* Wdata = W.template data<uint16_t>();
auto* Ydata = Y->template mutable_data<float>();
id<MTLBuffer> inputDataBuffer = GetMetalAllocator()->Buffer((void*)Xdata);
id<MTLBuffer> weightBuffer = GetMetalAllocator()->Buffer((void*)Wdata);
id<MTLBuffer> outputDataBuffer = GetMetalAllocator()->Buffer((void*)Ydata);
metal_prelu(inputDataBuffer, X.dim32(1), X.dim32(3), X.dim32(2), weightBuffer, W.size(), outputDataBuffer);
return true;
}
REGISTER_CPU_OPERATOR_WITH_ENGINE(MetalPRelu, METAL, MetalPReluOp);
OPERATOR_SCHEMA(MetalPRelu).NumInputs(2).NumOutputs(1).AllowInplace({{0, 0}}).IdenticalTypeAndShape();
} // namespace caffe2
#endif // CAFFE2_MOBILE

View File

@ -1,11 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import <string>
#import <sstream>
#import <functional>
std::string replace_first(std::string input, std::function<void(std::stringstream &fs)> fmt, int index);
#define REPLACE_CONSTANT(src, val, idx) { src = replace_first(src, [&](std::stringstream &fmt) { fmt << val; }, idx); }

View File

@ -1,20 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "MetalShaderUtilities.h"
std::string replace_first(std::string input, std::function<void(std::stringstream &fs)> fmt, int index) {
std::stringstream value_format_stream;
value_format_stream << "= ";
fmt(value_format_stream);
std::stringstream token_format_stream;
token_format_stream << "[[ function_constant(" << index << ") ]]";
size_t position = input.find(token_format_stream.str());
if (position != std::string::npos) {
size_t length = token_format_stream.str().length();
return input.replace(position, length, value_format_stream.str());
} else {
return input;
}
}

View File

@ -1,11 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#ifdef __ARM_NEON__
#import <arm_neon.h>
#else
typedef unsigned short uint16_t;
typedef uint16_t float16_t;
typedef float float32_t;
#endif

View File

@ -1,67 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import <stdlib.h>
#import <functional>
#import "arm_neon_support.h"
#import <dispatch/dispatch.h>
template <typename T1, typename T2, void process_data(T1*, const T2*, size_t), int P>
void parallelize(T1* dst, const T2* src, size_t items, dispatch_queue_t queue) {
process_data(dst, src, items);
dispatch_apply(P, queue, ^(size_t it) {
const size_t chunk_size = it == P - 1 ? items - (P - 1) * (items / P) : items / P;
const size_t offset = it * chunk_size;
process_data(dst + offset, src + offset, chunk_size);
});
}
template <typename filter_type>
filter_type* reformatKernelImage(
const float* input_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed,
std::function<filter_type*(size_t)> allocator);
template <typename filter_type>
bool reformatKernelImage(
const float* input_data,
filter_type* output_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed);
template <typename filter_type>
bool reformatKernelImage(
const float* input_data,
int kernels,
int kernel_channels,
int kernel_width,
int kernel_height,
bool transposed,
std::function<filter_type*(int)> allocator);
template <typename out_buffer_type>
out_buffer_type* reformatInputImage(
const float* data,
int channels,
int width,
int height,
std::function<out_buffer_type*(size_t)> allocator);
template <typename T1, typename T2>
void memcpycvt(T1* dst, const T2* src, size_t n);

View File

@ -1,265 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "data_conversion.h"
#import "metal_convolution.h"
#import <stdio.h>
#import <string.h>
void memcvt_F32_F16(float32_t* dst, const float16_t* src, size_t n) {
int i = 0;
#if defined(__ARM_NEON__)
for (; i < 4 * (n / 4); i += 4) {
*((float32x4_t*)&dst[i]) = vcvt_f32_f16(*((float16x4_t*)&src[i]));
}
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
}
void memcvt_F16_F32(float16_t* dst, const float32_t* src, size_t n) {
int i = 0;
#if defined(__ARM_NEON__)
for (; i < 4 * (n / 4); i += 4) {
*((float16x4_t*)&dst[i]) = vcvt_f16_f32(*((float32x4_t*)&src[i]));
}
#endif
for (; i < n; i++) {
dst[i] = src[i];
}
}
template <typename T1, typename T2>
void memcpycvt(T1* dst, const T2* src, size_t n);
template <>
void memcpycvt(float* dst, const float* src, size_t n) {
memcpy(dst, src, n * sizeof(float));
}
template <>
void memcpycvt(float16_t* dst, const float* src, size_t n) {
parallelize<float16_t, float, memcvt_F16_F32, 2>(
dst, src, n, dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0));
}
template <>
void memcpycvt(float* dst, const float16_t* src, size_t n) {
parallelize<float, float16_t, memcvt_F32_F16, 2>(
dst, src, n, dispatch_get_global_queue(QOS_CLASS_USER_INTERACTIVE, 0));
}
template <typename filter_type>
bool reformatKernelImage(
const float* input_data,
filter_type* output_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed) {
const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
if (output_data) {
filter_type* buffer = output_data;
for (int ks = 0; ks < kernels / kernel_stride; ks++) {
for (int kb = 0; kb < kernel_stride; kb++) {
int k = kernel_offset + kernel_stride * ks + kb;
for (int c = 0; c < channels; c++) {
for (int y = 0; y < height; y++) {
for (int x = 0; x < width; x++) {
buffer[aligned_kernel_stride * (width * height * (ks * channels + c) + y * width + x) + kb] =
transposed
? input_data
[(c * input_kernels + k) * width * height + (height - 1 - y) * height + (width - 1 - x)]
: input_data[(k * channels + c) * width * height + y * width + x];
}
}
}
}
}
return true;
}
return false;
}
template bool reformatKernelImage<float16_t>(
const float* input_data,
float16_t* output_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed);
template bool reformatKernelImage<float>(
const float* input_data,
float* output_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed);
template <typename filter_type>
filter_type* reformatKernelImage(
const float* input_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed,
std::function<filter_type*(size_t)> allocator) {
const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
const int buffer_size = aligned_kernel_stride * (kernels / kernel_stride) * channels * width * height;
filter_type* output_data = allocator(sizeof(filter_type) * buffer_size);
reformatKernelImage(
input_data,
output_data,
kernels,
input_kernels,
kernel_offset,
kernel_stride,
channels,
width,
height,
transposed);
return output_data;
}
template float16_t* reformatKernelImage<float16_t>(
const float* input_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed,
std::function<float16_t*(size_t)> allocator);
template float* reformatKernelImage<float>(
const float* input_data,
int kernels,
int input_kernels,
int kernel_offset,
int kernel_stride,
int channels,
int width,
int height,
bool transposed,
std::function<float*(size_t)> allocator);
template <typename filter_type>
bool reformatKernelImage(
const float* input_data,
int kernels,
int kernel_channels,
int kernel_width,
int kernel_height,
bool transposed,
std::function<filter_type*(int)> allocator) {
int kernels_per_convolution = kernels;
if (!calculate_kernels_per_convolution(kernels_per_convolution)) {
return false;
}
const int convolutions = kernels / kernels_per_convolution;
const int kernel_stride = kernels_per_convolution;
const int aligned_kernel_stride = kernel_stride <= 2 ? kernel_stride : 4 * ((kernel_stride + 3) / 4);
const int chunk_size = aligned_kernel_stride * (kernels_per_convolution / kernel_stride) * kernel_channels *
kernel_width * kernel_height;
// This will allocate more memory than needed, but I couldn't find a better solution
filter_type* output_data = allocator((aligned_kernel_stride + kernel_stride - 1) / kernel_stride);
for (int c = 0; c < convolutions; c++) {
if (!reformatKernelImage(
input_data,
output_data + c * chunk_size,
kernels_per_convolution,
kernels,
c * kernels_per_convolution,
kernel_stride,
kernel_channels,
kernel_width,
kernel_height,
transposed))
return false;
}
return true;
}
template bool reformatKernelImage<float16_t>(
const float* input_data,
int kernels,
int kernel_channels,
int kernel_width,
int kernel_height,
bool transposed,
std::function<float16_t*(int)> allocator);
template bool reformatKernelImage<float>(
const float* input_data,
int kernels,
int kernel_channels,
int kernel_width,
int kernel_height,
bool transposed,
std::function<float*(int)> allocator);
template <typename out_buffer_type>
out_buffer_type* reformatInputImage(
const float* data,
int channels,
int width,
int height,
std::function<out_buffer_type*(size_t)> allocator) {
const int buffer_size = channels * width * height;
out_buffer_type* output_data = allocator(sizeof(out_buffer_type) * buffer_size);
if (output_data) {
memcpycvt(output_data, data, buffer_size);
}
return output_data;
}
template float16_t* reformatInputImage<float16_t>(
const float* data,
int channels,
int width,
int height,
std::function<float16_t*(size_t)> allocator);
template float* reformatInputImage<float>(
const float* data,
int channels,
int width,
int height,
std::function<float*(size_t)> allocator);

View File

@ -1,31 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import <Metal/Metal.h>
bool calculate_kernels_per_convolution(int& kernels_per_convolution);
bool metal_convolution(
id<MTLBuffer> inputBuffer,
int input_channels,
int input_width,
int input_height,
int input_stride_x,
int input_stride_y,
int input_pad_t,
int input_pad_l,
int input_pad_b,
int input_pad_r,
id<MTLBuffer> weightBuffer,
int output_channels,
int kernel_channels,
int kernel_width,
int kernel_height,
id<MTLBuffer> outputBuffer,
int output_number,
int output_width,
int output_height,
const float* bias,
int bias_length,
bool transposed);

View File

@ -1,140 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "arm_neon_support.h"
#import "FBMetalCNNConvolution.h"
#import "metal_convolution.h"
#include "caffe2/core/logging.h"
@interface MetalConvolutionCacheEntry : NSObject
@property (nonatomic, strong) NSMutableArray<FBMetalCNNConvolution*>* convolutions;
@end
@implementation MetalConvolutionCacheEntry
@end
static MetalContext* metalContext = NULL;
static NSMutableDictionary<NSString*, MetalConvolutionCacheEntry*>* convolutionCache = NULL;
static void init_metal_pipeline() {
if (metalContext == NULL) {
metalContext = [MetalContext getContext];
convolutionCache = [NSMutableDictionary<NSString*, MetalConvolutionCacheEntry*> dictionary];
}
}
bool calculate_kernels_per_convolution(int& kernels_per_convolution) {
while (kernels_per_convolution > MAX_KERNELS_PER_CONVOLUTION) {
if (kernels_per_convolution % 3 == 0)
kernels_per_convolution /= 3;
else if (kernels_per_convolution % 2 == 0)
kernels_per_convolution /= 2;
else {
LOG(ERROR) << "The number of output channels must be a multiple of 2 or 3\n";
return false;
}
}
return true;
}
bool metal_convolution(
id<MTLBuffer> inputBuffer,
int input_channels,
int input_width,
int input_height,
int input_stride_x,
int input_stride_y,
int input_pad_t,
int input_pad_l,
int input_pad_b,
int input_pad_r,
id<MTLBuffer> weightBuffer,
int output_channels,
int kernel_channels,
int kernel_width,
int kernel_height,
id<MTLBuffer> outputBuffer,
int output_number,
int output_width,
int output_height,
const float* bias,
int bias_length,
bool transposed) {
init_metal_pipeline();
if (transposed) {
int t = output_channels;
output_channels = kernel_channels;
kernel_channels = t;
}
int output_batch_size = output_channels;
if (!calculate_kernels_per_convolution(output_batch_size)) {
return false;
}
FBMetalCNNConstantValues constantValues = FBMetalCNNConstantValues(
input_width,
input_height,
input_channels,
input_stride_x,
input_stride_y,
input_pad_t,
input_pad_l,
input_pad_b,
input_pad_r,
kernel_width,
kernel_height,
output_width,
output_height,
output_batch_size,
transposed);
NSString* key = [NSString stringWithUTF8String:constantValues.to_string().c_str()];
const int batches = output_channels / output_batch_size;
MetalConvolutionCacheEntry* cc = convolutionCache[key];
if (cc == NULL) {
// printf("metal_convolution_mtlbuffer: %s\n", [key UTF8String]);
convolutionCache[key] = cc = [[MetalConvolutionCacheEntry alloc] init];
cc.convolutions = [NSMutableArray<FBMetalCNNConvolution*> arrayWithCapacity:batches];
for (int c = 0; c < batches; c++) {
FBMetalCNNConvolution* convolution = [FBMetalCNNConvolution filterWithContext:metalContext
channels:input_channels
kernel_size:kernel_height
constantValues:&constantValues
width:output_width
height:output_height
stride_x:input_stride_x
stride_y:input_stride_y];
cc.convolutions[c] = convolution;
}
}
for (int c = 0; c < batches; c++) {
FBMetalCNNConvolution* convolution = cc.convolutions[c];
[convolution loadBiasData:&bias[output_batch_size * c] length:(NSUInteger)output_batch_size];
convolution.dataBuffer = inputBuffer;
convolution.outputBuffer = outputBuffer;
convolution.weightBuffer = weightBuffer;
const int aligned_kernel_stride = output_batch_size <= 2 ? output_batch_size : 4 * ((output_batch_size + 3) / 4);
const int buffer_size = aligned_kernel_stride * (output_batch_size / output_batch_size) * kernel_channels *
kernel_width * kernel_height;
int weightBuffer_offset = c * sizeof(weight_buffer_type) * buffer_size;
int outputBuffer_offset = c * sizeof(data_buffer_type) * output_batch_size * output_width * output_height;
[convolution applyFilter:(void (^)(NSError* error)) nullptr
weightBufferOffset:weightBuffer_offset
outputBufferOffset:outputBuffer_offset];
}
return true;
}

View File

@ -1,17 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#import <Metal/Metal.h>
bool metal_instance_norm(
id<MTLBuffer> inputBuffer,
int input_channels,
int input_width,
int input_height,
id<MTLBuffer> scaleDataBuffer,
id<MTLBuffer> biasDataBuffer,
id<MTLBuffer> outputBuffer,
id<MTLBuffer> preluBuffer,
int prelu_length,
float epsilon_);

View File

@ -1,77 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "arm_neon_support.h"
#import "MetalContext.h"
#import "FBMetalCNNInstanceNorm.h"
#import <Metal/Metal.h>
@interface InstanceNormCacheEntry : NSObject
@property (nonatomic, strong) FBMetalCNNInstanceNorm* instanceNorm;
@end
@implementation InstanceNormCacheEntry
@end
static MetalContext* metalContext = NULL;
static NSMutableDictionary<NSString*, InstanceNormCacheEntry*>* instanceNormCache = NULL;
static void init_metal_pipeline() {
if (metalContext == NULL) {
metalContext = [MetalContext getContext];
instanceNormCache = [NSMutableDictionary<NSString*, InstanceNormCacheEntry*> dictionary];
}
}
bool metal_instance_norm(
id<MTLBuffer> inputBuffer,
int input_channels,
int input_width,
int input_height,
id<MTLBuffer> scaleDataBuffer,
id<MTLBuffer> biasDataBuffer,
id<MTLBuffer> outputBuffer,
id<MTLBuffer> preluBuffer,
int prelu_size,
float epsilon_) {
init_metal_pipeline();
NSString* key = [NSString stringWithFormat:@"X:%d:%d:%d-P:%d", input_channels, input_width, input_height, prelu_size];
InstanceNormCacheEntry* cc = instanceNormCache[key];
if (cc == NULL) {
cc = [[InstanceNormCacheEntry alloc] init];
instanceNormCache[key] = cc;
FBMetalInstanceNormConstantValues constantValues =
FBMetalInstanceNormConstantValues(input_width, input_height, input_channels, prelu_size);
id<MTLBuffer> avgBuffer =
[metalContext.device newBufferWithLength:sizeof(float) * input_channels options:MTLStorageModeShared];
id<MTLBuffer> stdevBuffer =
[metalContext.device newBufferWithLength:sizeof(float) * input_channels options:MTLStorageModeShared];
FBMetalCNNInstanceNorm* instanceNorm = [FBMetalCNNInstanceNorm filterWithContext:metalContext
constantValues:&constantValues
width:input_width
height:input_height
channel:input_channels
withPRelu:preluBuffer != nil];
instanceNorm.avgBuffer = avgBuffer;
instanceNorm.stdevBuffer = stdevBuffer;
[instanceNorm loadEpsilon:epsilon_];
cc.instanceNorm = instanceNorm;
}
FBMetalCNNInstanceNorm* instanceNorm = cc.instanceNorm;
instanceNorm.dataBuffer = inputBuffer;
instanceNorm.outputBuffer = outputBuffer;
instanceNorm.scaleBuffer = scaleDataBuffer;
instanceNorm.biasBuffer = biasDataBuffer;
instanceNorm.preluBuffer = preluBuffer;
[instanceNorm applyFilter:(void (^)(NSError* error)) nullptr];
return true;
}

View File

@ -1,12 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
bool metal_prelu(
id<MTLBuffer> inputBuffer,
int input_channels,
int input_width,
int input_height,
id<MTLBuffer> weightBuffer,
int weight_length,
id<MTLBuffer> outputBuffer);

View File

@ -1,63 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "arm_neon_support.h"
#import "MetalContext.h"
#import "FBMetalCNNPRelu.h"
#import "metal_prelu.h"
#include "caffe2/core/logging.h"
@interface PReluCacheEntry : NSObject
@property (nonatomic, strong) FBMetalCNNPRelu* prelu;
@end
@implementation PReluCacheEntry
@end
static MetalContext* metalContext = NULL;
static NSMutableDictionary<NSString*, PReluCacheEntry*>* preluCache = NULL;
static void init_metal_pipeline() {
if (metalContext == NULL) {
metalContext = [MetalContext getContext];
preluCache = [NSMutableDictionary<NSString*, PReluCacheEntry*> dictionary];
}
}
bool metal_prelu(
id<MTLBuffer> inputBuffer,
int input_channels,
int input_width,
int input_height,
id<MTLBuffer> weightBuffer,
int weight_length,
id<MTLBuffer> outputBuffer) {
init_metal_pipeline();
NSString* key =
[NSString stringWithFormat:@"X:%d:%d:%d-F:%d", input_channels, input_width, input_height, weight_length];
PReluCacheEntry* cc = preluCache[key];
if (cc == NULL) {
preluCache[key] = cc = [[PReluCacheEntry alloc] init];
FBMetalPReluConstantValues constantValues =
FBMetalPReluConstantValues(input_width, input_height, input_channels, weight_length);
cc.prelu = [FBMetalCNNPRelu filterWithContext:metalContext
constantValues:&constantValues
width:input_width
height:input_height
channel:input_channels];
}
FBMetalCNNPRelu* prelu = cc.prelu;
prelu.dataBuffer = inputBuffer;
prelu.outputBuffer = outputBuffer;
prelu.weightBuffer = weightBuffer;
[prelu applyFilter:(void (^)(NSError* error)) nullptr];
return true;
}

View File

@ -1,5 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
bool metal_sync_op();

View File

@ -1,36 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#import "arm_neon_support.h"
#import "MetalContext.h"
#import "FBMetalCNNNoOp.h"
#import "metal_sync_op.h"
static MetalContext* metalContext = NULL;
static FBMetalCNNNoOp* noOpCache = NULL;
static void init_metal_pipeline() {
if (metalContext == NULL) {
metalContext = [MetalContext getContext];
}
}
bool metal_sync_op() {
init_metal_pipeline();
if (noOpCache == NULL) {
noOpCache = [FBMetalCNNNoOp filterWithContext:metalContext
width:1
height:1];
}
static dispatch_semaphore_t gpu_execution_done = NULL;
if (gpu_execution_done == NULL)
gpu_execution_done = dispatch_semaphore_create(0);
[noOpCache applyFilter:^(NSError* error) {
dispatch_semaphore_signal(gpu_execution_done);
}];
dispatch_semaphore_wait(gpu_execution_done, DISPATCH_TIME_FOREVER);
return true;
}

View File

@ -1,883 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/core/logging.h"
#include "caffe2/core/workspace.h"
#include "metal_test.h"
#include "rewrite_net.h"
#define DEBUGGING false
namespace caffe2 {
void testMetalCopyOps(int N, int C, int H, int W, float error) {
LOG(INFO) << "MetalCopyFrom/To Test";
Workspace ws;
{
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
NetDef netdef;
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPU");
op.add_input("X_cpu");
op.add_output("X_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyFromMetalGPU");
op.add_input("X_mtl");
op.add_output("Y_cpu");
}
ws.RunNetOnce(netdef);
const auto &t1 = ws.GetBlob("X_cpu")->Get<TensorCPU>();
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
// for (auto i = 0; i < t1.size(); ++i) {
// LOG(INFO) << "i: " << i << ", CPU: " << t1.data<float>()[i] << ", MTL: " << t2.data<float>()[i];
// }
for (auto i = 0; i < t1.size(); ++i) {
CHECK_NEAR(t1.data<float>()[i], t2.data<float>()[i], error);
}
}
void testMetalInstanceNorm(int N, int C, int H, int W, float error) {
LOG(INFO) << "MetalInstanceNorm Test";
Workspace ws;
{
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
t->Resize(N, C, H, W);
CPUContext ctx;
// Too noisy.
math::RandGaussian<float, CPUContext>(t->size(), 0, 3, t->mutable_data<float>(), &ctx);
}
{
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
t->mutable_data<float>()[i] = i;
}
}
{
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
t->mutable_data<float>()[i] = 8 - 2 * i;
}
}
NetDef netdef;
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPU");
op.add_input("X_cpu");
op.add_output("X_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPUFloat16");
op.add_input("W");
op.add_output("W_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPUFloat16");
op.add_input("b");
op.add_output("b_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("MetalInstanceNorm");
op.add_input("X_mtl");
op.add_input("W_mtl");
op.add_input("b_mtl");
op.add_output("Y_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyFromMetalGPU");
op.add_input("Y_mtl");
op.add_output("Y_cpu");
}
{
auto &op = *(netdef.add_op());
op.set_type("InstanceNorm");
op.add_input("X_cpu");
op.add_input("W");
op.add_input("b");
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
op.add_output("Y_ref");
}
ws.RunNetOnce(netdef);
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
for (auto i = 0; i < t1.size(); ++i) {
const float t1_i = t1.data<float>()[i];
const float t2_i = t2.data<float>()[i];
CHECK_NEAR(t1_i, t2_i, error);
}
}
void testMetalPRelu(int N, int C, int H, int W, int K, float error) {
LOG(INFO) << "MetalPRelu Test";
Workspace ws;
{
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
{
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
t->Resize(K);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
// The MetalPRelu is an in-place operator
NetDef netdef;
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPU");
op.add_input("X_cpu");
op.add_output("X_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPUFloat16");
op.add_input("b");
op.add_output("b_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("MetalPRelu");
op.add_input("X_mtl");
op.add_input("b_mtl");
op.add_output("X_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyFromMetalGPU");
op.add_input("X_mtl");
op.add_output("Y_cpu");
}
{
auto &op = *(netdef.add_op());
op.set_type("PRelu");
op.add_input("X_cpu");
op.add_input("b");
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
op.add_output("Y_ref");
}
ws.RunNetOnce(netdef);
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
for (auto i = 0; i < t1.size(); ++i) {
const float t1_i = t1.data<float>()[i];
const float t2_i = t2.data<float>()[i];
CHECK_NEAR(t1_i, t2_i, error);
}
}
void testMetalInstanceNormPRelu(int N, int C, int H, int W, float error) {
LOG(INFO) << "MetalInstanceNormPRelu Test";
Workspace ws;
{
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
t->Resize(N, C, H, W);
CPUContext ctx;
// Too noisy.
math::RandGaussian<float, CPUContext>(t->size(), 0, 3, t->mutable_data<float>(), &ctx);
}
{
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
t->mutable_data<float>()[i] = i;
}
}
{
auto *t = ws.CreateBlob("b1")->GetMutable<TensorCPU>();
t->Resize(C);
CPUContext ctx;
for (auto i = 0; i < t->size(); ++i) {
t->mutable_data<float>()[i] = 8 - 2 * i;
}
}
// bias for PRelu
{
auto *t = ws.CreateBlob("b2")->GetMutable<TensorCPU>();
t->Resize(C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
NetDef netdef;
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPU");
op.add_input("X_cpu");
op.add_output("X_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPUFloat16");
op.add_input("W");
op.add_output("W_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPUFloat16");
op.add_input("b1");
op.add_output("b1_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPUFloat16");
op.add_input("b2");
op.add_output("b2_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("MetalInstanceNormPRelu");
op.add_input("X_mtl");
op.add_input("W_mtl");
op.add_input("b1_mtl");
op.add_input("b2_mtl");
op.add_output("Y_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyFromMetalGPU");
op.add_input("Y_mtl");
op.add_output("Y_cpu");
}
{
auto &op = *(netdef.add_op());
op.set_type("InstanceNorm");
op.add_input("X_cpu");
op.add_input("W");
op.add_input("b1");
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
op.add_output("Y_mid");
}
{
auto &op = *(netdef.add_op());
op.set_type("PRelu");
op.add_input("Y_mid");
op.add_input("b2");
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
op.add_output("Y_ref");
}
ws.RunNetOnce(netdef);
const auto &t1 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
const auto &t2 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
for (auto i = 0; i < t1.size(); ++i) {
const float t1_i = t1.data<float>()[i];
const float t2_i = t2.data<float>()[i];
CHECK_NEAR(t1_i, t2_i, error);
}
}
void testMetalConv(int N, int C, int H, int W, int K, int kernel_h, int kernel_w, int pad, float error) {
LOG(INFO) << "MetalConv Test";
Workspace ws;
{
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
{
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
t->Resize(K, C, kernel_h, kernel_w);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
{
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
t->Resize(K);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
NetDef netdef;
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPU");
op.add_input("X_cpu");
op.add_output("X_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyWeightTensorToMetalGPU");
op.add_input("W");
op.add_output("W_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("MetalConv");
op.add_input("X_mtl");
op.add_input("W_mtl");
op.add_input("b");
op.set_engine("METAL");
{
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
}
{
auto &arg = *(op.add_arg());
arg.set_name("kernel");
arg.set_i(kernel_h);
}
{
auto &arg = *(op.add_arg());
arg.set_name("pad");
arg.set_i(pad);
}
op.add_output("Y_mtl");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyFromMetalGPU");
op.add_input("Y_mtl");
op.add_output("Y_cpu");
}
{
auto &op = *(netdef.add_op());
op.set_type("Conv");
op.add_input("X_cpu");
op.add_input("W");
op.add_input("b");
{
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
}
{
auto &arg = *(op.add_arg());
arg.set_name("kernel");
arg.set_i(kernel_h);
}
{
auto &arg = *(op.add_arg());
arg.set_name("pad");
arg.set_i(pad);
}
op.add_output("Y_ref");
}
ws.RunNetOnce(netdef);
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
// for (auto i = 0; i < t1.size(); ++i) {
// const float t1_i = t1.data<float>()[i];
// const float t2_i = t2.data<float>()[i];
// if (std::abs(t1_i - t2_i) > 0.5) {
// LOG(INFO) << "i: " << i << ", CPU: " << t1_i << ", MTL: " << t2_i << ", error: " << std::abs(t1_i - t2_i) / t1_i;
// }
// }
for (auto i = 0; i < t1.size(); ++i) {
// FP16 <-> FP32 round trip, accumulation, etc.
const float t1_i = t1.data<float>()[i];
const float t2_i = t2.data<float>()[i];
CHECK_NEAR(t1_i, t2_i, error);
}
}
void testMetalConvTranspose(
int N, int C, int H, int W, int K, int kernel_h, int kernel_w, int pad, int stride, float error) {
LOG(INFO) << "MetalConvTranspose Test";
Workspace ws;
{
auto *t = ws.CreateBlob("X_cpu")->GetMutable<TensorCPU>();
t->Resize(N, C, H, W);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
{
auto *t = ws.CreateBlob("W")->GetMutable<TensorCPU>();
t->Resize(K, C, kernel_h, kernel_w);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
{
auto *t = ws.CreateBlob("b")->GetMutable<TensorCPU>();
t->Resize(C);
CPUContext ctx;
math::RandGaussian<float, CPUContext>(t->size(), 0, 1, t->mutable_data<float>(), &ctx);
}
NetDef netdef;
{
auto &op = *(netdef.add_op());
op.set_type("CopyToMetalGPU");
op.add_input("X_cpu");
op.add_output("X_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyTransposeWeightTensorToMetalGPU");
op.add_input("W");
op.add_output("W_mtl");
op.set_engine("METAL");
}
{
auto &op = *(netdef.add_op());
op.set_type("MetalConvTranspose");
op.add_input("X_mtl");
op.add_input("W_mtl");
op.add_input("b");
op.set_engine("METAL");
{
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
}
{
auto &arg = *(op.add_arg());
arg.set_name("kernel");
arg.set_i(kernel_h);
}
{
auto &arg = *(op.add_arg());
arg.set_name("pad");
arg.set_i(pad);
}
{
auto &arg = *(op.add_arg());
arg.set_name("stride");
arg.set_i(stride);
}
op.add_output("Y_mtl");
}
{
auto &op = *(netdef.add_op());
op.set_type("CopyFromMetalGPU");
op.add_input("Y_mtl");
op.add_output("Y_cpu");
}
{
auto &op = *(netdef.add_op());
op.set_type("ConvTranspose");
op.add_input("X_cpu");
op.add_input("W");
op.add_input("b");
{
auto &arg = *(op.add_arg());
arg.set_name("order");
arg.set_s("NCHW");
}
{
auto &arg = *(op.add_arg());
arg.set_name("kernel");
arg.set_i(kernel_h);
}
{
auto &arg = *(op.add_arg());
arg.set_name("pad");
arg.set_i(pad);
}
{
auto &arg = *(op.add_arg());
arg.set_name("stride");
arg.set_i(stride);
}
op.add_output("Y_ref");
}
ws.RunNetOnce(netdef);
const auto &t2 = ws.GetBlob("Y_cpu")->Get<TensorCPU>();
const auto &t1 = ws.GetBlob("Y_ref")->Get<TensorCPU>();
CAFFE_ENFORCE_EQ(t1.dims(), t2.dims());
LOG(INFO) << "N: " << t1.dim(0) << " C: " << t1.dim(1) << " H: " << t1.dim(2) << " W: " << t1.dim(3);
#if DEBUGGING
for (auto i = 0; i < t1.size(); ++i) {
const float t1_i = t1.data<float>()[i];
const float t2_i = t2.data<float>()[i];
if (std::abs(t1_i - t2_i) > error) {
LOG(INFO) << "i: " << i << ", CPU: " << t1_i << ", MTL: " << t2_i
<< ", relative error: " << 100 * std::abs(t1_i - t2_i) / t1_i << "%";
}
}
printf("CPU:");
for (int i = 0; i < t1.size(); i++) {
const float t1_i = t1.data<float>()[i];
if (i % t1.dim(2) == 0)
printf("\n");
printf("%f\t", t1_i);
}
printf("\nMETAL:");
for (int i = 0; i < t1.size(); i++) {
const float t2_i = t2.data<float>()[i];
if (i % t2.dim(2) == 0)
printf("\n");
printf("%f\t", t2_i);
}
printf("\n");
#endif
for (auto i = 0; i < t1.size(); ++i) {
// FP16 <-> FP32 round trip, accumulation, etc.
const float t1_i = t1.data<float>()[i];
const float t2_i = t2.data<float>()[i];
CHECK_NEAR(t1_i, t2_i, error);
}
}
void testMetalRewriteWithFusion() {
for (const auto &computeOp : std::vector<std::string>{"InstanceNorm"}) {
LOG(INFO) << "RewriteForMetal Fusion/Copy Test";
NetDef netdef;
netdef.add_external_input("X");
netdef.add_external_output("Y");
// These two ops can be fused.
{
auto &op = *(netdef.add_op());
op.set_type(computeOp);
op.add_input("X");
op.add_input("W");
op.add_input("b1");
op.add_output("Y");
}
{
auto &op = *(netdef.add_op());
op.set_type("PRelu");
op.add_input("Y");
op.add_input("b2");
op.add_output("Y");
}
// Can't fuse these as not in-place (can fix by using SSA).
{
auto &op = *(netdef.add_op());
op.set_type(computeOp);
op.add_input("X2");
op.add_input("W");
op.add_input("b1");
op.add_output("Y2");
}
{
auto &op = *(netdef.add_op());
op.set_type("PRelu");
op.add_input("Y2");
op.add_input("b2");
op.add_output("Y");
}
netdef = rewritePredictNetForMetal(netdef, "METAL");
// dumpDef(netdef);
auto ty = [&](size_t i) { return netdef.op(i).type(); };
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
CHECK_EQ(netdef.op_size(), 5);
CHECK_EQ(ty(0), "CopyToMetalGPU");
CHECK_EQ(ty(1), std::string("Metal") + computeOp + std::string("PRelu"));
CHECK_EQ(ty(2), std::string("Metal") + computeOp);
CHECK_EQ(ty(3), "MetalPRelu");
CHECK_EQ(ty(4), "CopyFromMetalGPU");
CHECK_EQ(i0(0), "X");
CHECK_EQ(i0(1), o0(0));
CHECK_EQ(o0(2), "Y2_M");
CHECK_EQ(i0(3), o0(2));
CHECK_EQ(i0(4), o0(3));
CHECK_NE(o0(4), i0(4));
CHECK_EQ(netdef.external_input(0), "X");
CHECK_EQ(netdef.external_output(0), "Y");
}
}
void testMetalRewriteWithMultiInputCPUOps() {
LOG(INFO) << "RewriteForMetal Test";
NetDef netdef;
netdef.add_external_input("X");
netdef.add_external_output("Y");
// These two ops can be fused.
{
auto &op = *(netdef.add_op());
op.set_type("Conv");
op.add_input("X");
op.add_input("W");
op.add_input("b");
op.add_output("Y1");
}
{
auto &op = *(netdef.add_op());
op.set_type("Conv");
op.add_input("Y1");
op.add_input("W");
op.add_input("b");
op.add_output("Y2");
}
{
auto &op = *(netdef.add_op());
op.set_type("Add");
op.add_input("Y1");
op.add_input("Y2");
op.add_output("Y");
}
netdef = rewritePredictNetForMetal(netdef, "METAL");
// dumpDef(netdef);
auto ty = [&](size_t i) { return netdef.op(i).type(); };
auto i0 = [&](size_t i) { return netdef.op(i).input(0); };
auto i1 = [&](size_t i) { return netdef.op(i).input(1); };
auto o0 = [&](size_t i) { return netdef.op(i).output(0); };
CHECK_EQ(netdef.op_size(), 6);
CHECK_EQ(ty(0), "CopyToMetalGPU");
CHECK_EQ(ty(1), "MetalConv");
CHECK_EQ(ty(2), "MetalConv");
CHECK_EQ(ty(3), "CopyFromMetalGPU");
CHECK_EQ(ty(4), "CopyFromMetalGPU");
CHECK_EQ(ty(5), "Add");
CHECK_EQ(i0(0), "X");
CHECK_EQ(i0(1), o0(0));
CHECK_EQ(o0(2), "Y2_M");
CHECK_EQ(i0(3), o0(1));
CHECK_EQ(i0(4), o0(2));
CHECK_EQ(i0(5), o0(3));
CHECK_EQ(i1(5), o0(4));
CHECK_EQ(o0(5), "Y");
CHECK_EQ(netdef.external_input(0), "X");
CHECK_EQ(netdef.external_output(0), "Y");
}
void testMetalRewriteFailure() {
LOG(INFO) << "RewriteForMetal Failure Test";
NetDef netdef;
netdef.add_external_input("X");
netdef.add_external_output("Y");
{
auto &op = *(netdef.add_op());
op.set_type("Conv");
op.add_input("X");
op.add_input("W");
op.add_input("b");
op.add_output("Y1");
}
{
auto &op = *(netdef.add_op());
op.set_type("Conv");
op.add_input("X");
op.add_input("W");
op.add_input("b");
op.add_output("Y2");
}
{
auto &op = *(netdef.add_op());
op.set_type("Concat");
op.add_input("Y1");
op.add_input("Y2");
op.add_output("Y");
}
try {;
netdef = rewritePredictNetForMetal(netdef, "METAL");
// dumpDef(netdef);
CHECK(false) << "Shouldn't reach here, due to multiple usages of X";
} catch (const std::exception &e) {
LOG(INFO) << "RewriteForMetal failed";
}
}
void testMetal() {
testMetalCopyOps(1, 3, 2, 1, 1e-2);
testMetalCopyOps(1, 4, 1, 1, 1e-2);
testMetalCopyOps(1, 4, 8, 3, 1e-2);
testMetalCopyOps(1, 6, 8, 3, 1e-2);
testMetalCopyOps(1, 4, 1, 2, 1e-2);
testMetalCopyOps(1, 8, 6, 1, 1e-2);
testMetalCopyOps(1, 12, 13, 18, 1e-2);
testMetalInstanceNorm(1, 3, 120, 140, 0.05);
testMetalInstanceNorm(1, 12, 120, 140, 0.05);
testMetalPRelu(1, 3, 8, 13, 3, 0.1);
testMetalPRelu(1, 3, 8, 13, 1, 0.1);
testMetalInstanceNormPRelu(1, 12, 120, 140, 0.2);
testMetalConv(1, 12, 57, 72, 8, 3, 3, 1, 1.5);
testMetalConv(1, 12, 57, 72, 8, 3, 3, 2, 1.5);
testMetalConv(1, 12, 57, 72, 8, 3, 3, 0, 1.5);
testMetalConv(1, 12, 57, 72, 8, 2, 2, 0, 1.5);
#if DEBUGGING
testMetalConvTranspose(1, 1, 6, 6, 1, 1, 1, 0, 2, 0.1);
testMetalConvTranspose(1, 1, 6, 6, 1, 2, 2, 0, 2, 0.1);
testMetalConvTranspose(1, 1, 6, 6, 1, 3, 3, 0, 2, 0.1);
testMetalConvTranspose(1, 1, 6, 6, 1, 4, 4, 0, 2, 0.1);
testMetalConvTranspose(1, 1, 6, 6, 1, 5, 5, 0, 2, 0.1);
testMetalConvTranspose(1, 1, 6, 6, 1, 6, 6, 0, 2, 0.1);
#endif
testMetalConvTranspose(1, 16, 320, 180, 16, 2, 2, 0, 2, 1.5);
testMetalConvTranspose(1, 4, 320, 180, 4, 4, 4, 1, 2, 1.5);
testMetalConvTranspose(1, 4, 320, 180, 4, 4, 4, 0, 4, 1.5);
testMetalRewriteWithFusion();
testMetalRewriteWithMultiInputCPUOps();
testMetalRewriteFailure();
}
NetDef truncateAfter(NetDef def, size_t idx) {
// idx = 0, net = 10 -> remove 9
// idx = 0, net = 1 -> remove 0
const auto toRemove = def.op_size() - idx - 1;
for (auto i = 0; i < toRemove; ++i) {
def.mutable_op()->RemoveLast();
}
CHECK_EQ(def.op_size(), idx + 1);
return def;
}
void compareModels(const NetDef &initNet, NetDef predictNet) {
auto *arg = predictNet.mutable_op(0)->mutable_arg(0);
CHECK_EQ(arg->name(), "noise_std");
arg->set_f(0.000001);
for (auto i = 0; i < predictNet.op_size(); ++i) {
auto truncatedPredictNet = truncateAfter(predictNet, i);
// The copyFromMetalGPUop is added in the rewriting process
NetDef truncatedMetalPredictNet = rewritePredictNetForMetal(truncatedMetalPredictNet, "METAL");
NetDef metalInitNet = rewriteInitNetForMetal(metalInitNet, truncatedMetalPredictNet, "METAL");
// dumpDef(truncatedPredictNet);
// dumpDef(truncatedMetalPredictNet);
Workspace cws;
cws.RunNetOnce(initNet);
{
auto *t = cws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
t->Resize(1, 224, 224, 4);
for (auto i = 0; i < t->size(); ++i) {
t->mutable_data<uint8_t>()[i] = i % 225;
}
}
cws.RunNetOnce(truncatedPredictNet);
Workspace mws;
mws.RunNetOnce(metalInitNet);
{
auto *t = mws.CreateBlob(predictNet.external_input(0))->GetMutable<TensorCPU>();
t->Resize(1, 224, 224, 4);
for (auto i = 0; i < t->size(); ++i) {
t->mutable_data<uint8_t>()[i] = i % 225;
}
}
mws.RunNetOnce(truncatedMetalPredictNet);
const auto name = truncatedPredictNet.op(truncatedPredictNet.op_size() - 1).output(0);
LOG(INFO) << "Checking correspondence for name: " << name << ", idx: " << i;
{
const auto &mt = mws.GetBlob(name)->Get<TensorCPU>();
const auto &ct = cws.GetBlob(name)->Get<TensorCPU>();
CHECK_EQ(mt.dims(), ct.dims());
for (auto j = 0; j < mt.size(); ++j) {
if (mt.IsType<float>()) {
if (j < 10) {
LOG(INFO) << "i: " << i << ", j: " << j << ", CPU: " << ct.data<float>()[j]
<< ", MTL: " << mt.data<float>()[j];
}
CHECK_NEAR(mt.data<float>()[j], ct.data<float>()[j], 5);
} else {
CHECK(mt.IsType<uint8_t>());
if (j < 10) {
LOG(INFO) << "i: " << i << ", j: " << j << ", CPU: " << ct.data<uint8_t>()[j]
<< ", MTL: " << mt.data<uint8_t>()[j];
}
CHECK_NEAR(mt.data<uint8_t>()[j], ct.data<uint8_t>()[j], 5);
}
}
}
}
}
} // namespace caffe2

View File

@ -1,8 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "caffe2/proto/caffe2.pb.h"
namespace caffe2 {
void testMetal();
void compareModels(const NetDef& initNet, NetDef predictNet);
}

View File

@ -1,304 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#include "rewrite_net.h"
#include "caffe2/utils/proto_utils.h"
namespace caffe2 {
struct Analysis {
struct SSA {
using BlobVersions = std::unordered_map<std::string, size_t>;
BlobVersions inVersions;
BlobVersions outVersions;
};
std::vector<SSA> ssa;
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
};
Analysis analyzeNet(const NetDef &net) {
Analysis::SSA::BlobVersions frontier;
Analysis analysis;
auto play = [&](size_t i, const OperatorDef &op) {
Analysis::SSA::BlobVersions inVersions;
for (const auto &s : op.input()) {
inVersions[s] = frontier[s];
analysis.inUsages[s][frontier[s]].push_back(i);
}
Analysis::SSA::BlobVersions outVersions;
for (const auto &s : op.output()) {
if (frontier.find(s) != frontier.end()) {
frontier[s] += 1;
}
outVersions[s] = frontier[s];
}
analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
};
for (auto i = 0; i < net.op_size(); ++i) {
play(i, net.op(i));
}
return analysis;
}
void insertCopyToMetalGPUOp(NetDef &predictNet, const std::string &cpu_blob) {
auto *op = predictNet.add_op();
op->set_name("CopyFromCPUToGPU");
op->set_type("CopyToMetalGPU");
op->set_engine("METAL");
op->add_input(cpu_blob);
op->add_output(cpu_blob + "_M");
}
void insertCopyFromMetalGPUOp(NetDef &predictNet, const std::string &cpu_blob) {
auto *op = predictNet.add_op();
op->set_name("CopyFromGPUToCPU");
op->set_type("CopyFromMetalGPU");
op->add_input(cpu_blob + "_M");
op->add_output(cpu_blob);
}
NetDef insertInputOutputCopyOps(const NetDef &def, std::string engine) {
// Do some validation of the outputs. For this version, we require:
// - a single input (first element of external_input()) is consumed by the NetDef
// - a single output (first element of external_output()) is produced by the NetDef.
// - the input is consumed by def.op(0), and this is the only consumer.
// - the output is produced by def.op(-1).
CAFFE_ENFORCE_GE(def.external_input_size(), 1);
CAFFE_ENFORCE_GE(def.external_output_size(), 1);
auto analysis = analyzeNet(def);
// enforce a single use of the input blob.
CAFFE_ENFORCE_GE(def.op_size(), 1);
const auto &inputBlob = def.external_input(0);
// Enforce that the input blob has a single usage - in the first operator.
CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
// Enforce that the external_output(0) blob is produced by the last operator in this sequence.
const auto &outputBlob = def.external_output(0);
CAFFE_ENFORCE(analysis.ssa.back().outVersions.find(outputBlob) != analysis.ssa.back().outVersions.end());
const auto &outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
// This should hold true by definition of the SSA analysis.
CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) == analysis.inUsages[outputBlob].end());
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
std::unordered_map<std::string, std::set<size_t>> cpu_blobs, metal_blobs;
cpu_blobs[def.external_input(0)].insert(0);
for (auto i = 0; i < def.op_size(); i++) {
const auto &currentOp = def.op(i);
if (currentOp.engine() == engine) {
// Metal Op
// insert copyToMetalOp
for (auto j = 0; j < currentOp.input_size(); j++) {
auto &input = currentOp.input(j);
auto version = analysis.ssa[i].inVersions[input];
if (cpu_blobs[input].count(version) > 0) {
insertCopyToMetalGPUOp(mdef, input);
metal_blobs[input].insert(version);
cpu_blobs[input].erase(version);
}
}
auto *op = mdef.add_op();
op->CopyFrom(currentOp);
// swap input blob
for (auto j = 0; j < currentOp.input_size(); j++) {
auto &input = currentOp.input(j);
auto version = analysis.ssa[i].inVersions[input];
if (metal_blobs[input].count(version) > 0) {
op->set_input(j, input + "_M");
}
}
// swap output blob
for (auto j = 0; j < currentOp.output_size(); j++) {
auto &output = currentOp.output(j);
auto version = analysis.ssa[i].outVersions[output];
op->set_output(j, output + "_M");
metal_blobs[output].insert(version);
}
// insert copyFromMetalOp after the last op if the last op is a metal op
if (i == def.op_size() - 1) {
insertCopyFromMetalGPUOp(mdef, currentOp.output(0));
}
} else {
// CPU Op
// insert copyFromMetalOp
for (auto j = 0; j < currentOp.input_size(); j++) {
auto &input = currentOp.input(j);
auto version = analysis.ssa[i].inVersions[input];
if (metal_blobs[input].count(version) > 0) {
insertCopyFromMetalGPUOp(mdef, input);
}
}
auto *op = mdef.add_op();
op->CopyFrom(currentOp);
for (auto j = 0; j < currentOp.output_size(); j++) {
auto &output = currentOp.output(j);
auto version = analysis.ssa[i].outVersions[output];
cpu_blobs[output].insert(version);
}
}
}
return mdef;
}
bool tryFuseAdjacentOps(const OperatorDef &currentOp, const OperatorDef &nextOp, OperatorDef *fusedOp) {
// Check for possible invalid opportunities.
// Must be identical outputs, with in-place usage for nextOp.
if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
return false;
}
if (currentOp.output(0) != nextOp.input(0) || nextOp.input(0) != nextOp.output(0)) {
return false;
}
static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {{
{{"MetalInstanceNorm", "MetalPRelu"}, "MetalInstanceNormPRelu"},
}};
auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
if (it == fusionOpportunities.end()) {
return false;
}
LOG(INFO) << "Found a fusion between adjacent ops: (" << currentOp.type() << ", " << nextOp.type() << ") -> "
<< it->second;
fusedOp->CopyFrom(currentOp);
fusedOp->set_type(it->second);
for (auto i = 1; i < nextOp.input_size(); i++) {
fusedOp->add_input(nextOp.input(i));
}
return true;
}
NetDef runMetalFusion(const NetDef &def) {
CHECK_GE(def.op_size(), 1);
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
auto i = 0;
while (i < def.op_size()) {
if (i == def.op_size() - 1) {
VLOG(2) << "Last operator, skipping";
auto *op = mdef.add_op();
op->CopyFrom(def.op(i));
i += 1;
continue;
}
const auto &currentOp = def.op(i);
const auto &nextOp = def.op(i + 1);
OperatorDef fusedOp;
if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp)) {
VLOG(2) << "Found an adjacent fusion at: " << i;
// We can fuse.
auto *op = mdef.add_op();
op->CopyFrom(fusedOp);
i += 2;
continue;
}
VLOG(2) << "No fusion available";
// Just emit the current type.
auto *op = mdef.add_op();
op->CopyFrom(currentOp);
i += 1;
}
return mdef;
}
void dumpDef(NetDef &net) {
for (const auto &op : net.op()) {
printf("***Operator: %s\n", op.type().c_str());
for (auto input : op.input()) {
printf("\tInput: %s\n", input.c_str());
}
for (auto output : op.output()) {
printf("\tOutput: %s\n", output.c_str());
}
}
}
NetDef rewritePredictNetForMetal(const NetDef &predictNet, const std::string engine) {
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
NetDef net;
net.CopyFrom(predictNet);
std::unordered_map<std::string, std::string> replacements({
{"Conv", "MetalConv"},
{"InstanceNorm", "MetalInstanceNorm"},
{"PRelu", "MetalPRelu"},
{"ConvTranspose", "MetalConvTranspose"},
});
for (OperatorDef &op : *net.mutable_op()) {
if (op.has_type() && replacements.count(op.type()) > 0) {
op.set_type(replacements[op.type()]);
op.set_engine("METAL");
}
}
net = insertInputOutputCopyOps(net, engine);
net = runMetalFusion(net);
return net;
}
NetDef rewriteInitNetForMetal(const NetDef &initNet, const NetDef &predictNet, const std::string engine) {
// Find the GivenTensorFill operators for weight tensors and change to Metal GivenTensorFill ops
NetDef net;
net.CopyFrom(initNet);
std::set<std::string> conv_weights, conv_tranpose_weights, weights_and_biases;
for (auto &op : predictNet.op()) {
if (op.engine() == engine) {
if (op.type() == "MetalConv") {
conv_weights.insert(op.input(1));
} else if (op.type() == "MetalConvTranspose") {
conv_tranpose_weights.insert(op.input(1));
} else {
// Need to add support for operators with > 1 input tensors such as Add
for (int i = 1; i < op.input_size(); i++) {
weights_and_biases.insert(op.input(i));
}
}
}
}
for (auto &op : *net.mutable_op()) {
if (op.type() == "GivenTensorFill" && op.output_size() == 1) {
if (conv_weights.count(op.output(0)) > 0) {
op.set_type("GivenWeightTensorFill");
op.set_engine(engine);
} else if (conv_tranpose_weights.count(op.output(0)) > 0) {
op.set_type("GivenTransposeWeightTensorFill");
op.set_engine(engine);
} else if (weights_and_biases.count(op.output(0)) > 0) {
op.set_type("GivenTensorFloat16MetalFill");
op.set_engine(engine);
}
}
}
return net;
}
bool tryConvertToMetal(const NetDef &initNet, const NetDef &predictNet, NetDef *metalInitNet, NetDef *metalPredictNet) {
try {
// Throws if unsupported operators are found.
*metalPredictNet = rewritePredictNetForMetal(predictNet, "METAL");
*metalInitNet = rewriteInitNetForMetal(initNet, *metalPredictNet, "METAL");
// Throws if unsupported parameters are found.
Workspace ws;
ws.RunNetOnce(*metalInitNet);
ws.CreateNet(*metalPredictNet);
LOG(INFO) << "Metal is successfully enabled";
return true;
} catch (const std::exception &e) {
LOG(ERROR) << "Caught exception trying to convert NetDef to Metal: " << e.what();
return false;
}
}
} // namespace caffe2

View File

@ -1,13 +0,0 @@
// Copyright 2004-present Facebook. All Rights Reserved.
#pragma once
#include "caffe2/core/predictor.h"
namespace caffe2 {
bool tryConvertToMetal(const NetDef &initNet, const NetDef &predictNet, NetDef *metalInitNet, NetDef *metalPredictNet);
// Exposed for testing
NetDef rewritePredictNetForMetal(const NetDef &predictNet, const std::string engine);
NetDef rewriteInitNetForMetal(const NetDef &initNet, const NetDef &predictNet, const std::string engine);
void dumpDef(NetDef &net);
}

View File

@ -0,0 +1,7 @@
if(IOS)
file(GLOB_RECURSE tmp *.mm *.cc)
# exclude test files
file(GLOB_RECURSE test_files *_test.cc)
exclude(tmp "${tmp}" ${test_files})
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
endif()

View File

@ -85,3 +85,8 @@ if (${CMAKE_CXX_COMPILER_ID} STREQUAL "MSVC")
/wd4996 # (3): Use of a deprecated member
)
endif()
# ---[ If we are building on ios, we will enable -mfpu=neon-fp16 for iOS Metal build.
if (IOS)
add_definitions("-mfpu=neon-fp16")
endif()