Remove ComputeLibrary submodule

Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18052

Reviewed By: ezyang

Differential Revision: D14477355

fbshipit-source-id: c56b802f6d69701596c327cf9af6782f30e335fa
This commit is contained in:
Junjie Bai
2019-03-16 09:03:17 -07:00
committed by Facebook Github Bot
parent c7448aa13c
commit 0fe6e8c870
51 changed files with 0 additions and 3318 deletions

3
.gitmodules vendored
View File

@ -52,9 +52,6 @@
[submodule "third_party/python-six"]
path = third_party/python-six
url = https://github.com/benjaminp/six.git
[submodule "third_party/ComputeLibrary"]
path = third_party/ComputeLibrary
url = https://github.com/ARM-software/ComputeLibrary.git
[submodule "third_party/onnx"]
path = third_party/onnx
url = https://github.com/onnx/onnx.git

View File

@ -101,7 +101,6 @@ option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
cmake_dependent_option(
INSTALL_TEST "Install test binaries if BUILD_TEST is on" ON
"BUILD_TEST" OFF)
option(USE_ACL "Use ARM Compute Library" OFF)
option(USE_ASAN "Use Address Sanitizer" OFF)
option(USE_CUDA "Use CUDA" ON)
option(USE_ROCM "Use ROCm" ON)

View File

@ -1,8 +1,4 @@
add_subdirectory(ios)
if (USE_ACL)
# add_subdirectory(arm-compute)
endif()
# Finally pass the src lists back to the parent
if (USE_NNAPI)
add_subdirectory(nnapi)

View File

@ -1,6 +0,0 @@
add_subdirectory(core)
add_subdirectory(operators)
add_subdirectory(test)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)

View File

@ -1,64 +0,0 @@
# Caffe2 - ARM Compute Backend
## Build
To build, clone and install scons
```
brew install scons
```
set ANDROID_NDK to /opt/android_ndk/xxx(e.g. /opt/android_ndk/android-ndk-r15c/)
setup toolchain:
Let's say $PATH_TO_TOOLCHAIN is the directory you want to store the toolchain files.
arm
```
rm -rf $PATH_TO_TOOLCHAIN
$ANDROID_NDK/build/tools/make_standalone_toolchain.py --arch arm --api 21 --install-dir $PATH_TO_TOOLCHAIN
```
arm64
```
rm -rf PATH_TO_TOOLCHAIN
$ANDROID_NDK/build/tools/make_standalone_toolchain.py --arch arm64 --api 21 --install-dir $PATH_TO_TOOLCHAIN
```
add the toolchain path to .bashrc/.zshrc etc.
e.g.
```
export PATH=$PATH:$PATH_TO_TOOLCHAIN/bin
```
use the build\_android.sh:
for 32-bit ARM
```
./scripts/build_android.sh -DUSE_ACL=ON -DBUILD_TEST=ON
```
for 64-bit ARM
```
./scripts/build_android.sh -DUSE_ACL=ON -DBUILD_TEST=ON -DANDROID_ABI=arm64-v8a -DANDROID_TOOLCHAIN=clang
```
Before switch between 32 bit and 64 bit, please make sure to delete build\_android folder:
```
rm -rf build_android
```
## Test
Plug in an android device, and run a test
```
cd build_android
adb push bin/gl_conv_op_test /data/local/tmp && adb shell '/data/local/tmp/gl_conv_op_test'
```
or use a script to run them all
In caffe2 top level directory
```
./caffe2/mobile/contrib/arm-compute/run_tests.sh build_android
```
Note that some tests(fully_connected and alignment) have been disabled until the next release of ACL.

View File

@ -1,2 +0,0 @@
file(GLOB_RECURSE tmp *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)

View File

@ -1,39 +0,0 @@
#include "context.h"
#include "caffe2/core/allocator.h"
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/tensor.h"
namespace caffe2 {
CAFFE_KNOWN_TYPE(GLTensor<GLfloat>);
CAFFE_KNOWN_TYPE(GLTensor<GLhalf>);
CAFFE_KNOWN_TYPE(GLTensor<half>);
CAFFE_KNOWN_TYPE(Tensor<GLContext>);
bool GLContext::initialized = false;
GLContext::GLContext() {
CAFFE_ENFORCE(arm_compute::opengles31_is_available());
if(!initialized) {
arm_compute::GCScheduler::get().default_init();
initialized = true;
}
}
void EventCreateOPENGL(const DeviceOption & /* unused */,
Event * /* unused */) {}
void EventRecordOPENGL(Event * /* unused */, const void * /* unused */,
const char * /* unused */) {}
void EventWaitOPENGLOPENGL(const Event * /* unused */, void * /* unused */) {}
void EventFinishOPENGL(const Event * /* unused */) {}
void EventResetOPENGL(Event * /* unused */) {}
REGISTER_EVENT_CREATE_FUNCTION(OPENGL, EventCreateOPENGL);
REGISTER_EVENT_RECORD_FUNCTION(OPENGL, EventRecordOPENGL);
REGISTER_EVENT_WAIT_FUNCTION(OPENGL, OPENGL, EventWaitOPENGLOPENGL);
REGISTER_EVENT_FINISH_FUNCTION(OPENGL, EventFinishOPENGL);
REGISTER_EVENT_RESET_FUNCTION(OPENGL, EventResetOPENGL);
} // namespace caffe2

View File

@ -1,391 +0,0 @@
#ifndef CAFFE2_OPENGL_CONTEXT_H_
#define CAFFE2_OPENGL_CONTEXT_H_
#ifdef CAFFE2_OPENGL_BACKEND
#error Can only build one OpenGL backend at a time.
#else
#define CAFFE2_OPENGL_BACKEND
#endif
#include "caffe2/core/allocator.h"
#include "caffe2/core/blob.h"
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/timer.h"
#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/Allocator.h"
#include "arm_compute/runtime/BlobLifetimeManager.h"
#include "arm_compute/runtime/MemoryManagerOnDemand.h"
#include "arm_compute/runtime/PoolManager.h"
#include "utils/Utils.h"
#include "include/half/half.hpp"
namespace caffe2 {
typedef half_float::half half;
//#define ACL_USE_FLOAT32
#ifdef ACL_USE_FLOAT32
typedef float DataType;
#else
typedef half DataType;
#endif
template <typename T> class GLTensor;
class GLContext final {
public:
static bool initialized;
explicit GLContext();
explicit GLContext(const DeviceOption &option) {
DCHECK_EQ(option.device_type(), PROTO_OPENGL);
GLContext();
}
~GLContext() {}
static void sync() { arm_compute::GCScheduler::get().memory_barrier(); }
template <typename T>
using deleted_unique_ptr = std::unique_ptr<T, std::function<void(T *)>>;
template <typename T>
static deleted_unique_ptr<const GLTensor<T>> getGLTensor(const Blob *b, const GLTensor<T>* X_old = nullptr) {
if (b->IsType<TensorCPU>()) {
auto &Xcpu = b->Get<TensorCPU>();
GLTensor<T> *X_raw_ptr;
if (X_old) {
X_raw_ptr = const_cast<GLTensor<T> *>(X_old);
X_raw_ptr->ResizeLike(Xcpu);
deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, EmptyDeleter<T>);
return X_unique_ptr;
} else {
X_raw_ptr = new GLTensor<T>();
X_raw_ptr->ResizeLike(Xcpu);
deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, GLTensorDeleter<T>);
return X_unique_ptr;
}
}
const GLTensor<T> *X_raw_ptr;
X_raw_ptr = &b->Get<GLTensor<T>>();
deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, EmptyDeleter<T>);
return X_unique_ptr;
}
/*
* Everything below is basically boiler plate for Context classes
*/
static std::pair<void *, MemoryDeleter> New(size_t nbytes) {
return std::pair<void *, MemoryDeleter>(malloc(nbytes), GLContext::Delete);
}
static void Delete(void *data) {
if (data != nullptr) {
free(data);
}
}
template <class SrcContext, class DstContext>
inline void CopyBytes(size_t nbytes, const void *src, void *dst) {}
template <typename T, class SrcContext, class DstContext>
inline void Copy(int n, const T *src, T *dst) {
CopyBytes<SrcContext, DstContext>(n * sizeof(T),
static_cast<const void *>(src),
static_cast<void *>(dst));
}
template <class SrcContext, class DstContext>
inline void CopyItems(const TypeMeta &meta, size_t n, const void *src,
void *dst) {
CAFFE_ENFORCE(!meta.copy(), "GLContext requires fundamental types.");
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
}
void SwitchToDevice(int a, ...) { /* TODO */
}
void SwitchToDevice() { SwitchToDevice(0); }
inline void WaitEvent(const Event &ev) { /* TODO */
}
void FinishDeviceComputation() { /* TODO */
}
inline void Record(Event *ev, const char *&) const { /* TODO */
}
static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
return true;
}
bool HasAsyncPartDefault() const { return false; }
bool SupportsAsyncScheduling() const { return false; }
private:
template <typename T>
static void GLTensorDeleter(const GLTensor<T> *X) {
delete X;
}
template <typename T>
static void EmptyDeleter(const GLTensor<T> *X) {
return;
}
};
template <typename T> class GLTensor {
public:
GLTensor() { tensor_ = make_unique<arm_compute::GCTensor>(); }
~GLTensor() { tensor_->allocator()->free(); }
template <typename TensorType> bool ResizeLike(TensorType &X, bool free = false) {
bool need_allocation = SetDims(X.dims());
for (int i = 0; i < dims_.size(); i++) {
shape_.set(dims_.size() - i - 1, dims_[i]);
}
if (need_allocation) {
if (free) {
tensor_->allocator()->free();
}
#ifdef ACL_USE_FLOAT32
tensor_->allocator()->init(
arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F32));
#else
tensor_->allocator()->init(
arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F16));
#endif
} else {
tensor_->info()->set_tensor_shape(shape_);
}
return need_allocation;
}
template <typename... Ts> bool Resize(Ts... dim_source) {
bool need_allocation = SetDims(dim_source...);
for (int i = 0; i < dims_.size(); i++) {
shape_.set(dims_.size() - i - 1, dims_[i]);
}
if (need_allocation) {
// TODO: Make it type generic
tensor_->allocator()->free();
#ifdef ACL_USE_FLOAT32
tensor_->allocator()->init(arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F32));
#else
tensor_->allocator()->init(arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F16));
#endif
} else {
tensor_->info()->set_tensor_shape(shape_);
}
return need_allocation;
}
// Allocates and copies data if needed
void lazy_allocate(const Blob *b, bool allocate_tensor, bool try_to_copy_from_cpu) const {
if (try_to_copy_from_cpu) {
// we skip GLTensors, nothing to copy
if (!b->IsType<GLTensor>()) {
// typically only called on the second run
if (allocate_tensor) {
allocate();
}
Timer timer;
fillGLTensor(b);
auto millis = timer.MilliSeconds();
VLOG(2) << "[C2DEBUG] fillGLTensor timer: " << millis;
}
}
}
void allocate() const {
tensor_->allocator()->allocate();
}
void fillGLTensor(const Blob *b) const {
if (b->IsType<TensorCPU>()) {
auto &Xcpu = b->Get<TensorCPU>();
VLOG(2) << "[C2DEBUG] fillGLTensor dims: " << Xcpu.dims();
T *buffer = map();
char *byte_buffer = (char *)buffer;
auto info = tensor_->info();
arm_compute::Window it_window;
it_window.use_tensor_dimensions(info->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
arm_compute::Iterator it(get_underlying(), it_window);
if (Xcpu.ndim() == 4) {
auto C = Xcpu.dim32(1);
auto H = Xcpu.dim32(2);
auto W = Xcpu.dim32(3);
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
std::copy_n(Xcpu.data<float>() + id[3] * (C * W * H) + id.z() * (W * H) + id.y() * W, W, reinterpret_cast<T *>(it.ptr()));
},
it);
} else if (Xcpu.ndim() == 3) {
auto H = Xcpu.dim32(1);
auto W = Xcpu.dim32(2);
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
std::copy_n(Xcpu.data<float>() + (id.z() * (W * H) + id.y() * W), W, reinterpret_cast<T *>(it.ptr()));
},
it);
} else if (Xcpu.ndim() == 2) {
auto W = Xcpu.dim32(1);
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
std::copy_n(Xcpu.data<float>() + id.y() * W, W, reinterpret_cast<T *>(it.ptr()));
},
it);
} else {
arm_compute::Window w;
w.use_tensor_dimensions(info->tensor_shape());
arm_compute::Iterator i(get_underlying(), w);
auto size = Xcpu.dim32(0);
std::copy_n(Xcpu.data<float>(), size, reinterpret_cast<T *>(i.ptr()));
}
unmap();
}
}
const int32_t ndim() const { return dims_.size(); }
vector<int64_t> dims() const { return dims_; }
const int32_t dim32(const int index) const { return dims_.at(index); }
const int32_t size() const {
int32_t s = 1;
for (int i = 0; i < dims_.size(); i++) {
s *= dims_[i];
}
return s;
}
arm_compute::GCTensor *get_underlying() const { return tensor_.get(); }
T *map() const {
GLContext::sync();
tensor_->map(true);
return reinterpret_cast<T *>(tensor_->buffer());
}
void unmap() const { return tensor_->unmap(); }
void sync() const {
GLContext::sync();
tensor_->map();
tensor_->unmap();
}
private:
template <typename TI, typename = typename std::enable_if<
std::is_integral<TI>::value>::type>
bool SetDims(const vector<TI> &src) {
auto old_size = size_;
dims_.resize(src.size());
int64_t new_size = 1;
for (unsigned int i = 0; i < src.size(); ++i) {
new_size *= src[i];
dims_[i] = src[i];
}
size_ = new_size;
return size_ > old_size;
}
bool SetDims() {
auto old_size = size_;
dims_.resize(0);
size_ = 1;
return size_ > old_size;
}
bool SetDims(const int64_t d0) {
auto old_size = size_;
dims_.resize(1);
dims_[0] = d0;
size_ = d0;
return size_ > old_size;
}
bool SetDims(const int64_t d0, const int64_t d1) {
auto old_size = size_;
dims_.resize(2);
dims_[0] = d0;
dims_[1] = d1;
size_ = d0 * d1;
return size_ > old_size;
}
bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
auto old_size = size_;
dims_.resize(3);
dims_[0] = d0;
dims_[1] = d1;
dims_[2] = d2;
size_ = d0 * d1 * d2;
return size_ > old_size;
}
bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2,
const int64_t d3) {
auto old_size = size_;
dims_.resize(4);
dims_[0] = d0;
dims_[1] = d1;
dims_[2] = d2;
dims_[3] = d3;
size_ = d0 * d1 * d2 * d3;
return size_ > old_size;
}
vector<int64_t> dims_;
int64_t size_ = -1;
arm_compute::TensorShape shape_;
unique_ptr<arm_compute::GCTensor> tensor_;
};
template<typename T = DataType>
void getTensorCPU(const GLTensor<T>& g_, TensorCPU& g) {
VLOG(2) << " [C2DEBUG] getTensorCPU " << g_.dims();
g.Resize(g_.dims());
g_.map();
auto tensor = g_.get_underlying();
auto info = tensor->info();
arm_compute::Window it_window;
it_window.use_tensor_dimensions(info->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
arm_compute::Iterator it(tensor, it_window);
if (g_.ndim() == 4) {
auto C = g_.dim32(1);
auto H = g_.dim32(2);
auto W = g_.dim32(3);
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + id[3] * (C * W * H) + id.z() * (W * H) + id.y() * W);
},
it);
} else if (g_.ndim() == 3) {
auto H = g_.dim32(1);
auto W = g_.dim32(2);
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + (id.z() * (W * H) + id.y() * W));
},
it);
} else if (g_.ndim() == 2) {
auto W = g_.dim32(1);
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + id.y() * W);
},
it);
} else {
arm_compute::Window w;
w.use_tensor_dimensions(info->tensor_shape());
arm_compute::Iterator i(tensor, w);
auto size = g_.dim32(0);
std::copy_n(reinterpret_cast<T *>(i.ptr()), size, g.mutable_data<float>());
}
g_.unmap();
}
} // namespace caffe2
#endif /* CAFFE2_OPENGL_CONTEXT_H_ */

View File

@ -1,230 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/net_gl.h"
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/core/net.h"
#include <iostream>
#include <set>
#include <unordered_map>
#include <unordered_set>
#include "caffe2/core/operator.h"
#include "caffe2/core/static_tracepoint.h"
#include "caffe2/core/timer.h"
#include "caffe2/proto/caffe2_pb.h"
#include "caffe2/utils/proto_utils.h"
namespace caffe2 {
GLNet::GLNet(
const std::shared_ptr<const NetDef>& net_def,
Workspace* ws)
: NetBase(net_def, ws) {
ws_ = ws;
VLOG(1) << "Constructing GLNet " << net_def->name();
const bool net_def_has_device_option = net_def->has_device_option();
// Initialize the operators
for (int idx = 0; idx < net_def->op_size(); ++idx) {
const auto& operator_def = net_def->op(idx);
VLOG(1) << "Creating operator " << operator_def.name() << ": "
<< operator_def.type();
output_blobs_.push_back(operator_def.output(0));
if (operator_def.has_device_option() && operator_def.device_option().device_type() == OPENGL) {
opengl_device_.push_back(true);
} else {
opengl_device_.push_back(false);
}
std::unique_ptr<OperatorBase> op{nullptr};
OperatorDef temp_def(operator_def);
if (temp_def.type() == "GenerateProposals") {
auto* arg = temp_def.add_arg();
arg->set_name("fill_output");
arg->set_i(1);
}
if (!operator_def.has_device_option() && net_def_has_device_option) {
// In the case that the operator def does not specify a device option but
// the net def has a default option, we copy the device option over to the
// operator def.
temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
op = CreateOperator(temp_def, ws, idx);
} else {
op = CreateOperator(temp_def, ws, idx);
op->set_debug_def(
std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
}
operators_.emplace_back(std::move(op));
}
}
bool GLNet::Run() {
StartAllObservers();
if (first_run_) {
first_run_ = false;
for (auto& op: operators_) {
VLOG(2) << "[C2DEBUG] configure " << ProtoDebugString(op->debug_def());
op->Run();
}
for (auto& op: operators_) {
VLOG(2) << "[C2DEBUG] second run " << ProtoDebugString(op->debug_def());
op->Run();
}
// Change the parameters for GenerateProposals
for (int i = 0; i < operators_.size(); ++i) {
if (operators_[i]->debug_def().type() == "GenerateProposals") {
OperatorDef temp_def(operators_[i]->debug_def());
auto* arg = temp_def.add_arg();
arg->set_name("fill_output");
arg->set_i(0);
operators_[i].reset(CreateOperator(temp_def, ws_, i).release());
}
}
}
VLOG(1) << "Running net " << name_;
int i = 0;
//Timer timer;
for (auto& op : operators_) {
VLOG(2) << "[C2DEBUG] running " << ProtoDebugString(op->debug_def()) << " " << i;
++i;
//timer.Start();
bool res = op->Run();
// auto millis = timer.MilliSeconds();
// LOG(ERROR) << "[C2DEBUG] OP " << op->debug_def().type() << " " << millis <<" ms.";
if (!res) {
LOG(ERROR) << "[C2DEBUG] Operator failed: " << ProtoDebugString(op->debug_def());
return false;
}
}
StopAllObservers();
return true;
}
bool GLNet::RunAsync() {
return Run();
}
namespace {
template <typename A, typename B>
bool PairLargerThan(const std::pair<A, B>& x, const std::pair<A, B>& y) {
return x.second > y.second;
}
}
vector<float> GLNet::TEST_Benchmark(
const int warmup_runs,
const int main_runs,
const bool run_individual) {
std::cout << "Starting benchmark." << std::endl;
std::cout << "Running warmup runs." << std::endl;
CAFFE_ENFORCE(
warmup_runs >= 0,
"Number of warm up runs should be non negative, provided ",
warmup_runs,
".");
for (int i = 0; i < warmup_runs; ++i) {
CAFFE_ENFORCE(Run(), "Warmup run ", i, " has failed.");
}
auto last_blob = output_blobs_[output_blobs_.size() - 1];
Blob *gpu_out_blob = ws_->GetBlob(last_blob);
if (gpu_out_blob->IsType<GLTensor<DataType>>()) {
auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
// Enforce gpu execution
g_.sync();
}
std::cout << "Main runs." << std::endl;
CAFFE_ENFORCE(
main_runs >= 0,
"Number of main runs should be non negative, provided ",
main_runs,
".");
Timer timer;
for (int i = 0; i < main_runs; ++i) {
CAFFE_ENFORCE(Run(), "Main run ", i, " has failed.");
}
if (gpu_out_blob->IsType<GLTensor<DataType>>()) {
auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
g_.sync();
}
auto millis = timer.MilliSeconds();
std::cout << "Main run finished. Milliseconds per iter: "
<< millis / main_runs
<< ". Iters per second: " << 1000.0 * main_runs / millis << std::endl;
vector<float> time_per_op(operators_.size(), 0);
vector<uint64_t> flops_per_op(operators_.size(), 0);
CaffeMap<string, float> time_per_op_type;
if (run_individual) {
for (int i = 0; i < main_runs; ++i) {
for (auto& op : operators_) {
op->ResetEvent();
}
int idx = 0;
for (auto& op : operators_) {
const string& op_type = op->debug_def().type();
timer.Start();
CAFFE_ENFORCE(
op->Run(),
"operator ",
op->debug_def().name(),
"(",
op_type,
") has failed.");
if (opengl_device_[idx] && op_type != "CopyFromGL" && op_type != "Reshape") {
Blob *gpu_out_blob = ws_->GetBlob(output_blobs_[idx]);
auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
g_.sync();
}
float spent = timer.MilliSeconds();
time_per_op[idx] += spent;
time_per_op_type[op_type] += spent;
++idx;
}
}
int idx = 0;
for (auto& op : operators_) {
const string& op_type = op->debug_def().type();
const string& print_name =
(op->debug_def().name().size()
? op->debug_def().name()
: (op->debug_def().output_size() ? op->debug_def().output(0)
: "NO_OUTPUT"));
std::stringstream flops_str;
if (flops_per_op[idx]) {
flops_str << " ("
<< to_string(1.0e-6 * flops_per_op[idx] / time_per_op[idx])
<< " GFLOPS)";
}
std::cout << "[C2DEBUG] Operator #" << idx << " (" << print_name << ", " << op_type
<< ") " << time_per_op[idx] / main_runs << " ms/iter"
<< flops_str.str() << std::endl;
++idx;
}
std::cout << "[C2DEBUG] Time per operator type:" << std::endl;
// sort by decreasing time spending.
std::vector<std::pair<string, float>> time_per_op_type_vec(
time_per_op_type.begin(), time_per_op_type.end());
std::sort(
time_per_op_type_vec.begin(),
time_per_op_type_vec.end(),
PairLargerThan<string, float>);
for (const auto& item : time_per_op_type_vec) {
std::cout << "[C2DEBUG] " << std::setw(15) << std::setfill(' ') << item.second / main_runs
<< " " << item.first << std::endl;
}
}
// We will reuse time_per_op to return the result of BenchmarkNet.
for (int i = 0; i < time_per_op.size(); ++i) {
time_per_op[i] /= main_runs;
}
time_per_op.insert(time_per_op.begin(), millis / main_runs);
return time_per_op;
}
REGISTER_NET(opengl, GLNet);
} // namespace caffe2

View File

@ -1,65 +0,0 @@
#ifndef CAFFE2_CORE_NET_GL_H_
#define CAFFE2_CORE_NET_GL_H_
#include <vector>
#include "c10/util/Registry.h"
#include "caffe2/core/common.h"
#include "caffe2/core/logging.h"
#include "caffe2/core/net.h"
#include "caffe2/core/tensor.h"
#include "caffe2/core/workspace.h"
#include "caffe2/proto/caffe2_pb.h"
namespace caffe2 {
// This is the very basic structure you need to run a network with
// ARM's compute library
class GLNet : public NetBase {
private:
bool first_run_ = true;
Workspace* ws_;
// record output blob for sync step in operator level benchmarking
std::vector<string> output_blobs_;
// record operator type and only sync after gpu op
std::vector<bool> opengl_device_;
public:
GLNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
bool SupportsAsync() override {
return false;
}
vector<float> TEST_Benchmark(
const int warmup_runs,
const int main_runs,
const bool run_individual) override;
/*
* This returns a list of pointers to objects stored in unique_ptrs.
* Used by Observers.
*
* Think carefully before using.
*/
vector<OperatorBase*> GetOperators() const override {
vector<OperatorBase*> op_list;
for (auto& op : operators_) {
op_list.push_back(op.get());
}
return op_list;
}
protected:
bool Run();
bool RunAsync();
bool DoRunAsync() override {
return Run();
}
vector<unique_ptr<OperatorBase>> operators_;
C10_DISABLE_COPY_AND_ASSIGN(GLNet);
};
} // namespace caffe2
#endif // CAFFE2_CORE_NET_SIMPLE_H_

View File

@ -1,12 +0,0 @@
#include "operator.h"
namespace caffe2 {
C10_DEFINE_REGISTRY(
GLOperatorRegistry,
OperatorBase,
const OperatorDef&,
Workspace*);
CAFFE_REGISTER_DEVICE_TYPE(DeviceType::OPENGL, GLOperatorRegistry);
} // namespace caffe2

View File

@ -1,30 +0,0 @@
#ifndef CAFFE2_OPENGL_OPERATOR_H_
#define CAFFE2_OPENGL_OPERATOR_H_
#include "c10/util/Registry.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
C10_DECLARE_REGISTRY(
GLOperatorRegistry,
OperatorBase,
const OperatorDef&,
Workspace*);
#define REGISTER_GL_OPERATOR_CREATOR(key, ...) \
C10_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__)
#define REGISTER_GL_OPERATOR(name, ...) \
extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() { \
CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
} \
C10_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__)
#define REGISTER_GL_OPERATOR_STR(str_name, ...) \
C10_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__)
#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...) \
C10_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
} // namespace caffe2
#endif // CAFFE2_OPENGL_OPERATOR_H_

View File

@ -1,257 +0,0 @@
#include "rewrite_net.h"
#include "caffe2/core/operator.h"
#include "caffe2/utils/proto_utils.h"
#include <unordered_map>
namespace caffe2 {
struct Analysis {
struct SSA {
using BlobVersions = std::unordered_map<std::string, size_t>;
BlobVersions inVersions;
BlobVersions outVersions;
};
std::vector<SSA> ssa;
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
};
static Analysis analyzeNet(const NetDef& net) {
Analysis::SSA::BlobVersions frontier;
Analysis analysis;
auto play = [&](size_t i, const OperatorDef& op) {
Analysis::SSA::BlobVersions inVersions;
for (const auto& s : op.input()) {
inVersions[s] = frontier[s];
analysis.inUsages[s][frontier[s]].push_back(i);
}
Analysis::SSA::BlobVersions outVersions;
for (const auto& s : op.output()) {
if (frontier.find(s) != frontier.end()) {
frontier[s] += 1;
}
outVersions[s] = frontier[s];
}
analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
};
for (auto i = 0; i < net.op_size(); ++i) {
play(i, net.op(i));
}
return analysis;
}
static void insertCopyFromGLOp(NetDef& predictNet, const std::string& cpu_blob) {
auto* op = predictNet.add_op();
op->set_name("CopyFromGL");
op->set_type("CopyFromGL");
op->add_input(cpu_blob + "_M");
op->add_output(cpu_blob);
}
static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& cpuOp) {
// Do some validation of the outputs. For this version, we require:
// - a single input (first element of external_input()) is consumed by the NetDef
// - a single output (first element of external_output()) is produced by the NetDef.
// - the input is consumed by def.op(0), and this is the only consumer.
// - the output is produced by def.op(-1).
CAFFE_ENFORCE_GE(def.external_input_size(), 1);
CAFFE_ENFORCE_GE(def.external_output_size(), 1);
auto analysis = analyzeNet(def);
// enforce a single use of the input blob.
CAFFE_ENFORCE_GE(def.op_size(), 1);
const auto& inputBlob = def.external_input(0);
// Enforce that the input blob has a single usage - in the first operator.
CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
const auto& outputBlob = def.external_output(0);
const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
// This should hold true by definition of the SSA analysis.
CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
analysis.inUsages[outputBlob].end());
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
cpu_blobs[def.external_input(0)].insert(0);
VLOG(2) << "[C2DEBUG] def.op_size(): " << def.op_size();
for (auto i = 0; i < def.op_size(); i++) {
const auto& currentOp = def.op(i);
if (cpuOp.count(currentOp.type()) > 0) {
// CPU Op
// insert copyFromOpenGLOp
for (auto j = 0; j < currentOp.input_size(); j++) {
auto& input = currentOp.input(j);
auto version = analysis.ssa[i].inVersions[input];
if (gpu_blobs[input].count(version) > 0) {
insertCopyFromGLOp(mdef, input);
}
}
auto* op = mdef.add_op();
op->CopyFrom(currentOp);
for (auto j = 0; j < currentOp.output_size(); j++) {
auto& output = currentOp.output(j);
auto version = analysis.ssa[i].outVersions[output];
cpu_blobs[output].insert(version);
}
} else {
// OpenGL Op
auto* op = mdef.add_op();
op->CopyFrom(currentOp);
for (auto j = 0; j < op->input_size(); j++) {
auto* input = op->mutable_input(j);
auto version = analysis.ssa[i].inVersions[*input];
if (gpu_blobs[*input].count(version) > 0) {
*input = *input + "_M";
}
}
for (auto j = 0; j < currentOp.output_size(); j++) {
auto& output = currentOp.output(j);
auto version = analysis.ssa[i].outVersions[output];
gpu_blobs[output].insert(version);
// add _M to intermediate OpenGL op outputs
auto* output_ = op->mutable_output(j);
bool inter = true;
for(auto k = 0; k < def.external_output_size(); k++) {
if (*output_ == def.external_output(k)) {
inter = false;
}
}
if (inter) {
*output_ = *output_ + "_M";
}
}
}
}
return mdef;
}
static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
const OperatorDef& nextOp,
OperatorDef* fusedOp,
std::unordered_set<std::string>& glOps) {
// Check for possible invalid opportunities.
if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
return false;
}
// The fused op cannot be inplace
if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
return false;
}
static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
{{"Conv", "Relu"}, "ConvRelu"}};
auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
if (it == fusionOpportunities.end()) {
return false;
}
glOps.insert(it->second);
fusedOp->CopyFrom(currentOp);
fusedOp->set_output(0, nextOp.output(0));
fusedOp->set_type(it->second);
for (auto i = 1; i < nextOp.input_size(); i++) {
fusedOp->add_input(nextOp.input(i));
}
return true;
}
static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
CHECK_GE(def.op_size(), 1);
NetDef mdef;
mdef.CopyFrom(def);
mdef.clear_op();
auto i = 0;
while (i < def.op_size()) {
if (i == def.op_size() - 1) {
VLOG(2) << "Last operator, skipping";
auto* op = mdef.add_op();
op->CopyFrom(def.op(i));
i += 1;
continue;
}
const auto& currentOp = def.op(i);
const auto& nextOp = def.op(i + 1);
OperatorDef fusedOp;
if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
// We can fuse.
auto* op = mdef.add_op();
op->CopyFrom(fusedOp);
i += 2;
continue;
}
VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
// Just emit the current type.
auto* op = mdef.add_op();
op->CopyFrom(currentOp);
i += 1;
}
return mdef;
}
void dumpDefForOpenGL(const NetDef& d) {
for (const auto& op : d.op()) {
LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
}
}
// // For debugging
// void dumpDefForOpenGL(const NetDef &net) {
// for (const auto &op : net.op()) {
// printf("***Operator: %s\n", op.type().c_str());
// for (auto input : op.input()) {
// printf("\tInput: %s\n", input.c_str());
// }
//
// for (auto output : op.output()) {
// printf("\tOutput: %s\n", output.c_str());
// }
// }
//}
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool runFusion, std::unordered_set<std::string> cpuOps) {
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
NetDef net;
net.CopyFrom(predictNet);
// if (runFusion) {
// net = runOpenGLFusion(net, openGLOps);
// }
net = insertInputOutputCopyOps(net, cpuOps);
VLOG(2) << "[C2DEBUG] net size " << net.op().size();
net.set_type("opengl");
for (auto i = 0; i < net.op().size(); ++i) {
auto op = net.mutable_op(i);
if (std::find(cpuOps.begin(), cpuOps.end(), op->type()) == cpuOps.end()) {
op->mutable_device_option()->set_device_type(PROTO_OPENGL);
}
}
return net;
}
bool tryConvertToOpenGL(const NetDef& predictNet,
NetDef* glPredictNet,
bool runFusion,
std::unordered_set<std::string> cpuOps) {
try {
// Throws if unsupported operators are found.
VLOG(2) << "[C2DEBUG] in tryConvertToOpenGL";
*glPredictNet = rewritePredictNetForOpenGL(predictNet, runFusion, cpuOps);
dumpDefForOpenGL(*glPredictNet);
// Throws if unsupported parameters are found.
LOG(INFO) << "OpenGL is successfully enabled";
return true;
} catch (const std::exception& e) {
LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
return false;
}
}
} // namespace caffe2

View File

@ -1,17 +0,0 @@
#pragma once
#include "caffe2/mobile/contrib/arm-compute/core/net_gl.h"
#include <unordered_set>
namespace caffe2 {
bool tryConvertToOpenGL(const NetDef& predictNet,
NetDef* glPredictNet,
bool runFusion,
std::unordered_set<std::string> cpuOps);
// Exposed for testing
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
bool runFusion,
std::unordered_set<std::string> cpuOps);
void dumpDefForOpenGL(const NetDef& net);
} // namespace caffe2

View File

@ -1,2 +0,0 @@
file(GLOB_RECURSE tmp *.cc)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)

View File

@ -1,110 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/mobile/contrib/arm-compute/operators/activation_ops.h"
#include "caffe2/operators/relu_op.h"
namespace caffe2 {
template <typename T>
bool GLReluOp<T>::RunOnDevice() {
auto *Xblob = OperatorBase::Inputs()[0];
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
if (Y->get_underlying() != X_->get_underlying())
{
Y->ResizeLike(*X_);
}
relu_layer_.configure(
X_->get_underlying(), Y->get_underlying(),
arm_compute::ActivationLayerInfo(
arm_compute::ActivationLayerInfo::ActivationFunction::RELU));
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
second_run_ = false;
// in place activation, do not need to allocate new memory
if (Y->get_underlying() != X_->get_underlying()) {
Y->ResizeLike(*X_);
Y->allocate();
}
relu_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation = false;
if (Y->get_underlying() != X_->get_underlying()) {
need_allocation = Y->ResizeLike(*X_, true);
}
relu_layer_.configure(
X_->get_underlying(), Y->get_underlying(),
arm_compute::ActivationLayerInfo(
arm_compute::ActivationLayerInfo::ActivationFunction::RELU));
if (need_allocation) {
Y->allocate();
}
relu_layer_.run();
}
return true;
}
REGISTER_GL_OPERATOR(Relu, GLReluOp<DataType>);
template <typename T>
bool GLSigmoidOp<T>::RunOnDevice() {
auto *Xblob = OperatorBase::Inputs()[0];
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
if (Y->get_underlying() != X_->get_underlying())
{
Y->ResizeLike(*X_);
}
sigmoid_layer_.configure(
X_->get_underlying(), Y->get_underlying(),
arm_compute::ActivationLayerInfo(
arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC));
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
second_run_ = false;
// in place activation, do not need to allocate new memory
if (Y->get_underlying() != X_->get_underlying()) {
Y->ResizeLike(*X_);
Y->allocate();
}
sigmoid_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation = false;
if (Y->get_underlying() != X_->get_underlying())
{
need_allocation = Y->ResizeLike(*X_, true);
}
sigmoid_layer_.configure(
X_->get_underlying(), Y->get_underlying(),
arm_compute::ActivationLayerInfo(
arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC));
if (need_allocation) {
Y->allocate();
}
sigmoid_layer_.run();
}
return true;
}
REGISTER_GL_OPERATOR(Sigmoid, GLSigmoidOp<DataType>);
} // namespace caffe2

View File

@ -1,38 +0,0 @@
#ifndef CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
#define CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
#include "caffe2/core/context.h"
#include "caffe2/core/operator.h"
namespace caffe2 {
template <typename T>
class GLSigmoidOp final : public Operator<GLContext> {
public:
GLSigmoidOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
arm_compute::GCActivationLayer sigmoid_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
};
template <typename T> class GLReluOp final : public Operator<GLContext> {
public:
GLReluOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~GLReluOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
arm_compute::GCActivationLayer relu_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
};
} // namespace caffe2
#endif // CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_

View File

@ -1,116 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/concat_split_op.h"
namespace caffe2 {
template <typename T> class GLConcatOp final : public Operator<GLContext> {
public:
GLConcatOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~GLConcatOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
arm_compute::GCDepthConcatenateLayer concat_layer_;
bool first_run_ = true, second_run_ = true;
std::vector<GLContext::deleted_unique_ptr<const GLTensor<T>>> inputs_;
int channelCount_ = 0;
};
template <typename T>
bool GLConcatOp<T>::RunOnDevice() {
CAFFE_ENFORCE(InputSize() <= 4 && InputSize() >= 2, "Number \
of input must be between 2 and 4.");
auto *X0blob = OperatorBase::Inputs()[0];
if (first_run_) {
auto X0 = GLContext::getGLTensor<T>(X0blob);
inputs_.push_back(std::move(X0));
} else {
auto X0 = GLContext::getGLTensor<T>(X0blob, inputs_[0].release());
inputs_[0] = std::move(X0);
}
int N = inputs_[0]->dim32(0);
int channels = inputs_[0]->dim32(1);
int height = inputs_[0]->dim32(2);
int width = inputs_[0]->dim32(3);
std::vector<const Blob*> inputsBlob;
inputsBlob.push_back(X0blob);
if (first_run_) {
channelCount_ = channels;
for (int i = 1; i < Inputs().size(); ++i) {
auto *Xblob = OperatorBase::Inputs()[i];
auto X = GLContext::getGLTensor<T>(Xblob);
CAFFE_ENFORCE_EQ(N, X->dim32(0), X->dim32(0));
CAFFE_ENFORCE_EQ(height, X->dim32(2), X->dim32(2));
CAFFE_ENFORCE_EQ(width, X->dim32(3), X->dim32(3));
channelCount_ += X->dim32(1);
inputs_.push_back(std::move(X));
}
} else {
channelCount_ = channels;
for (int i = 1; i < Inputs().size(); ++i) {
auto *Xblob = OperatorBase::Inputs()[i];
auto X = GLContext::getGLTensor<T>(Xblob, inputs_[i].release());
CAFFE_ENFORCE_EQ(N, X->dim32(0), X->dim32(0));
CAFFE_ENFORCE_EQ(height, X->dim32(2), X->dim32(2));
CAFFE_ENFORCE_EQ(width, X->dim32(3), X->dim32(3));
channelCount_ += X->dim32(1);
inputs_[i] = std::move(X);
}
}
for (int i = 1; i < Inputs().size(); ++i) {
auto *Xblob = OperatorBase::Inputs()[i];
inputsBlob.push_back(Xblob);
}
std::vector<int> output_dims = {N, channelCount_, height, width};
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
Y->Resize(output_dims);
std::vector<arm_compute::IGCTensor*> inputsGC;
for (int i = 0; i < inputs_.size(); ++i) {
inputsGC.push_back(inputs_[i]->get_underlying());
}
concat_layer_.configure(inputsGC, Y->get_underlying());
} else if (second_run_) {
for (int i = 0; i < inputs_.size(); ++i) {
auto* X = inputs_[i].get();
auto* Xblob = inputsBlob[i];
X->lazy_allocate(Xblob, second_run_, true);
}
second_run_ = false;
Y->Resize(output_dims);
Y->allocate();
concat_layer_.run();
} else {
for (int i = 0; i < inputs_.size(); ++i) {
auto* X = inputs_[i].get();
auto* Xblob = inputsBlob[i];
X->lazy_allocate(Xblob, second_run_, true);
}
bool need_allocation = Y->Resize(output_dims);
std::vector<arm_compute::IGCTensor*> inputsGC;
for (int i = 0; i < inputs_.size(); ++i) {
inputsGC.push_back(inputs_[i]->get_underlying());
}
concat_layer_.configure(inputsGC, Y->get_underlying());
if (need_allocation) {
Y->allocate();
}
concat_layer_.run();
}
return true;
}
REGISTER_GL_OPERATOR(Concat, GLConcatOp<DataType>);
} // namespace caffe2

View File

@ -1,113 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/conv_op.h"
namespace caffe2 {
template <typename T>
class GLConvOp final : public ConvPoolOpBase<GLContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
GLConvOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<GLContext>(operator_def, ws) {
// Since this is the default convolution implementation, we will
// use CAFFE_ENFORCE instead of OPERATOR_NEEDS_FEATURE.
CAFFE_ENFORCE(
group_ == 1 || order_ == StorageOrder::NCHW,
"Group convolution only supports NCHW order right now.");
}
~GLConvOp() {}
bool RunOnDevice() override;
private:
arm_compute::GCDirectConvolutionLayer conv_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, filter_, bias_;
};
template <typename T>
bool GLConvOp<T>::RunOnDevice() {
auto *Xblob = OperatorBase::Inputs()[0];
auto *filterblob = OperatorBase::Inputs()[1];
auto *biasblob = OperatorBase::Inputs()[2];
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
if (first_run_) {
filter_ = GLContext::getGLTensor<T>(filterblob);
bias_ = GLContext::getGLTensor<T>(biasblob);
}
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
const int N = X_->dim32(0), H = X_->dim32(2), W = X_->dim32(3), C = X_->dim32(1);
LOG(INFO) << "[C2DEBUG] Conv " << N << " " << H << " " << W << " " << C;
CAFFE_ENFORCE_EQ(kernel_.size(), 2,
"Only 2d convolution is supported with ARM compute backend");
CAFFE_ENFORCE(X_->ndim(), filter_->ndim());
const int M = filter_->dim32(0);
CAFFE_ENFORCE(filter_->dim32(2) == kernel_h());
CAFFE_ENFORCE(filter_->dim32(3) == kernel_w());
CAFFE_ENFORCE(filter_->dim32(1) == C);
if (first_run_) {
first_run_ = false;
// resize output accordingly
TensorCPU fakeX;
fakeX.Resize(X_->dims());
TensorCPU fakeY;
ConvPoolOpBase<GLContext>::SetOutputSize(fakeX, &fakeY, filter_->dim32(0));
Y->ResizeLike(fakeY);
LOG(INFO) << "[C2DEBUG] dims of X " << X_->dims();
LOG(INFO) << "[C2DEBUG] dims of X(gctensor) "
<< X_->get_underlying()->info()->dimension(3) << " "
<< X_->get_underlying()->info()->dimension(2) << " "
<< X_->get_underlying()->info()->dimension(1) << " "
<< X_->get_underlying()->info()->dimension(0) << " "
;
LOG(INFO) << "[C2DEBUG] dims of Y " << Y->dims();
LOG(INFO) << "[C2DEBUG] dims of Y(gctensor) "
<< Y->get_underlying()->info()->dimension(3) << " "
<< Y->get_underlying()->info()->dimension(2) << " "
<< Y->get_underlying()->info()->dimension(1) << " "
<< Y->get_underlying()->info()->dimension(0) << " "
;
conv_.configure(
X_->get_underlying(), filter_->get_underlying(), bias_->get_underlying(),
Y->get_underlying(),
arm_compute::PadStrideInfo(stride_[0], stride_[1], pads_[0], pads_[1]));
} else if (second_run_) {
// Always attempt to copy the CPU to GPU on input
X_->lazy_allocate(Xblob, second_run_, true);
filter_->lazy_allocate(filterblob, second_run_, second_run_);
bias_->lazy_allocate(biasblob, second_run_, second_run_);
second_run_ = false;
Y->allocate();
conv_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
TensorCPU fakeX;
fakeX.Resize(X_->dims());
TensorCPU fakeY;
ConvPoolOpBase<GLContext>::SetOutputSize(fakeX, &fakeY, filter_->dim32(0));
bool need_allocation = Y->ResizeLike(fakeY, true);
if (need_allocation) {
Y->allocate();
}
conv_.configure(
X_->get_underlying(), filter_->get_underlying(), bias_->get_underlying(),
Y->get_underlying(),
arm_compute::PadStrideInfo(stride_[0], stride_[1], pads_[0], pads_[1]));
conv_.run();
}
return true;
}
REGISTER_GL_OPERATOR(Conv, GLConvOp<DataType>);
} // namespace caffe2

View File

@ -1,71 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/core/timer.h"
namespace caffe2 {
template <typename T> class CopyFromGLOp final : public Operator<GLContext> {
public:
CopyFromGLOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~CopyFromGLOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
bool first_run_ = true, second_run_ = true;
std::vector<GLContext::deleted_unique_ptr<const GLTensor<T>>> inputs_;
};
template <typename T>
bool CopyFromGLOp<T>::RunOnDevice() {
std::vector<const Blob*> inputsBlob;
for (int i = 0; i < Inputs().size(); ++i) {
auto *Xblob = OperatorBase::Inputs()[i];
inputsBlob.push_back(Xblob);
}
if (first_run_) {
for (int i = 0; i < Inputs().size(); ++i) {
auto *Xblob = inputsBlob[i];
auto X = GLContext::getGLTensor<T>(Xblob);
inputs_.push_back(std::move(X));
}
} else {
for (int i = 0; i < Inputs().size(); ++i) {
auto *Xblob = inputsBlob[i];
auto X = GLContext::getGLTensor<T>(Xblob, inputs_[i].release());
inputs_[i] = std::move(X);
}
}
if (first_run_) {
first_run_ = false;
for (int i = 0; i < Inputs().size(); ++i) {
auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
Y->Resize(inputs_[i]->dims());
Y->template mutable_data<float>();
}
} else {
for (auto i = 0; i < Inputs().size(); ++i) {
// Blob
auto* Xblob = inputsBlob[i];
// GLTensor
auto* X = inputs_[i].get();
X->lazy_allocate(Xblob, second_run_, true);
auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
Timer timer;
timer.Start();
getTensorCPU(*X, *Y);
auto millis = timer.MilliSeconds();
//LOG(ERROR) << "[C2DEBUG] copy_op " << X->dims() << " takes " << millis << " milliseconds";
}
}
return true;
}
REGISTER_GL_OPERATOR(CopyFromGL, CopyFromGLOp<DataType>);
} // namespace caffe2

View File

@ -1,58 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/utility_ops.h"
namespace caffe2 {
template <typename T> class GLSumOp final : public Operator<GLContext> {
public:
GLSumOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~GLSumOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
arm_compute::GCArithmeticAddition add_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> A_, B_;
};
template <typename T>
bool GLSumOp<T>::RunOnDevice() {
auto *Ablob = OperatorBase::Inputs()[0];
auto *Bblob = OperatorBase::Inputs()[1];
A_ = GLContext::getGLTensor<T>(Ablob, A_.release());
B_ = GLContext::getGLTensor<T>(Bblob, B_.release());
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
Y->ResizeLike(*A_);
add_layer_.configure(A_->get_underlying(), B_->get_underlying(), Y->get_underlying(), arm_compute::ConvertPolicy::SATURATE);
} else if (second_run_) {
A_->lazy_allocate(Ablob, second_run_, true);
B_->lazy_allocate(Bblob, second_run_, true);
second_run_ = false;
Y->allocate();
add_layer_.run();
} else {
A_->lazy_allocate(Ablob, second_run_, true);
B_->lazy_allocate(Bblob, second_run_, true);
bool need_allocation = Y->ResizeLike(*A_);
add_layer_.configure(A_->get_underlying(), B_->get_underlying(), Y->get_underlying(), arm_compute::ConvertPolicy::SATURATE);
if (need_allocation) {
Y->allocate();
}
}
return true;
}
REGISTER_GL_OPERATOR(Sum, GLSumOp<DataType>);
REGISTER_GL_OPERATOR(Add, GLSumOp<DataType>);
} // namespace caffe2

View File

@ -1,76 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/fully_connected_op.h"
namespace caffe2 {
template <typename T> class GLFullyConnectedOp final : public Operator<GLContext> {
public:
GLFullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~GLFullyConnectedOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
arm_compute::GCFullyConnectedLayer fc_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, W_, B_;
};
template <typename T>
bool GLFullyConnectedOp<T>::RunOnDevice() {
auto Xblob = OperatorBase::Inputs()[0];
auto *Wblob = OperatorBase::Inputs()[1];
auto *Bblob = OperatorBase::Inputs()[2];
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
if (first_run_) {
W_ = GLContext::getGLTensor<T>(Wblob);
B_ = GLContext::getGLTensor<T>(Bblob);
}
auto M = X_->dim32(0);
auto CIn = X_->dim32(1);
auto Height = X_->dim32(2);
auto Width = X_->dim32(3);
auto N = W_->dim32(0);
CAFFE_ENFORCE_EQ(1, B_->ndim());
CAFFE_ENFORCE_EQ(N, B_->dim32(0));
vector<int64_t> output_dims = {M, N};
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
Y->Resize(output_dims);
fc_layer_.configure(X_->get_underlying(), W_->get_underlying(),
B_->get_underlying(), Y->get_underlying(), true, false);
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
W_->lazy_allocate(Wblob, second_run_, second_run_);
B_->lazy_allocate(Bblob, second_run_, second_run_);
second_run_ = false;
Y->Resize(output_dims);
Y->allocate();
fc_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation = Y->Resize(output_dims);
fc_layer_.configure(X_->get_underlying(), W_->get_underlying(),
B_->get_underlying(), Y->get_underlying(), true, false);
if (need_allocation) {
Y->allocate();
}
fc_layer_.run();
}
return true;
}
REGISTER_GL_OPERATOR(FC, GLFullyConnectedOp<DataType>);
} // namespace caffe2

View File

@ -1,70 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
namespace caffe2 {
template <typename T>
class GLNormalizePlanarYUVOp final : public Operator<GLContext> {
public:
GLNormalizePlanarYUVOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~GLNormalizePlanarYUVOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
arm_compute::GCNormalizePlanarYUVLayer norm_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, mean_, sd_;
};
template <typename T> bool GLNormalizePlanarYUVOp<T>::RunOnDevice() {
auto Xblob = OperatorBase::Inputs()[0];
auto *meanblob = OperatorBase::Inputs()[1];
auto *sdblob = OperatorBase::Inputs()[2];
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
if (first_run_) {
mean_ = GLContext::getGLTensor<T>(meanblob);
sd_ = GLContext::getGLTensor<T>(sdblob);
}
CAFFE_ENFORCE_EQ(X_->ndim(), 4);
auto N = X_->dim32(0);
auto C = X_->dim32(1);
auto H = X_->dim32(2);
auto W = X_->dim32(3);
CAFFE_ENFORCE_EQ(C, mean_->dim32(1));
CAFFE_ENFORCE_EQ(C, sd_->dim32(1));
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
Y->ResizeLike(*X_);
norm_layer_.configure(X_->get_underlying(), Y->get_underlying(), mean_->get_underlying(), sd_->get_underlying());
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
mean_->lazy_allocate(meanblob, second_run_, second_run_);
sd_->lazy_allocate(sdblob, second_run_, second_run_);
second_run_ = false;
Y->ResizeLike(*X_);
Y->allocate();
norm_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation = Y->ResizeLike(*X_);
norm_layer_.configure(X_->get_underlying(), Y->get_underlying(), mean_->get_underlying(), sd_->get_underlying());
if (need_allocation) {
Y->allocate();
}
norm_layer_.run();
}
return true;
}
REGISTER_GL_OPERATOR(NormalizePlanarYUV, GLNormalizePlanarYUVOp<DataType>);
} // namespace caffe2

View File

@ -1,186 +0,0 @@
#include "caffe2/operators/pool_op.h"
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
namespace caffe2 {
template <typename T>
class GLAveragePoolOp final : public ConvPoolOpBase<GLContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
GLAveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<GLContext>(operator_def, ws) {
}
~GLAveragePoolOp() {}
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
private:
arm_compute::GCPoolingLayer pooling_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
};
template<typename T>
class GLMaxPoolOp final : public ConvPoolOpBase<GLContext> {
public:
USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
GLMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
: ConvPoolOpBase<GLContext>(operator_def, ws) {
}
~GLMaxPoolOp() {}
bool RunOnDeviceWithOrderNCHW() override;
bool RunOnDeviceWithOrderNHWC() override;
private:
arm_compute::GCPoolingLayer pooling_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
};
template <>
bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
auto *Xblob = OperatorBase::Inputs()[0];
if (first_run_) {
X_ = GLContext::getGLTensor<DataType>(Xblob);
} else {
X_ = GLContext::getGLTensor<DataType>(Xblob, X_.release());
}
int N = X_->dim32(0);
int channels = X_->dim32(1);
int height = X_->dim32(2);
int width = X_->dim32(3);
vector<int64_t> output_dims = {N, channels, 1, 1};
if (!global_pooling_) {
output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
}
GLTensor<DataType> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<DataType>>();
if (first_run_) {
first_run_ = false;
CAFFE_ENFORCE_EQ(kernel_.size(), 2, "ARM OpenGL only supports 2D pooling");
CAFFE_ENFORCE_EQ(kernel_h(), kernel_w(),
"ARM OpenGL only supports equal kernel size");
Y->Resize(output_dims);
if (global_pooling_) {
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
} else {
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
pad_t(), pad_b(),
arm_compute::DimensionRoundingType::FLOOR);
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG, kernel_h(),
ps_info);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
}
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
second_run_ = false;
Y->Resize(output_dims);
Y->allocate();
pooling_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation =Y->Resize(output_dims);
if (global_pooling_) {
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
} else {
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
pad_t(), pad_b(),
arm_compute::DimensionRoundingType::FLOOR);
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG, kernel_h(),
ps_info);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
}
if (need_allocation) {
Y->allocate();
}
pooling_layer_.run();
}
return true;
}
template <> bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
auto *Xblob = OperatorBase::Inputs()[0];
X_ = GLContext::getGLTensor<DataType>(Xblob, X_.release());
int N = X_->dim32(0);
int channels = X_->dim32(1);
int height = X_->dim32(2);
int width = X_->dim32(3);
vector<int64_t> output_dims = {N, channels, 1, 1};
if (!global_pooling_) {
output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
}
GLTensor<DataType> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<DataType>>();
if (first_run_) {
first_run_ = false;
CAFFE_ENFORCE_EQ(kernel_.size(), 2, "ARM OpenGL only supports 2D pooling");
CAFFE_ENFORCE_EQ(kernel_h(), kernel_w(),
"ARM OpenGL only supports equal kernel size");
Y->Resize(output_dims);
if (global_pooling_) {
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
} else {
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
pad_t(), pad_b(),
arm_compute::DimensionRoundingType::FLOOR);
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX, kernel_h(),
ps_info);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
}
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
second_run_ = false;
Y->Resize(output_dims);
Y->allocate();
pooling_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation = Y->Resize(output_dims);
if (global_pooling_) {
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
} else {
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
pad_t(), pad_b(),
arm_compute::DimensionRoundingType::FLOOR);
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX, kernel_h(),
ps_info);
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
}
if (need_allocation) {
Y->allocate();
}
pooling_layer_.run();
}
return true;
}
template <>
bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNHWC() {
return false;
}
template <>
bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNHWC() {
return false;
}
REGISTER_GL_OPERATOR(AveragePool, GLAveragePoolOp<DataType>);
REGISTER_GL_OPERATOR(MaxPool, GLMaxPoolOp<DataType>);
} // namespace caffe2

View File

@ -1,29 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/reshape_op.h"
namespace caffe2 {
template <typename T> class GLReshapeOp final : public Operator<GLContext> {
public:
GLReshapeOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~GLReshapeOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
};
template <typename T>
bool GLReshapeOp<T>::RunOnDevice() {
auto *Xblob = OperatorBase::Inputs()[0];
auto X = GLContext::getGLTensor<T>(Xblob);
auto arg = OperatorBase::GetRepeatedArgument<int>("shape");
for (int i = 0; i < arg.size(); ++i) {
LOG(INFO) << "[C2DEBUG] shape: " << arg[i];
}
return true;
}
REGISTER_GL_OPERATOR(Reshape, GLReshapeOp<DataType>);
} // namespace caffe2

View File

@ -1,74 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/resize_op.h"
namespace caffe2 {
template<typename T>
class GLResizeNearestOp final : public Operator<GLContext> {
public:
GLResizeNearestOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
if (HasArgument("width_scale")) {
width_scale_ = static_cast<float>(
OperatorBase::GetSingleArgument<float>("width_scale", 1));
}
if (HasArgument("height_scale")) {
height_scale_ = static_cast<float>(
OperatorBase::GetSingleArgument<float>("height_scale", 1));
}
CAFFE_ENFORCE_GT(width_scale_, 0);
CAFFE_ENFORCE_GT(height_scale_, 0);
}
virtual ~GLResizeNearestOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
float width_scale_;
float height_scale_;
arm_compute::GCScale resize_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
};
template <typename T>
bool GLResizeNearestOp<T>::RunOnDevice() {
auto* Xblob = OperatorBase::Inputs()[0];
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
auto N = X_->dim32(0);
auto C = X_->dim32(1);
auto H = X_->dim32(2);
auto W = X_->dim32(3);
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
vector<int64_t> output_dims = {N, C, H * height_scale_, W * width_scale_};
if (first_run_) {
Y->Resize(output_dims);
first_run_ = false;
resize_layer_.configure(X_->get_underlying(), Y->get_underlying(), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, arm_compute::BorderMode::UNDEFINED);
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
second_run_ = false;
Y->Resize(output_dims);
Y->allocate();
resize_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation = Y->Resize(output_dims);
resize_layer_.configure(X_->get_underlying(), Y->get_underlying(), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, arm_compute::BorderMode::UNDEFINED);
if (need_allocation) {
Y->allocate();
}
}
return true;
}
REGISTER_GL_OPERATOR(ResizeNearest, GLResizeNearestOp<DataType>);
} // namespace caffe2

View File

@ -1,54 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/softmax_op.h"
namespace caffe2 {
template <typename T> class GLSoftmaxOp final : public Operator<GLContext> {
public:
GLSoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws) {}
virtual ~GLSoftmaxOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
private:
arm_compute::GCSoftmaxLayer softmax_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
};
template <typename T>
bool GLSoftmaxOp<T>::RunOnDevice() {
auto *Xblob = OperatorBase::Inputs()[0];
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
Y->ResizeLike(*X_);
softmax_layer_.configure(X_->get_underlying(), Y->get_underlying());
} else if (second_run_) {
X_->lazy_allocate(Xblob, second_run_, true);
second_run_ = false;
Y->ResizeLike(*X_);
Y->allocate();
softmax_layer_.run();
} else {
X_->lazy_allocate(Xblob, second_run_, true);
bool need_allocation = Y->ResizeLike(*X_);
softmax_layer_.configure(X_->get_underlying(), Y->get_underlying());
if (need_allocation) {
Y->allocate();
}
softmax_layer_.run();
}
return true;
}
REGISTER_GL_OPERATOR(Softmax, GLSoftmaxOp<DataType>);
} // namespace caffe2

View File

@ -1,93 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
#include "caffe2/operators/spatial_batch_norm_op.h"
namespace caffe2 {
template <typename T> class GLSpatialBNOp final : public Operator<GLContext> {
public:
GLSpatialBNOp(const OperatorDef &operator_def, Workspace *ws)
: Operator<GLContext>(operator_def, ws),
is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.9)),
order_(StringToStorageOrder(
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) { }
virtual ~GLSpatialBNOp() noexcept {}
USE_OPERATOR_FUNCTIONS(GLContext);
bool RunOnDevice() override;
protected:
bool is_test_;
double epsilon_;
double momentum_;
StorageOrder order_;
INPUT_TAGS(INPUT, SCALE, BIAS, EST_MEAN, EST_VAR);
OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_VAR);
private:
arm_compute::GCBatchNormalizationLayer bn_layer_;
bool first_run_ = true, second_run_ = true;
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, mean_, var_, bias_, scale_;
};
template <typename T>
bool GLSpatialBNOp<T>::RunOnDevice() {
auto *XBlob = OperatorBase::Inputs()[0];
auto *scaleBlob = OperatorBase::Inputs()[SCALE];
auto *biasBlob = OperatorBase::Inputs()[BIAS];
auto *meanBlob = OperatorBase::Inputs()[EST_MEAN];
auto *varBlob = OperatorBase::Inputs()[EST_VAR];
X_ = GLContext::getGLTensor<T>(XBlob, X_.release());
if (first_run_) {
scale_ = GLContext::getGLTensor<T>(scaleBlob);
bias_ = GLContext::getGLTensor<T>(biasBlob);
mean_ = GLContext::getGLTensor<T>(meanBlob);
var_ = GLContext::getGLTensor<T>(varBlob);
}
auto C = X_->dim32(1);
CAFFE_ENFORCE_EQ(scale_->ndim(), 1);
CAFFE_ENFORCE_EQ(bias_->ndim(), 1);
CAFFE_ENFORCE_EQ(mean_->ndim(), 1);
CAFFE_ENFORCE_EQ(var_->ndim(), 1);
CAFFE_ENFORCE_EQ(scale_->dim32(0), C);
CAFFE_ENFORCE_EQ(bias_->dim32(0), C);
CAFFE_ENFORCE_EQ(mean_->dim32(0), C);
CAFFE_ENFORCE_EQ(var_->dim32(0), C);
GLTensor<T> *Y =
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
if (first_run_) {
first_run_ = false;
Y->ResizeLike(*X_);
bn_layer_.configure(X_->get_underlying(), Y->get_underlying(),
mean_->get_underlying(), var_->get_underlying(),
bias_->get_underlying(), scale_->get_underlying(), epsilon_);
} else if (second_run_) {
X_->lazy_allocate(XBlob, second_run_, true);
scale_->lazy_allocate(scaleBlob, second_run_, second_run_);
bias_->lazy_allocate(biasBlob, second_run_, second_run_);
mean_->lazy_allocate(meanBlob, second_run_, second_run_);
var_->lazy_allocate(varBlob, second_run_, second_run_);
second_run_ = false;
Y->ResizeLike(*X_);
Y->allocate();
bn_layer_.run();
} else {
X_->lazy_allocate(XBlob, second_run_, true);
bool need_allocation = Y->ResizeLike(*X_);
bn_layer_.configure(X_->get_underlying(), Y->get_underlying(),
mean_->get_underlying(), var_->get_underlying(),
bias_->get_underlying(), scale_->get_underlying(), epsilon_);
if (need_allocation) {
Y->allocate();
}
}
return true;
}
REGISTER_GL_OPERATOR(SpatialBN, GLSpatialBNOp<DataType>);
} // namespace caffe2

View File

@ -1,22 +0,0 @@
set -vex
if [ -z "$CAFFE2_BINARY_DIR" ] ; then
if [ -z "$1" ] ; then
CAFFE2_BINARY_DIR=.
else
CAFFE2_BINARY_DIR=$1
fi
fi
files=($(find "$CAFFE2_BINARY_DIR" -type f -name "*_test"))
for test_binary in "${files[@]}";
do
test_binary_base=$(basename $test_binary)
if [[ $test_binary_base == gl* ]];then
echo Running $test_binary_base
adb push $test_binary "/data/local/tmp/$test_binary_base"
adb shell "GLOG_logtostderr=1 /data/local/tmp/$test_binary_base"
fi
done
echo All tests passed.

View File

@ -1,2 +0,0 @@
file(GLOB tmp *_test.cc)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp} PARENT_SCOPE)

View File

@ -1,70 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, Sigmoid) {
Workspace ws;
PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
NetDef cpu_net;
{
AddOp(&cpu_net, "Sigmoid", {"cpu_X"}, {"ref_Y"});
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"cpu_X"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net);
}
TEST(OPENGLOperatorTest, ReLU) {
Workspace ws;
PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
NetDef cpu_net;
{
AddOp(&cpu_net, "Relu", {"cpu_X"}, {"ref_Y"});
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Relu", {"cpu_X"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net);
}
TEST(OPENGLOperatorTest, SigmoidTwice) {
Workspace ws;
PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
NetDef cpu_net;
{
AddOp(&cpu_net, "Sigmoid", {"cpu_X"}, {"ref_Y1"});
AddOp(&cpu_net, "Sigmoid", {"ref_Y1"}, {"ref_Y2"});
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"cpu_X"}, {"gpu_Y1"});
MAKE_OPENGL_OPERATOR(def);
}
{
OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"gpu_Y1"}, {"gpu_Y2"});
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2");
}
} // namespace caffe2

View File

@ -1,197 +0,0 @@
#include "gl_operator_test.h"
#include "caffe2/core/timer.h"
namespace caffe2 {
constexpr float tol = 5.0e-2;
// {MaxPool, Relu, Add} followed by pad 1 conv
TEST(OPENGLOperatorTest, ConvMaxPoolConv) {
Workspace ws;
auto channel_in = 16;
auto channel_out = 16;
auto spatial = 32;
auto kern = 3;
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
PopulateCPUBlob(&ws, true, "b2", {channel_out});
#define ADD_CONV_ARGS \
{ \
ADD_ARG((*def), "kernel", i, kern); \
ADD_ARG((*def), "stride", i, 1); \
ADD_ARG((*def), "pad", i, 1); \
ADD_ARG((*def), "order", s, "NCHW"); \
}
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
def->set_name("cpu_conv");
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&cpu_net, "MaxPool", {"ref_Y"}, {"ref_maxpool"});
ADD_ARG((*def), "kernel", i, 2);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride_w", i, 2);
ADD_ARG((*def), "stride_h", i, 2);
ADD_ARG((*def), "order", s, "NCHW");
}
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_maxpool", "W2", "b2"}, {"ref_Y2"});
ADD_CONV_ARGS;
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&gpu_net, "MaxPool", {"gpu_Y"}, {"gpu_maxpool"});
ADD_ARG((*def), "kernel", i, 2);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride_w", i, 2);
ADD_ARG((*def), "stride_h", i, 2);
ADD_ARG((*def), "order", s, "NCHW");
MAKE_OPENGL_OPERATOR(def);
}
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_maxpool", "W2", "b2"}, {"gpu_Y2"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
#undef ADD_CONV_ARGS
// will work after next release of ACL
// compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
}
TEST(OPENGLOperatorTest, ConvReluConv) {
Workspace ws;
auto channel_in = 16;
auto channel_out = 16;
auto spatial = 32;
auto kern = 3;
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
PopulateCPUBlob(&ws, true, "b2", {channel_out});
#define ADD_CONV_ARGS \
{ \
ADD_ARG((*def), "kernel", i, kern); \
ADD_ARG((*def), "stride", i, 1); \
ADD_ARG((*def), "pad", i, 1); \
ADD_ARG((*def), "order", s, "NCHW"); \
}
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
def->set_name("cpu_conv");
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&cpu_net, "Relu", {"ref_Y"}, {"ref_relu"});
}
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_relu", "W2", "b2"}, {"ref_Y2"});
ADD_CONV_ARGS;
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&gpu_net, "Relu", {"gpu_Y"}, {"gpu_relu"});
MAKE_OPENGL_OPERATOR(def);
}
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_relu", "W2", "b2"}, {"gpu_Y2"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
#undef ADD_CONV_ARGS
// will work after next release of ACL
// compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
}
TEST(OPENGLOperatorTest, ConvAddConv) {
Workspace ws;
auto channel_in = 16;
auto channel_out = 16;
auto spatial = 32; // --> 2x2 w no padding, all values 9
auto kern = 3;
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
PopulateCPUBlob(&ws, true, "b2", {channel_out});
PopulateCPUBlob(&ws, true, "cpu_Y", {1, channel_in, spatial, spatial}, 1337);
#define ADD_CONV_ARGS \
{ \
ADD_ARG((*def), "kernel", i, kern); \
ADD_ARG((*def), "stride", i, 1); \
ADD_ARG((*def), "pad", i, 1); \
ADD_ARG((*def), "order", s, "NCHW"); \
}
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
def->set_name("cpu_conv");
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&cpu_net, "Add", {"ref_Y", "cpu_Y"}, {"ref_add"});
}
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_add", "W2", "b2"}, {"ref_Y2"});
ADD_CONV_ARGS;
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&gpu_net, "Add", {"gpu_Y", "cpu_Y"}, {"gpu_add"});
MAKE_OPENGL_OPERATOR(def);
}
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_add", "W2", "b2"}, {"gpu_Y2"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
#undef ADD_CONV_ARGS
// will work after next release of ACL
// compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
}
} // namespace caffe2

View File

@ -1,49 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, Concat) {
for (auto Cs: std::vector<std::vector<int>>{
{4, 4},
{4, 4, 4},
{6, 6, 6},
{16, 8, 4},
{32, 8, 16, 4},
}) {
Workspace ws;
int batchSize = 1;
int H = 8;
int W = 8;
for (int i = 0; i < Cs.size(); ++i) {
PopulateCPUBlob(
&ws,
true,
std::string("cpu_X") + c10::to_string(i),
{batchSize, Cs[i], H, W});
}
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "Concat", {}, {"ref_Y", "cpu_dummy"});
for (int i = 0; i < Cs.size(); ++i ) {
def->add_input(std::string("cpu_X") + c10::to_string(i));
}
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Concat", {}, {"gpu_Y", "gpu_dummy"});
MAKE_OPENGL_OPERATOR(def);
for (int i = 0; i < Cs.size(); ++i ) {
def->add_input(std::string("cpu_X") + c10::to_string(i));
}
}
compareNetResult(ws, cpu_net, gpu_net);
}
}
} // namespace caffe2

View File

@ -1,11 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include <gtest/gtest.h>
namespace caffe2 {
TEST(OPENGLContextTest, Initialization) {
auto gc = new GLContext();
delete gc;
}
} // namespace caffe2

View File

@ -1,162 +0,0 @@
#include "gl_operator_test.h"
#include "caffe2/core/timer.h"
namespace caffe2 {
constexpr float tol = 3.0e-2;
TEST(OPENGLOperatorTest, Conv) {
Workspace ws;
auto channel_in = 16;
auto channel_out = 16;
auto spatial = 16; // --> 2x2 w no padding, all values 9
auto kern = 3;
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
#define ADD_CONV_ARGS \
{ \
ADD_ARG((*def), "kernel", i, kern); \
ADD_ARG((*def), "stride", i, 1); \
ADD_ARG((*def), "pad", i, 0); \
ADD_ARG((*def), "order", s, "NCHW"); \
}
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
def->set_name("cpu_conv");
ADD_CONV_ARGS;
}
ws.RunNetOnce(cpu_net);
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
#undef ADD_CONV_ARGS
compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", tol);
}
TEST(OPENGLOperatorTest, ConvReluConv) {
Workspace ws;
auto channel_in = 16;
auto channel_out = 16;
auto spatial = 32; // --> 2x2 w no padding, all values 9
auto kern = 3;
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
PopulateCPUBlob(&ws, true, "b2", {channel_out});
#define ADD_CONV_ARGS \
{ \
ADD_ARG((*def), "kernel", i, kern); \
ADD_ARG((*def), "stride", i, 1); \
ADD_ARG((*def), "pad", i, 0); \
ADD_ARG((*def), "order", s, "NCHW"); \
}
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
def->set_name("cpu_conv");
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&cpu_net, "Relu", {"ref_Y"}, {"ref_relu"});
}
{
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_relu", "W2", "b2"}, {"ref_Y2"});
ADD_CONV_ARGS;
}
ws.RunNetOnce(cpu_net);
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
{
OperatorDef* def = AddOp(&gpu_net, "Relu", {"gpu_Y"}, {"gpu_relu"});
MAKE_OPENGL_OPERATOR(def);
}
{
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_relu", "W2", "b2"}, {"gpu_Y2"});
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS;
}
#undef ADD_CONV_ARGS
compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
}
TEST(OPENGLOperatorTest, ConvBenchmark) {
Workspace ws;
auto channel_in = 4;
auto channel_out = 4;
auto spatial = 10;
auto kern = 3;
long long iters = 2;
PopulateCPUBlob(&ws, false, "cpu_X", {1, channel_in, spatial, spatial}, 1, 0, 0.1);
#define ADD_CONV_ARGS(_def) \
{ \
ADD_ARG((*_def), "kernel", i, kern); \
ADD_ARG((*_def), "stride", i, 1); \
ADD_ARG((*_def), "pad", i, 0); \
ADD_ARG((*_def), "order", s, "NCHW"); \
}
NetDef gpu_net;
NetDef cpu_net;
gpu_net.set_type("opengl");
std::string prev_out = "cpu_X";
for (auto i = 0; i < iters; ++i) {
std::string weightName = "W" + to_string(i);
std::string biasName = "b" + to_string(i);
std::string output = "conv" + to_string(i);
PopulateCPUBlob(&ws, false, weightName, {channel_out, channel_in, kern, kern}, 1);
PopulateCPUBlob(&ws, false, biasName, {channel_out}, 0);
OperatorDef* def = AddOp(&gpu_net, "Conv", {prev_out, weightName, biasName}, {output});
if (i == 0) {
OperatorDef* def2 = AddOp(&cpu_net, "Conv", {prev_out, weightName, biasName}, {"cpu" + output});
ADD_CONV_ARGS(def2);
} else {
OperatorDef* def2 = AddOp(&cpu_net, "Conv", {"cpu" + prev_out, weightName, biasName}, {"cpu" + output});
ADD_CONV_ARGS(def2);
}
prev_out = output;
MAKE_OPENGL_OPERATOR(def);
ADD_CONV_ARGS(def);
}
#undef ADD_CONV_ARGS
compareNetResult4D(ws, cpu_net, gpu_net, "cpu" + prev_out, prev_out, tol);
}
} // namespace caffe2

View File

@ -1,42 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, CopyFromGL) {
for (auto dims: std::vector<std::vector<int>>{
{1},
{3},
{1, 2},
{2, 3},
{1, 2, 3},
{1, 2, 3, 4},
{4, 3, 2, 1},
{4, 9, 8, 13},
}) {
Workspace ws;
PopulateCPUBlob(&ws, true, std::string("cpu_X"), dims, 1, 0.2, 0.1);
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "CopyFromGL", {"cpu_X"}, {"cpu_X2"});
MAKE_OPENGL_OPERATOR(def);
}
ws.RunNetOnce(gpu_net);
Blob *cpu_out = ws.GetBlob("cpu_X");
Blob *gpu_out = ws.GetBlob("cpu_X2");
EXPECT_NE(nullptr, cpu_out);
EXPECT_NE(nullptr, gpu_out);
auto &t1 = cpu_out->Get<TensorCPU>();
auto &t2 = gpu_out->Get<TensorCPU>();
double tol=0.01;
for (auto i = 0; i < t1.size(); ++i) {
EXPECT_NEAR(t1.data<float>()[i], t2.data<float>()[i], tol)
<< "at index " << i;
}
}
}
} // namespace caffe2

View File

@ -1,27 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, Sum) {
Workspace ws;
int N = 28;
int D = 128;
PopulateCPUBlob(&ws, true, "cpu_X", {N, D}, 1);
PopulateCPUBlob(&ws, true, "cpu_Y", {N, D}, 1);
NetDef cpu_net;
{
AddOp(&cpu_net, "Sum", {"cpu_X", "cpu_Y"}, {"ref_Y"});
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Sum", {"cpu_X", "cpu_Y"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net);
}
} // namespace caffe2

View File

@ -1,36 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, FC) {
Workspace ws;
int batchSize = 1;
int CIn = 4;
int H = 8;
int W = 8;
int COut = 16;
PopulateCPUBlob(&ws, true, "cpu_X", {batchSize, CIn, H, W});
PopulateCPUBlob(&ws, true, "cpu_W", {COut, CIn * H * W});
PopulateCPUBlob(&ws, true, "cpu_B", {COut});
constexpr float tol = 0.2;
NetDef cpu_net;
{
AddOp(&cpu_net, "FC", {"cpu_X", "cpu_W", "cpu_B"}, {"ref_Y"});
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "FC", {"cpu_X", "cpu_W", "cpu_B"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
}
// will work after the next release of ACL
// compareNetResult(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", tol, true);
}
} // namespace caffe2

View File

@ -1,11 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/test/gl_model_test.h"
namespace caffe2 {
// The last softmax op didn't pass because of the dimension mismatch, and we are not likely to hit it in other models, but the implementation should be correct
// TEST(OPENGLModelTest, SqueezenetV11) {
// std::string parent_path = "/data/local/tmp/";
// benchmarkModel(parent_path + "squeezenet_init.pb", parent_path + "squeezenet_predict.pb", "data", {1, 3, 224, 224}, "squeezenet_v11");
// }
} // namespace caffe2

View File

@ -1,62 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include "caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h"
#include "caffe2/mobile/contrib/arm-compute/core/rewrite_net.h"
#include <gtest/gtest.h>
#include "caffe2/core/operator.h"
#include "caffe2/core/workspace.h"
#include <unordered_set>
C10_DEFINE_int(warmup, 3, "The number of iterations to warm up.");
C10_DEFINE_int(iter, 100, "The number of iterations to run.");
C10_DEFINE_bool(
run_individual,
true,
"Whether to benchmark individual operators.");
constexpr float tol = 0.03;
namespace caffe2 {
void benchmarkModel(std::string init_net_pb, std::string predict_net_pb, std::string input_name, std::vector<int> input_dims, std::string net_name="benchmark_net", std::unordered_set<std::string> cpu_ops = std::unordered_set<std::string>({})) {
unique_ptr<caffe2::Workspace> ws(new caffe2::Workspace());
NetDef init_net_def;
CAFFE_ENFORCE(ReadProtoFromFile(init_net_pb, &init_net_def));
CAFFE_ENFORCE(ws->RunNetOnce(init_net_def));
NetDef predict_net_def, predict_net_def_gpu;
CAFFE_ENFORCE(ReadProtoFromFile(predict_net_pb, &predict_net_def));
PopulateCPUBlob(ws.get(), true, input_name, input_dims);
LOG(ERROR) << "[C2DEBUG] rewriting OpenGL net";
tryConvertToOpenGL(predict_net_def, &predict_net_def_gpu, false, cpu_ops);
// change the name of last op
auto index = predict_net_def_gpu.op().size() - 1;
LOG(ERROR) << "[C2DEBUG] index:" << index;
auto last_blob = predict_net_def_gpu.op()[index].output()[0];
auto op = predict_net_def_gpu.mutable_op(index);
auto output = op->mutable_output(0);
*output = last_blob + "_gpu";
LOG(ERROR) << "[C2DEBUG] last blob: " << last_blob;
for (auto i = 0; i < predict_net_def_gpu.external_output_size(); ++i) {
auto out = predict_net_def_gpu.mutable_external_output(i);
if (*out == last_blob) {
*out = last_blob + "_gpu";
}
}
compareNetResult4D(*ws, predict_net_def, predict_net_def_gpu, last_blob, last_blob + "_gpu");
LOG(ERROR) << "[C2DEBUG] after compareNetResult4D";
NetBase* net = ws->CreateNet(predict_net_def_gpu);
LOG(ERROR) << "[C2DEBUG] Benchmarking OpenGL Net";
net->TEST_Benchmark(FLAGS_warmup, FLAGS_iter, FLAGS_run_individual);
// Test CPU
for (auto i = 0; i < predict_net_def.op().size(); ++i) {
auto op = predict_net_def.mutable_op(i);
if (std::find(cpu_ops.begin(), cpu_ops.end(), op->type()) == cpu_ops.end()) {
op->mutable_device_option()->set_device_type(PROTO_CPU);
}
}
predict_net_def.set_type("simple");
predict_net_def.set_name("cpu_net");
net = ws->CreateNet(predict_net_def);
LOG(INFO) << "[C2DEBUG] Benchmarking CPU Net";
net->TEST_Benchmark(FLAGS_warmup, FLAGS_iter, FLAGS_run_individual);
}
} // namespace caffe2

View File

@ -1,33 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
constexpr float tol = 5.0e-2;
TEST(OPENGLOperatorTest, NormPlanarYUV) {
Workspace ws;
int batchSize = 1;
int channels = 8;
PopulateCPUBlob(&ws, true, "cpu_X", {batchSize, channels, 8, 13});
PopulateCPUBlob(&ws, true, "cpu_mean", {1, channels});
PopulateCPUBlob(&ws, true, "cpu_stddev", {1, channels}, 1, 0.5);
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "NormalizePlanarYUV", {"cpu_X", "cpu_mean", "cpu_stddev"}, {"ref_Y"});
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "NormalizePlanarYUV", {"cpu_X", "cpu_mean", "cpu_stddev"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult4D(ws, cpu_net, gpu_net);
}
} // namespace caffe2

View File

@ -1,121 +0,0 @@
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
#include <gtest/gtest.h>
#include "caffe2/core/graph.h"
#include "caffe2/core/operator.h"
#include "caffe2/core/workspace.h"
namespace caffe2 {
#define DECLARE_OPENGL_OPERATOR(_name) \
OperatorDef _name; \
_name.mutable_device_option()->set_device_type(PROTO_OPENGL);
#define MAKE_OPENGL_OPERATOR(_op) \
_op->mutable_device_option()->set_device_type(PROTO_OPENGL);
#define ADD_ARG(_op, _name, _type, _val) \
{ \
Argument *arg = _op.add_arg(); \
arg->set_name(_name); \
arg->set_##_type(_val); \
}
// Use value 1337 to generate a blob that is deterministic
// and unique at each value (for debugging purposes)
template<typename T = float>
void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
Blob *blob = ws->CreateBlob(name);
auto* tensor = BlobGetMutableTensor(blob, CPU);
tensor->Resize(dims);
T *t_data = tensor->mutable_data<T>();
std::random_device rd;
std::mt19937 e2(rd());
std::normal_distribution<> dist(0 + dist_shift, variance + dist_shift);
for (int i = 0; i < tensor->size(); ++i) {
t_data[i] = T(random ? dist(e2) : (val == 1337 ? i : val));
}
}
template<typename T = DataType>
void compareNetResult(Workspace& ws,
NetDef& cpu_net, NetDef& gpu_net,
string cpu_blob="ref_Y",
string gpu_blob="gpu_Y",
double tol=0.01,
bool relative=false) {
ws.RunNetOnce(cpu_net);
ws.RunNetOnce(gpu_net);
Blob *cpu_out = ws.GetBlob(cpu_blob);
Blob *gpu_out = ws.GetBlob(gpu_blob);
EXPECT_NE(nullptr, cpu_out);
EXPECT_NE(nullptr, gpu_out);
TensorCPU g;
auto& g_ = gpu_out->Get<GLTensor<T>>();
getTensorCPU(g_, g);
auto &t = cpu_out->Get<TensorCPU>();
EXPECT_EQ(g.size(), t.size());
for (auto i = 0; i < g.size(); ++i) {
if (relative) {
EXPECT_NEAR(g.data<float>()[i], t.data<float>()[i], tol + tol * std::abs(t.data<float>()[i])) << "at index " << i;
} else{
EXPECT_NEAR(g.data<float>()[i], t.data<float>()[i], tol)
<< "at index " << i;
}
}
}
template<typename T = DataType>
void compareNetResult4D(Workspace& ws,
NetDef& cpu_net, NetDef& gpu_net,
string cpu_blob="ref_Y",
string gpu_blob="gpu_Y",
double tol=0.05) {
LOG(INFO) << "[C2DEBUG] running gpu net";
bool gpu_success = ws.RunNetOnce(gpu_net);
LOG(INFO) << "[C2DEBUG] after gpu net";
bool cpu_success = ws.RunNetOnce(cpu_net);
LOG(INFO) << "[C2DEBUG] after cpu net";
if (!gpu_success || !cpu_success) {
LOG(ERROR) << "[C2DEBUG] cpu or gpu net failed.";
return;
}
Blob *cpu_out = ws.GetBlob(cpu_blob);
Blob *gpu_out = ws.GetBlob(gpu_blob);
EXPECT_NE(nullptr, cpu_out);
EXPECT_NE(nullptr, gpu_out);
auto &t = cpu_out->Get<TensorCPU>();
int diff_num = 0;
if (gpu_out->IsType<TensorCPU>()) {
auto& g = gpu_out->Get<TensorCPU>();
for (auto i = 0; i < t.size(); ++i) {
auto t_elem = t.data<float>()[i];
auto g_elem = g.data<float>()[i];
if (!isnan(t_elem) && (std::abs(t_elem - g_elem) > tol + tol * std::abs(t_elem))) {
diff_num++;
}
}
} else if (gpu_out->IsType<GLTensor<T>>()) {
TensorCPU g;
getTensorCPU(gpu_out->Get<GLTensor<T>>(), g);
for (auto i = 0; i < t.size(); ++i) {
auto t_elem = t.data<float>()[i];
auto g_elem = g.data<float>()[i];
if (!isnan(t_elem) && (std::abs(t_elem - g_elem) > tol + tol * std::abs(t_elem))) {
diff_num++;
}
}
}
CHECK(diff_num <= 0.03 * t.size());
}
} // namespace caffe2

View File

@ -1,89 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, AveragePool) {
Workspace ws;
PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "AveragePool", {"cpu_X"}, {"ref_Y"});
ADD_ARG((*def), "kernel", i, 2);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride", i, 2);
ADD_ARG((*def), "order", s, "NCHW");
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "AveragePool", {"cpu_X"}, {"gpu_Y"});
ADD_ARG((*def), "kernel", i, 2);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride", i, 2);
ADD_ARG((*def), "order", s, "NCHW");
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net);
}
TEST(OPENGLOperatorTest, MaxPool) {
Workspace ws;
PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "MaxPool", {"cpu_X"}, {"ref_Y"});
ADD_ARG((*def), "kernel", i, 2);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride", i, 2);
ADD_ARG((*def), "order", s, "NCHW");
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "MaxPool", {"cpu_X"}, {"gpu_Y"});
ADD_ARG((*def), "kernel", i, 2);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride", i, 2);
ADD_ARG((*def), "order", s, "NCHW");
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net);
}
TEST(OPENGLOperatorTest, AverageGlobalPool) {
Workspace ws;
PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "AveragePool", {"cpu_X"}, {"ref_Y"});
ADD_ARG((*def), "global_pooling", i, 1);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride", i, 1);
ADD_ARG((*def), "order", s, "NCHW");
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "AveragePool", {"cpu_X"}, {"gpu_Y"});
ADD_ARG((*def), "global_pooling", i, 1);
ADD_ARG((*def), "pad", i, 0);
ADD_ARG((*def), "stride", i, 1);
ADD_ARG((*def), "order", s, "NCHW");
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net);
}
} // namespace caffe2

View File

@ -1,35 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, ResizeNearest) {
Workspace ws;
float height_scale = 2;
float width_scale = 2;
int N = 1;
int CIn = 7;
PopulateCPUBlob(&ws, true, "cpu_X", {N, CIn, 37, 89});
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "ResizeNearest", {"cpu_X"}, {"ref_Y"});
ADD_ARG((*def), "height_scale", f, height_scale);
ADD_ARG((*def), "width_scale", f, width_scale);
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "ResizeNearest", {"cpu_X"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
ADD_ARG((*def), "height_scale", f, height_scale);
ADD_ARG((*def), "width_scale", f, width_scale);
}
compareNetResult4D(ws, cpu_net, gpu_net);
}
} // namespace caffe2

View File

@ -1,28 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, Softmax) {
Workspace ws;
int N = 1;
int D = 128;
PopulateCPUBlob(&ws, true, "cpu_X", {N, D}, 1);
NetDef cpu_net;
{
AddOp(&cpu_net, "Softmax", {"cpu_X"}, {"ref_Y"});
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "Softmax", {"cpu_X"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
}
compareNetResult(ws, cpu_net, gpu_net);
}
} // namespace caffe2

View File

@ -1,35 +0,0 @@
#include "gl_operator_test.h"
namespace caffe2 {
TEST(OPENGLOperatorTest, SpatialBN) {
Workspace ws;
int batchSize = 1;
int channels = 8;
PopulateCPUBlob(&ws, true, "cpu_X", {3, channels, 8, 13});
PopulateCPUBlob(&ws, true, "cpu_scale", {channels});
PopulateCPUBlob(&ws, true, "cpu_bias", {channels});
PopulateCPUBlob(&ws, true, "cpu_mean", {channels});
PopulateCPUBlob(&ws, true, "cpu_var", {channels}, 1, 0.5);
NetDef cpu_net;
{
OperatorDef* def = AddOp(&cpu_net, "SpatialBN", {"cpu_X", "cpu_scale", "cpu_bias", "cpu_mean", "cpu_var"}, {"ref_Y"});
ADD_ARG((*def), OpSchema::Arg_IsTest, i, 1);
}
NetDef gpu_net;
gpu_net.set_type("opengl");
{
OperatorDef* def = AddOp(&gpu_net, "SpatialBN", {"cpu_X", "cpu_scale", "cpu_bias", "cpu_mean", "cpu_var"}, {"gpu_Y"});
MAKE_OPENGL_OPERATOR(def);
ADD_ARG((*def), OpSchema::Arg_IsTest, i, 1);
}
compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", 0.01);
}
} // namespace caffe2

View File

@ -919,63 +919,6 @@ if(USE_PROF)
endif()
endif()
# ---[ ARM Compute Library: check compatibility.
if (USE_ACL)
if (NOT ANDROID)
message(WARNING "ARM Compute Library is only supported for Android builds.")
caffe2_update_option(USE_ACL OFF)
else()
list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
# 32-bit ARM (armv7, armv7-a, armv7l, etc)
set(ACL_ARCH "armv7a")
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$")
# 64-bit ARM
set(ACL_ARCH "arm64-v8a")
else()
message(WARNING "ARM Compute Library is only supported for ARM/ARM64 builds.")
caffe2_update_option(USE_ACL OFF)
endif()
endif()
endif()
# ---[ ARM Compute Library: build the target.
if (USE_ACL)
list(APPEND ARM_COMPUTE_INCLUDE_DIRS "third_party/ComputeLibrary/")
list(APPEND ARM_COMPUTE_INCLUDE_DIRS "third_party/ComputeLibrary/include")
include_directories(SYSTEM ${ARM_COMPUTE_INCLUDE_DIRS})
string (REPLACE ";" " -I" ANDROID_STL_INCLUDE_FLAGS "-I${ANDROID_STL_INCLUDE_DIRS}")
set (ARM_COMPUTE_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}/../third_party/ComputeLibrary/")
set (ARM_COMPUTE_LIB "${CMAKE_CURRENT_BINARY_DIR}/libarm_compute.a")
set (ARM_COMPUTE_CORE_LIB "${CMAKE_CURRENT_BINARY_DIR}/libarm_compute_core.a")
set (ARM_COMPUTE_LIBS ${ARM_COMPUTE_LIB} ${ARM_COMPUTE_CORE_LIB})
add_custom_command(
OUTPUT ${ARM_COMPUTE_LIBS}
COMMAND
/bin/sh -c "export PATH=\"$PATH:$(dirname ${CMAKE_CXX_COMPILER})\" && \
scons -C \"${ARM_COMPUTE_SRC_DIR}\" -Q \
examples=no validation_tests=no benchmark_tests=no standalone=yes \
embed_kernels=yes opencl=no gles_compute=yes \
os=android arch=${ACL_ARCH} \
extra_cxx_flags=\"${ANDROID_CXX_FLAGS} ${ANDROID_STL_INCLUDE_FLAGS}\"" &&
/bin/sh -c "cp ${ARM_COMPUTE_SRC_DIR}/build/libarm_compute-static.a ${CMAKE_CURRENT_BINARY_DIR}/libarm_compute.a" &&
/bin/sh -c "cp ${ARM_COMPUTE_SRC_DIR}/build/libarm_compute_core-static.a ${CMAKE_CURRENT_BINARY_DIR}/libarm_compute_core.a" &&
/bin/sh -c "rm -r ${ARM_COMPUTE_SRC_DIR}/build"
COMMENT "Building ARM compute library" VERBATIM)
add_custom_target(arm_compute_build ALL DEPENDS ${ARM_COMPUTE_LIBS})
add_library(arm_compute_core STATIC IMPORTED)
add_dependencies(arm_compute_core arm_compute_build)
set_property(TARGET arm_compute_core PROPERTY IMPORTED_LOCATION ${ARM_COMPUTE_CORE_LIB})
add_library(arm_compute STATIC IMPORTED)
add_dependencies(arm_compute arm_compute_build)
set_property(TARGET arm_compute PROPERTY IMPORTED_LOCATION ${ARM_COMPUTE_LIB})
list(APPEND Caffe2_DEPENDENCY_LIBS arm_compute arm_compute_core)
endif()
if (USE_SNPE AND ANDROID)
if (SNPE_LOCATION AND SNPE_HEADERS)
message(STATUS "Using SNPE location specified by -DSNPE_LOCATION: " ${SNPE_LOCATION})

View File

@ -333,23 +333,6 @@ if (IOS)
add_definitions("-Wno-deprecated-declarations")
endif()
# ---[ If we are building with ACL, we will enable neon-fp16.
if(USE_ACL)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
# 32-bit ARM (armv7, armv7-a, armv7l, etc)
set(ACL_ARCH "armv7a")
# Compilers for 32-bit ARM need extra flags to enable NEON-FP16
add_definitions("-mfpu=neon-fp16")
include(CheckCCompilerFlag)
CHECK_C_COMPILER_FLAG(
-mfp16-format=ieee CAFFE2_COMPILER_SUPPORTS_FP16_FORMAT)
if (CAFFE2_COMPILER_SUPPORTS_FP16_FORMAT)
add_definitions("-mfp16-format=ieee")
endif()
endif()
endif()
# ---[ If we use asan, turn on the flags.
# TODO: This only works with new style gcc and clang (not the old -faddress-sanitizer).
# Change if necessary on old platforms.