mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 21:14:14 +08:00
Remove ComputeLibrary submodule
Summary: Pull Request resolved: https://github.com/pytorch/pytorch/pull/18052 Reviewed By: ezyang Differential Revision: D14477355 fbshipit-source-id: c56b802f6d69701596c327cf9af6782f30e335fa
This commit is contained in:
committed by
Facebook Github Bot
parent
c7448aa13c
commit
0fe6e8c870
3
.gitmodules
vendored
3
.gitmodules
vendored
@ -52,9 +52,6 @@
|
||||
[submodule "third_party/python-six"]
|
||||
path = third_party/python-six
|
||||
url = https://github.com/benjaminp/six.git
|
||||
[submodule "third_party/ComputeLibrary"]
|
||||
path = third_party/ComputeLibrary
|
||||
url = https://github.com/ARM-software/ComputeLibrary.git
|
||||
[submodule "third_party/onnx"]
|
||||
path = third_party/onnx
|
||||
url = https://github.com/onnx/onnx.git
|
||||
|
@ -101,7 +101,6 @@ option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" OFF)
|
||||
cmake_dependent_option(
|
||||
INSTALL_TEST "Install test binaries if BUILD_TEST is on" ON
|
||||
"BUILD_TEST" OFF)
|
||||
option(USE_ACL "Use ARM Compute Library" OFF)
|
||||
option(USE_ASAN "Use Address Sanitizer" OFF)
|
||||
option(USE_CUDA "Use CUDA" ON)
|
||||
option(USE_ROCM "Use ROCm" ON)
|
||||
|
@ -1,8 +1,4 @@
|
||||
add_subdirectory(ios)
|
||||
if (USE_ACL)
|
||||
# add_subdirectory(arm-compute)
|
||||
endif()
|
||||
# Finally pass the src lists back to the parent
|
||||
|
||||
if (USE_NNAPI)
|
||||
add_subdirectory(nnapi)
|
||||
|
@ -1,6 +0,0 @@
|
||||
add_subdirectory(core)
|
||||
add_subdirectory(operators)
|
||||
add_subdirectory(test)
|
||||
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
@ -1,64 +0,0 @@
|
||||
# Caffe2 - ARM Compute Backend
|
||||
|
||||
## Build
|
||||
|
||||
To build, clone and install scons
|
||||
|
||||
```
|
||||
brew install scons
|
||||
```
|
||||
|
||||
set ANDROID_NDK to /opt/android_ndk/xxx(e.g. /opt/android_ndk/android-ndk-r15c/)
|
||||
|
||||
setup toolchain:
|
||||
Let's say $PATH_TO_TOOLCHAIN is the directory you want to store the toolchain files.
|
||||
|
||||
arm
|
||||
```
|
||||
rm -rf $PATH_TO_TOOLCHAIN
|
||||
$ANDROID_NDK/build/tools/make_standalone_toolchain.py --arch arm --api 21 --install-dir $PATH_TO_TOOLCHAIN
|
||||
```
|
||||
|
||||
arm64
|
||||
```
|
||||
rm -rf PATH_TO_TOOLCHAIN
|
||||
$ANDROID_NDK/build/tools/make_standalone_toolchain.py --arch arm64 --api 21 --install-dir $PATH_TO_TOOLCHAIN
|
||||
```
|
||||
|
||||
add the toolchain path to .bashrc/.zshrc etc.
|
||||
e.g.
|
||||
```
|
||||
export PATH=$PATH:$PATH_TO_TOOLCHAIN/bin
|
||||
```
|
||||
|
||||
use the build\_android.sh:
|
||||
|
||||
for 32-bit ARM
|
||||
```
|
||||
./scripts/build_android.sh -DUSE_ACL=ON -DBUILD_TEST=ON
|
||||
```
|
||||
|
||||
for 64-bit ARM
|
||||
```
|
||||
./scripts/build_android.sh -DUSE_ACL=ON -DBUILD_TEST=ON -DANDROID_ABI=arm64-v8a -DANDROID_TOOLCHAIN=clang
|
||||
```
|
||||
|
||||
Before switch between 32 bit and 64 bit, please make sure to delete build\_android folder:
|
||||
```
|
||||
rm -rf build_android
|
||||
```
|
||||
## Test
|
||||
Plug in an android device, and run a test
|
||||
|
||||
```
|
||||
cd build_android
|
||||
adb push bin/gl_conv_op_test /data/local/tmp && adb shell '/data/local/tmp/gl_conv_op_test'
|
||||
```
|
||||
or use a script to run them all
|
||||
|
||||
In caffe2 top level directory
|
||||
```
|
||||
./caffe2/mobile/contrib/arm-compute/run_tests.sh build_android
|
||||
```
|
||||
|
||||
Note that some tests(fully_connected and alignment) have been disabled until the next release of ACL.
|
@ -1,2 +0,0 @@
|
||||
file(GLOB_RECURSE tmp *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
@ -1,39 +0,0 @@
|
||||
#include "context.h"
|
||||
|
||||
#include "caffe2/core/allocator.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
CAFFE_KNOWN_TYPE(GLTensor<GLfloat>);
|
||||
CAFFE_KNOWN_TYPE(GLTensor<GLhalf>);
|
||||
CAFFE_KNOWN_TYPE(GLTensor<half>);
|
||||
CAFFE_KNOWN_TYPE(Tensor<GLContext>);
|
||||
|
||||
bool GLContext::initialized = false;
|
||||
|
||||
GLContext::GLContext() {
|
||||
CAFFE_ENFORCE(arm_compute::opengles31_is_available());
|
||||
if(!initialized) {
|
||||
arm_compute::GCScheduler::get().default_init();
|
||||
initialized = true;
|
||||
}
|
||||
}
|
||||
|
||||
void EventCreateOPENGL(const DeviceOption & /* unused */,
|
||||
Event * /* unused */) {}
|
||||
void EventRecordOPENGL(Event * /* unused */, const void * /* unused */,
|
||||
const char * /* unused */) {}
|
||||
void EventWaitOPENGLOPENGL(const Event * /* unused */, void * /* unused */) {}
|
||||
void EventFinishOPENGL(const Event * /* unused */) {}
|
||||
void EventResetOPENGL(Event * /* unused */) {}
|
||||
|
||||
REGISTER_EVENT_CREATE_FUNCTION(OPENGL, EventCreateOPENGL);
|
||||
REGISTER_EVENT_RECORD_FUNCTION(OPENGL, EventRecordOPENGL);
|
||||
REGISTER_EVENT_WAIT_FUNCTION(OPENGL, OPENGL, EventWaitOPENGLOPENGL);
|
||||
REGISTER_EVENT_FINISH_FUNCTION(OPENGL, EventFinishOPENGL);
|
||||
REGISTER_EVENT_RESET_FUNCTION(OPENGL, EventResetOPENGL);
|
||||
|
||||
} // namespace caffe2
|
@ -1,391 +0,0 @@
|
||||
#ifndef CAFFE2_OPENGL_CONTEXT_H_
|
||||
#define CAFFE2_OPENGL_CONTEXT_H_
|
||||
|
||||
#ifdef CAFFE2_OPENGL_BACKEND
|
||||
#error Can only build one OpenGL backend at a time.
|
||||
#else
|
||||
#define CAFFE2_OPENGL_BACKEND
|
||||
#endif
|
||||
|
||||
#include "caffe2/core/allocator.h"
|
||||
#include "caffe2/core/blob.h"
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
#include "arm_compute/core/GLES_COMPUTE/OpenGLES.h"
|
||||
#include "arm_compute/runtime/GLES_COMPUTE/GCFunctions.h"
|
||||
#include "arm_compute/runtime/GLES_COMPUTE/GCScheduler.h"
|
||||
#include "arm_compute/runtime/GLES_COMPUTE/GCTensor.h"
|
||||
|
||||
#include "arm_compute/core/Types.h"
|
||||
#include "arm_compute/runtime/Allocator.h"
|
||||
#include "arm_compute/runtime/BlobLifetimeManager.h"
|
||||
#include "arm_compute/runtime/MemoryManagerOnDemand.h"
|
||||
#include "arm_compute/runtime/PoolManager.h"
|
||||
#include "utils/Utils.h"
|
||||
#include "include/half/half.hpp"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
typedef half_float::half half;
|
||||
//#define ACL_USE_FLOAT32
|
||||
#ifdef ACL_USE_FLOAT32
|
||||
typedef float DataType;
|
||||
#else
|
||||
typedef half DataType;
|
||||
#endif
|
||||
|
||||
template <typename T> class GLTensor;
|
||||
|
||||
class GLContext final {
|
||||
public:
|
||||
static bool initialized;
|
||||
explicit GLContext();
|
||||
explicit GLContext(const DeviceOption &option) {
|
||||
DCHECK_EQ(option.device_type(), PROTO_OPENGL);
|
||||
GLContext();
|
||||
}
|
||||
~GLContext() {}
|
||||
|
||||
static void sync() { arm_compute::GCScheduler::get().memory_barrier(); }
|
||||
|
||||
template <typename T>
|
||||
using deleted_unique_ptr = std::unique_ptr<T, std::function<void(T *)>>;
|
||||
template <typename T>
|
||||
static deleted_unique_ptr<const GLTensor<T>> getGLTensor(const Blob *b, const GLTensor<T>* X_old = nullptr) {
|
||||
if (b->IsType<TensorCPU>()) {
|
||||
|
||||
auto &Xcpu = b->Get<TensorCPU>();
|
||||
GLTensor<T> *X_raw_ptr;
|
||||
if (X_old) {
|
||||
X_raw_ptr = const_cast<GLTensor<T> *>(X_old);
|
||||
X_raw_ptr->ResizeLike(Xcpu);
|
||||
deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, EmptyDeleter<T>);
|
||||
return X_unique_ptr;
|
||||
} else {
|
||||
X_raw_ptr = new GLTensor<T>();
|
||||
X_raw_ptr->ResizeLike(Xcpu);
|
||||
deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, GLTensorDeleter<T>);
|
||||
return X_unique_ptr;
|
||||
}
|
||||
}
|
||||
const GLTensor<T> *X_raw_ptr;
|
||||
X_raw_ptr = &b->Get<GLTensor<T>>();
|
||||
deleted_unique_ptr<const GLTensor<T>> X_unique_ptr(X_raw_ptr, EmptyDeleter<T>);
|
||||
return X_unique_ptr;
|
||||
}
|
||||
|
||||
/*
|
||||
* Everything below is basically boiler plate for Context classes
|
||||
*/
|
||||
static std::pair<void *, MemoryDeleter> New(size_t nbytes) {
|
||||
return std::pair<void *, MemoryDeleter>(malloc(nbytes), GLContext::Delete);
|
||||
}
|
||||
|
||||
static void Delete(void *data) {
|
||||
if (data != nullptr) {
|
||||
free(data);
|
||||
}
|
||||
}
|
||||
|
||||
template <class SrcContext, class DstContext>
|
||||
inline void CopyBytes(size_t nbytes, const void *src, void *dst) {}
|
||||
|
||||
template <typename T, class SrcContext, class DstContext>
|
||||
inline void Copy(int n, const T *src, T *dst) {
|
||||
CopyBytes<SrcContext, DstContext>(n * sizeof(T),
|
||||
static_cast<const void *>(src),
|
||||
static_cast<void *>(dst));
|
||||
}
|
||||
|
||||
template <class SrcContext, class DstContext>
|
||||
inline void CopyItems(const TypeMeta &meta, size_t n, const void *src,
|
||||
void *dst) {
|
||||
CAFFE_ENFORCE(!meta.copy(), "GLContext requires fundamental types.");
|
||||
CopyBytes<SrcContext, DstContext>(n * meta.itemsize(), src, dst);
|
||||
}
|
||||
|
||||
void SwitchToDevice(int a, ...) { /* TODO */
|
||||
}
|
||||
void SwitchToDevice() { SwitchToDevice(0); }
|
||||
|
||||
inline void WaitEvent(const Event &ev) { /* TODO */
|
||||
}
|
||||
void FinishDeviceComputation() { /* TODO */
|
||||
}
|
||||
inline void Record(Event *ev, const char *&) const { /* TODO */
|
||||
}
|
||||
static bool IsStreamFree(const DeviceOption& /* unused */, int /* unused */) {
|
||||
return true;
|
||||
}
|
||||
bool HasAsyncPartDefault() const { return false; }
|
||||
bool SupportsAsyncScheduling() const { return false; }
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
static void GLTensorDeleter(const GLTensor<T> *X) {
|
||||
delete X;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
static void EmptyDeleter(const GLTensor<T> *X) {
|
||||
return;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
template <typename T> class GLTensor {
|
||||
public:
|
||||
GLTensor() { tensor_ = make_unique<arm_compute::GCTensor>(); }
|
||||
~GLTensor() { tensor_->allocator()->free(); }
|
||||
|
||||
template <typename TensorType> bool ResizeLike(TensorType &X, bool free = false) {
|
||||
bool need_allocation = SetDims(X.dims());
|
||||
for (int i = 0; i < dims_.size(); i++) {
|
||||
shape_.set(dims_.size() - i - 1, dims_[i]);
|
||||
}
|
||||
|
||||
if (need_allocation) {
|
||||
if (free) {
|
||||
tensor_->allocator()->free();
|
||||
}
|
||||
#ifdef ACL_USE_FLOAT32
|
||||
tensor_->allocator()->init(
|
||||
arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F32));
|
||||
#else
|
||||
tensor_->allocator()->init(
|
||||
arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F16));
|
||||
#endif
|
||||
} else {
|
||||
tensor_->info()->set_tensor_shape(shape_);
|
||||
}
|
||||
return need_allocation;
|
||||
}
|
||||
|
||||
template <typename... Ts> bool Resize(Ts... dim_source) {
|
||||
bool need_allocation = SetDims(dim_source...);
|
||||
for (int i = 0; i < dims_.size(); i++) {
|
||||
shape_.set(dims_.size() - i - 1, dims_[i]);
|
||||
}
|
||||
if (need_allocation) {
|
||||
// TODO: Make it type generic
|
||||
tensor_->allocator()->free();
|
||||
#ifdef ACL_USE_FLOAT32
|
||||
tensor_->allocator()->init(arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F32));
|
||||
#else
|
||||
tensor_->allocator()->init(arm_compute::TensorInfo(shape_, 1, arm_compute::DataType::F16));
|
||||
#endif
|
||||
} else {
|
||||
tensor_->info()->set_tensor_shape(shape_);
|
||||
}
|
||||
return need_allocation;
|
||||
}
|
||||
|
||||
// Allocates and copies data if needed
|
||||
void lazy_allocate(const Blob *b, bool allocate_tensor, bool try_to_copy_from_cpu) const {
|
||||
if (try_to_copy_from_cpu) {
|
||||
// we skip GLTensors, nothing to copy
|
||||
if (!b->IsType<GLTensor>()) {
|
||||
// typically only called on the second run
|
||||
if (allocate_tensor) {
|
||||
allocate();
|
||||
}
|
||||
Timer timer;
|
||||
fillGLTensor(b);
|
||||
auto millis = timer.MilliSeconds();
|
||||
VLOG(2) << "[C2DEBUG] fillGLTensor timer: " << millis;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void allocate() const {
|
||||
tensor_->allocator()->allocate();
|
||||
}
|
||||
|
||||
void fillGLTensor(const Blob *b) const {
|
||||
if (b->IsType<TensorCPU>()) {
|
||||
auto &Xcpu = b->Get<TensorCPU>();
|
||||
VLOG(2) << "[C2DEBUG] fillGLTensor dims: " << Xcpu.dims();
|
||||
T *buffer = map();
|
||||
char *byte_buffer = (char *)buffer;
|
||||
auto info = tensor_->info();
|
||||
arm_compute::Window it_window;
|
||||
it_window.use_tensor_dimensions(info->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
|
||||
arm_compute::Iterator it(get_underlying(), it_window);
|
||||
if (Xcpu.ndim() == 4) {
|
||||
auto C = Xcpu.dim32(1);
|
||||
auto H = Xcpu.dim32(2);
|
||||
auto W = Xcpu.dim32(3);
|
||||
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
|
||||
std::copy_n(Xcpu.data<float>() + id[3] * (C * W * H) + id.z() * (W * H) + id.y() * W, W, reinterpret_cast<T *>(it.ptr()));
|
||||
},
|
||||
it);
|
||||
} else if (Xcpu.ndim() == 3) {
|
||||
auto H = Xcpu.dim32(1);
|
||||
auto W = Xcpu.dim32(2);
|
||||
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
|
||||
std::copy_n(Xcpu.data<float>() + (id.z() * (W * H) + id.y() * W), W, reinterpret_cast<T *>(it.ptr()));
|
||||
},
|
||||
it);
|
||||
} else if (Xcpu.ndim() == 2) {
|
||||
auto W = Xcpu.dim32(1);
|
||||
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
|
||||
std::copy_n(Xcpu.data<float>() + id.y() * W, W, reinterpret_cast<T *>(it.ptr()));
|
||||
},
|
||||
it);
|
||||
} else {
|
||||
arm_compute::Window w;
|
||||
w.use_tensor_dimensions(info->tensor_shape());
|
||||
arm_compute::Iterator i(get_underlying(), w);
|
||||
auto size = Xcpu.dim32(0);
|
||||
std::copy_n(Xcpu.data<float>(), size, reinterpret_cast<T *>(i.ptr()));
|
||||
}
|
||||
unmap();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
const int32_t ndim() const { return dims_.size(); }
|
||||
|
||||
vector<int64_t> dims() const { return dims_; }
|
||||
|
||||
const int32_t dim32(const int index) const { return dims_.at(index); }
|
||||
|
||||
const int32_t size() const {
|
||||
int32_t s = 1;
|
||||
for (int i = 0; i < dims_.size(); i++) {
|
||||
s *= dims_[i];
|
||||
}
|
||||
return s;
|
||||
}
|
||||
|
||||
arm_compute::GCTensor *get_underlying() const { return tensor_.get(); }
|
||||
|
||||
T *map() const {
|
||||
GLContext::sync();
|
||||
tensor_->map(true);
|
||||
return reinterpret_cast<T *>(tensor_->buffer());
|
||||
}
|
||||
|
||||
void unmap() const { return tensor_->unmap(); }
|
||||
|
||||
void sync() const {
|
||||
GLContext::sync();
|
||||
tensor_->map();
|
||||
tensor_->unmap();
|
||||
}
|
||||
|
||||
private:
|
||||
template <typename TI, typename = typename std::enable_if<
|
||||
std::is_integral<TI>::value>::type>
|
||||
bool SetDims(const vector<TI> &src) {
|
||||
auto old_size = size_;
|
||||
dims_.resize(src.size());
|
||||
int64_t new_size = 1;
|
||||
for (unsigned int i = 0; i < src.size(); ++i) {
|
||||
new_size *= src[i];
|
||||
dims_[i] = src[i];
|
||||
}
|
||||
size_ = new_size;
|
||||
return size_ > old_size;
|
||||
}
|
||||
|
||||
bool SetDims() {
|
||||
auto old_size = size_;
|
||||
dims_.resize(0);
|
||||
size_ = 1;
|
||||
return size_ > old_size;
|
||||
}
|
||||
|
||||
bool SetDims(const int64_t d0) {
|
||||
auto old_size = size_;
|
||||
dims_.resize(1);
|
||||
dims_[0] = d0;
|
||||
size_ = d0;
|
||||
return size_ > old_size;
|
||||
}
|
||||
|
||||
bool SetDims(const int64_t d0, const int64_t d1) {
|
||||
auto old_size = size_;
|
||||
dims_.resize(2);
|
||||
dims_[0] = d0;
|
||||
dims_[1] = d1;
|
||||
size_ = d0 * d1;
|
||||
return size_ > old_size;
|
||||
}
|
||||
|
||||
bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2) {
|
||||
auto old_size = size_;
|
||||
dims_.resize(3);
|
||||
dims_[0] = d0;
|
||||
dims_[1] = d1;
|
||||
dims_[2] = d2;
|
||||
size_ = d0 * d1 * d2;
|
||||
return size_ > old_size;
|
||||
}
|
||||
|
||||
bool SetDims(const int64_t d0, const int64_t d1, const int64_t d2,
|
||||
const int64_t d3) {
|
||||
auto old_size = size_;
|
||||
dims_.resize(4);
|
||||
dims_[0] = d0;
|
||||
dims_[1] = d1;
|
||||
dims_[2] = d2;
|
||||
dims_[3] = d3;
|
||||
size_ = d0 * d1 * d2 * d3;
|
||||
return size_ > old_size;
|
||||
}
|
||||
|
||||
vector<int64_t> dims_;
|
||||
int64_t size_ = -1;
|
||||
arm_compute::TensorShape shape_;
|
||||
unique_ptr<arm_compute::GCTensor> tensor_;
|
||||
};
|
||||
|
||||
template<typename T = DataType>
|
||||
void getTensorCPU(const GLTensor<T>& g_, TensorCPU& g) {
|
||||
VLOG(2) << " [C2DEBUG] getTensorCPU " << g_.dims();
|
||||
g.Resize(g_.dims());
|
||||
g_.map();
|
||||
auto tensor = g_.get_underlying();
|
||||
auto info = tensor->info();
|
||||
arm_compute::Window it_window;
|
||||
it_window.use_tensor_dimensions(info->tensor_shape(), /* first_dimension =*/arm_compute::Window::DimY); // Iterate through the rows (not each element)
|
||||
arm_compute::Iterator it(tensor, it_window);
|
||||
if (g_.ndim() == 4) {
|
||||
auto C = g_.dim32(1);
|
||||
auto H = g_.dim32(2);
|
||||
auto W = g_.dim32(3);
|
||||
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
|
||||
std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + id[3] * (C * W * H) + id.z() * (W * H) + id.y() * W);
|
||||
},
|
||||
it);
|
||||
} else if (g_.ndim() == 3) {
|
||||
auto H = g_.dim32(1);
|
||||
auto W = g_.dim32(2);
|
||||
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
|
||||
std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + (id.z() * (W * H) + id.y() * W));
|
||||
},
|
||||
it);
|
||||
} else if (g_.ndim() == 2) {
|
||||
auto W = g_.dim32(1);
|
||||
arm_compute::execute_window_loop(it_window, [&](const arm_compute::Coordinates & id) {
|
||||
std::copy_n(reinterpret_cast<T *>(it.ptr()), W, g.mutable_data<float>() + id.y() * W);
|
||||
},
|
||||
it);
|
||||
} else {
|
||||
arm_compute::Window w;
|
||||
w.use_tensor_dimensions(info->tensor_shape());
|
||||
arm_compute::Iterator i(tensor, w);
|
||||
auto size = g_.dim32(0);
|
||||
std::copy_n(reinterpret_cast<T *>(i.ptr()), size, g.mutable_data<float>());
|
||||
}
|
||||
g_.unmap();
|
||||
}
|
||||
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif /* CAFFE2_OPENGL_CONTEXT_H_ */
|
@ -1,230 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/net_gl.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/core/net.h"
|
||||
|
||||
#include <iostream>
|
||||
#include <set>
|
||||
#include <unordered_map>
|
||||
#include <unordered_set>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/static_tracepoint.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
GLNet::GLNet(
|
||||
const std::shared_ptr<const NetDef>& net_def,
|
||||
Workspace* ws)
|
||||
: NetBase(net_def, ws) {
|
||||
ws_ = ws;
|
||||
VLOG(1) << "Constructing GLNet " << net_def->name();
|
||||
const bool net_def_has_device_option = net_def->has_device_option();
|
||||
// Initialize the operators
|
||||
for (int idx = 0; idx < net_def->op_size(); ++idx) {
|
||||
const auto& operator_def = net_def->op(idx);
|
||||
VLOG(1) << "Creating operator " << operator_def.name() << ": "
|
||||
<< operator_def.type();
|
||||
output_blobs_.push_back(operator_def.output(0));
|
||||
if (operator_def.has_device_option() && operator_def.device_option().device_type() == OPENGL) {
|
||||
opengl_device_.push_back(true);
|
||||
} else {
|
||||
opengl_device_.push_back(false);
|
||||
}
|
||||
|
||||
std::unique_ptr<OperatorBase> op{nullptr};
|
||||
OperatorDef temp_def(operator_def);
|
||||
if (temp_def.type() == "GenerateProposals") {
|
||||
auto* arg = temp_def.add_arg();
|
||||
arg->set_name("fill_output");
|
||||
arg->set_i(1);
|
||||
}
|
||||
if (!operator_def.has_device_option() && net_def_has_device_option) {
|
||||
// In the case that the operator def does not specify a device option but
|
||||
// the net def has a default option, we copy the device option over to the
|
||||
// operator def.
|
||||
temp_def.mutable_device_option()->CopyFrom(net_def->device_option());
|
||||
op = CreateOperator(temp_def, ws, idx);
|
||||
} else {
|
||||
op = CreateOperator(temp_def, ws, idx);
|
||||
op->set_debug_def(
|
||||
std::shared_ptr<const OperatorDef>{net_def, &(net_def->op(idx))});
|
||||
}
|
||||
operators_.emplace_back(std::move(op));
|
||||
}
|
||||
}
|
||||
|
||||
bool GLNet::Run() {
|
||||
StartAllObservers();
|
||||
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
for (auto& op: operators_) {
|
||||
VLOG(2) << "[C2DEBUG] configure " << ProtoDebugString(op->debug_def());
|
||||
op->Run();
|
||||
}
|
||||
for (auto& op: operators_) {
|
||||
VLOG(2) << "[C2DEBUG] second run " << ProtoDebugString(op->debug_def());
|
||||
op->Run();
|
||||
}
|
||||
// Change the parameters for GenerateProposals
|
||||
for (int i = 0; i < operators_.size(); ++i) {
|
||||
if (operators_[i]->debug_def().type() == "GenerateProposals") {
|
||||
OperatorDef temp_def(operators_[i]->debug_def());
|
||||
auto* arg = temp_def.add_arg();
|
||||
arg->set_name("fill_output");
|
||||
arg->set_i(0);
|
||||
operators_[i].reset(CreateOperator(temp_def, ws_, i).release());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
VLOG(1) << "Running net " << name_;
|
||||
int i = 0;
|
||||
//Timer timer;
|
||||
for (auto& op : operators_) {
|
||||
VLOG(2) << "[C2DEBUG] running " << ProtoDebugString(op->debug_def()) << " " << i;
|
||||
++i;
|
||||
//timer.Start();
|
||||
bool res = op->Run();
|
||||
// auto millis = timer.MilliSeconds();
|
||||
// LOG(ERROR) << "[C2DEBUG] OP " << op->debug_def().type() << " " << millis <<" ms.";
|
||||
if (!res) {
|
||||
LOG(ERROR) << "[C2DEBUG] Operator failed: " << ProtoDebugString(op->debug_def());
|
||||
return false;
|
||||
}
|
||||
}
|
||||
StopAllObservers();
|
||||
return true;
|
||||
}
|
||||
|
||||
bool GLNet::RunAsync() {
|
||||
return Run();
|
||||
}
|
||||
|
||||
namespace {
|
||||
template <typename A, typename B>
|
||||
bool PairLargerThan(const std::pair<A, B>& x, const std::pair<A, B>& y) {
|
||||
return x.second > y.second;
|
||||
}
|
||||
}
|
||||
|
||||
vector<float> GLNet::TEST_Benchmark(
|
||||
const int warmup_runs,
|
||||
const int main_runs,
|
||||
const bool run_individual) {
|
||||
std::cout << "Starting benchmark." << std::endl;
|
||||
std::cout << "Running warmup runs." << std::endl;
|
||||
CAFFE_ENFORCE(
|
||||
warmup_runs >= 0,
|
||||
"Number of warm up runs should be non negative, provided ",
|
||||
warmup_runs,
|
||||
".");
|
||||
for (int i = 0; i < warmup_runs; ++i) {
|
||||
CAFFE_ENFORCE(Run(), "Warmup run ", i, " has failed.");
|
||||
}
|
||||
|
||||
auto last_blob = output_blobs_[output_blobs_.size() - 1];
|
||||
Blob *gpu_out_blob = ws_->GetBlob(last_blob);
|
||||
if (gpu_out_blob->IsType<GLTensor<DataType>>()) {
|
||||
auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
|
||||
// Enforce gpu execution
|
||||
g_.sync();
|
||||
}
|
||||
|
||||
std::cout << "Main runs." << std::endl;
|
||||
CAFFE_ENFORCE(
|
||||
main_runs >= 0,
|
||||
"Number of main runs should be non negative, provided ",
|
||||
main_runs,
|
||||
".");
|
||||
Timer timer;
|
||||
for (int i = 0; i < main_runs; ++i) {
|
||||
CAFFE_ENFORCE(Run(), "Main run ", i, " has failed.");
|
||||
}
|
||||
if (gpu_out_blob->IsType<GLTensor<DataType>>()) {
|
||||
auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
|
||||
g_.sync();
|
||||
}
|
||||
|
||||
auto millis = timer.MilliSeconds();
|
||||
std::cout << "Main run finished. Milliseconds per iter: "
|
||||
<< millis / main_runs
|
||||
<< ". Iters per second: " << 1000.0 * main_runs / millis << std::endl;
|
||||
|
||||
vector<float> time_per_op(operators_.size(), 0);
|
||||
vector<uint64_t> flops_per_op(operators_.size(), 0);
|
||||
CaffeMap<string, float> time_per_op_type;
|
||||
if (run_individual) {
|
||||
for (int i = 0; i < main_runs; ++i) {
|
||||
for (auto& op : operators_) {
|
||||
op->ResetEvent();
|
||||
}
|
||||
int idx = 0;
|
||||
for (auto& op : operators_) {
|
||||
const string& op_type = op->debug_def().type();
|
||||
timer.Start();
|
||||
CAFFE_ENFORCE(
|
||||
op->Run(),
|
||||
"operator ",
|
||||
op->debug_def().name(),
|
||||
"(",
|
||||
op_type,
|
||||
") has failed.");
|
||||
if (opengl_device_[idx] && op_type != "CopyFromGL" && op_type != "Reshape") {
|
||||
Blob *gpu_out_blob = ws_->GetBlob(output_blobs_[idx]);
|
||||
auto &g_ = gpu_out_blob->Get<GLTensor<DataType>>();
|
||||
g_.sync();
|
||||
}
|
||||
float spent = timer.MilliSeconds();
|
||||
time_per_op[idx] += spent;
|
||||
time_per_op_type[op_type] += spent;
|
||||
++idx;
|
||||
}
|
||||
}
|
||||
|
||||
int idx = 0;
|
||||
for (auto& op : operators_) {
|
||||
const string& op_type = op->debug_def().type();
|
||||
const string& print_name =
|
||||
(op->debug_def().name().size()
|
||||
? op->debug_def().name()
|
||||
: (op->debug_def().output_size() ? op->debug_def().output(0)
|
||||
: "NO_OUTPUT"));
|
||||
std::stringstream flops_str;
|
||||
if (flops_per_op[idx]) {
|
||||
flops_str << " ("
|
||||
<< to_string(1.0e-6 * flops_per_op[idx] / time_per_op[idx])
|
||||
<< " GFLOPS)";
|
||||
}
|
||||
std::cout << "[C2DEBUG] Operator #" << idx << " (" << print_name << ", " << op_type
|
||||
<< ") " << time_per_op[idx] / main_runs << " ms/iter"
|
||||
<< flops_str.str() << std::endl;
|
||||
++idx;
|
||||
}
|
||||
std::cout << "[C2DEBUG] Time per operator type:" << std::endl;
|
||||
// sort by decreasing time spending.
|
||||
std::vector<std::pair<string, float>> time_per_op_type_vec(
|
||||
time_per_op_type.begin(), time_per_op_type.end());
|
||||
std::sort(
|
||||
time_per_op_type_vec.begin(),
|
||||
time_per_op_type_vec.end(),
|
||||
PairLargerThan<string, float>);
|
||||
for (const auto& item : time_per_op_type_vec) {
|
||||
std::cout << "[C2DEBUG] " << std::setw(15) << std::setfill(' ') << item.second / main_runs
|
||||
<< " " << item.first << std::endl;
|
||||
}
|
||||
}
|
||||
// We will reuse time_per_op to return the result of BenchmarkNet.
|
||||
for (int i = 0; i < time_per_op.size(); ++i) {
|
||||
time_per_op[i] /= main_runs;
|
||||
}
|
||||
time_per_op.insert(time_per_op.begin(), millis / main_runs);
|
||||
return time_per_op;
|
||||
}
|
||||
|
||||
REGISTER_NET(opengl, GLNet);
|
||||
|
||||
} // namespace caffe2
|
@ -1,65 +0,0 @@
|
||||
#ifndef CAFFE2_CORE_NET_GL_H_
|
||||
#define CAFFE2_CORE_NET_GL_H_
|
||||
|
||||
#include <vector>
|
||||
|
||||
#include "c10/util/Registry.h"
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/core/tensor.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include "caffe2/proto/caffe2_pb.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// This is the very basic structure you need to run a network with
|
||||
// ARM's compute library
|
||||
class GLNet : public NetBase {
|
||||
private:
|
||||
bool first_run_ = true;
|
||||
Workspace* ws_;
|
||||
// record output blob for sync step in operator level benchmarking
|
||||
std::vector<string> output_blobs_;
|
||||
// record operator type and only sync after gpu op
|
||||
std::vector<bool> opengl_device_;
|
||||
public:
|
||||
GLNet(const std::shared_ptr<const NetDef>& net_def, Workspace* ws);
|
||||
bool SupportsAsync() override {
|
||||
return false;
|
||||
}
|
||||
|
||||
vector<float> TEST_Benchmark(
|
||||
const int warmup_runs,
|
||||
const int main_runs,
|
||||
const bool run_individual) override;
|
||||
|
||||
/*
|
||||
* This returns a list of pointers to objects stored in unique_ptrs.
|
||||
* Used by Observers.
|
||||
*
|
||||
* Think carefully before using.
|
||||
*/
|
||||
vector<OperatorBase*> GetOperators() const override {
|
||||
vector<OperatorBase*> op_list;
|
||||
for (auto& op : operators_) {
|
||||
op_list.push_back(op.get());
|
||||
}
|
||||
return op_list;
|
||||
}
|
||||
|
||||
protected:
|
||||
bool Run();
|
||||
bool RunAsync();
|
||||
bool DoRunAsync() override {
|
||||
return Run();
|
||||
}
|
||||
|
||||
vector<unique_ptr<OperatorBase>> operators_;
|
||||
|
||||
C10_DISABLE_COPY_AND_ASSIGN(GLNet);
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_CORE_NET_SIMPLE_H_
|
@ -1,12 +0,0 @@
|
||||
#include "operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
C10_DEFINE_REGISTRY(
|
||||
GLOperatorRegistry,
|
||||
OperatorBase,
|
||||
const OperatorDef&,
|
||||
Workspace*);
|
||||
CAFFE_REGISTER_DEVICE_TYPE(DeviceType::OPENGL, GLOperatorRegistry);
|
||||
|
||||
} // namespace caffe2
|
@ -1,30 +0,0 @@
|
||||
#ifndef CAFFE2_OPENGL_OPERATOR_H_
|
||||
#define CAFFE2_OPENGL_OPERATOR_H_
|
||||
|
||||
#include "c10/util/Registry.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
C10_DECLARE_REGISTRY(
|
||||
GLOperatorRegistry,
|
||||
OperatorBase,
|
||||
const OperatorDef&,
|
||||
Workspace*);
|
||||
#define REGISTER_GL_OPERATOR_CREATOR(key, ...) \
|
||||
C10_REGISTER_CREATOR(GLOperatorRegistry, key, __VA_ARGS__)
|
||||
#define REGISTER_GL_OPERATOR(name, ...) \
|
||||
extern void CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
|
||||
static void CAFFE2_UNUSED CAFFE_ANONYMOUS_VARIABLE_GL##name() { \
|
||||
CAFFE2_PLEASE_ADD_OPERATOR_SCHEMA_FOR_##name(); \
|
||||
} \
|
||||
C10_REGISTER_CLASS(GLOperatorRegistry, name, __VA_ARGS__)
|
||||
#define REGISTER_GL_OPERATOR_STR(str_name, ...) \
|
||||
C10_REGISTER_TYPED_CLASS(GLOperatorRegistry, str_name, __VA_ARGS__)
|
||||
|
||||
#define REGISTER_GL_OPERATOR_WITH_ENGINE(name, engine, ...) \
|
||||
C10_REGISTER_CLASS(GLOperatorRegistry, name##_ENGINE_##engine, __VA_ARGS__)
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPENGL_OPERATOR_H_
|
@ -1,257 +0,0 @@
|
||||
#include "rewrite_net.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/utils/proto_utils.h"
|
||||
#include <unordered_map>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
struct Analysis {
|
||||
struct SSA {
|
||||
using BlobVersions = std::unordered_map<std::string, size_t>;
|
||||
BlobVersions inVersions;
|
||||
BlobVersions outVersions;
|
||||
};
|
||||
std::vector<SSA> ssa;
|
||||
std::unordered_map<std::string, std::unordered_map<size_t, std::vector<size_t>>> inUsages;
|
||||
};
|
||||
|
||||
static Analysis analyzeNet(const NetDef& net) {
|
||||
Analysis::SSA::BlobVersions frontier;
|
||||
Analysis analysis;
|
||||
|
||||
auto play = [&](size_t i, const OperatorDef& op) {
|
||||
Analysis::SSA::BlobVersions inVersions;
|
||||
for (const auto& s : op.input()) {
|
||||
inVersions[s] = frontier[s];
|
||||
analysis.inUsages[s][frontier[s]].push_back(i);
|
||||
}
|
||||
Analysis::SSA::BlobVersions outVersions;
|
||||
for (const auto& s : op.output()) {
|
||||
if (frontier.find(s) != frontier.end()) {
|
||||
frontier[s] += 1;
|
||||
}
|
||||
outVersions[s] = frontier[s];
|
||||
}
|
||||
analysis.ssa.push_back(Analysis::SSA{inVersions, outVersions});
|
||||
};
|
||||
|
||||
for (auto i = 0; i < net.op_size(); ++i) {
|
||||
play(i, net.op(i));
|
||||
}
|
||||
return analysis;
|
||||
}
|
||||
|
||||
static void insertCopyFromGLOp(NetDef& predictNet, const std::string& cpu_blob) {
|
||||
auto* op = predictNet.add_op();
|
||||
op->set_name("CopyFromGL");
|
||||
op->set_type("CopyFromGL");
|
||||
op->add_input(cpu_blob + "_M");
|
||||
op->add_output(cpu_blob);
|
||||
}
|
||||
|
||||
static NetDef insertInputOutputCopyOps(const NetDef& def, std::unordered_set<std::string>& cpuOp) {
|
||||
// Do some validation of the outputs. For this version, we require:
|
||||
// - a single input (first element of external_input()) is consumed by the NetDef
|
||||
// - a single output (first element of external_output()) is produced by the NetDef.
|
||||
// - the input is consumed by def.op(0), and this is the only consumer.
|
||||
// - the output is produced by def.op(-1).
|
||||
CAFFE_ENFORCE_GE(def.external_input_size(), 1);
|
||||
CAFFE_ENFORCE_GE(def.external_output_size(), 1);
|
||||
auto analysis = analyzeNet(def);
|
||||
// enforce a single use of the input blob.
|
||||
CAFFE_ENFORCE_GE(def.op_size(), 1);
|
||||
|
||||
const auto& inputBlob = def.external_input(0);
|
||||
// Enforce that the input blob has a single usage - in the first operator.
|
||||
CAFFE_ENFORCE(analysis.inUsages[inputBlob][0] == (std::vector<size_t>{0}));
|
||||
const auto& outputBlob = def.external_output(0);
|
||||
const auto& outputBlobVersion = analysis.ssa.back().outVersions[outputBlob];
|
||||
// This should hold true by definition of the SSA analysis.
|
||||
CAFFE_ENFORCE(analysis.inUsages[outputBlob].find(outputBlobVersion) ==
|
||||
analysis.inUsages[outputBlob].end());
|
||||
|
||||
NetDef mdef;
|
||||
mdef.CopyFrom(def);
|
||||
mdef.clear_op();
|
||||
|
||||
std::unordered_map<std::string, std::set<size_t>> cpu_blobs, gpu_blobs;
|
||||
cpu_blobs[def.external_input(0)].insert(0);
|
||||
VLOG(2) << "[C2DEBUG] def.op_size(): " << def.op_size();
|
||||
for (auto i = 0; i < def.op_size(); i++) {
|
||||
const auto& currentOp = def.op(i);
|
||||
if (cpuOp.count(currentOp.type()) > 0) {
|
||||
// CPU Op
|
||||
// insert copyFromOpenGLOp
|
||||
for (auto j = 0; j < currentOp.input_size(); j++) {
|
||||
auto& input = currentOp.input(j);
|
||||
auto version = analysis.ssa[i].inVersions[input];
|
||||
if (gpu_blobs[input].count(version) > 0) {
|
||||
insertCopyFromGLOp(mdef, input);
|
||||
}
|
||||
}
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
for (auto j = 0; j < currentOp.output_size(); j++) {
|
||||
auto& output = currentOp.output(j);
|
||||
auto version = analysis.ssa[i].outVersions[output];
|
||||
cpu_blobs[output].insert(version);
|
||||
}
|
||||
} else {
|
||||
// OpenGL Op
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
|
||||
for (auto j = 0; j < op->input_size(); j++) {
|
||||
auto* input = op->mutable_input(j);
|
||||
auto version = analysis.ssa[i].inVersions[*input];
|
||||
if (gpu_blobs[*input].count(version) > 0) {
|
||||
*input = *input + "_M";
|
||||
}
|
||||
}
|
||||
|
||||
for (auto j = 0; j < currentOp.output_size(); j++) {
|
||||
auto& output = currentOp.output(j);
|
||||
auto version = analysis.ssa[i].outVersions[output];
|
||||
gpu_blobs[output].insert(version);
|
||||
// add _M to intermediate OpenGL op outputs
|
||||
auto* output_ = op->mutable_output(j);
|
||||
bool inter = true;
|
||||
for(auto k = 0; k < def.external_output_size(); k++) {
|
||||
if (*output_ == def.external_output(k)) {
|
||||
inter = false;
|
||||
}
|
||||
}
|
||||
if (inter) {
|
||||
*output_ = *output_ + "_M";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return mdef;
|
||||
}
|
||||
|
||||
static bool tryFuseAdjacentOps(const OperatorDef& currentOp,
|
||||
const OperatorDef& nextOp,
|
||||
OperatorDef* fusedOp,
|
||||
std::unordered_set<std::string>& glOps) {
|
||||
// Check for possible invalid opportunities.
|
||||
if (currentOp.output_size() != 1 || nextOp.output_size() != 1) {
|
||||
return false;
|
||||
}
|
||||
// The fused op cannot be inplace
|
||||
if (currentOp.output(0) != nextOp.input(0) || currentOp.input(0) == nextOp.output(0)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
static const std::map<std::pair<std::string, std::string>, std::string> fusionOpportunities = {
|
||||
{{"Conv", "Relu"}, "ConvRelu"}};
|
||||
auto it = fusionOpportunities.find({currentOp.type(), nextOp.type()});
|
||||
if (it == fusionOpportunities.end()) {
|
||||
return false;
|
||||
}
|
||||
|
||||
glOps.insert(it->second);
|
||||
fusedOp->CopyFrom(currentOp);
|
||||
fusedOp->set_output(0, nextOp.output(0));
|
||||
fusedOp->set_type(it->second);
|
||||
for (auto i = 1; i < nextOp.input_size(); i++) {
|
||||
fusedOp->add_input(nextOp.input(i));
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
static NetDef runOpenGLFusion(const NetDef& def, std::unordered_set<std::string>& glOps) {
|
||||
CHECK_GE(def.op_size(), 1);
|
||||
NetDef mdef;
|
||||
mdef.CopyFrom(def);
|
||||
mdef.clear_op();
|
||||
auto i = 0;
|
||||
|
||||
while (i < def.op_size()) {
|
||||
if (i == def.op_size() - 1) {
|
||||
VLOG(2) << "Last operator, skipping";
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(def.op(i));
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
|
||||
const auto& currentOp = def.op(i);
|
||||
const auto& nextOp = def.op(i + 1);
|
||||
OperatorDef fusedOp;
|
||||
if (tryFuseAdjacentOps(currentOp, nextOp, &fusedOp, glOps)) {
|
||||
VLOG(2) << "Found an adjacent fusion for: " << currentOp.type() << ", " << nextOp.type();
|
||||
// We can fuse.
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(fusedOp);
|
||||
i += 2;
|
||||
continue;
|
||||
}
|
||||
VLOG(2) << "No fusion available for: " << currentOp.type() << ", " << nextOp.type();
|
||||
// Just emit the current type.
|
||||
auto* op = mdef.add_op();
|
||||
op->CopyFrom(currentOp);
|
||||
i += 1;
|
||||
}
|
||||
return mdef;
|
||||
}
|
||||
|
||||
void dumpDefForOpenGL(const NetDef& d) {
|
||||
for (const auto& op : d.op()) {
|
||||
LOG(INFO) << op.input(0) << " -> " << op.type() << " -> " << op.output(0);
|
||||
}
|
||||
}
|
||||
|
||||
// // For debugging
|
||||
// void dumpDefForOpenGL(const NetDef &net) {
|
||||
// for (const auto &op : net.op()) {
|
||||
// printf("***Operator: %s\n", op.type().c_str());
|
||||
// for (auto input : op.input()) {
|
||||
// printf("\tInput: %s\n", input.c_str());
|
||||
// }
|
||||
//
|
||||
// for (auto output : op.output()) {
|
||||
// printf("\tOutput: %s\n", output.c_str());
|
||||
// }
|
||||
// }
|
||||
//}
|
||||
|
||||
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet, bool runFusion, std::unordered_set<std::string> cpuOps) {
|
||||
CAFFE_ENFORCE_GE(predictNet.op_size(), 1);
|
||||
NetDef net;
|
||||
net.CopyFrom(predictNet);
|
||||
// if (runFusion) {
|
||||
// net = runOpenGLFusion(net, openGLOps);
|
||||
// }
|
||||
net = insertInputOutputCopyOps(net, cpuOps);
|
||||
VLOG(2) << "[C2DEBUG] net size " << net.op().size();
|
||||
net.set_type("opengl");
|
||||
|
||||
for (auto i = 0; i < net.op().size(); ++i) {
|
||||
auto op = net.mutable_op(i);
|
||||
if (std::find(cpuOps.begin(), cpuOps.end(), op->type()) == cpuOps.end()) {
|
||||
op->mutable_device_option()->set_device_type(PROTO_OPENGL);
|
||||
}
|
||||
}
|
||||
|
||||
return net;
|
||||
}
|
||||
|
||||
bool tryConvertToOpenGL(const NetDef& predictNet,
|
||||
NetDef* glPredictNet,
|
||||
bool runFusion,
|
||||
std::unordered_set<std::string> cpuOps) {
|
||||
try {
|
||||
// Throws if unsupported operators are found.
|
||||
VLOG(2) << "[C2DEBUG] in tryConvertToOpenGL";
|
||||
*glPredictNet = rewritePredictNetForOpenGL(predictNet, runFusion, cpuOps);
|
||||
dumpDefForOpenGL(*glPredictNet);
|
||||
// Throws if unsupported parameters are found.
|
||||
LOG(INFO) << "OpenGL is successfully enabled";
|
||||
return true;
|
||||
} catch (const std::exception& e) {
|
||||
LOG(ERROR) << "Caught exception trying to convert NetDef to OpenGL: " << e.what();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
} // namespace caffe2
|
@ -1,17 +0,0 @@
|
||||
|
||||
#pragma once
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/net_gl.h"
|
||||
#include <unordered_set>
|
||||
|
||||
namespace caffe2 {
|
||||
bool tryConvertToOpenGL(const NetDef& predictNet,
|
||||
NetDef* glPredictNet,
|
||||
bool runFusion,
|
||||
std::unordered_set<std::string> cpuOps);
|
||||
|
||||
// Exposed for testing
|
||||
NetDef rewritePredictNetForOpenGL(const NetDef& predictNet,
|
||||
bool runFusion,
|
||||
std::unordered_set<std::string> cpuOps);
|
||||
void dumpDefForOpenGL(const NetDef& net);
|
||||
} // namespace caffe2
|
Binary file not shown.
Binary file not shown.
@ -1,2 +0,0 @@
|
||||
file(GLOB_RECURSE tmp *.cc)
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${tmp} PARENT_SCOPE)
|
@ -1,110 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
|
||||
#include "caffe2/mobile/contrib/arm-compute/operators/activation_ops.h"
|
||||
#include "caffe2/operators/relu_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T>
|
||||
bool GLReluOp<T>::RunOnDevice() {
|
||||
|
||||
auto *Xblob = OperatorBase::Inputs()[0];
|
||||
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
if (Y->get_underlying() != X_->get_underlying())
|
||||
{
|
||||
Y->ResizeLike(*X_);
|
||||
}
|
||||
relu_layer_.configure(
|
||||
X_->get_underlying(), Y->get_underlying(),
|
||||
arm_compute::ActivationLayerInfo(
|
||||
arm_compute::ActivationLayerInfo::ActivationFunction::RELU));
|
||||
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
second_run_ = false;
|
||||
// in place activation, do not need to allocate new memory
|
||||
if (Y->get_underlying() != X_->get_underlying()) {
|
||||
Y->ResizeLike(*X_);
|
||||
Y->allocate();
|
||||
}
|
||||
relu_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation = false;
|
||||
if (Y->get_underlying() != X_->get_underlying()) {
|
||||
need_allocation = Y->ResizeLike(*X_, true);
|
||||
}
|
||||
relu_layer_.configure(
|
||||
X_->get_underlying(), Y->get_underlying(),
|
||||
arm_compute::ActivationLayerInfo(
|
||||
arm_compute::ActivationLayerInfo::ActivationFunction::RELU));
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
relu_layer_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(Relu, GLReluOp<DataType>);
|
||||
|
||||
template <typename T>
|
||||
bool GLSigmoidOp<T>::RunOnDevice() {
|
||||
|
||||
auto *Xblob = OperatorBase::Inputs()[0];
|
||||
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
|
||||
if (Y->get_underlying() != X_->get_underlying())
|
||||
{
|
||||
Y->ResizeLike(*X_);
|
||||
}
|
||||
|
||||
sigmoid_layer_.configure(
|
||||
X_->get_underlying(), Y->get_underlying(),
|
||||
arm_compute::ActivationLayerInfo(
|
||||
arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC));
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
second_run_ = false;
|
||||
// in place activation, do not need to allocate new memory
|
||||
if (Y->get_underlying() != X_->get_underlying()) {
|
||||
Y->ResizeLike(*X_);
|
||||
Y->allocate();
|
||||
}
|
||||
sigmoid_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation = false;
|
||||
if (Y->get_underlying() != X_->get_underlying())
|
||||
{
|
||||
need_allocation = Y->ResizeLike(*X_, true);
|
||||
}
|
||||
sigmoid_layer_.configure(
|
||||
X_->get_underlying(), Y->get_underlying(),
|
||||
arm_compute::ActivationLayerInfo(
|
||||
arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC));
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
sigmoid_layer_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(Sigmoid, GLSigmoidOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,38 +0,0 @@
|
||||
#ifndef CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
|
||||
#define CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T>
|
||||
class GLSigmoidOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLSigmoidOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCActivationLayer sigmoid_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
|
||||
};
|
||||
|
||||
template <typename T> class GLReluOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLReluOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~GLReluOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCActivationLayer relu_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
|
||||
|
||||
};
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_OPENGL_OPERATORS_ACTIVATION_OPS_H_
|
@ -1,116 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
#include "caffe2/operators/concat_split_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T> class GLConcatOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLConcatOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~GLConcatOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCDepthConcatenateLayer concat_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
std::vector<GLContext::deleted_unique_ptr<const GLTensor<T>>> inputs_;
|
||||
int channelCount_ = 0;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
bool GLConcatOp<T>::RunOnDevice() {
|
||||
|
||||
CAFFE_ENFORCE(InputSize() <= 4 && InputSize() >= 2, "Number \
|
||||
of input must be between 2 and 4.");
|
||||
|
||||
auto *X0blob = OperatorBase::Inputs()[0];
|
||||
if (first_run_) {
|
||||
auto X0 = GLContext::getGLTensor<T>(X0blob);
|
||||
inputs_.push_back(std::move(X0));
|
||||
} else {
|
||||
auto X0 = GLContext::getGLTensor<T>(X0blob, inputs_[0].release());
|
||||
inputs_[0] = std::move(X0);
|
||||
}
|
||||
|
||||
int N = inputs_[0]->dim32(0);
|
||||
int channels = inputs_[0]->dim32(1);
|
||||
int height = inputs_[0]->dim32(2);
|
||||
int width = inputs_[0]->dim32(3);
|
||||
std::vector<const Blob*> inputsBlob;
|
||||
inputsBlob.push_back(X0blob);
|
||||
|
||||
if (first_run_) {
|
||||
channelCount_ = channels;
|
||||
for (int i = 1; i < Inputs().size(); ++i) {
|
||||
auto *Xblob = OperatorBase::Inputs()[i];
|
||||
auto X = GLContext::getGLTensor<T>(Xblob);
|
||||
CAFFE_ENFORCE_EQ(N, X->dim32(0), X->dim32(0));
|
||||
CAFFE_ENFORCE_EQ(height, X->dim32(2), X->dim32(2));
|
||||
CAFFE_ENFORCE_EQ(width, X->dim32(3), X->dim32(3));
|
||||
channelCount_ += X->dim32(1);
|
||||
inputs_.push_back(std::move(X));
|
||||
}
|
||||
} else {
|
||||
channelCount_ = channels;
|
||||
for (int i = 1; i < Inputs().size(); ++i) {
|
||||
auto *Xblob = OperatorBase::Inputs()[i];
|
||||
auto X = GLContext::getGLTensor<T>(Xblob, inputs_[i].release());
|
||||
CAFFE_ENFORCE_EQ(N, X->dim32(0), X->dim32(0));
|
||||
CAFFE_ENFORCE_EQ(height, X->dim32(2), X->dim32(2));
|
||||
CAFFE_ENFORCE_EQ(width, X->dim32(3), X->dim32(3));
|
||||
channelCount_ += X->dim32(1);
|
||||
inputs_[i] = std::move(X);
|
||||
}
|
||||
}
|
||||
|
||||
for (int i = 1; i < Inputs().size(); ++i) {
|
||||
auto *Xblob = OperatorBase::Inputs()[i];
|
||||
inputsBlob.push_back(Xblob);
|
||||
}
|
||||
std::vector<int> output_dims = {N, channelCount_, height, width};
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
Y->Resize(output_dims);
|
||||
std::vector<arm_compute::IGCTensor*> inputsGC;
|
||||
for (int i = 0; i < inputs_.size(); ++i) {
|
||||
inputsGC.push_back(inputs_[i]->get_underlying());
|
||||
}
|
||||
concat_layer_.configure(inputsGC, Y->get_underlying());
|
||||
} else if (second_run_) {
|
||||
for (int i = 0; i < inputs_.size(); ++i) {
|
||||
auto* X = inputs_[i].get();
|
||||
auto* Xblob = inputsBlob[i];
|
||||
X->lazy_allocate(Xblob, second_run_, true);
|
||||
}
|
||||
second_run_ = false;
|
||||
Y->Resize(output_dims);
|
||||
Y->allocate();
|
||||
concat_layer_.run();
|
||||
} else {
|
||||
for (int i = 0; i < inputs_.size(); ++i) {
|
||||
auto* X = inputs_[i].get();
|
||||
auto* Xblob = inputsBlob[i];
|
||||
X->lazy_allocate(Xblob, second_run_, true);
|
||||
}
|
||||
bool need_allocation = Y->Resize(output_dims);
|
||||
std::vector<arm_compute::IGCTensor*> inputsGC;
|
||||
for (int i = 0; i < inputs_.size(); ++i) {
|
||||
inputsGC.push_back(inputs_[i]->get_underlying());
|
||||
}
|
||||
concat_layer_.configure(inputsGC, Y->get_underlying());
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
concat_layer_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(Concat, GLConcatOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,113 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
|
||||
#include "caffe2/operators/conv_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T>
|
||||
class GLConvOp final : public ConvPoolOpBase<GLContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
|
||||
GLConvOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<GLContext>(operator_def, ws) {
|
||||
// Since this is the default convolution implementation, we will
|
||||
// use CAFFE_ENFORCE instead of OPERATOR_NEEDS_FEATURE.
|
||||
CAFFE_ENFORCE(
|
||||
group_ == 1 || order_ == StorageOrder::NCHW,
|
||||
"Group convolution only supports NCHW order right now.");
|
||||
}
|
||||
~GLConvOp() {}
|
||||
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCDirectConvolutionLayer conv_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, filter_, bias_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
bool GLConvOp<T>::RunOnDevice() {
|
||||
auto *Xblob = OperatorBase::Inputs()[0];
|
||||
auto *filterblob = OperatorBase::Inputs()[1];
|
||||
auto *biasblob = OperatorBase::Inputs()[2];
|
||||
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
|
||||
if (first_run_) {
|
||||
filter_ = GLContext::getGLTensor<T>(filterblob);
|
||||
bias_ = GLContext::getGLTensor<T>(biasblob);
|
||||
}
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
|
||||
const int N = X_->dim32(0), H = X_->dim32(2), W = X_->dim32(3), C = X_->dim32(1);
|
||||
LOG(INFO) << "[C2DEBUG] Conv " << N << " " << H << " " << W << " " << C;
|
||||
CAFFE_ENFORCE_EQ(kernel_.size(), 2,
|
||||
"Only 2d convolution is supported with ARM compute backend");
|
||||
|
||||
CAFFE_ENFORCE(X_->ndim(), filter_->ndim());
|
||||
const int M = filter_->dim32(0);
|
||||
CAFFE_ENFORCE(filter_->dim32(2) == kernel_h());
|
||||
CAFFE_ENFORCE(filter_->dim32(3) == kernel_w());
|
||||
CAFFE_ENFORCE(filter_->dim32(1) == C);
|
||||
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
|
||||
// resize output accordingly
|
||||
TensorCPU fakeX;
|
||||
fakeX.Resize(X_->dims());
|
||||
TensorCPU fakeY;
|
||||
ConvPoolOpBase<GLContext>::SetOutputSize(fakeX, &fakeY, filter_->dim32(0));
|
||||
Y->ResizeLike(fakeY);
|
||||
LOG(INFO) << "[C2DEBUG] dims of X " << X_->dims();
|
||||
LOG(INFO) << "[C2DEBUG] dims of X(gctensor) "
|
||||
<< X_->get_underlying()->info()->dimension(3) << " "
|
||||
<< X_->get_underlying()->info()->dimension(2) << " "
|
||||
<< X_->get_underlying()->info()->dimension(1) << " "
|
||||
<< X_->get_underlying()->info()->dimension(0) << " "
|
||||
;
|
||||
LOG(INFO) << "[C2DEBUG] dims of Y " << Y->dims();
|
||||
LOG(INFO) << "[C2DEBUG] dims of Y(gctensor) "
|
||||
<< Y->get_underlying()->info()->dimension(3) << " "
|
||||
<< Y->get_underlying()->info()->dimension(2) << " "
|
||||
<< Y->get_underlying()->info()->dimension(1) << " "
|
||||
<< Y->get_underlying()->info()->dimension(0) << " "
|
||||
;
|
||||
|
||||
conv_.configure(
|
||||
X_->get_underlying(), filter_->get_underlying(), bias_->get_underlying(),
|
||||
Y->get_underlying(),
|
||||
arm_compute::PadStrideInfo(stride_[0], stride_[1], pads_[0], pads_[1]));
|
||||
|
||||
} else if (second_run_) {
|
||||
// Always attempt to copy the CPU to GPU on input
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
filter_->lazy_allocate(filterblob, second_run_, second_run_);
|
||||
bias_->lazy_allocate(biasblob, second_run_, second_run_);
|
||||
second_run_ = false;
|
||||
Y->allocate();
|
||||
conv_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
TensorCPU fakeX;
|
||||
fakeX.Resize(X_->dims());
|
||||
TensorCPU fakeY;
|
||||
ConvPoolOpBase<GLContext>::SetOutputSize(fakeX, &fakeY, filter_->dim32(0));
|
||||
bool need_allocation = Y->ResizeLike(fakeY, true);
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
conv_.configure(
|
||||
X_->get_underlying(), filter_->get_underlying(), bias_->get_underlying(),
|
||||
Y->get_underlying(),
|
||||
arm_compute::PadStrideInfo(stride_[0], stride_[1], pads_[0], pads_[1]));
|
||||
conv_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(Conv, GLConvOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,71 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T> class CopyFromGLOp final : public Operator<GLContext> {
|
||||
public:
|
||||
CopyFromGLOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~CopyFromGLOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
std::vector<GLContext::deleted_unique_ptr<const GLTensor<T>>> inputs_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
bool CopyFromGLOp<T>::RunOnDevice() {
|
||||
|
||||
std::vector<const Blob*> inputsBlob;
|
||||
|
||||
for (int i = 0; i < Inputs().size(); ++i) {
|
||||
auto *Xblob = OperatorBase::Inputs()[i];
|
||||
inputsBlob.push_back(Xblob);
|
||||
}
|
||||
|
||||
if (first_run_) {
|
||||
for (int i = 0; i < Inputs().size(); ++i) {
|
||||
auto *Xblob = inputsBlob[i];
|
||||
auto X = GLContext::getGLTensor<T>(Xblob);
|
||||
inputs_.push_back(std::move(X));
|
||||
}
|
||||
} else {
|
||||
for (int i = 0; i < Inputs().size(); ++i) {
|
||||
auto *Xblob = inputsBlob[i];
|
||||
auto X = GLContext::getGLTensor<T>(Xblob, inputs_[i].release());
|
||||
inputs_[i] = std::move(X);
|
||||
}
|
||||
}
|
||||
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
for (int i = 0; i < Inputs().size(); ++i) {
|
||||
auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
|
||||
Y->Resize(inputs_[i]->dims());
|
||||
Y->template mutable_data<float>();
|
||||
}
|
||||
} else {
|
||||
for (auto i = 0; i < Inputs().size(); ++i) {
|
||||
// Blob
|
||||
auto* Xblob = inputsBlob[i];
|
||||
// GLTensor
|
||||
auto* X = inputs_[i].get();
|
||||
X->lazy_allocate(Xblob, second_run_, true);
|
||||
auto* Y = BlobGetMutableTensor(OperatorBase::Outputs()[i], CPU);
|
||||
Timer timer;
|
||||
timer.Start();
|
||||
getTensorCPU(*X, *Y);
|
||||
auto millis = timer.MilliSeconds();
|
||||
//LOG(ERROR) << "[C2DEBUG] copy_op " << X->dims() << " takes " << millis << " milliseconds";
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(CopyFromGL, CopyFromGLOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,58 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
#include "caffe2/operators/utility_ops.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T> class GLSumOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLSumOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~GLSumOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCArithmeticAddition add_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> A_, B_;
|
||||
};
|
||||
|
||||
|
||||
template <typename T>
|
||||
bool GLSumOp<T>::RunOnDevice() {
|
||||
|
||||
auto *Ablob = OperatorBase::Inputs()[0];
|
||||
auto *Bblob = OperatorBase::Inputs()[1];
|
||||
|
||||
A_ = GLContext::getGLTensor<T>(Ablob, A_.release());
|
||||
B_ = GLContext::getGLTensor<T>(Bblob, B_.release());
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
Y->ResizeLike(*A_);
|
||||
add_layer_.configure(A_->get_underlying(), B_->get_underlying(), Y->get_underlying(), arm_compute::ConvertPolicy::SATURATE);
|
||||
} else if (second_run_) {
|
||||
A_->lazy_allocate(Ablob, second_run_, true);
|
||||
B_->lazy_allocate(Bblob, second_run_, true);
|
||||
second_run_ = false;
|
||||
Y->allocate();
|
||||
add_layer_.run();
|
||||
} else {
|
||||
A_->lazy_allocate(Ablob, second_run_, true);
|
||||
B_->lazy_allocate(Bblob, second_run_, true);
|
||||
bool need_allocation = Y->ResizeLike(*A_);
|
||||
add_layer_.configure(A_->get_underlying(), B_->get_underlying(), Y->get_underlying(), arm_compute::ConvertPolicy::SATURATE);
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(Sum, GLSumOp<DataType>);
|
||||
REGISTER_GL_OPERATOR(Add, GLSumOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,76 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
|
||||
#include "caffe2/operators/fully_connected_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T> class GLFullyConnectedOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLFullyConnectedOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~GLFullyConnectedOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCFullyConnectedLayer fc_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, W_, B_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
bool GLFullyConnectedOp<T>::RunOnDevice() {
|
||||
|
||||
auto Xblob = OperatorBase::Inputs()[0];
|
||||
auto *Wblob = OperatorBase::Inputs()[1];
|
||||
auto *Bblob = OperatorBase::Inputs()[2];
|
||||
|
||||
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
|
||||
if (first_run_) {
|
||||
W_ = GLContext::getGLTensor<T>(Wblob);
|
||||
B_ = GLContext::getGLTensor<T>(Bblob);
|
||||
}
|
||||
|
||||
auto M = X_->dim32(0);
|
||||
auto CIn = X_->dim32(1);
|
||||
auto Height = X_->dim32(2);
|
||||
auto Width = X_->dim32(3);
|
||||
auto N = W_->dim32(0);
|
||||
|
||||
CAFFE_ENFORCE_EQ(1, B_->ndim());
|
||||
CAFFE_ENFORCE_EQ(N, B_->dim32(0));
|
||||
|
||||
vector<int64_t> output_dims = {M, N};
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
Y->Resize(output_dims);
|
||||
|
||||
fc_layer_.configure(X_->get_underlying(), W_->get_underlying(),
|
||||
B_->get_underlying(), Y->get_underlying(), true, false);
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
W_->lazy_allocate(Wblob, second_run_, second_run_);
|
||||
B_->lazy_allocate(Bblob, second_run_, second_run_);
|
||||
second_run_ = false;
|
||||
Y->Resize(output_dims);
|
||||
Y->allocate();
|
||||
fc_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation = Y->Resize(output_dims);
|
||||
fc_layer_.configure(X_->get_underlying(), W_->get_underlying(),
|
||||
B_->get_underlying(), Y->get_underlying(), true, false);
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
fc_layer_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(FC, GLFullyConnectedOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,70 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T>
|
||||
class GLNormalizePlanarYUVOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLNormalizePlanarYUVOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~GLNormalizePlanarYUVOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCNormalizePlanarYUVLayer norm_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, mean_, sd_;
|
||||
};
|
||||
|
||||
template <typename T> bool GLNormalizePlanarYUVOp<T>::RunOnDevice() {
|
||||
|
||||
auto Xblob = OperatorBase::Inputs()[0];
|
||||
auto *meanblob = OperatorBase::Inputs()[1];
|
||||
auto *sdblob = OperatorBase::Inputs()[2];
|
||||
|
||||
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
|
||||
if (first_run_) {
|
||||
mean_ = GLContext::getGLTensor<T>(meanblob);
|
||||
sd_ = GLContext::getGLTensor<T>(sdblob);
|
||||
}
|
||||
|
||||
CAFFE_ENFORCE_EQ(X_->ndim(), 4);
|
||||
auto N = X_->dim32(0);
|
||||
auto C = X_->dim32(1);
|
||||
auto H = X_->dim32(2);
|
||||
auto W = X_->dim32(3);
|
||||
|
||||
CAFFE_ENFORCE_EQ(C, mean_->dim32(1));
|
||||
CAFFE_ENFORCE_EQ(C, sd_->dim32(1));
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
Y->ResizeLike(*X_);
|
||||
norm_layer_.configure(X_->get_underlying(), Y->get_underlying(), mean_->get_underlying(), sd_->get_underlying());
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
mean_->lazy_allocate(meanblob, second_run_, second_run_);
|
||||
sd_->lazy_allocate(sdblob, second_run_, second_run_);
|
||||
second_run_ = false;
|
||||
Y->ResizeLike(*X_);
|
||||
Y->allocate();
|
||||
norm_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation = Y->ResizeLike(*X_);
|
||||
norm_layer_.configure(X_->get_underlying(), Y->get_underlying(), mean_->get_underlying(), sd_->get_underlying());
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
norm_layer_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(NormalizePlanarYUV, GLNormalizePlanarYUVOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,186 +0,0 @@
|
||||
#include "caffe2/operators/pool_op.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T>
|
||||
class GLAveragePoolOp final : public ConvPoolOpBase<GLContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
|
||||
GLAveragePoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<GLContext>(operator_def, ws) {
|
||||
}
|
||||
~GLAveragePoolOp() {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
bool RunOnDeviceWithOrderNHWC() override;
|
||||
private:
|
||||
arm_compute::GCPoolingLayer pooling_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
|
||||
};
|
||||
|
||||
template<typename T>
|
||||
class GLMaxPoolOp final : public ConvPoolOpBase<GLContext> {
|
||||
public:
|
||||
USE_CONV_POOL_BASE_FUNCTIONS(GLContext);
|
||||
GLMaxPoolOp(const OperatorDef& operator_def, Workspace* ws)
|
||||
: ConvPoolOpBase<GLContext>(operator_def, ws) {
|
||||
}
|
||||
~GLMaxPoolOp() {}
|
||||
|
||||
bool RunOnDeviceWithOrderNCHW() override;
|
||||
bool RunOnDeviceWithOrderNHWC() override;
|
||||
private:
|
||||
arm_compute::GCPoolingLayer pooling_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
|
||||
};
|
||||
|
||||
template <>
|
||||
bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
auto *Xblob = OperatorBase::Inputs()[0];
|
||||
if (first_run_) {
|
||||
X_ = GLContext::getGLTensor<DataType>(Xblob);
|
||||
} else {
|
||||
X_ = GLContext::getGLTensor<DataType>(Xblob, X_.release());
|
||||
}
|
||||
|
||||
int N = X_->dim32(0);
|
||||
int channels = X_->dim32(1);
|
||||
int height = X_->dim32(2);
|
||||
int width = X_->dim32(3);
|
||||
|
||||
vector<int64_t> output_dims = {N, channels, 1, 1};
|
||||
if (!global_pooling_) {
|
||||
output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
|
||||
output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
|
||||
}
|
||||
|
||||
GLTensor<DataType> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<DataType>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
CAFFE_ENFORCE_EQ(kernel_.size(), 2, "ARM OpenGL only supports 2D pooling");
|
||||
CAFFE_ENFORCE_EQ(kernel_h(), kernel_w(),
|
||||
"ARM OpenGL only supports equal kernel size");
|
||||
Y->Resize(output_dims);
|
||||
if (global_pooling_) {
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
} else {
|
||||
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
|
||||
pad_t(), pad_b(),
|
||||
arm_compute::DimensionRoundingType::FLOOR);
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG, kernel_h(),
|
||||
ps_info);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
}
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
second_run_ = false;
|
||||
Y->Resize(output_dims);
|
||||
Y->allocate();
|
||||
pooling_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation =Y->Resize(output_dims);
|
||||
if (global_pooling_) {
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
} else {
|
||||
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
|
||||
pad_t(), pad_b(),
|
||||
arm_compute::DimensionRoundingType::FLOOR);
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::AVG, kernel_h(),
|
||||
ps_info);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
}
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
pooling_layer_.run();
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
template <> bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNCHW() {
|
||||
|
||||
auto *Xblob = OperatorBase::Inputs()[0];
|
||||
X_ = GLContext::getGLTensor<DataType>(Xblob, X_.release());
|
||||
|
||||
int N = X_->dim32(0);
|
||||
int channels = X_->dim32(1);
|
||||
int height = X_->dim32(2);
|
||||
int width = X_->dim32(3);
|
||||
|
||||
vector<int64_t> output_dims = {N, channels, 1, 1};
|
||||
if (!global_pooling_) {
|
||||
output_dims[2] = (height + pad_t() + pad_b() - kernel_h()) / stride_h() + 1;
|
||||
output_dims[3] = (width + pad_l() + pad_r() - kernel_w()) / stride_w() + 1;
|
||||
}
|
||||
GLTensor<DataType> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<DataType>>();
|
||||
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
CAFFE_ENFORCE_EQ(kernel_.size(), 2, "ARM OpenGL only supports 2D pooling");
|
||||
CAFFE_ENFORCE_EQ(kernel_h(), kernel_w(),
|
||||
"ARM OpenGL only supports equal kernel size");
|
||||
Y->Resize(output_dims);
|
||||
if (global_pooling_) {
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
} else {
|
||||
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
|
||||
pad_t(), pad_b(),
|
||||
arm_compute::DimensionRoundingType::FLOOR);
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX, kernel_h(),
|
||||
ps_info);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
}
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
second_run_ = false;
|
||||
Y->Resize(output_dims);
|
||||
Y->allocate();
|
||||
pooling_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation = Y->Resize(output_dims);
|
||||
if (global_pooling_) {
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
} else {
|
||||
arm_compute::PadStrideInfo ps_info(stride_w(), stride_h(), pad_l(), pad_r(),
|
||||
pad_t(), pad_b(),
|
||||
arm_compute::DimensionRoundingType::FLOOR);
|
||||
arm_compute::PoolingLayerInfo info(arm_compute::PoolingType::MAX, kernel_h(),
|
||||
ps_info);
|
||||
pooling_layer_.configure(X_->get_underlying(), Y->get_underlying(), info);
|
||||
}
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
pooling_layer_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool GLAveragePoolOp<DataType>::RunOnDeviceWithOrderNHWC() {
|
||||
return false;
|
||||
}
|
||||
|
||||
template <>
|
||||
bool GLMaxPoolOp<DataType>::RunOnDeviceWithOrderNHWC() {
|
||||
return false;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(AveragePool, GLAveragePoolOp<DataType>);
|
||||
REGISTER_GL_OPERATOR(MaxPool, GLMaxPoolOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,29 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
#include "caffe2/operators/reshape_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T> class GLReshapeOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLReshapeOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~GLReshapeOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
bool GLReshapeOp<T>::RunOnDevice() {
|
||||
auto *Xblob = OperatorBase::Inputs()[0];
|
||||
auto X = GLContext::getGLTensor<T>(Xblob);
|
||||
auto arg = OperatorBase::GetRepeatedArgument<int>("shape");
|
||||
for (int i = 0; i < arg.size(); ++i) {
|
||||
LOG(INFO) << "[C2DEBUG] shape: " << arg[i];
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(Reshape, GLReshapeOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,74 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
#include "caffe2/operators/resize_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template<typename T>
|
||||
class GLResizeNearestOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLResizeNearestOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws), width_scale_(1), height_scale_(1) {
|
||||
if (HasArgument("width_scale")) {
|
||||
width_scale_ = static_cast<float>(
|
||||
OperatorBase::GetSingleArgument<float>("width_scale", 1));
|
||||
}
|
||||
if (HasArgument("height_scale")) {
|
||||
height_scale_ = static_cast<float>(
|
||||
OperatorBase::GetSingleArgument<float>("height_scale", 1));
|
||||
}
|
||||
CAFFE_ENFORCE_GT(width_scale_, 0);
|
||||
CAFFE_ENFORCE_GT(height_scale_, 0);
|
||||
}
|
||||
virtual ~GLResizeNearestOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
float width_scale_;
|
||||
float height_scale_;
|
||||
arm_compute::GCScale resize_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
bool GLResizeNearestOp<T>::RunOnDevice() {
|
||||
|
||||
auto* Xblob = OperatorBase::Inputs()[0];
|
||||
|
||||
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
|
||||
|
||||
auto N = X_->dim32(0);
|
||||
auto C = X_->dim32(1);
|
||||
auto H = X_->dim32(2);
|
||||
auto W = X_->dim32(3);
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
vector<int64_t> output_dims = {N, C, H * height_scale_, W * width_scale_};
|
||||
|
||||
if (first_run_) {
|
||||
Y->Resize(output_dims);
|
||||
first_run_ = false;
|
||||
resize_layer_.configure(X_->get_underlying(), Y->get_underlying(), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, arm_compute::BorderMode::UNDEFINED);
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
second_run_ = false;
|
||||
Y->Resize(output_dims);
|
||||
Y->allocate();
|
||||
resize_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation = Y->Resize(output_dims);
|
||||
resize_layer_.configure(X_->get_underlying(), Y->get_underlying(), arm_compute::InterpolationPolicy::NEAREST_NEIGHBOR, arm_compute::BorderMode::UNDEFINED);
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(ResizeNearest, GLResizeNearestOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,54 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
|
||||
#include "caffe2/operators/softmax_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T> class GLSoftmaxOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLSoftmaxOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws) {}
|
||||
virtual ~GLSoftmaxOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
private:
|
||||
arm_compute::GCSoftmaxLayer softmax_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
bool GLSoftmaxOp<T>::RunOnDevice() {
|
||||
|
||||
auto *Xblob = OperatorBase::Inputs()[0];
|
||||
X_ = GLContext::getGLTensor<T>(Xblob, X_.release());
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
Y->ResizeLike(*X_);
|
||||
softmax_layer_.configure(X_->get_underlying(), Y->get_underlying());
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
second_run_ = false;
|
||||
Y->ResizeLike(*X_);
|
||||
Y->allocate();
|
||||
softmax_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(Xblob, second_run_, true);
|
||||
bool need_allocation = Y->ResizeLike(*X_);
|
||||
softmax_layer_.configure(X_->get_underlying(), Y->get_underlying());
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
softmax_layer_.run();
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(Softmax, GLSoftmaxOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,93 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/operator.h"
|
||||
|
||||
#include "caffe2/operators/spatial_batch_norm_op.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
template <typename T> class GLSpatialBNOp final : public Operator<GLContext> {
|
||||
public:
|
||||
GLSpatialBNOp(const OperatorDef &operator_def, Workspace *ws)
|
||||
: Operator<GLContext>(operator_def, ws),
|
||||
is_test_(OperatorBase::GetSingleArgument<int>(OpSchema::Arg_IsTest, 0)),
|
||||
epsilon_(OperatorBase::GetSingleArgument<float>("epsilon", 1e-5)),
|
||||
momentum_(OperatorBase::GetSingleArgument<float>("momentum", 0.9)),
|
||||
order_(StringToStorageOrder(
|
||||
OperatorBase::GetSingleArgument<string>("order", "NCHW"))) { }
|
||||
virtual ~GLSpatialBNOp() noexcept {}
|
||||
USE_OPERATOR_FUNCTIONS(GLContext);
|
||||
bool RunOnDevice() override;
|
||||
protected:
|
||||
bool is_test_;
|
||||
double epsilon_;
|
||||
double momentum_;
|
||||
StorageOrder order_;
|
||||
INPUT_TAGS(INPUT, SCALE, BIAS, EST_MEAN, EST_VAR);
|
||||
OUTPUT_TAGS(OUTPUT, RUNNING_MEAN, RUNNING_VAR, SAVED_MEAN, SAVED_INV_VAR);
|
||||
private:
|
||||
arm_compute::GCBatchNormalizationLayer bn_layer_;
|
||||
bool first_run_ = true, second_run_ = true;
|
||||
GLContext::deleted_unique_ptr<const GLTensor<T>> X_, mean_, var_, bias_, scale_;
|
||||
};
|
||||
|
||||
template <typename T>
|
||||
bool GLSpatialBNOp<T>::RunOnDevice() {
|
||||
auto *XBlob = OperatorBase::Inputs()[0];
|
||||
auto *scaleBlob = OperatorBase::Inputs()[SCALE];
|
||||
auto *biasBlob = OperatorBase::Inputs()[BIAS];
|
||||
auto *meanBlob = OperatorBase::Inputs()[EST_MEAN];
|
||||
auto *varBlob = OperatorBase::Inputs()[EST_VAR];
|
||||
|
||||
X_ = GLContext::getGLTensor<T>(XBlob, X_.release());
|
||||
if (first_run_) {
|
||||
scale_ = GLContext::getGLTensor<T>(scaleBlob);
|
||||
bias_ = GLContext::getGLTensor<T>(biasBlob);
|
||||
mean_ = GLContext::getGLTensor<T>(meanBlob);
|
||||
var_ = GLContext::getGLTensor<T>(varBlob);
|
||||
}
|
||||
|
||||
auto C = X_->dim32(1);
|
||||
CAFFE_ENFORCE_EQ(scale_->ndim(), 1);
|
||||
CAFFE_ENFORCE_EQ(bias_->ndim(), 1);
|
||||
CAFFE_ENFORCE_EQ(mean_->ndim(), 1);
|
||||
CAFFE_ENFORCE_EQ(var_->ndim(), 1);
|
||||
|
||||
CAFFE_ENFORCE_EQ(scale_->dim32(0), C);
|
||||
CAFFE_ENFORCE_EQ(bias_->dim32(0), C);
|
||||
CAFFE_ENFORCE_EQ(mean_->dim32(0), C);
|
||||
CAFFE_ENFORCE_EQ(var_->dim32(0), C);
|
||||
|
||||
GLTensor<T> *Y =
|
||||
OperatorBase::Outputs()[0]->template GetMutable<GLTensor<T>>();
|
||||
if (first_run_) {
|
||||
first_run_ = false;
|
||||
Y->ResizeLike(*X_);
|
||||
bn_layer_.configure(X_->get_underlying(), Y->get_underlying(),
|
||||
mean_->get_underlying(), var_->get_underlying(),
|
||||
bias_->get_underlying(), scale_->get_underlying(), epsilon_);
|
||||
} else if (second_run_) {
|
||||
X_->lazy_allocate(XBlob, second_run_, true);
|
||||
scale_->lazy_allocate(scaleBlob, second_run_, second_run_);
|
||||
bias_->lazy_allocate(biasBlob, second_run_, second_run_);
|
||||
mean_->lazy_allocate(meanBlob, second_run_, second_run_);
|
||||
var_->lazy_allocate(varBlob, second_run_, second_run_);
|
||||
second_run_ = false;
|
||||
Y->ResizeLike(*X_);
|
||||
Y->allocate();
|
||||
bn_layer_.run();
|
||||
} else {
|
||||
X_->lazy_allocate(XBlob, second_run_, true);
|
||||
bool need_allocation = Y->ResizeLike(*X_);
|
||||
bn_layer_.configure(X_->get_underlying(), Y->get_underlying(),
|
||||
mean_->get_underlying(), var_->get_underlying(),
|
||||
bias_->get_underlying(), scale_->get_underlying(), epsilon_);
|
||||
if (need_allocation) {
|
||||
Y->allocate();
|
||||
}
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
REGISTER_GL_OPERATOR(SpatialBN, GLSpatialBNOp<DataType>);
|
||||
|
||||
} // namespace caffe2
|
@ -1,22 +0,0 @@
|
||||
set -vex
|
||||
|
||||
if [ -z "$CAFFE2_BINARY_DIR" ] ; then
|
||||
if [ -z "$1" ] ; then
|
||||
CAFFE2_BINARY_DIR=.
|
||||
else
|
||||
CAFFE2_BINARY_DIR=$1
|
||||
fi
|
||||
fi
|
||||
|
||||
files=($(find "$CAFFE2_BINARY_DIR" -type f -name "*_test"))
|
||||
for test_binary in "${files[@]}";
|
||||
do
|
||||
test_binary_base=$(basename $test_binary)
|
||||
if [[ $test_binary_base == gl* ]];then
|
||||
echo Running $test_binary_base
|
||||
adb push $test_binary "/data/local/tmp/$test_binary_base"
|
||||
adb shell "GLOG_logtostderr=1 /data/local/tmp/$test_binary_base"
|
||||
fi
|
||||
done
|
||||
|
||||
echo All tests passed.
|
@ -1,2 +0,0 @@
|
||||
file(GLOB tmp *_test.cc)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${tmp} PARENT_SCOPE)
|
@ -1,70 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, Sigmoid) {
|
||||
Workspace ws;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
AddOp(&cpu_net, "Sigmoid", {"cpu_X"}, {"ref_Y"});
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"cpu_X"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, ReLU) {
|
||||
Workspace ws;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
AddOp(&cpu_net, "Relu", {"cpu_X"}, {"ref_Y"});
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Relu", {"cpu_X"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, SigmoidTwice) {
|
||||
Workspace ws;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, 4, 4, 4});
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
AddOp(&cpu_net, "Sigmoid", {"cpu_X"}, {"ref_Y1"});
|
||||
AddOp(&cpu_net, "Sigmoid", {"ref_Y1"}, {"ref_Y2"});
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"cpu_X"}, {"gpu_Y1"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Sigmoid", {"gpu_Y1"}, {"gpu_Y2"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2");
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,197 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
constexpr float tol = 5.0e-2;
|
||||
|
||||
// {MaxPool, Relu, Add} followed by pad 1 conv
|
||||
TEST(OPENGLOperatorTest, ConvMaxPoolConv) {
|
||||
|
||||
Workspace ws;
|
||||
auto channel_in = 16;
|
||||
auto channel_out = 16;
|
||||
auto spatial = 32;
|
||||
auto kern = 3;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
|
||||
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
|
||||
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
|
||||
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
|
||||
PopulateCPUBlob(&ws, true, "b2", {channel_out});
|
||||
|
||||
#define ADD_CONV_ARGS \
|
||||
{ \
|
||||
ADD_ARG((*def), "kernel", i, kern); \
|
||||
ADD_ARG((*def), "stride", i, 1); \
|
||||
ADD_ARG((*def), "pad", i, 1); \
|
||||
ADD_ARG((*def), "order", s, "NCHW"); \
|
||||
}
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
|
||||
def->set_name("cpu_conv");
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "MaxPool", {"ref_Y"}, {"ref_maxpool"});
|
||||
ADD_ARG((*def), "kernel", i, 2);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride_w", i, 2);
|
||||
ADD_ARG((*def), "stride_h", i, 2);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_maxpool", "W2", "b2"}, {"ref_Y2"});
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "MaxPool", {"gpu_Y"}, {"gpu_maxpool"});
|
||||
ADD_ARG((*def), "kernel", i, 2);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride_w", i, 2);
|
||||
ADD_ARG((*def), "stride_h", i, 2);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_maxpool", "W2", "b2"}, {"gpu_Y2"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
#undef ADD_CONV_ARGS
|
||||
|
||||
// will work after next release of ACL
|
||||
// compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, ConvReluConv) {
|
||||
|
||||
Workspace ws;
|
||||
auto channel_in = 16;
|
||||
auto channel_out = 16;
|
||||
auto spatial = 32;
|
||||
auto kern = 3;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
|
||||
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
|
||||
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
|
||||
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
|
||||
PopulateCPUBlob(&ws, true, "b2", {channel_out});
|
||||
|
||||
#define ADD_CONV_ARGS \
|
||||
{ \
|
||||
ADD_ARG((*def), "kernel", i, kern); \
|
||||
ADD_ARG((*def), "stride", i, 1); \
|
||||
ADD_ARG((*def), "pad", i, 1); \
|
||||
ADD_ARG((*def), "order", s, "NCHW"); \
|
||||
}
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
|
||||
def->set_name("cpu_conv");
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Relu", {"ref_Y"}, {"ref_relu"});
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_relu", "W2", "b2"}, {"ref_Y2"});
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Relu", {"gpu_Y"}, {"gpu_relu"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_relu", "W2", "b2"}, {"gpu_Y2"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
#undef ADD_CONV_ARGS
|
||||
|
||||
// will work after next release of ACL
|
||||
// compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
|
||||
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, ConvAddConv) {
|
||||
|
||||
Workspace ws;
|
||||
auto channel_in = 16;
|
||||
auto channel_out = 16;
|
||||
auto spatial = 32; // --> 2x2 w no padding, all values 9
|
||||
auto kern = 3;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
|
||||
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
|
||||
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
|
||||
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
|
||||
PopulateCPUBlob(&ws, true, "b2", {channel_out});
|
||||
PopulateCPUBlob(&ws, true, "cpu_Y", {1, channel_in, spatial, spatial}, 1337);
|
||||
|
||||
#define ADD_CONV_ARGS \
|
||||
{ \
|
||||
ADD_ARG((*def), "kernel", i, kern); \
|
||||
ADD_ARG((*def), "stride", i, 1); \
|
||||
ADD_ARG((*def), "pad", i, 1); \
|
||||
ADD_ARG((*def), "order", s, "NCHW"); \
|
||||
}
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
|
||||
def->set_name("cpu_conv");
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Add", {"ref_Y", "cpu_Y"}, {"ref_add"});
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_add", "W2", "b2"}, {"ref_Y2"});
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Add", {"gpu_Y", "cpu_Y"}, {"gpu_add"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_add", "W2", "b2"}, {"gpu_Y2"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
#undef ADD_CONV_ARGS
|
||||
|
||||
// will work after next release of ACL
|
||||
// compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
|
||||
|
||||
}
|
||||
} // namespace caffe2
|
@ -1,49 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, Concat) {
|
||||
|
||||
for (auto Cs: std::vector<std::vector<int>>{
|
||||
{4, 4},
|
||||
{4, 4, 4},
|
||||
{6, 6, 6},
|
||||
{16, 8, 4},
|
||||
{32, 8, 16, 4},
|
||||
}) {
|
||||
Workspace ws;
|
||||
int batchSize = 1;
|
||||
int H = 8;
|
||||
int W = 8;
|
||||
for (int i = 0; i < Cs.size(); ++i) {
|
||||
PopulateCPUBlob(
|
||||
&ws,
|
||||
true,
|
||||
std::string("cpu_X") + c10::to_string(i),
|
||||
{batchSize, Cs[i], H, W});
|
||||
}
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Concat", {}, {"ref_Y", "cpu_dummy"});
|
||||
for (int i = 0; i < Cs.size(); ++i ) {
|
||||
def->add_input(std::string("cpu_X") + c10::to_string(i));
|
||||
}
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Concat", {}, {"gpu_Y", "gpu_dummy"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
for (int i = 0; i < Cs.size(); ++i ) {
|
||||
def->add_input(std::string("cpu_X") + c10::to_string(i));
|
||||
}
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,11 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLContextTest, Initialization) {
|
||||
auto gc = new GLContext();
|
||||
delete gc;
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,162 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
#include "caffe2/core/timer.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
constexpr float tol = 3.0e-2;
|
||||
|
||||
TEST(OPENGLOperatorTest, Conv) {
|
||||
|
||||
Workspace ws;
|
||||
auto channel_in = 16;
|
||||
auto channel_out = 16;
|
||||
auto spatial = 16; // --> 2x2 w no padding, all values 9
|
||||
auto kern = 3;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
|
||||
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
|
||||
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
|
||||
|
||||
#define ADD_CONV_ARGS \
|
||||
{ \
|
||||
ADD_ARG((*def), "kernel", i, kern); \
|
||||
ADD_ARG((*def), "stride", i, 1); \
|
||||
ADD_ARG((*def), "pad", i, 0); \
|
||||
ADD_ARG((*def), "order", s, "NCHW"); \
|
||||
}
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
|
||||
def->set_name("cpu_conv");
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
ws.RunNetOnce(cpu_net);
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
#undef ADD_CONV_ARGS
|
||||
|
||||
compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", tol);
|
||||
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, ConvReluConv) {
|
||||
|
||||
Workspace ws;
|
||||
auto channel_in = 16;
|
||||
auto channel_out = 16;
|
||||
auto spatial = 32; // --> 2x2 w no padding, all values 9
|
||||
auto kern = 3;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, channel_in, spatial, spatial}, 1337);
|
||||
PopulateCPUBlob(&ws, true, "W", {channel_out, channel_in, kern, kern}, 1337);
|
||||
PopulateCPUBlob(&ws, false, "b", {channel_out}, 0);
|
||||
PopulateCPUBlob(&ws, true, "W2", {channel_out, channel_in, kern, kern});
|
||||
PopulateCPUBlob(&ws, true, "b2", {channel_out});
|
||||
|
||||
#define ADD_CONV_ARGS \
|
||||
{ \
|
||||
ADD_ARG((*def), "kernel", i, kern); \
|
||||
ADD_ARG((*def), "stride", i, 1); \
|
||||
ADD_ARG((*def), "pad", i, 0); \
|
||||
ADD_ARG((*def), "order", s, "NCHW"); \
|
||||
}
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"cpu_X", "W", "b"}, {"ref_Y"});
|
||||
def->set_name("cpu_conv");
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Relu", {"ref_Y"}, {"ref_relu"});
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "Conv", {"ref_relu", "W2", "b2"}, {"ref_Y2"});
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
ws.RunNetOnce(cpu_net);
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"cpu_X", "W", "b"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Relu", {"gpu_Y"}, {"gpu_relu"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {"gpu_relu", "W2", "b2"}, {"gpu_Y2"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS;
|
||||
}
|
||||
|
||||
#undef ADD_CONV_ARGS
|
||||
|
||||
compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y2", "gpu_Y2", tol);
|
||||
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, ConvBenchmark) {
|
||||
|
||||
Workspace ws;
|
||||
auto channel_in = 4;
|
||||
auto channel_out = 4;
|
||||
auto spatial = 10;
|
||||
auto kern = 3;
|
||||
long long iters = 2;
|
||||
|
||||
PopulateCPUBlob(&ws, false, "cpu_X", {1, channel_in, spatial, spatial}, 1, 0, 0.1);
|
||||
|
||||
#define ADD_CONV_ARGS(_def) \
|
||||
{ \
|
||||
ADD_ARG((*_def), "kernel", i, kern); \
|
||||
ADD_ARG((*_def), "stride", i, 1); \
|
||||
ADD_ARG((*_def), "pad", i, 0); \
|
||||
ADD_ARG((*_def), "order", s, "NCHW"); \
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
NetDef cpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
|
||||
std::string prev_out = "cpu_X";
|
||||
for (auto i = 0; i < iters; ++i) {
|
||||
std::string weightName = "W" + to_string(i);
|
||||
std::string biasName = "b" + to_string(i);
|
||||
std::string output = "conv" + to_string(i);
|
||||
PopulateCPUBlob(&ws, false, weightName, {channel_out, channel_in, kern, kern}, 1);
|
||||
PopulateCPUBlob(&ws, false, biasName, {channel_out}, 0);
|
||||
OperatorDef* def = AddOp(&gpu_net, "Conv", {prev_out, weightName, biasName}, {output});
|
||||
if (i == 0) {
|
||||
OperatorDef* def2 = AddOp(&cpu_net, "Conv", {prev_out, weightName, biasName}, {"cpu" + output});
|
||||
ADD_CONV_ARGS(def2);
|
||||
} else {
|
||||
OperatorDef* def2 = AddOp(&cpu_net, "Conv", {"cpu" + prev_out, weightName, biasName}, {"cpu" + output});
|
||||
ADD_CONV_ARGS(def2);
|
||||
}
|
||||
prev_out = output;
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_CONV_ARGS(def);
|
||||
}
|
||||
|
||||
#undef ADD_CONV_ARGS
|
||||
|
||||
compareNetResult4D(ws, cpu_net, gpu_net, "cpu" + prev_out, prev_out, tol);
|
||||
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
@ -1,42 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, CopyFromGL) {
|
||||
|
||||
for (auto dims: std::vector<std::vector<int>>{
|
||||
{1},
|
||||
{3},
|
||||
{1, 2},
|
||||
{2, 3},
|
||||
{1, 2, 3},
|
||||
{1, 2, 3, 4},
|
||||
{4, 3, 2, 1},
|
||||
{4, 9, 8, 13},
|
||||
}) {
|
||||
Workspace ws;
|
||||
PopulateCPUBlob(&ws, true, std::string("cpu_X"), dims, 1, 0.2, 0.1);
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "CopyFromGL", {"cpu_X"}, {"cpu_X2"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
ws.RunNetOnce(gpu_net);
|
||||
Blob *cpu_out = ws.GetBlob("cpu_X");
|
||||
Blob *gpu_out = ws.GetBlob("cpu_X2");
|
||||
EXPECT_NE(nullptr, cpu_out);
|
||||
EXPECT_NE(nullptr, gpu_out);
|
||||
|
||||
auto &t1 = cpu_out->Get<TensorCPU>();
|
||||
auto &t2 = gpu_out->Get<TensorCPU>();
|
||||
double tol=0.01;
|
||||
for (auto i = 0; i < t1.size(); ++i) {
|
||||
EXPECT_NEAR(t1.data<float>()[i], t2.data<float>()[i], tol)
|
||||
<< "at index " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,27 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, Sum) {
|
||||
Workspace ws;
|
||||
int N = 28;
|
||||
int D = 128;
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {N, D}, 1);
|
||||
PopulateCPUBlob(&ws, true, "cpu_Y", {N, D}, 1);
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
AddOp(&cpu_net, "Sum", {"cpu_X", "cpu_Y"}, {"ref_Y"});
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Sum", {"cpu_X", "cpu_Y"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,36 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, FC) {
|
||||
|
||||
Workspace ws;
|
||||
int batchSize = 1;
|
||||
int CIn = 4;
|
||||
int H = 8;
|
||||
int W = 8;
|
||||
int COut = 16;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {batchSize, CIn, H, W});
|
||||
PopulateCPUBlob(&ws, true, "cpu_W", {COut, CIn * H * W});
|
||||
PopulateCPUBlob(&ws, true, "cpu_B", {COut});
|
||||
|
||||
constexpr float tol = 0.2;
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
AddOp(&cpu_net, "FC", {"cpu_X", "cpu_W", "cpu_B"}, {"ref_Y"});
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "FC", {"cpu_X", "cpu_W", "cpu_B"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
// will work after the next release of ACL
|
||||
// compareNetResult(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", tol, true);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,11 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/test/gl_model_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
// The last softmax op didn't pass because of the dimension mismatch, and we are not likely to hit it in other models, but the implementation should be correct
|
||||
// TEST(OPENGLModelTest, SqueezenetV11) {
|
||||
// std::string parent_path = "/data/local/tmp/";
|
||||
// benchmarkModel(parent_path + "squeezenet_init.pb", parent_path + "squeezenet_predict.pb", "data", {1, 3, 224, 224}, "squeezenet_v11");
|
||||
// }
|
||||
|
||||
} // namespace caffe2
|
@ -1,62 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/test/gl_operator_test.h"
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/rewrite_net.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
#include <unordered_set>
|
||||
|
||||
C10_DEFINE_int(warmup, 3, "The number of iterations to warm up.");
|
||||
C10_DEFINE_int(iter, 100, "The number of iterations to run.");
|
||||
C10_DEFINE_bool(
|
||||
run_individual,
|
||||
true,
|
||||
"Whether to benchmark individual operators.");
|
||||
|
||||
constexpr float tol = 0.03;
|
||||
namespace caffe2 {
|
||||
void benchmarkModel(std::string init_net_pb, std::string predict_net_pb, std::string input_name, std::vector<int> input_dims, std::string net_name="benchmark_net", std::unordered_set<std::string> cpu_ops = std::unordered_set<std::string>({})) {
|
||||
unique_ptr<caffe2::Workspace> ws(new caffe2::Workspace());
|
||||
NetDef init_net_def;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(init_net_pb, &init_net_def));
|
||||
CAFFE_ENFORCE(ws->RunNetOnce(init_net_def));
|
||||
NetDef predict_net_def, predict_net_def_gpu;
|
||||
CAFFE_ENFORCE(ReadProtoFromFile(predict_net_pb, &predict_net_def));
|
||||
PopulateCPUBlob(ws.get(), true, input_name, input_dims);
|
||||
LOG(ERROR) << "[C2DEBUG] rewriting OpenGL net";
|
||||
tryConvertToOpenGL(predict_net_def, &predict_net_def_gpu, false, cpu_ops);
|
||||
// change the name of last op
|
||||
auto index = predict_net_def_gpu.op().size() - 1;
|
||||
LOG(ERROR) << "[C2DEBUG] index:" << index;
|
||||
auto last_blob = predict_net_def_gpu.op()[index].output()[0];
|
||||
auto op = predict_net_def_gpu.mutable_op(index);
|
||||
auto output = op->mutable_output(0);
|
||||
*output = last_blob + "_gpu";
|
||||
LOG(ERROR) << "[C2DEBUG] last blob: " << last_blob;
|
||||
for (auto i = 0; i < predict_net_def_gpu.external_output_size(); ++i) {
|
||||
auto out = predict_net_def_gpu.mutable_external_output(i);
|
||||
if (*out == last_blob) {
|
||||
*out = last_blob + "_gpu";
|
||||
}
|
||||
}
|
||||
|
||||
compareNetResult4D(*ws, predict_net_def, predict_net_def_gpu, last_blob, last_blob + "_gpu");
|
||||
LOG(ERROR) << "[C2DEBUG] after compareNetResult4D";
|
||||
NetBase* net = ws->CreateNet(predict_net_def_gpu);
|
||||
LOG(ERROR) << "[C2DEBUG] Benchmarking OpenGL Net";
|
||||
net->TEST_Benchmark(FLAGS_warmup, FLAGS_iter, FLAGS_run_individual);
|
||||
// Test CPU
|
||||
for (auto i = 0; i < predict_net_def.op().size(); ++i) {
|
||||
auto op = predict_net_def.mutable_op(i);
|
||||
if (std::find(cpu_ops.begin(), cpu_ops.end(), op->type()) == cpu_ops.end()) {
|
||||
op->mutable_device_option()->set_device_type(PROTO_CPU);
|
||||
}
|
||||
}
|
||||
predict_net_def.set_type("simple");
|
||||
predict_net_def.set_name("cpu_net");
|
||||
net = ws->CreateNet(predict_net_def);
|
||||
LOG(INFO) << "[C2DEBUG] Benchmarking CPU Net";
|
||||
net->TEST_Benchmark(FLAGS_warmup, FLAGS_iter, FLAGS_run_individual);
|
||||
}
|
||||
} // namespace caffe2
|
@ -1,33 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
constexpr float tol = 5.0e-2;
|
||||
|
||||
TEST(OPENGLOperatorTest, NormPlanarYUV) {
|
||||
|
||||
Workspace ws;
|
||||
int batchSize = 1;
|
||||
int channels = 8;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {batchSize, channels, 8, 13});
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_mean", {1, channels});
|
||||
PopulateCPUBlob(&ws, true, "cpu_stddev", {1, channels}, 1, 0.5);
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "NormalizePlanarYUV", {"cpu_X", "cpu_mean", "cpu_stddev"}, {"ref_Y"});
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "NormalizePlanarYUV", {"cpu_X", "cpu_mean", "cpu_stddev"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult4D(ws, cpu_net, gpu_net);
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,121 +0,0 @@
|
||||
#include "caffe2/mobile/contrib/arm-compute/core/context.h"
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "caffe2/core/graph.h"
|
||||
#include "caffe2/core/operator.h"
|
||||
#include "caffe2/core/workspace.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
#define DECLARE_OPENGL_OPERATOR(_name) \
|
||||
OperatorDef _name; \
|
||||
_name.mutable_device_option()->set_device_type(PROTO_OPENGL);
|
||||
|
||||
#define MAKE_OPENGL_OPERATOR(_op) \
|
||||
_op->mutable_device_option()->set_device_type(PROTO_OPENGL);
|
||||
|
||||
#define ADD_ARG(_op, _name, _type, _val) \
|
||||
{ \
|
||||
Argument *arg = _op.add_arg(); \
|
||||
arg->set_name(_name); \
|
||||
arg->set_##_type(_val); \
|
||||
}
|
||||
|
||||
// Use value 1337 to generate a blob that is deterministic
|
||||
// and unique at each value (for debugging purposes)
|
||||
template<typename T = float>
|
||||
void PopulateCPUBlob(Workspace *ws, bool random, std::string name,
|
||||
std::vector<int> dims, int val = 1, int dist_shift = 0, float variance = 1) {
|
||||
Blob *blob = ws->CreateBlob(name);
|
||||
auto* tensor = BlobGetMutableTensor(blob, CPU);
|
||||
tensor->Resize(dims);
|
||||
T *t_data = tensor->mutable_data<T>();
|
||||
std::random_device rd;
|
||||
std::mt19937 e2(rd());
|
||||
std::normal_distribution<> dist(0 + dist_shift, variance + dist_shift);
|
||||
for (int i = 0; i < tensor->size(); ++i) {
|
||||
t_data[i] = T(random ? dist(e2) : (val == 1337 ? i : val));
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T = DataType>
|
||||
void compareNetResult(Workspace& ws,
|
||||
NetDef& cpu_net, NetDef& gpu_net,
|
||||
string cpu_blob="ref_Y",
|
||||
string gpu_blob="gpu_Y",
|
||||
double tol=0.01,
|
||||
bool relative=false) {
|
||||
ws.RunNetOnce(cpu_net);
|
||||
ws.RunNetOnce(gpu_net);
|
||||
|
||||
Blob *cpu_out = ws.GetBlob(cpu_blob);
|
||||
Blob *gpu_out = ws.GetBlob(gpu_blob);
|
||||
EXPECT_NE(nullptr, cpu_out);
|
||||
EXPECT_NE(nullptr, gpu_out);
|
||||
|
||||
TensorCPU g;
|
||||
auto& g_ = gpu_out->Get<GLTensor<T>>();
|
||||
getTensorCPU(g_, g);
|
||||
|
||||
auto &t = cpu_out->Get<TensorCPU>();
|
||||
EXPECT_EQ(g.size(), t.size());
|
||||
|
||||
for (auto i = 0; i < g.size(); ++i) {
|
||||
if (relative) {
|
||||
EXPECT_NEAR(g.data<float>()[i], t.data<float>()[i], tol + tol * std::abs(t.data<float>()[i])) << "at index " << i;
|
||||
} else{
|
||||
EXPECT_NEAR(g.data<float>()[i], t.data<float>()[i], tol)
|
||||
<< "at index " << i;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template<typename T = DataType>
|
||||
void compareNetResult4D(Workspace& ws,
|
||||
NetDef& cpu_net, NetDef& gpu_net,
|
||||
string cpu_blob="ref_Y",
|
||||
string gpu_blob="gpu_Y",
|
||||
double tol=0.05) {
|
||||
LOG(INFO) << "[C2DEBUG] running gpu net";
|
||||
bool gpu_success = ws.RunNetOnce(gpu_net);
|
||||
LOG(INFO) << "[C2DEBUG] after gpu net";
|
||||
bool cpu_success = ws.RunNetOnce(cpu_net);
|
||||
LOG(INFO) << "[C2DEBUG] after cpu net";
|
||||
|
||||
if (!gpu_success || !cpu_success) {
|
||||
LOG(ERROR) << "[C2DEBUG] cpu or gpu net failed.";
|
||||
return;
|
||||
}
|
||||
Blob *cpu_out = ws.GetBlob(cpu_blob);
|
||||
Blob *gpu_out = ws.GetBlob(gpu_blob);
|
||||
|
||||
EXPECT_NE(nullptr, cpu_out);
|
||||
EXPECT_NE(nullptr, gpu_out);
|
||||
|
||||
auto &t = cpu_out->Get<TensorCPU>();
|
||||
int diff_num = 0;
|
||||
if (gpu_out->IsType<TensorCPU>()) {
|
||||
auto& g = gpu_out->Get<TensorCPU>();
|
||||
for (auto i = 0; i < t.size(); ++i) {
|
||||
auto t_elem = t.data<float>()[i];
|
||||
auto g_elem = g.data<float>()[i];
|
||||
if (!isnan(t_elem) && (std::abs(t_elem - g_elem) > tol + tol * std::abs(t_elem))) {
|
||||
diff_num++;
|
||||
}
|
||||
}
|
||||
} else if (gpu_out->IsType<GLTensor<T>>()) {
|
||||
TensorCPU g;
|
||||
getTensorCPU(gpu_out->Get<GLTensor<T>>(), g);
|
||||
for (auto i = 0; i < t.size(); ++i) {
|
||||
auto t_elem = t.data<float>()[i];
|
||||
auto g_elem = g.data<float>()[i];
|
||||
if (!isnan(t_elem) && (std::abs(t_elem - g_elem) > tol + tol * std::abs(t_elem))) {
|
||||
diff_num++;
|
||||
}
|
||||
}
|
||||
}
|
||||
CHECK(diff_num <= 0.03 * t.size());
|
||||
}
|
||||
|
||||
|
||||
} // namespace caffe2
|
@ -1,89 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, AveragePool) {
|
||||
Workspace ws;
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "AveragePool", {"cpu_X"}, {"ref_Y"});
|
||||
ADD_ARG((*def), "kernel", i, 2);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride", i, 2);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "AveragePool", {"cpu_X"}, {"gpu_Y"});
|
||||
ADD_ARG((*def), "kernel", i, 2);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride", i, 2);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, MaxPool) {
|
||||
Workspace ws;
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "MaxPool", {"cpu_X"}, {"ref_Y"});
|
||||
ADD_ARG((*def), "kernel", i, 2);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride", i, 2);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "MaxPool", {"cpu_X"}, {"gpu_Y"});
|
||||
ADD_ARG((*def), "kernel", i, 2);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride", i, 2);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
|
||||
}
|
||||
|
||||
TEST(OPENGLOperatorTest, AverageGlobalPool) {
|
||||
Workspace ws;
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {1, 1, 8, 8});
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "AveragePool", {"cpu_X"}, {"ref_Y"});
|
||||
ADD_ARG((*def), "global_pooling", i, 1);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride", i, 1);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "AveragePool", {"cpu_X"}, {"gpu_Y"});
|
||||
ADD_ARG((*def), "global_pooling", i, 1);
|
||||
ADD_ARG((*def), "pad", i, 0);
|
||||
ADD_ARG((*def), "stride", i, 1);
|
||||
ADD_ARG((*def), "order", s, "NCHW");
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,35 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, ResizeNearest) {
|
||||
|
||||
Workspace ws;
|
||||
float height_scale = 2;
|
||||
float width_scale = 2;
|
||||
int N = 1;
|
||||
int CIn = 7;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {N, CIn, 37, 89});
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "ResizeNearest", {"cpu_X"}, {"ref_Y"});
|
||||
ADD_ARG((*def), "height_scale", f, height_scale);
|
||||
ADD_ARG((*def), "width_scale", f, width_scale);
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "ResizeNearest", {"cpu_X"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_ARG((*def), "height_scale", f, height_scale);
|
||||
ADD_ARG((*def), "width_scale", f, width_scale);
|
||||
}
|
||||
|
||||
compareNetResult4D(ws, cpu_net, gpu_net);
|
||||
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,28 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, Softmax) {
|
||||
|
||||
Workspace ws;
|
||||
int N = 1;
|
||||
int D = 128;
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {N, D}, 1);
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
AddOp(&cpu_net, "Softmax", {"cpu_X"}, {"ref_Y"});
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "Softmax", {"cpu_X"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
}
|
||||
|
||||
compareNetResult(ws, cpu_net, gpu_net);
|
||||
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -1,35 +0,0 @@
|
||||
#include "gl_operator_test.h"
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
TEST(OPENGLOperatorTest, SpatialBN) {
|
||||
|
||||
Workspace ws;
|
||||
int batchSize = 1;
|
||||
int channels = 8;
|
||||
|
||||
PopulateCPUBlob(&ws, true, "cpu_X", {3, channels, 8, 13});
|
||||
PopulateCPUBlob(&ws, true, "cpu_scale", {channels});
|
||||
PopulateCPUBlob(&ws, true, "cpu_bias", {channels});
|
||||
PopulateCPUBlob(&ws, true, "cpu_mean", {channels});
|
||||
PopulateCPUBlob(&ws, true, "cpu_var", {channels}, 1, 0.5);
|
||||
|
||||
NetDef cpu_net;
|
||||
{
|
||||
OperatorDef* def = AddOp(&cpu_net, "SpatialBN", {"cpu_X", "cpu_scale", "cpu_bias", "cpu_mean", "cpu_var"}, {"ref_Y"});
|
||||
ADD_ARG((*def), OpSchema::Arg_IsTest, i, 1);
|
||||
}
|
||||
|
||||
NetDef gpu_net;
|
||||
gpu_net.set_type("opengl");
|
||||
{
|
||||
OperatorDef* def = AddOp(&gpu_net, "SpatialBN", {"cpu_X", "cpu_scale", "cpu_bias", "cpu_mean", "cpu_var"}, {"gpu_Y"});
|
||||
MAKE_OPENGL_OPERATOR(def);
|
||||
ADD_ARG((*def), OpSchema::Arg_IsTest, i, 1);
|
||||
}
|
||||
|
||||
compareNetResult4D(ws, cpu_net, gpu_net, "ref_Y", "gpu_Y", 0.01);
|
||||
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
@ -919,63 +919,6 @@ if(USE_PROF)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ ARM Compute Library: check compatibility.
|
||||
if (USE_ACL)
|
||||
if (NOT ANDROID)
|
||||
message(WARNING "ARM Compute Library is only supported for Android builds.")
|
||||
caffe2_update_option(USE_ACL OFF)
|
||||
else()
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS EGL GLESv2)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
|
||||
# 32-bit ARM (armv7, armv7-a, armv7l, etc)
|
||||
set(ACL_ARCH "armv7a")
|
||||
elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm64|aarch64)$")
|
||||
# 64-bit ARM
|
||||
set(ACL_ARCH "arm64-v8a")
|
||||
else()
|
||||
message(WARNING "ARM Compute Library is only supported for ARM/ARM64 builds.")
|
||||
caffe2_update_option(USE_ACL OFF)
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ ARM Compute Library: build the target.
|
||||
if (USE_ACL)
|
||||
list(APPEND ARM_COMPUTE_INCLUDE_DIRS "third_party/ComputeLibrary/")
|
||||
list(APPEND ARM_COMPUTE_INCLUDE_DIRS "third_party/ComputeLibrary/include")
|
||||
include_directories(SYSTEM ${ARM_COMPUTE_INCLUDE_DIRS})
|
||||
string (REPLACE ";" " -I" ANDROID_STL_INCLUDE_FLAGS "-I${ANDROID_STL_INCLUDE_DIRS}")
|
||||
set (ARM_COMPUTE_SRC_DIR "${CMAKE_CURRENT_LIST_DIR}/../third_party/ComputeLibrary/")
|
||||
set (ARM_COMPUTE_LIB "${CMAKE_CURRENT_BINARY_DIR}/libarm_compute.a")
|
||||
set (ARM_COMPUTE_CORE_LIB "${CMAKE_CURRENT_BINARY_DIR}/libarm_compute_core.a")
|
||||
set (ARM_COMPUTE_LIBS ${ARM_COMPUTE_LIB} ${ARM_COMPUTE_CORE_LIB})
|
||||
|
||||
add_custom_command(
|
||||
OUTPUT ${ARM_COMPUTE_LIBS}
|
||||
COMMAND
|
||||
/bin/sh -c "export PATH=\"$PATH:$(dirname ${CMAKE_CXX_COMPILER})\" && \
|
||||
scons -C \"${ARM_COMPUTE_SRC_DIR}\" -Q \
|
||||
examples=no validation_tests=no benchmark_tests=no standalone=yes \
|
||||
embed_kernels=yes opencl=no gles_compute=yes \
|
||||
os=android arch=${ACL_ARCH} \
|
||||
extra_cxx_flags=\"${ANDROID_CXX_FLAGS} ${ANDROID_STL_INCLUDE_FLAGS}\"" &&
|
||||
/bin/sh -c "cp ${ARM_COMPUTE_SRC_DIR}/build/libarm_compute-static.a ${CMAKE_CURRENT_BINARY_DIR}/libarm_compute.a" &&
|
||||
/bin/sh -c "cp ${ARM_COMPUTE_SRC_DIR}/build/libarm_compute_core-static.a ${CMAKE_CURRENT_BINARY_DIR}/libarm_compute_core.a" &&
|
||||
/bin/sh -c "rm -r ${ARM_COMPUTE_SRC_DIR}/build"
|
||||
COMMENT "Building ARM compute library" VERBATIM)
|
||||
add_custom_target(arm_compute_build ALL DEPENDS ${ARM_COMPUTE_LIBS})
|
||||
|
||||
add_library(arm_compute_core STATIC IMPORTED)
|
||||
add_dependencies(arm_compute_core arm_compute_build)
|
||||
set_property(TARGET arm_compute_core PROPERTY IMPORTED_LOCATION ${ARM_COMPUTE_CORE_LIB})
|
||||
|
||||
add_library(arm_compute STATIC IMPORTED)
|
||||
add_dependencies(arm_compute arm_compute_build)
|
||||
set_property(TARGET arm_compute PROPERTY IMPORTED_LOCATION ${ARM_COMPUTE_LIB})
|
||||
|
||||
list(APPEND Caffe2_DEPENDENCY_LIBS arm_compute arm_compute_core)
|
||||
endif()
|
||||
|
||||
if (USE_SNPE AND ANDROID)
|
||||
if (SNPE_LOCATION AND SNPE_HEADERS)
|
||||
message(STATUS "Using SNPE location specified by -DSNPE_LOCATION: " ${SNPE_LOCATION})
|
||||
|
@ -333,23 +333,6 @@ if (IOS)
|
||||
add_definitions("-Wno-deprecated-declarations")
|
||||
endif()
|
||||
|
||||
# ---[ If we are building with ACL, we will enable neon-fp16.
|
||||
if(USE_ACL)
|
||||
if (CMAKE_SYSTEM_PROCESSOR MATCHES "^armv")
|
||||
# 32-bit ARM (armv7, armv7-a, armv7l, etc)
|
||||
set(ACL_ARCH "armv7a")
|
||||
# Compilers for 32-bit ARM need extra flags to enable NEON-FP16
|
||||
add_definitions("-mfpu=neon-fp16")
|
||||
|
||||
include(CheckCCompilerFlag)
|
||||
CHECK_C_COMPILER_FLAG(
|
||||
-mfp16-format=ieee CAFFE2_COMPILER_SUPPORTS_FP16_FORMAT)
|
||||
if (CAFFE2_COMPILER_SUPPORTS_FP16_FORMAT)
|
||||
add_definitions("-mfp16-format=ieee")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
# ---[ If we use asan, turn on the flags.
|
||||
# TODO: This only works with new style gcc and clang (not the old -faddress-sanitizer).
|
||||
# Change if necessary on old platforms.
|
||||
|
1
third_party/ComputeLibrary
vendored
1
third_party/ComputeLibrary
vendored
Submodule third_party/ComputeLibrary deleted from 292227986e
Reference in New Issue
Block a user