NNPACK: Use new bindings and custom thread pool

Summary:
This change should dramatically (~10X) improve performance of convolution with NNPACK engine
Closes https://github.com/caffe2/caffe2/pull/1730

Reviewed By: sf-wind

Differential Revision: D6695895

Pulled By: Maratyszcza

fbshipit-source-id: 26291916811ef4cb819a59aec848c4e23668e568
This commit is contained in:
Marat Dukhan
2018-01-11 10:43:31 -08:00
committed by Facebook Github Bot
parent 0a8a18ca01
commit 224493d9ce
19 changed files with 43 additions and 98 deletions

View File

@ -17,7 +17,7 @@ set(CAFFE2_VERSION
option(BUILD_BINARY "Build C++ binaries" ON)
option(BUILD_PYTHON "Build Python binaries" ON)
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
option(BUILD_SHARE_DIR "Build files in the share/ directory" OFF)
option(BUILD_OBSERVERS "Build performance observers/loggers in caffe2/share/observers directory" OFF)
option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" ON)
option(USE_ATEN "Use ATen" OFF)
option(USE_ASAN "Use Address Sanitizer" OFF)

View File

@ -34,9 +34,7 @@ add_subdirectory(perfkernels)
add_subdirectory(python)
add_subdirectory(queue)
add_subdirectory(sgd)
if (BUILD_SHARE_DIR)
add_subdirectory(share)
endif()
add_subdirectory(share)
# add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
add_subdirectory(transforms)
add_subdirectory(utils)
@ -300,9 +298,10 @@ endif()
# Binaries
if (BUILD_BINARY)
add_subdirectory(binaries)
# Benchmarking binaries require observers included in the build
# There is a linking issue that happens in some of the Windows builds.
# TODO(Yangqing): after the module redesing, enable this back.
if (NOT MSVC AND BUILD_SHARE_DIR)
# TODO(Yangqing): after the module redesign, enable this back.
if (BUILD_OBSERVERS AND NOT MSVC)
add_subdirectory(share/contrib/binaries)
endif()
endif()

View File

@ -1,7 +1,6 @@
add_subdirectory(aten)
add_subdirectory(gloo)
add_subdirectory(nccl)
add_subdirectory(nnpack)
add_subdirectory(prof)
add_subdirectory(shm_mutex)
add_subdirectory(script)

View File

@ -1,8 +0,0 @@
if(USE_NNPACK)
message(STATUS "Include NNPACK operators")
set(Caffe2_CONTRIB_NNPACK_CPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/nnpack_ops.cc"
)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_NNPACK_CPU_SRC} PARENT_SCOPE)
endif()

View File

@ -320,7 +320,6 @@ bool Workspace::RunPlan(const PlanDef& plan, ShouldContinue shouldContinue) {
return RunPlanOnWorkspace(this, plan, shouldContinue);
}
#if CAFFE2_MOBILE
ThreadPool* Workspace::GetThreadPool() {
std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
if (!thread_pool_) {
@ -328,6 +327,5 @@ ThreadPool* Workspace::GetThreadPool() {
}
return thread_pool_.get();
}
#endif // CAFFE2_MOBILE
} // namespace caffe2

View File

@ -20,10 +20,6 @@
#include "caffe2/core/common.h"
#include "caffe2/core/observer.h"
#ifndef CAFFE2_MOBILE
#error "mobile build state not defined"
#endif
#include <climits>
#include <cstddef>
#include <mutex>
@ -36,9 +32,7 @@
#include "caffe2/core/net.h"
#include "caffe2/proto/caffe2.pb.h"
#include "caffe2/utils/signal_handler.h"
#if CAFFE2_MOBILE
#include "caffe2/utils/threadpool/ThreadPool.h"
#endif // CAFFE2_MOBILE
CAFFE2_DECLARE_bool(caffe2_print_blob_sizes_at_exit);
@ -291,14 +285,12 @@ class Workspace {
bool RunPlan(const PlanDef& plan_def,
ShouldContinue should_continue = StopOnSignal{});
#if CAFFE2_MOBILE
/*
* Returns a CPU threadpool instace for parallel execution of
* work. The threadpool is created lazily; if no operators use it,
* then no threadpool will be created.
*/
ThreadPool* GetThreadPool();
#endif
// RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
// between RunNet and RunNetOnce lies in the fact that RunNet allows you to
@ -318,10 +310,8 @@ class Workspace {
const Workspace* shared_;
std::unordered_map<string, std::pair<const Workspace*, string>>
forwarded_blobs_;
#if CAFFE2_MOBILE
std::unique_ptr<ThreadPool> thread_pool_;
std::mutex thread_pool_creation_mutex_;
#endif // CAFFE2_MOBILE
DISABLE_COPY_AND_ASSIGN(Workspace);
};

View File

@ -1,11 +1,18 @@
set(Caffe2_CPU_OBSERVER_SRCS)
if (USE_NNPACK)
add_subdirectory(nnpack)
endif()
if (BUILD_OBSERVERS)
add_subdirectory(observers)
endif()
if (USE_ZSTD)
add_subdirectory(zstd)
endif()
add_library(Caffe2_CPU_OBSERVER STATIC ${Caffe2_CPU_OBSERVER_SRCS})
if (BUILD_OBSERVERS)
add_library(Caffe2_CPU_OBSERVER STATIC ${Caffe2_CPU_OBSERVER_SRCS})
endif()
# CPU source, test sources, binary sources
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)

View File

@ -0,0 +1,9 @@
set(Caffe2_CONTRIB_NNPACK_CPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/conv_op.cc"
)
set(Caffe2_CONTRIB_NNPACK_TEST_CPU_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/conv_op_test.cc"
)
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_NNPACK_CPU_SRC} PARENT_SCOPE)
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${Caffe2_CONTRIB_NNPACK_TEST_CPU_SRC} PARENT_SCOPE)

View File

@ -16,11 +16,6 @@
#include "caffe2/core/common.h"
#ifndef CAFFE2_MOBILE
#error "Caffe2 mobile state not defined"
#endif
#if CAFFE2_MOBILE
#include "caffe2/core/context.h"
#include "caffe2/core/logging.h"
@ -258,7 +253,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
transformStrategy_ = nnp_convolution_transform_strategy_reuse;
}
} else {
LOG(ERROR)
LOG(WARNING)
<< "Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
transformStrategy_ = nnp_convolution_transform_strategy_compute;
}
@ -406,5 +401,3 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
} // namespace caffe2
#endif // CAFFE2_MOBILE

View File

@ -238,13 +238,13 @@ constexpr size_t kIters = 20;
// TODO(#14383029) cblas_sgemm not yet implemented on limited mobile cases.
#if !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
TEST(MobileNNPACK, Conv_3x3s1) {
TEST(NNPACK, Conv_3x3s1) {
for (int i = 0; i < kIters; ++i) {
runConv(3, 3, 1, 1);
}
}
TEST(MobileNNPACK, Conv_3x3s1_precompute) {
TEST(NNPACK, Conv_3x3s1_precompute) {
for (int i = 0; i < kIters; ++i) {
int group = randInt(1, 2);
runConv(
@ -261,13 +261,13 @@ TEST(MobileNNPACK, Conv_3x3s1_precompute) {
}
}
TEST(MobileNNPACK, Conv_3x3s1_FP16) {
TEST(NNPACK, Conv_3x3s1_FP16) {
for (int i = 0; i < kIters; ++i) {
runConv(3, 3, 1, 1, 1, "WINOGRAD_FP16");
}
}
TEST(MobileNNPACK, Conv_3x3s1_FP16_precompute) {
TEST(NNPACK, Conv_3x3s1_FP16_precompute) {
for (int i = 0; i < kIters; ++i) {
int group = randInt(1, 2);
runConv(
@ -284,14 +284,14 @@ TEST(MobileNNPACK, Conv_3x3s1_FP16_precompute) {
}
}
TEST(MobileNNPACK, Conv_NxNs1) {
TEST(NNPACK, Conv_NxNs1) {
for (int i = 0; i < kIters; ++i) {
int kernel = randInt(2, 10);
runConv(kernel, kernel, 1, 1);
}
}
TEST(MobileNNPACK, Conv_1x1s1) {
TEST(NNPACK, Conv_1x1s1) {
for (int i = 0; i < kIters; ++i) {
auto group = randInt(1, 3);
auto inChannels = randInt(1, 8) * group;
@ -301,7 +301,7 @@ TEST(MobileNNPACK, Conv_1x1s1) {
}
}
TEST(MobileNNPACK, Conv_1x1s1_precompute) {
TEST(NNPACK, Conv_1x1s1_precompute) {
for (int i = 0; i < kIters; ++i) {
auto group = randInt(1, 3);
auto inChannels = randInt(1, 8) * group;
@ -312,7 +312,7 @@ TEST(MobileNNPACK, Conv_1x1s1_precompute) {
}
}
TEST(MobileNNPACK, Conv_NxNs_grouped) {
TEST(NNPACK, Conv_NxNs_grouped) {
for (int i = 0; i < kIters; ++i) {
int group = randInt(2, 3);
int iC = randInt(1, 6) * group;
@ -323,7 +323,7 @@ TEST(MobileNNPACK, Conv_NxNs_grouped) {
}
}
TEST(MobileNNPACK, Conv_NxNs_grouped_precompute) {
TEST(NNPACK, Conv_NxNs_grouped_precompute) {
for (int i = 0; i < kIters; ++i) {
int group = randInt(2, 3);
int iC = randInt(1, 6) * group;
@ -334,7 +334,7 @@ TEST(MobileNNPACK, Conv_NxNs_grouped_precompute) {
}
}
TEST(MobileNNPACK, Conv_NxNsW) {
TEST(NNPACK, Conv_NxNsW) {
for (int i = 0; i < 3; ++i) {
int kernel = randInt(3, 5);
int stride = randInt(1, kernel - 1);
@ -342,7 +342,7 @@ TEST(MobileNNPACK, Conv_NxNsW) {
}
}
TEST(MobileNNPACK, Conv_HxWsHxW) {
TEST(NNPACK, Conv_HxWsHxW) {
for (int i = 0; i < 3; ++i) {
int kernelH = randInt(2, 5);
int kernelW = randInt(2, 5);
@ -352,7 +352,7 @@ TEST(MobileNNPACK, Conv_HxWsHxW) {
}
}
TEST(MobileNNPACK, Depthwise3x3Conv) {
TEST(NNPACK, Depthwise3x3Conv) {
for (int i = 0; i < kIters; ++i) {
int channel = 2;
runConv(

View File

@ -21,12 +21,12 @@ exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
# ---[ threadpool/pthreadpool* is a local modification of the NNPACK
# pthreadpool with a very similar interface. We used it internally to
# wait on such changes to be merged to the upstream pthreadpool repo.
# For the open source version, we do not need to use these files as we
# will directly link nnpack pthreadpool.
file(GLOB_RECURSE tmp pthreadpool*)
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
# pthreadpool with a very similar interface. Neither NNPACK, nor this
# thread pool supports Windows.
if (MSVC)
file(GLOB_RECURSE tmp pthreadpool*)
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
endif()
# ---[ GPU test files
file(GLOB_RECURSE tmp *_gpu_test.cc)

View File

@ -31,7 +31,6 @@ CAFFE2_DEFINE_int(caffe2_threadpool_android_cap, true, "");
// Whether or not threadpool caps apply to iOS
CAFFE2_DEFINE_int(caffe2_threadpool_ios_cap, false, "");
#if CAFFE2_THREADPOOL_MOBILE
namespace caffe2 {
@ -62,8 +61,6 @@ std::unique_ptr<ThreadPool> ThreadPool::defaultThreadPool() {
applyCap = caffe2::FLAGS_caffe2_threadpool_android_cap;
#elif CAFFE2_IOS
applyCap = caffe2::FLAGS_caffe2_threadpool_ios_cap;
#else
#error Undefined architecture
#endif
if (applyCap) {
@ -185,5 +182,3 @@ void ThreadPool::run(const std::function<void(int, size_t)>& fn, size_t range) {
}
} // namespace caffe2
#endif // CAFFE2_THREADPOOL_MOBILE

View File

@ -19,13 +19,7 @@
#include "ThreadPoolCommon.h"
#ifndef CAFFE2_THREADPOOL_MOBILE
#error "mobile build state not defined"
#endif
// ThreadPool only used in mobile builds at the moment
#if CAFFE2_THREADPOOL_MOBILE
#include <functional>
#include <memory>
#include <mutex>
#include <vector>
@ -68,6 +62,4 @@ private:
} // namespace caffe2
#endif // CAFFE2_THREADPOOL_MOBILE
#endif // CAFFE2_UTILS_THREADPOOL_H_

View File

@ -26,20 +26,14 @@
// This is copied from core/common.h's definition of CAFFE2_MOBILE
// Define enabled when building for iOS or Android devices
#if !defined(CAFFE2_THREADPOOL_MOBILE)
#if defined(__ANDROID__)
#define CAFFE2_ANDROID 1
#define CAFFE2_THREADPOOL_MOBILE 1
#elif (defined(__APPLE__) && \
(TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
#define CAFFE2_IOS 1
#define CAFFE2_THREADPOOL_MOBILE 1
#elif (defined(__APPLE__) && TARGET_OS_MAC)
#define CAFFE2_IOS 1
#define CAFFE2_THREADPOOL_MOBILE 1
#else
#define CAFFE2_THREADPOOL_MOBILE 0
#endif // ANDROID / IOS / MACOS
#endif // CAFFE2_THREADPOOL_MOBILE
#endif // CAFFE2_UTILS_THREADPOOL_COMMON_H_

View File

@ -30,7 +30,6 @@
#include "caffe2/utils/fixed_divisor.h"
#include "caffe2/utils/threadpool/pthreadpool.h"
#if CAFFE2_THREADPOOL_MOBILE
static inline size_t divide_round_up(size_t dividend, size_t divisor) {
if (dividend % divisor == 0) {
@ -181,5 +180,3 @@ void pthreadpool_compute_2d_tiled(
pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
}
}
#endif // CAFFE2_THREADPOOL_MOBILE

View File

@ -21,12 +21,6 @@
#include "ThreadPoolCommon.h"
#ifndef CAFFE2_THREADPOOL_MOBILE
#error "mobile build state not defined"
#endif
// ThreadPool only used in mobile builds at the moment
#if CAFFE2_THREADPOOL_MOBILE
#include <stddef.h> // for size_t
@ -122,6 +116,4 @@ void pthreadpool_destroy(pthreadpool_t threadpool);
} /* extern "C" */
#endif
#endif // CAFFE2_THREADPOOL_MOBILE
#endif // CAFFE2_UTILS_PTHREADPOOL_H_

View File

@ -18,7 +18,6 @@
#include "caffe2/utils/threadpool/pthreadpool_impl.h"
#include "caffe2/utils/threadpool/ThreadPool.h"
#if CAFFE2_THREADPOOL_MOBILE
//
// External API
@ -38,5 +37,3 @@ void pthreadpool_compute_1d(struct pthreadpool* threadpool,
size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) {
return threadpool->pool_->getNumThreads();
}
#endif // CAFFE2_THREADPOOL_MOBILE

View File

@ -19,11 +19,6 @@
#include "ThreadPoolCommon.h"
#ifndef CAFFE2_THREADPOOL_MOBILE
#error "mobile build state not defined"
#endif
#if CAFFE2_THREADPOOL_MOBILE
namespace caffe2 {
@ -41,6 +36,4 @@ struct pthreadpool {
} // extern "C"
#endif // CAFFE2_THREADPOOL_MOBILE
#endif // CAFFE2_UTILS_PTHREADPOOL_IMPL_H_

View File

@ -76,9 +76,7 @@ if (ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NA
set(NNPACK_INCLUDE_DIRS
$<TARGET_PROPERTY:nnpack,INCLUDE_DIRECTORIES>
$<TARGET_PROPERTY:pthreadpool,INCLUDE_DIRECTORIES>)
set(NNPACK_LIBRARIES
$<TARGET_FILE:nnpack>
$<TARGET_FILE:pthreadpool>)
set(NNPACK_LIBRARIES $<TARGET_FILE:nnpack>)
return()
endif()