mirror of
https://github.com/pytorch/pytorch.git
synced 2025-10-20 12:54:11 +08:00
NNPACK: Use new bindings and custom thread pool
Summary: This change should dramatically (~10X) improve performance of convolution with NNPACK engine Closes https://github.com/caffe2/caffe2/pull/1730 Reviewed By: sf-wind Differential Revision: D6695895 Pulled By: Maratyszcza fbshipit-source-id: 26291916811ef4cb819a59aec848c4e23668e568
This commit is contained in:
committed by
Facebook Github Bot
parent
0a8a18ca01
commit
224493d9ce
@ -17,7 +17,7 @@ set(CAFFE2_VERSION
|
||||
option(BUILD_BINARY "Build C++ binaries" ON)
|
||||
option(BUILD_PYTHON "Build Python binaries" ON)
|
||||
option(BUILD_SHARED_LIBS "Build libcaffe2.so" ON)
|
||||
option(BUILD_SHARE_DIR "Build files in the share/ directory" OFF)
|
||||
option(BUILD_OBSERVERS "Build performance observers/loggers in caffe2/share/observers directory" OFF)
|
||||
option(BUILD_TEST "Build C++ test binaries (need gtest and gbenchmark)" ON)
|
||||
option(USE_ATEN "Use ATen" OFF)
|
||||
option(USE_ASAN "Use Address Sanitizer" OFF)
|
||||
|
@ -34,9 +34,7 @@ add_subdirectory(perfkernels)
|
||||
add_subdirectory(python)
|
||||
add_subdirectory(queue)
|
||||
add_subdirectory(sgd)
|
||||
if (BUILD_SHARE_DIR)
|
||||
add_subdirectory(share)
|
||||
endif()
|
||||
add_subdirectory(share)
|
||||
# add_subdirectory(test) # todo: use caffe2_gtest_main instead of gtest_main because we will need to call GlobalInit
|
||||
add_subdirectory(transforms)
|
||||
add_subdirectory(utils)
|
||||
@ -300,9 +298,10 @@ endif()
|
||||
# Binaries
|
||||
if (BUILD_BINARY)
|
||||
add_subdirectory(binaries)
|
||||
# Benchmarking binaries require observers included in the build
|
||||
# There is a linking issue that happens in some of the Windows builds.
|
||||
# TODO(Yangqing): after the module redesing, enable this back.
|
||||
if (NOT MSVC AND BUILD_SHARE_DIR)
|
||||
# TODO(Yangqing): after the module redesign, enable this back.
|
||||
if (BUILD_OBSERVERS AND NOT MSVC)
|
||||
add_subdirectory(share/contrib/binaries)
|
||||
endif()
|
||||
endif()
|
||||
|
@ -1,7 +1,6 @@
|
||||
add_subdirectory(aten)
|
||||
add_subdirectory(gloo)
|
||||
add_subdirectory(nccl)
|
||||
add_subdirectory(nnpack)
|
||||
add_subdirectory(prof)
|
||||
add_subdirectory(shm_mutex)
|
||||
add_subdirectory(script)
|
||||
|
@ -1,8 +0,0 @@
|
||||
if(USE_NNPACK)
|
||||
message(STATUS "Include NNPACK operators")
|
||||
set(Caffe2_CONTRIB_NNPACK_CPU_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/nnpack_ops.cc"
|
||||
)
|
||||
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_NNPACK_CPU_SRC} PARENT_SCOPE)
|
||||
endif()
|
@ -320,7 +320,6 @@ bool Workspace::RunPlan(const PlanDef& plan, ShouldContinue shouldContinue) {
|
||||
return RunPlanOnWorkspace(this, plan, shouldContinue);
|
||||
}
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
ThreadPool* Workspace::GetThreadPool() {
|
||||
std::lock_guard<std::mutex> guard(thread_pool_creation_mutex_);
|
||||
if (!thread_pool_) {
|
||||
@ -328,6 +327,5 @@ ThreadPool* Workspace::GetThreadPool() {
|
||||
}
|
||||
return thread_pool_.get();
|
||||
}
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
||||
} // namespace caffe2
|
||||
|
@ -20,10 +20,6 @@
|
||||
#include "caffe2/core/common.h"
|
||||
#include "caffe2/core/observer.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
#include <climits>
|
||||
#include <cstddef>
|
||||
#include <mutex>
|
||||
@ -36,9 +32,7 @@
|
||||
#include "caffe2/core/net.h"
|
||||
#include "caffe2/proto/caffe2.pb.h"
|
||||
#include "caffe2/utils/signal_handler.h"
|
||||
#if CAFFE2_MOBILE
|
||||
#include "caffe2/utils/threadpool/ThreadPool.h"
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
||||
CAFFE2_DECLARE_bool(caffe2_print_blob_sizes_at_exit);
|
||||
|
||||
@ -291,14 +285,12 @@ class Workspace {
|
||||
bool RunPlan(const PlanDef& plan_def,
|
||||
ShouldContinue should_continue = StopOnSignal{});
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
/*
|
||||
* Returns a CPU threadpool instace for parallel execution of
|
||||
* work. The threadpool is created lazily; if no operators use it,
|
||||
* then no threadpool will be created.
|
||||
*/
|
||||
ThreadPool* GetThreadPool();
|
||||
#endif
|
||||
|
||||
// RunOperatorOnce and RunNetOnce runs an operator or net once. The difference
|
||||
// between RunNet and RunNetOnce lies in the fact that RunNet allows you to
|
||||
@ -318,10 +310,8 @@ class Workspace {
|
||||
const Workspace* shared_;
|
||||
std::unordered_map<string, std::pair<const Workspace*, string>>
|
||||
forwarded_blobs_;
|
||||
#if CAFFE2_MOBILE
|
||||
std::unique_ptr<ThreadPool> thread_pool_;
|
||||
std::mutex thread_pool_creation_mutex_;
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
||||
DISABLE_COPY_AND_ASSIGN(Workspace);
|
||||
};
|
||||
|
@ -1,11 +1,18 @@
|
||||
set(Caffe2_CPU_OBSERVER_SRCS)
|
||||
|
||||
if (USE_NNPACK)
|
||||
add_subdirectory(nnpack)
|
||||
endif()
|
||||
if (BUILD_OBSERVERS)
|
||||
add_subdirectory(observers)
|
||||
endif()
|
||||
if (USE_ZSTD)
|
||||
add_subdirectory(zstd)
|
||||
endif()
|
||||
|
||||
add_library(Caffe2_CPU_OBSERVER STATIC ${Caffe2_CPU_OBSERVER_SRCS})
|
||||
if (BUILD_OBSERVERS)
|
||||
add_library(Caffe2_CPU_OBSERVER STATIC ${Caffe2_CPU_OBSERVER_SRCS})
|
||||
endif()
|
||||
|
||||
# CPU source, test sources, binary sources
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} PARENT_SCOPE)
|
||||
|
9
caffe2/share/contrib/nnpack/CMakeLists.txt
Normal file
9
caffe2/share/contrib/nnpack/CMakeLists.txt
Normal file
@ -0,0 +1,9 @@
|
||||
set(Caffe2_CONTRIB_NNPACK_CPU_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/conv_op.cc"
|
||||
)
|
||||
set(Caffe2_CONTRIB_NNPACK_TEST_CPU_SRC
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/conv_op_test.cc"
|
||||
)
|
||||
|
||||
set(Caffe2_CPU_SRCS ${Caffe2_CPU_SRCS} ${Caffe2_CONTRIB_NNPACK_CPU_SRC} PARENT_SCOPE)
|
||||
set(Caffe2_CPU_TEST_SRCS ${Caffe2_CPU_TEST_SRCS} ${Caffe2_CONTRIB_NNPACK_TEST_CPU_SRC} PARENT_SCOPE)
|
@ -16,11 +16,6 @@
|
||||
|
||||
#include "caffe2/core/common.h"
|
||||
|
||||
#ifndef CAFFE2_MOBILE
|
||||
#error "Caffe2 mobile state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_MOBILE
|
||||
|
||||
#include "caffe2/core/context.h"
|
||||
#include "caffe2/core/logging.h"
|
||||
@ -258,7 +253,7 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
||||
transformStrategy_ = nnp_convolution_transform_strategy_reuse;
|
||||
}
|
||||
} else {
|
||||
LOG(ERROR)
|
||||
LOG(WARNING)
|
||||
<< "Failed to query workspace size to precompute kernels, falling back to re-compute strategy";
|
||||
transformStrategy_ = nnp_convolution_transform_strategy_compute;
|
||||
}
|
||||
@ -406,5 +401,3 @@ bool NNPACKConvOp::RunOnDeviceWithOrderNCHW() {
|
||||
REGISTER_CPU_OPERATOR_WITH_ENGINE(Conv, NNPACK, NNPACKConvOp);
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_MOBILE
|
||||
|
@ -238,13 +238,13 @@ constexpr size_t kIters = 20;
|
||||
// TODO(#14383029) cblas_sgemm not yet implemented on limited mobile cases.
|
||||
#if !defined(CAFFE2_FB_LIMITED_MOBILE_CAPABILITY)
|
||||
|
||||
TEST(MobileNNPACK, Conv_3x3s1) {
|
||||
TEST(NNPACK, Conv_3x3s1) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
runConv(3, 3, 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_3x3s1_precompute) {
|
||||
TEST(NNPACK, Conv_3x3s1_precompute) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
int group = randInt(1, 2);
|
||||
runConv(
|
||||
@ -261,13 +261,13 @@ TEST(MobileNNPACK, Conv_3x3s1_precompute) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_3x3s1_FP16) {
|
||||
TEST(NNPACK, Conv_3x3s1_FP16) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
runConv(3, 3, 1, 1, 1, "WINOGRAD_FP16");
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_3x3s1_FP16_precompute) {
|
||||
TEST(NNPACK, Conv_3x3s1_FP16_precompute) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
int group = randInt(1, 2);
|
||||
runConv(
|
||||
@ -284,14 +284,14 @@ TEST(MobileNNPACK, Conv_3x3s1_FP16_precompute) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_NxNs1) {
|
||||
TEST(NNPACK, Conv_NxNs1) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
int kernel = randInt(2, 10);
|
||||
runConv(kernel, kernel, 1, 1);
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_1x1s1) {
|
||||
TEST(NNPACK, Conv_1x1s1) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
auto group = randInt(1, 3);
|
||||
auto inChannels = randInt(1, 8) * group;
|
||||
@ -301,7 +301,7 @@ TEST(MobileNNPACK, Conv_1x1s1) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_1x1s1_precompute) {
|
||||
TEST(NNPACK, Conv_1x1s1_precompute) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
auto group = randInt(1, 3);
|
||||
auto inChannels = randInt(1, 8) * group;
|
||||
@ -312,7 +312,7 @@ TEST(MobileNNPACK, Conv_1x1s1_precompute) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_NxNs_grouped) {
|
||||
TEST(NNPACK, Conv_NxNs_grouped) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
int group = randInt(2, 3);
|
||||
int iC = randInt(1, 6) * group;
|
||||
@ -323,7 +323,7 @@ TEST(MobileNNPACK, Conv_NxNs_grouped) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_NxNs_grouped_precompute) {
|
||||
TEST(NNPACK, Conv_NxNs_grouped_precompute) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
int group = randInt(2, 3);
|
||||
int iC = randInt(1, 6) * group;
|
||||
@ -334,7 +334,7 @@ TEST(MobileNNPACK, Conv_NxNs_grouped_precompute) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_NxNsW) {
|
||||
TEST(NNPACK, Conv_NxNsW) {
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
int kernel = randInt(3, 5);
|
||||
int stride = randInt(1, kernel - 1);
|
||||
@ -342,7 +342,7 @@ TEST(MobileNNPACK, Conv_NxNsW) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Conv_HxWsHxW) {
|
||||
TEST(NNPACK, Conv_HxWsHxW) {
|
||||
for (int i = 0; i < 3; ++i) {
|
||||
int kernelH = randInt(2, 5);
|
||||
int kernelW = randInt(2, 5);
|
||||
@ -352,7 +352,7 @@ TEST(MobileNNPACK, Conv_HxWsHxW) {
|
||||
}
|
||||
}
|
||||
|
||||
TEST(MobileNNPACK, Depthwise3x3Conv) {
|
||||
TEST(NNPACK, Depthwise3x3Conv) {
|
||||
for (int i = 0; i < kIters; ++i) {
|
||||
int channel = 2;
|
||||
runConv(
|
||||
|
@ -21,12 +21,12 @@ exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${Caffe2_GPU_SRCS})
|
||||
|
||||
# ---[ threadpool/pthreadpool* is a local modification of the NNPACK
|
||||
# pthreadpool with a very similar interface. We used it internally to
|
||||
# wait on such changes to be merged to the upstream pthreadpool repo.
|
||||
# For the open source version, we do not need to use these files as we
|
||||
# will directly link nnpack pthreadpool.
|
||||
file(GLOB_RECURSE tmp pthreadpool*)
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
|
||||
# pthreadpool with a very similar interface. Neither NNPACK, nor this
|
||||
# thread pool supports Windows.
|
||||
if (MSVC)
|
||||
file(GLOB_RECURSE tmp pthreadpool*)
|
||||
exclude(Caffe2_CPU_SRCS "${Caffe2_CPU_SRCS}" ${tmp})
|
||||
endif()
|
||||
|
||||
# ---[ GPU test files
|
||||
file(GLOB_RECURSE tmp *_gpu_test.cc)
|
||||
|
@ -31,7 +31,6 @@ CAFFE2_DEFINE_int(caffe2_threadpool_android_cap, true, "");
|
||||
// Whether or not threadpool caps apply to iOS
|
||||
CAFFE2_DEFINE_int(caffe2_threadpool_ios_cap, false, "");
|
||||
|
||||
#if CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -62,8 +61,6 @@ std::unique_ptr<ThreadPool> ThreadPool::defaultThreadPool() {
|
||||
applyCap = caffe2::FLAGS_caffe2_threadpool_android_cap;
|
||||
#elif CAFFE2_IOS
|
||||
applyCap = caffe2::FLAGS_caffe2_threadpool_ios_cap;
|
||||
#else
|
||||
#error Undefined architecture
|
||||
#endif
|
||||
|
||||
if (applyCap) {
|
||||
@ -185,5 +182,3 @@ void ThreadPool::run(const std::function<void(int, size_t)>& fn, size_t range) {
|
||||
}
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_THREADPOOL_MOBILE
|
||||
|
@ -19,13 +19,7 @@
|
||||
|
||||
#include "ThreadPoolCommon.h"
|
||||
|
||||
#ifndef CAFFE2_THREADPOOL_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
// ThreadPool only used in mobile builds at the moment
|
||||
#if CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
#include <functional>
|
||||
#include <memory>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
@ -68,6 +62,4 @@ private:
|
||||
|
||||
} // namespace caffe2
|
||||
|
||||
#endif // CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
#endif // CAFFE2_UTILS_THREADPOOL_H_
|
||||
|
@ -26,20 +26,14 @@
|
||||
|
||||
// This is copied from core/common.h's definition of CAFFE2_MOBILE
|
||||
// Define enabled when building for iOS or Android devices
|
||||
#if !defined(CAFFE2_THREADPOOL_MOBILE)
|
||||
#if defined(__ANDROID__)
|
||||
#define CAFFE2_ANDROID 1
|
||||
#define CAFFE2_THREADPOOL_MOBILE 1
|
||||
#elif (defined(__APPLE__) && \
|
||||
(TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE))
|
||||
#define CAFFE2_IOS 1
|
||||
#define CAFFE2_THREADPOOL_MOBILE 1
|
||||
#elif (defined(__APPLE__) && TARGET_OS_MAC)
|
||||
#define CAFFE2_IOS 1
|
||||
#define CAFFE2_THREADPOOL_MOBILE 1
|
||||
#else
|
||||
#define CAFFE2_THREADPOOL_MOBILE 0
|
||||
#endif // ANDROID / IOS / MACOS
|
||||
#endif // CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
#endif // CAFFE2_UTILS_THREADPOOL_COMMON_H_
|
||||
|
@ -30,7 +30,6 @@
|
||||
#include "caffe2/utils/fixed_divisor.h"
|
||||
#include "caffe2/utils/threadpool/pthreadpool.h"
|
||||
|
||||
#if CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
static inline size_t divide_round_up(size_t dividend, size_t divisor) {
|
||||
if (dividend % divisor == 0) {
|
||||
@ -181,5 +180,3 @@ void pthreadpool_compute_2d_tiled(
|
||||
pthreadpool_compute_1d(threadpool, (pthreadpool_function_1d_t) compute_2d_tiled, &context, tile_range_i * tile_range_j);
|
||||
}
|
||||
}
|
||||
|
||||
#endif // CAFFE2_THREADPOOL_MOBILE
|
||||
|
@ -21,12 +21,6 @@
|
||||
|
||||
#include "ThreadPoolCommon.h"
|
||||
|
||||
#ifndef CAFFE2_THREADPOOL_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
// ThreadPool only used in mobile builds at the moment
|
||||
#if CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
#include <stddef.h> // for size_t
|
||||
|
||||
@ -122,6 +116,4 @@ void pthreadpool_destroy(pthreadpool_t threadpool);
|
||||
} /* extern "C" */
|
||||
#endif
|
||||
|
||||
#endif // CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
#endif // CAFFE2_UTILS_PTHREADPOOL_H_
|
||||
|
@ -18,7 +18,6 @@
|
||||
#include "caffe2/utils/threadpool/pthreadpool_impl.h"
|
||||
#include "caffe2/utils/threadpool/ThreadPool.h"
|
||||
|
||||
#if CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
//
|
||||
// External API
|
||||
@ -38,5 +37,3 @@ void pthreadpool_compute_1d(struct pthreadpool* threadpool,
|
||||
size_t pthreadpool_get_threads_count(struct pthreadpool* threadpool) {
|
||||
return threadpool->pool_->getNumThreads();
|
||||
}
|
||||
|
||||
#endif // CAFFE2_THREADPOOL_MOBILE
|
||||
|
@ -19,11 +19,6 @@
|
||||
|
||||
#include "ThreadPoolCommon.h"
|
||||
|
||||
#ifndef CAFFE2_THREADPOOL_MOBILE
|
||||
#error "mobile build state not defined"
|
||||
#endif
|
||||
|
||||
#if CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
namespace caffe2 {
|
||||
|
||||
@ -41,6 +36,4 @@ struct pthreadpool {
|
||||
|
||||
} // extern "C"
|
||||
|
||||
#endif // CAFFE2_THREADPOOL_MOBILE
|
||||
|
||||
#endif // CAFFE2_UTILS_PTHREADPOOL_IMPL_H_
|
||||
|
4
cmake/External/nnpack.cmake
vendored
4
cmake/External/nnpack.cmake
vendored
@ -76,9 +76,7 @@ if (ANDROID OR IOS OR ${CMAKE_SYSTEM_NAME} STREQUAL "Linux" OR ${CMAKE_SYSTEM_NA
|
||||
set(NNPACK_INCLUDE_DIRS
|
||||
$<TARGET_PROPERTY:nnpack,INCLUDE_DIRECTORIES>
|
||||
$<TARGET_PROPERTY:pthreadpool,INCLUDE_DIRECTORIES>)
|
||||
set(NNPACK_LIBRARIES
|
||||
$<TARGET_FILE:nnpack>
|
||||
$<TARGET_FILE:pthreadpool>)
|
||||
set(NNPACK_LIBRARIES $<TARGET_FILE:nnpack>)
|
||||
return()
|
||||
endif()
|
||||
|
||||
|
Reference in New Issue
Block a user