Temporarily remove TBB (#8255)

This commit is contained in:
cpuhrsch
2018-06-18 19:31:57 -04:00
committed by GitHub
parent 4f37a6481d
commit 05c473b85c
14 changed files with 73 additions and 547 deletions

4
.gitmodules vendored
View File

@ -1,7 +1,3 @@
[submodule "third_party/tbb"]
path = third_party/tbb
url = https://github.com/01org/tbb
branch = tbb_2018
[submodule "third_party/catch"] [submodule "third_party/catch"]
path = third_party/catch path = third_party/catch
url = https://github.com/catchorg/Catch2.git url = https://github.com/catchorg/Catch2.git

View File

@ -89,24 +89,6 @@ IF(NOT AT_LINK_STYLE)
SET(AT_LINK_STYLE SHARED) SET(AT_LINK_STYLE SHARED)
ENDIF() ENDIF()
# Unset our restrictive C++ flags here and reset them later.
# Remove this once we use proper target_compile_options.
set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS})
set(CMAKE_CXX_FLAGS)
set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/tbb")
set(TBB_BUILD_STATIC ON CACHE BOOL " " FORCE)
set(TBB_BUILD_SHARED OFF CACHE BOOL " " FORCE)
set(TBB_BUILD_TBBMALLOC OFF CACHE BOOL " " FORCE)
set(TBB_BUILD_TBBMALLOC_PROXY OFF CACHE BOOL " " FORCE)
set(TBB_BUILD_TESTS OFF CACHE BOOL " " FORCE)
add_subdirectory(cpu/tbb)
set_property(TARGET tbb_static tbb_def_files PROPERTY FOLDER "dependencies")
list(APPEND ATen_THIRD_PARTY_INCLUDE ${TBB_ROOT_DIR}/include)
list(APPEND ATen_CPU_DEPENDENCY_LIBS tbb_static)
set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS})
IF(BLAS_FOUND) IF(BLAS_FOUND)
IF ($ENV{TH_BINARY_BUILD}) IF ($ENV{TH_BINARY_BUILD})
MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.") MESSAGE(STATUS "TH_BINARY_BUILD detected. Enabling special linkage.")

View File

@ -149,7 +149,6 @@ inline bool _apply_preamble(ArrayRef<Tensor> tensors) {
for (auto& t : tensors) for (auto& t : tensors)
if (t.sizes().equals({0})) if (t.sizes().equals({0}))
return false; return false;
internal::init_tbb_num_threads();
return true; return true;
} }
@ -351,7 +350,7 @@ template <typename scalar1, typename Op>
inline void CPU_tensor_parallel_apply1( inline void CPU_tensor_parallel_apply1(
Tensor tensor1, Tensor tensor1,
const Op op, const Op op,
int64_t grain_size = internal::TBB_GRAIN_SIZE) { int64_t grain_size = internal::GRAIN_SIZE) {
if (!_apply_preamble({tensor1})) if (!_apply_preamble({tensor1}))
return; return;
if (tensor1.ndimension() < 8) { if (tensor1.ndimension() < 8) {
@ -383,7 +382,7 @@ inline void CPU_tensor_parallel_apply2(
Tensor tensor1, Tensor tensor1,
Tensor tensor2, Tensor tensor2,
const Op op, const Op op,
int64_t grain_size = internal::TBB_GRAIN_SIZE) { int64_t grain_size = internal::GRAIN_SIZE) {
if (!_apply_preamble({tensor1, tensor2})) if (!_apply_preamble({tensor1, tensor2}))
return; return;
if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) { if (tensor1.ndimension() < 8 && tensor2.ndimension() < 8) {

View File

@ -1,56 +0,0 @@
#include <ATen/CPUGeneral.h>
#include <ATen/Parallel.h>
#include <tbb/blocked_range.h>
#include <tbb/parallel_reduce.h>
#include <tbb/partitioner.h>
#include <tbb/tbb.h>
#include <cassert>
#include <thread>
namespace at { namespace internal {
// thread_local variable with internal linkage
// requires no guarding as it's storage duration is defined to be per thread
static thread_local tbb::task_scheduler_init tbbinit(
tbb::task_scheduler_init::deferred);
// Tracks number of threads uses which TBB doesn't track.
static thread_local int num_threads_ = -1;
// Negative number of threads means default value
void init_tbb_num_threads() {
static thread_local bool first_call = true;
int num_threads = at::get_num_threads();
// In order to have control over the number of threads this function
// must be called first before any other tbb parallel construct is
// excercised within a particular thread. Otherwise the default
// scheduler will be created over which we do not have control.
// The following code will and must throw an error if tbb has
// already been initialized before this function was called.
if (!tbbinit.is_active() && !first_call)
throw std::runtime_error(
"tbb initialization failed: scheduler not active after first call");
if (first_call) {
if (tbbinit.is_active())
throw std::runtime_error(
"tbb initialization failed: scheduler active on first call");
if (num_threads < 0) {
int max_threads = tbbinit.default_num_threads();
tbbinit.initialize(max_threads);
} else {
tbbinit.initialize(num_threads);
}
first_call = false;
}
if (num_threads == 0) {
// TODO: For PyTorch 0 means 1
num_threads = 1;
}
if (num_threads > 0 && (num_threads_ != num_threads)) {
tbbinit.terminate();
tbbinit.initialize(num_threads);
num_threads_ = num_threads;
}
}
} // namespace internal
} // namespace at

View File

@ -1,54 +1,59 @@
#pragma once #pragma once
#include <ATen/ATen.h> #include <ATen/ATen.h>
#include <cstddef> #include <cstddef>
#include <tbb/tbb.h>
#ifdef _OPENMP
#include <omp.h>
#endif
namespace at { namespace at {
namespace internal { namespace internal {
// This needs to be called before the first use of any algorithm such as
// parallel or it will have no effect and the default task scheduler is
// created which uses all available cores.
// See
// https://www.threadingbuildingblocks.org/docs/help/reference/task_scheduler/task_scheduler_init_cls.html
// This does not initializes the number of workers in the market (the overall
// of workers available to a process). It is merely a request to the market
// for a certain number of workers. If there are multiple threads making
// a request at the size of the maximum number of threads, they will
// be allocated a number proportional to the other requests.
AT_API void init_tbb_num_threads();
// This parameter is heuristically chosen to determine the minimum number of // This parameter is heuristically chosen to determine the minimum number of
// work that warrants paralellism. For example, when summing an array, it is // work that warrants paralellism. For example, when summing an array, it is
// deemed inefficient to parallelise over arrays shorter than 32768. Further, // deemed inefficient to parallelise over arrays shorter than 32768. Further,
// no parallel algorithm (such as parallel_reduce) should split work into // no parallel algorithm (such as parallel_reduce) should split work into
// smaller than GRAIN_SIZE chunks. // smaller than GRAIN_SIZE chunks.
constexpr int64_t TBB_GRAIN_SIZE = 32768; constexpr int64_t GRAIN_SIZE = 32768;
} // namespace internal } // namespace internal
inline int64_t divup(int64_t x, int64_t y) {
return (x + y - 1) / y;
}
template <class F> template <class F>
inline void parallel_for( inline void parallel_for(
int64_t begin, const int64_t begin,
int64_t end, const int64_t end,
int64_t grain_size, const int64_t grain_size_,
const F& f) { const F f) {
internal::init_tbb_num_threads(); const int64_t min_grain_size = divup((end - begin), get_num_threads());
const int64_t grain_size = std::max(min_grain_size, grain_size_);
#ifdef __PPC64__ #pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1)
using default_partitioner_type = tbb::simple_partitioner; for (int64_t i = begin; i < end; i += grain_size) {
#else f(i, i + std::min(end - i, grain_size));
using default_partitioner_type = tbb::affinity_partitioner;
#endif
thread_local static default_partitioner_type ap;
if ((end - begin) < grain_size || get_num_threads() == 1) {
f(begin, end);
} else {
tbb::parallel_for(
tbb::blocked_range<int64_t>(begin, end, grain_size),
[f](const tbb::blocked_range<int64_t>& r) { f(r.begin(), r.end()); },
ap);
} }
} }
template <class scalar_t, class F, class SF>
inline scalar_t parallel_reduce(
const int64_t begin,
const int64_t end,
const int64_t grain_size_,
const scalar_t ident,
const F f,
const SF sf) {
const int64_t min_grain_size = divup((end - begin), get_num_threads());
const int64_t grain_size = std::max(min_grain_size, grain_size_);
const int64_t num_results = divup((end - begin), grain_size);
std::vector<scalar_t> results(num_results);
scalar_t* results_data = results.data();
#pragma omp parallel for if ((end - begin) >= grain_size && get_num_threads() > 1)
for (int64_t id = 0; id < num_results; id++) {
int64_t i = begin + id * grain_size;
results_data[id] = f(i, i + std::min(end - i, grain_size), ident);
}
return std::accumulate(
results_data, results_data + results.size(), ident, sf);
}
} // namespace at } // namespace at

View File

@ -1,376 +0,0 @@
# Based on https://github.com/wjakob/tbb/blob/master/CMakeLists.txt
# All credit goes to Wenzel Jakob!
cmake_minimum_required (VERSION 2.8.12 FATAL_ERROR)
project (tbb CXX)
include(CheckCXXCompilerFlag)
include(CheckCXXSourceRuns)
if(POLICY CMP0058)
cmake_policy(SET CMP0058 NEW)
endif()
if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
message(STATUS "Setting build type to 'Release' as none was specified.")
set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release"
"MinSizeRel" "RelWithDebInfo")
endif()
if(NOT TBB_ROOT_DIR)
set(TBB_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}")
endif()
if(NOT TBB_INSTALL_RUNTIME_DIR)
set(TBB_INSTALL_RUNTIME_DIR bin)
endif()
if(NOT TBB_INSTALL_LIBRARY_DIR)
set(TBB_INSTALL_LIBRARY_DIR lib)
endif()
if(NOT TBB_INSTALL_ARCHIVE_DIR)
set(TBB_INSTALL_ARCHIVE_DIR lib)
endif()
if(NOT TBB_INSTALL_INCLUDE_DIR)
set(TBB_INSTALL_INCLUDE_DIR "${TBB_ROOT_DIR}/include")
endif()
set(TBB_INCLUDES
"${TBB_ROOT_DIR}/include"
"${TBB_ROOT_DIR}/src"
"${TBB_ROOT_DIR}/src/rml/include"
${CMAKE_CURRENT_BINARY_DIR})
option(TBB_BUILD_SHARED "Build TBB shared library" ON)
option(TBB_BUILD_STATIC "Build TBB static library" ON)
option(TBB_BUILD_TBBMALLOC "Build TBB malloc library" ON)
option(TBB_BUILD_TBBMALLOC_PROXY "Build TBB malloc proxy library" ON)
option(TBB_BUILD_TESTS "Build TBB tests and enable testing infrastructure" ON)
option(TBB_CI_BUILD "Is this a continuous integration build?" OFF)
if(APPLE)
set(CMAKE_MACOSX_RPATH ON)
endif()
file(GLOB tbb_src "${TBB_ROOT_DIR}/src/tbb/*.cpp" "${TBB_ROOT_DIR}/src/old/*.cpp")
list(APPEND tbb_src ${TBB_ROOT_DIR}/src/rml/client/rml_tbb.cpp)
file(GLOB to_remove "${TBB_ROOT_DIR}/src/old/test*.cpp")
if (NOT "${to_remove}" STREQUAL "")
list(REMOVE_ITEM tbb_src ${to_remove})
endif()
set(tbbmalloc_static_src
src/tbbmalloc/backend.cpp
src/tbbmalloc/large_objects.cpp
src/tbbmalloc/backref.cpp
src/tbbmalloc/tbbmalloc.cpp
src/tbbmalloc/frontend.cpp
src/tbb/itt_notify.cpp)
set(tbbmalloc_src ${tbbmalloc_static_src})
set(tbbmalloc_proxy_src
src/tbbmalloc/proxy.cpp
src/tbbmalloc/tbb_function_replacement.cpp)
if (CMAKE_SYSTEM_PROCESSOR MATCHES "(i386|x86_64)")
if (NOT APPLE AND NOT MINGW)
add_definitions(-DDO_ITT_NOTIFY)
endif()
endif()
if (APPLE)
# Disable annoying "has no symbols" warnings
set(CMAKE_C_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
set(CMAKE_CXX_ARCHIVE_CREATE "<CMAKE_AR> Scr <TARGET> <LINK_FLAGS> <OBJECTS>")
set(CMAKE_C_ARCHIVE_FINISH "<CMAKE_RANLIB> -no_warning_for_no_symbols -c <TARGET>")
set(CMAKE_CXX_ARCHIVE_FINISH "<CMAKE_RANLIB> -no_warning_for_no_symbols -c <TARGET>")
endif()
macro(CHECK_CXX_COMPILER_AND_LINKER_FLAGS _RESULT _CXX_FLAGS _LINKER_FLAGS)
set(CMAKE_REQUIRED_FLAGS ${_CXX_FLAGS})
set(CMAKE_REQUIRED_LIBRARIES ${_LINKER_FLAGS})
set(CMAKE_REQUIRED_QUIET TRUE)
check_cxx_source_runs("#include <iostream>\nint main(int argc, char **argv) { std::cout << \"test\"; return 0; }" ${_RESULT})
set(CMAKE_REQUIRED_FLAGS "")
set(CMAKE_REQUIRED_LIBRARIES "")
endmacro()
# Prefer libc++ in conjunction with Clang
if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
if (CMAKE_CXX_FLAGS MATCHES "-stdlib=libc\\+\\+")
message(STATUS "TBB: using libc++.")
else()
CHECK_CXX_COMPILER_AND_LINKER_FLAGS(HAS_LIBCPP "-stdlib=libc++" "-stdlib=libc++")
if (HAS_LIBCPP)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -stdlib=libc++ -D_LIBCPP_VERSION")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -stdlib=libc++")
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -stdlib=libc++")
message(STATUS "TBB: using libc++.")
else()
message(STATUS "TBB: NOT using libc++.")
endif()
endif()
endif()
if (UNIX)
add_definitions (-DUSE_PTHREAD)
check_cxx_compiler_flag ("-std=c++11" SUPPORTS_STDCXX11)
if (SUPPORTS_STDCXX11)
set (CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
endif ()
check_cxx_compiler_flag ("-mrtm -Werror" SUPPORTS_MRTM)
if (SUPPORTS_MRTM)
set (CMAKE_CXX_FLAGS "-mrtm ${CMAKE_CXX_FLAGS}")
endif ()
elseif(WIN32)
if (MSVC)
cmake_minimum_required (VERSION 3.1)
enable_language(ASM_MASM)
set(CMAKE_CXX_FLAGS "/GS- /Zc:wchar_t /Zc:forScope /DUSE_WINTHREAD ${CMAKE_CXX_FLAGS}")
set(CMAKE_CXX_FLAGS "/D_CRT_SECURE_NO_DEPRECATE /D_WIN32_WINNT=0x0600 ${CMAKE_CXX_FLAGS}")
check_cxx_compiler_flag ("/volatile:iso" SUPPORTS_VOLATILE_FLAG)
if (SUPPORTS_VOLATILE_FLAG)
set(CMAKE_CXX_FLAGS "/volatile:iso ${CMAKE_CXX_FLAGS}")
endif ()
set(CMAKE_CXX_FLAGS "/wd4267 /wd4800 /wd4146 /wd4244 /wd4577 /wd4018 ${CMAKE_CXX_FLAGS}")
if (NOT CMAKE_SIZEOF_VOID_P)
message(FATAL_ERROR "'CMAKE_SIZEOF_VOID_P' is undefined. Please delete your build directory and rerun CMake again!")
endif()
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm")
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/itsx.asm")
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/intel64_misc.asm")
list(APPEND tbbmalloc_src "${TBB_ROOT_DIR}/src/tbb/intel64-masm/atomic_support.asm")
set(CMAKE_ASM_MASM_FLAGS "/DEM64T=1 ${CMAKE_ASM_MASM_FLAGS}")
else()
list(APPEND tbb_src "${TBB_ROOT_DIR}/src/tbb/ia32-masm/atomic_support.asm"
"${TBB_ROOT_DIR}/src/tbb/ia32-masm/itsx.asm src/tbb/ia32-masm/lock_byte.asm")
# Enable SAFESEH feature for assembly (x86 builds only).
set(CMAKE_ASM_MASM_FLAGS "/safeseh ${CMAKE_ASM_MASM_FLAGS}")
endif()
elseif (MINGW)
add_definitions(-DUSE_WINTHREAD)
add_definitions(-D_WIN32_WINNT=0x0502)
set(CMAKE_CXX_FLAGS "-mthreads ${CMAKE_CXX_FLAGS}")
endif ()
endif()
if (MSVC)
set(ENABLE_RTTI "/EHsc /GR ")
set(DISABLE_RTTI "/EHs- /GR- ")
elseif (UNIX)
set(ENABLE_RTTI "-frtti -fexceptions ")
set(DISABLE_RTTI "-fno-rtti -fno-exceptions ")
endif ()
##--------
# - Added TBB_USE_GLIBCXX_VERSION macro to specify the version of GNU
# libstdc++ when it cannot be properly recognized, e.g. when used
# with Clang on Linux* OS. Inspired by a contribution from David A.
if (NOT TBB_USE_GLIBCXX_VERSION AND UNIX AND NOT APPLE)
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# using Clang
string(REPLACE "." "0" TBB_USE_GLIBCXX_VERSION ${CMAKE_CXX_COMPILER_VERSION})
endif()
endif()
if (TBB_USE_GLIBCXX_VERSION)
add_definitions(-DTBB_USE_GLIBCXX_VERSION=${TBB_USE_GLIBCXX_VERSION})
endif()
##-------
if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
check_cxx_compiler_flag ("-flifetime-dse=1" SUPPORTS_FLIFETIME)
if (SUPPORTS_FLIFETIME)
add_definitions(-flifetime-dse=1)
endif()
endif()
# Linker export definitions
if (APPLE)
set (ARCH_PREFIX "mac")
elseif(WIN32)
set (ARCH_PREFIX "win")
else()
set (ARCH_PREFIX "lin")
endif()
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
set(ARCH_PREFIX "${ARCH_PREFIX}64")
else()
set(ARCH_PREFIX "${ARCH_PREFIX}32")
endif()
if (MINGW)
set (ARCH_PREFIX "${ARCH_PREFIX}-gcc")
# there's no win32-gcc-tbb-export.def, use lin32-tbb-export.def
execute_process (COMMAND ${CMAKE_COMMAND} -E copy ${TBB_ROOT_DIR}/src/tbb/lin32-tbb-export.def ${TBB_ROOT_DIR}/src/tbb/win32-gcc-tbb-export.def)
endif()
if (MSVC)
add_custom_command(OUTPUT tbb.def
COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def -I ${TBB_ROOT_DIR}/include > tbb.def
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def
COMMENT "Preprocessing tbb.def"
)
add_custom_command(OUTPUT tbbmalloc.def
COMMAND ${CMAKE_CXX_COMPILER} /TC /EP ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def -I ${TBB_ROOT_DIR}/include > tbbmalloc.def
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def
COMMENT "Preprocessing tbbmalloc.def"
)
else()
add_custom_command(OUTPUT tbb.def
COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def -I ${TBB_ROOT_DIR}/include -o tbb.def
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbb/${ARCH_PREFIX}-tbb-export.def
COMMENT "Preprocessing tbb.def"
)
add_custom_command(OUTPUT tbbmalloc.def
COMMAND ${CMAKE_CXX_COMPILER} -xc++ -E ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def -I ${TBB_ROOT_DIR}/include -o tbbmalloc.def
MAIN_DEPENDENCY ${TBB_ROOT_DIR}/src/tbbmalloc/${ARCH_PREFIX}-tbbmalloc-export.def
COMMENT "Preprocessing tbbmalloc.def"
)
endif()
add_custom_target(tbb_def_files DEPENDS tbb.def tbbmalloc.def)
# TBB library
if (TBB_BUILD_STATIC)
add_library(tbb_static STATIC ${tbb_src})
target_include_directories(tbb_static PRIVATE ${TBB_INCLUDES})
set_property(TARGET tbb_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1")
set_property(TARGET tbb_static APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI})
install(TARGETS tbb_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
if (MSVC)
target_compile_definitions(tbb_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1)
endif()
if (UNIX AND NOT APPLE)
target_link_libraries(tbb_static PUBLIC pthread dl)
endif()
endif()
if (TBB_BUILD_SHARED)
add_library(tbb SHARED ${tbb_src})
target_include_directories(tbb PRIVATE ${TBB_INCLUDES})
set_property(TARGET tbb APPEND PROPERTY COMPILE_DEFINITIONS "__TBB_BUILD=1")
set_property(TARGET tbb APPEND_STRING PROPERTY COMPILE_FLAGS ${ENABLE_RTTI})
add_dependencies(tbb tbb_def_files)
if (APPLE)
set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
elseif (MSVC)
set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
else ()
set_property(TARGET tbb APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbb.def\"")
endif()
install(TARGETS tbb
LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
if (UNIX AND NOT APPLE)
target_link_libraries(tbb PUBLIC pthread dl)
endif()
if (MSVC)
target_compile_definitions(tbb PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1)
endif()
endif()
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU")
# Quench a warning on GCC
set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/governor.cpp COMPILE_FLAGS "-Wno-missing-field-initializers ")
elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
# Quench a warning on Clang
set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/itt_notify.cpp COMPILE_FLAGS "-Wno-varargs ")
elseif(MSVC)
# Quench a warning on MSVC
set_source_files_properties(${TBB_ROOT_DIR}/src/tbb/scheduler.cpp COMPILE_FLAGS "/wd4458 ")
endif()
if(TBB_BUILD_TBBMALLOC)
# TBB malloc library
if (TBB_BUILD_STATIC)
add_library(tbbmalloc_static STATIC ${tbbmalloc_static_src})
target_include_directories(tbbmalloc_static PRIVATE ${TBB_INCLUDES})
set_property(TARGET tbbmalloc_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
set_property(TARGET tbbmalloc_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
if (MSVC)
target_compile_definitions(tbbmalloc_static PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1)
endif()
install(TARGETS tbbmalloc_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
endif()
if (TBB_BUILD_SHARED)
add_library(tbbmalloc SHARED ${tbbmalloc_src})
target_include_directories(tbbmalloc PRIVATE ${TBB_INCLUDES})
set_property(TARGET tbbmalloc APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
set_property(TARGET tbbmalloc APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
add_dependencies(tbbmalloc tbb_def_files)
if (APPLE)
set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-exported_symbols_list,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
elseif (MSVC)
set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "/DEF:\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
else ()
set_property(TARGET tbbmalloc APPEND PROPERTY LINK_FLAGS "-Wl,-version-script,\"${CMAKE_CURRENT_BINARY_DIR}/tbbmalloc.def\"")
endif()
if (MSVC)
target_compile_definitions(tbbmalloc PUBLIC __TBB_NO_IMPLICIT_LINKAGE=1 __TBBMALLOC_NO_IMPLICIT_LINKAGE=1)
endif()
install(TARGETS tbbmalloc
LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
if (UNIX AND NOT APPLE)
target_link_libraries(tbbmalloc PUBLIC pthread dl)
endif()
endif()
endif()
if(TBB_BUILD_TBBMALLOC_PROXY)
# TBB malloc proxy library
if (TBB_BUILD_STATIC)
add_library(tbbmalloc_proxy_static STATIC ${tbbmalloc_proxy_src})
set_property(TARGET tbbmalloc_proxy_static APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
set_property(TARGET tbbmalloc_proxy_static APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
install(TARGETS tbbmalloc_proxy_static ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR})
endif()
if (TBB_BUILD_SHARED)
add_library(tbbmalloc_proxy SHARED ${tbbmalloc_proxy_src})
set_property(TARGET tbbmalloc_proxy APPEND PROPERTY COMPILE_DEFINITIONS "__TBBMALLOC_BUILD=1")
set_property(TARGET tbbmalloc_proxy APPEND_STRING PROPERTY COMPILE_FLAGS ${DISABLE_RTTI})
target_link_libraries(tbbmalloc_proxy PUBLIC tbbmalloc)
install(TARGETS tbbmalloc_proxy
LIBRARY DESTINATION ${TBB_INSTALL_LIBRARY_DIR}
ARCHIVE DESTINATION ${TBB_INSTALL_ARCHIVE_DIR}
RUNTIME DESTINATION ${TBB_INSTALL_RUNTIME_DIR})
if (UNIX AND NOT APPLE)
target_link_libraries(tbbmalloc_proxy PUBLIC pthread dl)
endif()
endif()
endif()
install(DIRECTORY "${TBB_ROOT_DIR}/include/tbb" DESTINATION ${TBB_INSTALL_INCLUDE_DIR})
# version_string.ver
if (UNIX)
execute_process (COMMAND date "+%a, %d %b %Y %H:%M:%S %z"
OUTPUT_VARIABLE _configure_date
OUTPUT_STRIP_TRAILING_WHITESPACE)
elseif (WIN32)
execute_process (COMMAND cmd " /C date /T"
OUTPUT_VARIABLE _configure_date
OUTPUT_STRIP_TRAILING_WHITESPACE)
else ()
set (_configure_date "Unknown")
endif()
include_directories (${CMAKE_BINARY_DIR})
configure_file (extra/version_string.ver.in version_string.ver @ONLY)

View File

@ -1,11 +0,0 @@
#define __TBB_VERSION_STRINGS(N) \
#N": BUILD_HOST @CMAKE_SYSTEM_NAME@" ENDL \
#N": BUILD_OS @CMAKE_SYSTEM@" ENDL \
#N": BUILD_KERNEL @CMAKE_SYSTEM_VERSION@" ENDL \
#N": BUILD_GCC @CMAKE_CXX_COMPILER_ID@" ENDL \
#N": BUILD_LIBC Unknown" ENDL \
#N": BUILD_LD Unknown" ENDL \
#N": BUILD_TARGET Unknown" ENDL \
#N": BUILD_COMMAND Unknown" ENDL
#define __TBB_DATETIME "@_configure_date@"

View File

@ -22,7 +22,7 @@ namespace {
template <class T> template <class T>
struct Vec256 { struct Vec256 {
static constexpr int size = 32 / sizeof(T); static constexpr int size = 32 / sizeof(T);
T values[32 / sizeof(T)]; T values[32 / sizeof(T)] = {0};
Vec256() {} Vec256() {}
Vec256(T val) { Vec256(T val) {
for (int i = 0; i != size; i++) { for (int i = 0; i != size; i++) {

View File

@ -23,7 +23,7 @@ void host_softmax(Tensor output, const Tensor& input, const int64_t dim) {
int64_t outer_stride = dim_size * dim_stride; int64_t outer_stride = dim_size * dim_stride;
scalar_t* input_data_base = input.data<scalar_t>(); scalar_t* input_data_base = input.data<scalar_t>();
scalar_t* output_data_base = output.data<scalar_t>(); scalar_t* output_data_base = output.data<scalar_t>();
int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1); int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
parallel_for( parallel_for(
0, outer_size * inner_size, grain_size, 0, outer_size * inner_size, grain_size,
[&](int64_t begin, int64_t end) { [&](int64_t begin, int64_t end) {
@ -80,7 +80,7 @@ void host_softmax_backward(
scalar_t* gradInput_data_base = gI.data<scalar_t>(); scalar_t* gradInput_data_base = gI.data<scalar_t>();
scalar_t* output_data_base = output.data<scalar_t>(); scalar_t* output_data_base = output.data<scalar_t>();
scalar_t* gradOutput_data_base = grad.data<scalar_t>(); scalar_t* gradOutput_data_base = grad.data<scalar_t>();
int64_t grain_size = std::min(internal::TBB_GRAIN_SIZE / dim_size, (int64_t)1); int64_t grain_size = std::min(internal::GRAIN_SIZE / dim_size, (int64_t)1);
parallel_for( parallel_for(
0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) {
for (int64_t i = begin; i < end; i++) { for (int64_t i = begin; i < end; i++) {

View File

@ -9,12 +9,6 @@
#include "ATen/cpu/vec256/vec256.h" #include "ATen/cpu/vec256/vec256.h"
#include "ATen/optional.h" #include "ATen/optional.h"
#ifdef __PPC64__
using default_partitioner_type = tbb::simple_partitioner;
#else
using default_partitioner_type = tbb::affinity_partitioner;
#endif
namespace at { namespace native { namespace { namespace at { namespace native { namespace {
using namespace vec256; using namespace vec256;
@ -23,19 +17,22 @@ static inline int64_t round_down(int64_t a, int64_t m) {
return a - (a % m); return a - (a % m);
} }
template<typename F> template <typename F>
static void parallel_for(int64_t end, int64_t step, bool parallelize, F func) { static void _parallel_for(int64_t size, int64_t step, bool parallelize, F func) {
if (parallelize) { if (parallelize) {
tbb::parallel_for<int64_t>(0, end, step, func); parallel_for(0, size / step, 1, [func, step](int64_t begin, int64_t end) {
int64_t k = begin * step;
for (int64_t i = begin; i < end; i++, k += step) {
func(k);
}
});
} else { } else {
for (int64_t i = 0; i != end; i += step) { for (int64_t i = 0; i != size; i += step) {
func(i); func(i);
} }
} }
} }
static default_partitioner_type ap;
// Vectorized reduction defined by reduce operation `Op` with identity `ident`. // Vectorized reduction defined by reduce operation `Op` with identity `ident`.
// The reduction is built on top of reduce128, which reduces down a column // The reduction is built on top of reduce128, which reduces down a column
// 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen // 128 bytes wide (WIDTH scalar elements). The width of 128 bytes is chosen
@ -50,8 +47,6 @@ struct Reduction {
using ReduceScalar = Op<scalar_t>; using ReduceScalar = Op<scalar_t>;
static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) { static void apply(Tensor& res, const Tensor& self, at::optional<int64_t> dim) {
internal::init_tbb_num_threads();
auto out = res.data<scalar_t>(); auto out = res.data<scalar_t>();
auto data = self.data<scalar_t>(); auto data = self.data<scalar_t>();
auto numel = self.numel(); auto numel = self.numel();
@ -71,8 +66,8 @@ struct Reduction {
} }
} }
int64_t batch = numel / (n * stride); int64_t batch = numel / (n * stride);
bool paralellize = batch * n > internal::TBB_GRAIN_SIZE; bool paralellize = batch * n > internal::GRAIN_SIZE;
parallel_for(batch, 1, paralellize, [=](int64_t b) { _parallel_for(batch, 1, paralellize, [=](int64_t b) {
if (stride == 1) { if (stride == 1) {
out[b] = reduce_all(&data[b * n], n); out[b] = reduce_all(&data[b * n], n);
} else { } else {
@ -84,23 +79,17 @@ struct Reduction {
static scalar_t reduce_all(const scalar_t* data, int64_t size) { static scalar_t reduce_all(const scalar_t* data, int64_t size) {
int64_t k = size / WIDTH; int64_t k = size / WIDTH;
scalar_t sum; scalar_t sum = parallel_reduce(
if (size > internal::TBB_GRAIN_SIZE) { 0,
sum = tbb::parallel_reduce( k,
tbb::blocked_range<int64_t>(0, k, internal::TBB_GRAIN_SIZE / WIDTH), internal::GRAIN_SIZE / WIDTH,
scalar_t(ident), (scalar_t)ident,
[=](const tbb::blocked_range<int64_t>& r, scalar_t init) { [data](int64_t begin, int64_t end, scalar_t init) {
scalar_t buf[WIDTH]; scalar_t buf[WIDTH];
reduce128(&data[r.begin() * WIDTH], buf, r.end() - r.begin(), WIDTH); reduce128(&data[begin * WIDTH], buf, end - begin, WIDTH);
return std::accumulate(buf, buf + WIDTH, init, ReduceScalar()); return std::accumulate(buf, buf + WIDTH, init, ReduceScalar());
}, },
ReduceScalar(), ReduceScalar());
ap);
} else {
scalar_t buf[WIDTH];
reduce128(data, buf, k, WIDTH);
sum = std::accumulate(buf, buf + WIDTH, scalar_t(ident), ReduceScalar());
}
for (int64_t i = k * WIDTH; i != size; i++) { for (int64_t i = k * WIDTH; i != size; i++) {
sum = ReduceScalar()(sum, data[i]); sum = ReduceScalar()(sum, data[i]);
@ -127,8 +116,8 @@ struct Reduction {
// Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1] // Reduce a 2d matrix down each column. Stores the results in out[0 ... cols-1]
static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) { static void reduce2d(const scalar_t* data, scalar_t* out, int64_t rows, int64_t cols, int64_t stride) {
int64_t cols_rounded = round_down(cols, WIDTH); int64_t cols_rounded = round_down(cols, WIDTH);
bool paralellize = cols * rows > internal::TBB_GRAIN_SIZE; bool paralellize = cols * rows > internal::GRAIN_SIZE;
parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) { _parallel_for(cols_rounded, WIDTH, paralellize, [=](int64_t col) {
reduce128(&data[col], &out[col], rows, stride); reduce128(&data[col], &out[col], rows, stride);
}); });

View File

@ -14,7 +14,7 @@
// compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in // compiled with AVX/AVX2 This is because of SSE-AVX transitions and a bug in
// Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280 // Glibc2.23 See https://bugs.launchpad.net/ubuntu/+source/glibc/+bug/1663280
// //
// On grainsize: The grainsize is chosen to roughly get TBB_GRAIN_SIZE number of // On grainsize: The grainsize is chosen to roughly get GRAIN_SIZE number of
// computations per task. Each task works across dim_size elements. 16 should be // computations per task. Each task works across dim_size elements. 16 should be
// a very rough approximation of the number of computations per dim_size element // a very rough approximation of the number of computations per dim_size element
// by counting simple computations (*, +, -) as 1 and exp or log as 4. // by counting simple computations (*, +, -) as 1 and exp or log as 4.
@ -30,7 +30,7 @@ inline void _vec_log_softmax_lastdim(
int64_t dim_size) { int64_t dim_size) {
using Vec = vec256::Vec256<scalar_t>; using Vec = vec256::Vec256<scalar_t>;
static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size; static constexpr int64_t CHUNK_SIZE = (128 / sizeof(scalar_t)) * Vec::size;
int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE);
if (grain_size < CHUNK_SIZE) if (grain_size < CHUNK_SIZE)
grain_size = CHUNK_SIZE; grain_size = CHUNK_SIZE;
@ -93,7 +93,7 @@ inline void _vec_softmax_lastdim(
int64_t outer_size, int64_t outer_size,
int64_t dim_size) { int64_t dim_size) {
using Vec = vec256::Vec256<scalar_t>; using Vec = vec256::Vec256<scalar_t>;
int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size); int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
if (grain_size < 1) if (grain_size < 1)
grain_size = 1; grain_size = 1;
@ -134,7 +134,7 @@ inline void _vec_host_softmax_backward_lastdim(
int64_t outer_size, int64_t outer_size,
int64_t dim_size) { int64_t dim_size) {
using Vec = vec256::Vec256<scalar_t>; using Vec = vec256::Vec256<scalar_t>;
int64_t grain_size = internal::TBB_GRAIN_SIZE / (16 * dim_size); int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size);
if (grain_size < 1) if (grain_size < 1)
grain_size = 1; grain_size = 1;

View File

@ -355,7 +355,6 @@ class build_deps(PytorchCommand):
check_file(os.path.join(third_party_path, "nanopb", "CMakeLists.txt")) check_file(os.path.join(third_party_path, "nanopb", "CMakeLists.txt"))
check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt")) check_file(os.path.join(third_party_path, "pybind11", "CMakeLists.txt"))
check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt')) check_file(os.path.join(third_party_path, 'cpuinfo', 'CMakeLists.txt'))
check_file(os.path.join(third_party_path, 'tbb', 'Makefile'))
check_file(os.path.join(third_party_path, 'catch', 'CMakeLists.txt')) check_file(os.path.join(third_party_path, 'catch', 'CMakeLists.txt'))
check_file(os.path.join(third_party_path, 'onnx', 'CMakeLists.txt')) check_file(os.path.join(third_party_path, 'onnx', 'CMakeLists.txt'))

1
third_party/tbb vendored

Submodule third_party/tbb deleted from 633b01ad27

View File

@ -27,7 +27,7 @@ git fetch fullrepo
git checkout -b temporary-split-branch fullrepo/master git checkout -b temporary-split-branch fullrepo/master
# Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository # Cribbed from https://stackoverflow.com/questions/2982055/detach-many-subdirectories-into-a-new-separate-git-repository
# and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work # and https://stackoverflow.com/questions/42355621/git-filter-branch-moving-a-folder-with-index-filter-does-not-work
git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/tbb third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)' git filter-branch -f --index-filter 'git rm --cached -qr --ignore-unmatch -- . && git reset -q $GIT_COMMIT -- aten cmake third_party/catch third_party/cpuinfo && (git ls-files -s | sed "s-.travis.aten.yml-.travis.yml-" | sed "s-.gitmodules.aten-.gitmodules-" | git update-index --index-info)'
git checkout master git checkout master
git merge temporary-split-branch git merge temporary-split-branch
git push git push